From 99ae6968b06f0ff3f1f187c39a4086960d42a5ad Mon Sep 17 00:00:00 2001
From: kapsakcj <kapsakcj@gmail.com>
Date: Tue, 31 Oct 2023 14:10:36 -0400
Subject: [PATCH 1/7] big changes to BWA task: new optional inputs for memory
 and docker; memory default increased from 8 to 16; broke up one liner to
 produce intermediate SAM file for downstream usage; added 2 samtools view
 commands to generate separate BAMs for aligned and unaligned reads; updated
 cmds for FASTQ extraction from 2 BAMS; added indexing for unaligned BAM;
 added 4 new outputs: read1&2 unaligned, sorted_bam_unaligned, and
 sorted_bam_unaligned_bai

---
 tasks/alignment/task_bwa.wdl | 71 +++++++++++++++++++++++++++++-------
 1 file changed, 57 insertions(+), 14 deletions(-)

diff --git a/tasks/alignment/task_bwa.wdl b/tasks/alignment/task_bwa.wdl
index ad067ea90..e8abe636a 100644
--- a/tasks/alignment/task_bwa.wdl
+++ b/tasks/alignment/task_bwa.wdl
@@ -7,7 +7,9 @@ task bwa {
     String samplename
     File? reference_genome
     Int cpu = 6
+    Int memory = 16
     Int disk_size = 100
+    String docker = "us-docker.pkg.dev/general-theiagen/staphb/ivar:1.3.1-titan"
   }
   command <<<
     # date and version control
@@ -25,25 +27,62 @@ task bwa {
       ref_genome="/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta"  
     fi
 
-    # Map with BWA MEM
-    echo "Running bwa mem -t ~{cpu} ${ref_genome} ~{read1} ~{read2} | samtools sort | samtools view -F 4 -o ~{samplename}.sorted.bam "
+    # Map with BWA MEM; pipe to samtools sort to write sorted SAM file
     bwa mem \
-    -t ~{cpu} \
-    "${ref_genome}" \
-    ~{read1} ~{read2} |\
-    samtools sort | samtools view -F 4 -o ~{samplename}.sorted.bam
+      -t ~{cpu} \
+      "${ref_genome}" \
+      ~{read1} \
+      ~{read2} | \
+    samtools sort -@ ~{cpu} > ~{samplename}.sorted.sam
+    
+    # convert SAM to BAM that only includes aligned reads
+    samtools view \
+      -@ ~{cpu} \
+      -F 4 \
+      -b \
+      -o ~{samplename}.sorted.bam \
+      ~{samplename}.sorted.sam
 
+    # convert SAM to BAM that only includes unaligned reads
+    samtools view \
+      -@ ~{cpu} \
+      -f 4 \
+      -b \
+      -o ~{samplename}.sorted.unaligned-reads.bam \
+      ~{samplename}.sorted.sam
+
+    # if read2 was provided by user, extract both read1 and read2 from aligned and unaligned BAMs
     if [[ ! -z "~{read2}" ]]; then
-      echo "processing paired reads"
-      samtools fastq -F4 -1 ~{samplename}_R1.fastq.gz -2 ~{samplename}_R2.fastq.gz ~{samplename}.sorted.bam
+      echo "Generating FASTQs for aligned and unaligned paired reads"
+      samtools fastq \
+        -F 4 \
+        -1 ~{samplename}_R1.fastq.gz \
+        -2 ~{samplename}_R2.fastq.gz \
+        ~{samplename}.sorted.bam
+      # note the lowercase 'f' here is imporant
+      samtools fastq \
+        -f 4 \
+        -1 ~{samplename}_unaligned_R1.fastq.gz \
+        -2 ~{samplename}_unaligned_R2.fastq.gz \
+        ~{samplename}.sorted.unaligned-reads.bam
     else
-      echo "processing single-end reads"
-      samtools fastq -F4 ~{samplename}.sorted.bam | gzip > ~{samplename}_R1.fastq.gz
+      echo "Generating FASTQs for aligned and unaligned single-end reads"
+      samtools fastq \
+        -@ ~{cpu} \
+        -F 4 \
+        -1 ~{samplename}_R1.fastq.gz \
+        ~{samplename}.sorted.bam 
+      # again, lowercase 'f' is important for getting all unaligned reads
+      samtools fastq \
+        -@ ~{cpu} \
+        -f 4 \
+        -1 ~{samplename}_unaligned_R1.fastq.gz \
+        ~{samplename}.sorted.unaligned-reads.bam
     fi
 
-
     # index BAMs
-    samtools index ~{samplename}.sorted.bam
+    samtools index ~{samplename}.sorted.bam 
+    samtools index ~{samplename}.sorted.unaligned-reads.bam
   >>>
   output {
     String bwa_version = read_string("BWA_VERSION")
@@ -52,10 +91,14 @@ task bwa {
     File sorted_bai = "${samplename}.sorted.bam.bai"
     File read1_aligned = "~{samplename}_R1.fastq.gz"
     File? read2_aligned = "~{samplename}_R2.fastq.gz"
+    File read1_unaligned = "~{samplename}_unaligned_R1.fastq.gz"
+    File? read2_unaligned = "~{samplename}_unaligned_R2.fastq.gz"
+    File sorted_bam_unaligned = "~{samplename}.sorted.unaligned-reads.bam"
+    File sorted_bam_unaligned_bai = "~{samplename}.sorted.unaligned-reads.bam.bai"
   }
   runtime {
-    docker: "us-docker.pkg.dev/general-theiagen/staphb/ivar:1.3.1-titan"
-    memory: "8 GB"
+    docker: docker
+    memory: memory + " GB"
     cpu: cpu
     disks:  "local-disk " + disk_size + " SSD"
     disk: disk_size + " GB" # TES

From 4abfd6127ff956af0d2898c51d4eccae99b9440b Mon Sep 17 00:00:00 2001
From: kapsakcj <kapsakcj@gmail.com>
Date: Mon, 13 Nov 2023 14:52:17 -0500
Subject: [PATCH 2/7] added 4 new file outputs to wf_ivar_consensus subworkflow
 and theiacov_illumina_pe workflow; both ran successfully with miniwdl

---
 workflows/theiacov/wf_theiacov_illumina_pe.wdl | 4 ++++
 workflows/utilities/wf_ivar_consensus.wdl      | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/workflows/theiacov/wf_theiacov_illumina_pe.wdl b/workflows/theiacov/wf_theiacov_illumina_pe.wdl
index 8dff1ee42..9e6d6afa6 100644
--- a/workflows/theiacov/wf_theiacov_illumina_pe.wdl
+++ b/workflows/theiacov/wf_theiacov_illumina_pe.wdl
@@ -299,6 +299,10 @@ workflow theiacov_illumina_pe {
     File? read2_aligned = ivar_consensus.read2_aligned
     String aligned_bam = select_first([ivar_consensus.aligned_bam, ""])
     String aligned_bai = select_first([ivar_consensus.aligned_bai, ""])
+    File? read1_unaligned = ivar_consensus.read1_unaligned
+    File? read2_unaligned = ivar_consensus.read2_unaligned
+    File? sorted_bam_unaligned = ivar_consensus.sorted_bam_unaligned
+    File? sorted_bam_unaligned_bai = ivar_consensus.sorted_bam_unaligned_bai
     # Read Alignment - primer trimming outputs
     Float? primer_trimmed_read_percent = ivar_consensus.primer_trimmed_read_percent
     String? ivar_version_primtrim = ivar_consensus.ivar_version_primtrim
diff --git a/workflows/utilities/wf_ivar_consensus.wdl b/workflows/utilities/wf_ivar_consensus.wdl
index e88ead2a1..1a902f901 100644
--- a/workflows/utilities/wf_ivar_consensus.wdl
+++ b/workflows/utilities/wf_ivar_consensus.wdl
@@ -75,6 +75,10 @@ workflow ivar_consensus {
     File? read2_aligned = bwa.read2_aligned
     File aligned_bam =  select_first([primer_trim.trim_sorted_bam, bwa.sorted_bam, ""]) # set default values for select_first() to avoid workflow failures
     File aligned_bai = select_first([primer_trim.trim_sorted_bai, bwa.sorted_bai, ""])
+    File read1_unaligned = bwa.read1_unaligned
+    File? read2_unaligned = bwa.read2_unaligned
+    File sorted_bam_unaligned = bwa.sorted_bam_unaligned
+    File sorted_bam_unaligned_bai = bwa.sorted_bam_unaligned_bai
     
     # primer trimming outputs
     Float? primer_trimmed_read_percent = primer_trim.primer_trimmed_read_percent

From 3d26d0d98d08fbdf724552c4accaa8d06d695234 Mon Sep 17 00:00:00 2001
From: kapsakcj <kapsakcj@gmail.com>
Date: Mon, 13 Nov 2023 15:02:27 -0500
Subject: [PATCH 3/7] added 3 new optional outputs to theiacov_illumina_se:
 read1_unaligned, sorted_bam_unaligned, sorted_bam_unaligned_bai. tested fine
 w miniwdl

---
 workflows/theiacov/wf_theiacov_illumina_se.wdl | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/workflows/theiacov/wf_theiacov_illumina_se.wdl b/workflows/theiacov/wf_theiacov_illumina_se.wdl
index 349251177..96f297152 100644
--- a/workflows/theiacov/wf_theiacov_illumina_se.wdl
+++ b/workflows/theiacov/wf_theiacov_illumina_se.wdl
@@ -212,6 +212,9 @@ workflow theiacov_illumina_se {
     String? assembly_method = ivar_consensus.assembly_method_nonflu
     File? aligned_bam = ivar_consensus.aligned_bam
     File? aligned_bai = ivar_consensus.aligned_bai
+    File? read1_unaligned = ivar_consensus.read1_unaligned
+    File? sorted_bam_unaligned = ivar_consensus.sorted_bam_unaligned
+    File? sorted_bam_unaligned_bai = ivar_consensus.sorted_bam_unaligned_bai
     # Read Alignment - primer trimming outputs
     Float? primer_trimmed_read_percent = ivar_consensus.primer_trimmed_read_percent
     String? ivar_version_primtrim = ivar_consensus.ivar_version_primtrim

From 88e1d69533ba95f0fcc50c7d85ea5b3802ae3787 Mon Sep 17 00:00:00 2001
From: kapsakcj <kapsakcj@gmail.com>
Date: Tue, 14 Nov 2023 11:48:37 -0500
Subject: [PATCH 4/7] added in cpu option for samtools fastq commands. still
 need to figure out why so much data is being streamed to STDOUT and not
 directly to a file

---
 tasks/alignment/task_bwa.wdl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tasks/alignment/task_bwa.wdl b/tasks/alignment/task_bwa.wdl
index e8abe636a..425e69e51 100644
--- a/tasks/alignment/task_bwa.wdl
+++ b/tasks/alignment/task_bwa.wdl
@@ -55,12 +55,14 @@ task bwa {
     if [[ ! -z "~{read2}" ]]; then
       echo "Generating FASTQs for aligned and unaligned paired reads"
       samtools fastq \
+        -@ ~{cpu} \
         -F 4 \
         -1 ~{samplename}_R1.fastq.gz \
         -2 ~{samplename}_R2.fastq.gz \
         ~{samplename}.sorted.bam
       # note the lowercase 'f' here is imporant
       samtools fastq \
+        -@ ~{cpu} \
         -f 4 \
         -1 ~{samplename}_unaligned_R1.fastq.gz \
         -2 ~{samplename}_unaligned_R2.fastq.gz \

From b5b41dd66c55b3c2c78d04afd75de1102ea6716f Mon Sep 17 00:00:00 2001
From: kapsakcj <kapsakcj@gmail.com>
Date: Thu, 7 Dec 2023 12:14:51 -0500
Subject: [PATCH 5/7] updates to bwa task, ran successfully w miniwdl. Added
 echo statements to count reads before and after alignment & re-production of
 FASTQs; broke samtools sort into multiple lines for readability; added
 helpful echo statements throughout; fixed the samtools fastq commands that
 were not appropriately outputting FASTQ files

---
 tasks/alignment/task_bwa.wdl | 39 ++++++++++++++++++++++++++++++------
 1 file changed, 33 insertions(+), 6 deletions(-)

diff --git a/tasks/alignment/task_bwa.wdl b/tasks/alignment/task_bwa.wdl
index 425e69e51..cfec703d8 100644
--- a/tasks/alignment/task_bwa.wdl
+++ b/tasks/alignment/task_bwa.wdl
@@ -27,13 +27,18 @@ task bwa {
       ref_genome="/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta"  
     fi
 
+    echo "input R1 has $(zcat ~{read1} | grep -c '^@') reads as input"
+    echo "input R2 has $(zcat ~{read2} | grep -c '^@') reads as input"
+
     # Map with BWA MEM; pipe to samtools sort to write sorted SAM file
     bwa mem \
       -t ~{cpu} \
       "${ref_genome}" \
       ~{read1} \
       ~{read2} | \
-    samtools sort -@ ~{cpu} > ~{samplename}.sorted.sam
+    samtools sort \
+      -@ ~{cpu} - \
+      > ~{samplename}.sorted.sam
     
     # convert SAM to BAM that only includes aligned reads
     samtools view \
@@ -51,15 +56,21 @@ task bwa {
       -o ~{samplename}.sorted.unaligned-reads.bam \
       ~{samplename}.sorted.sam
 
+    # see here for "samtools fastq" options: https://www.htslib.org/doc/samtools-fasta.html
+    # TL;DR is that "samtools fastq -1 R1.fastq -2 R2.fastq" works with paired-end inputs and will output R1 and R2 reads to separate files due to tags in the SAM & BAM file
+
+    # AFAIK for single end alignments w/ bwa mem, the output SAM/BAM files do not have tags to differentiate between R1 and R2 reads, so the "samtools fastq -0 R1.fastq" command is used to output all reads to a single file
+
     # if read2 was provided by user, extract both read1 and read2 from aligned and unaligned BAMs
     if [[ ! -z "~{read2}" ]]; then
-      echo "Generating FASTQs for aligned and unaligned paired reads"
+      echo "Generating FASTQs for aligned reads"
       samtools fastq \
         -@ ~{cpu} \
         -F 4 \
         -1 ~{samplename}_R1.fastq.gz \
         -2 ~{samplename}_R2.fastq.gz \
         ~{samplename}.sorted.bam
+      echo "Generating FASTQs for unaligned reads"
       # note the lowercase 'f' here is imporant
       samtools fastq \
         -@ ~{cpu} \
@@ -68,23 +79,39 @@ task bwa {
         -2 ~{samplename}_unaligned_R2.fastq.gz \
         ~{samplename}.sorted.unaligned-reads.bam
     else
-      echo "Generating FASTQs for aligned and unaligned single-end reads"
+      echo "Generating FASTQs for aligned single-end reads"
       samtools fastq \
         -@ ~{cpu} \
         -F 4 \
-        -1 ~{samplename}_R1.fastq.gz \
-        ~{samplename}.sorted.bam 
+        -0 ~{samplename}_R1.fastq.gz \
+        ~{samplename}.sorted.bam
+      echo "Generating FASTQs for unaligned single-end reads" 
       # again, lowercase 'f' is important for getting all unaligned reads
       samtools fastq \
         -@ ~{cpu} \
         -f 4 \
-        -1 ~{samplename}_unaligned_R1.fastq.gz \
+        -0 ~{samplename}_unaligned_R1.fastq.gz \
         ~{samplename}.sorted.unaligned-reads.bam
     fi
 
     # index BAMs
     samtools index ~{samplename}.sorted.bam 
     samtools index ~{samplename}.sorted.unaligned-reads.bam
+
+    # count output reads to ensure we are outputting all reads, regardless if the aligned or not
+    # if read2 does exist as input, count both R1 and R2
+    if [[ ! -z "~{read2}" ]]; then
+      echo "output R1_aligned has $(zcat ~{samplename}_R1.fastq.gz | grep -c '^@') reads as input"
+      echo "output R2_aligned has $(zcat ~{samplename}_R2.fastq.gz | grep -c '^@') reads as input"
+      echo
+      echo "output R1_unaligned has $(zcat ~{samplename}_unaligned_R1.fastq.gz | grep -c '^@') reads as input"
+      echo "output R2_unaligned has $(zcat ~{samplename}_unaligned_R2.fastq.gz | grep -c '^@') reads as input"
+    # else = if read2 does not exist as input, only count R1
+    else
+      echo "output R1_aligned has $(zcat ~{samplename}_R1.fastq.gz | grep -c '^@') reads as input"
+      echo
+      echo "output R1_unaligned has $(zcat ~{samplename}_unaligned_R1.fastq.gz | grep -c '^@') reads as input"
+    fi
   >>>
   output {
     String bwa_version = read_string("BWA_VERSION")

From 714931ca344241c283479a2cffcfc3aead1fbf2e Mon Sep 17 00:00:00 2001
From: kapsakcj <kapsakcj@gmail.com>
Date: Fri, 8 Dec 2023 16:12:30 -0500
Subject: [PATCH 6/7] turn on memory retry feature for bwa WDL task

---
 tasks/alignment/task_bwa.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tasks/alignment/task_bwa.wdl b/tasks/alignment/task_bwa.wdl
index cfec703d8..0448a74c5 100644
--- a/tasks/alignment/task_bwa.wdl
+++ b/tasks/alignment/task_bwa.wdl
@@ -132,6 +132,6 @@ task bwa {
     disks:  "local-disk " + disk_size + " SSD"
     disk: disk_size + " GB" # TES
     preemptible: 0
-    #maxRetries: 3
+    maxRetries: 3
   }
 }
\ No newline at end of file

From c107e8135d8159881bd246f61a5a323a006ae0af Mon Sep 17 00:00:00 2001
From: jrotieno <james.otieno@theiagen.com>
Date: Wed, 7 Feb 2024 17:13:04 +0000
Subject: [PATCH 7/7] add a check on read compression and end of line
 formatting

---
 tasks/alignment/task_bwa.wdl | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/tasks/alignment/task_bwa.wdl b/tasks/alignment/task_bwa.wdl
index 0448a74c5..eeecbf0eb 100644
--- a/tasks/alignment/task_bwa.wdl
+++ b/tasks/alignment/task_bwa.wdl
@@ -27,8 +27,15 @@ task bwa {
       ref_genome="/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta"  
     fi
 
-    echo "input R1 has $(zcat ~{read1} | grep -c '^@') reads as input"
-    echo "input R2 has $(zcat ~{read2} | grep -c '^@') reads as input"
+    # set cat command based on compression
+    if [[ "~{read1}" == *".gz" ]] ; then
+      cat_reads="zcat"
+    else
+      cat_reads="cat"
+    fi
+
+    echo -e "\ninput R1 has $(${cat_reads} ~{read1} | grep -c '^@') reads as input"
+    echo "input R2 has $(${cat_reads} ~{read2} | grep -c '^@') reads as input"
 
     # Map with BWA MEM; pipe to samtools sort to write sorted SAM file
     bwa mem \
@@ -63,7 +70,7 @@ task bwa {
 
     # if read2 was provided by user, extract both read1 and read2 from aligned and unaligned BAMs
     if [[ ! -z "~{read2}" ]]; then
-      echo "Generating FASTQs for aligned reads"
+      echo -e "\nGenerating FASTQs for aligned reads"
       samtools fastq \
         -@ ~{cpu} \
         -F 4 \
@@ -79,13 +86,13 @@ task bwa {
         -2 ~{samplename}_unaligned_R2.fastq.gz \
         ~{samplename}.sorted.unaligned-reads.bam
     else
-      echo "Generating FASTQs for aligned single-end reads"
+      echo -e "\nGenerating FASTQs for aligned single-end reads\n"
       samtools fastq \
         -@ ~{cpu} \
         -F 4 \
         -0 ~{samplename}_R1.fastq.gz \
         ~{samplename}.sorted.bam
-      echo "Generating FASTQs for unaligned single-end reads" 
+      echo -e "Generating FASTQs for unaligned single-end reads\n"
       # again, lowercase 'f' is important for getting all unaligned reads
       samtools fastq \
         -@ ~{cpu} \
@@ -101,7 +108,7 @@ task bwa {
     # count output reads to ensure we are outputting all reads, regardless if the aligned or not
     # if read2 does exist as input, count both R1 and R2
     if [[ ! -z "~{read2}" ]]; then
-      echo "output R1_aligned has $(zcat ~{samplename}_R1.fastq.gz | grep -c '^@') reads as input"
+      echo -e "\noutput R1_aligned has $(zcat ~{samplename}_R1.fastq.gz | grep -c '^@') reads as input"
       echo "output R2_aligned has $(zcat ~{samplename}_R2.fastq.gz | grep -c '^@') reads as input"
       echo
       echo "output R1_unaligned has $(zcat ~{samplename}_unaligned_R1.fastq.gz | grep -c '^@') reads as input"