theiagen · cimendes · Feb 21, 2024 · Oct 31, 2023 · Nov 13, 2023 · Nov 13, 2023
@@ -7,7 +7,9 @@ task bwa {
     String samplename
     File? reference_genome
     Int cpu = 6
+    Int memory = 16
     Int disk_size = 100
+    String docker = "us-docker.pkg.dev/general-theiagen/staphb/ivar:1.3.1-titan"
   }
   command <<<
     # date and version control
@@ -25,25 +27,98 @@ task bwa {
       ref_genome="/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta"  
     fi
 
-    # Map with BWA MEM
-    echo "Running bwa mem -t ~{cpu} ${ref_genome} ~{read1} ~{read2} | samtools sort | samtools view -F 4 -o ~{samplename}.sorted.bam "
+    # set cat command based on compression
+    if [[ "~{read1}" == *".gz" ]] ; then
+      cat_reads="zcat"
+    else
+      cat_reads="cat"
+    fi
+
+    echo -e "\ninput R1 has $(${cat_reads} ~{read1} | grep -c '^@') reads as input"
+    echo "input R2 has $(${cat_reads} ~{read2} | grep -c '^@') reads as input"
+
+    # Map with BWA MEM; pipe to samtools sort to write sorted SAM file
     bwa mem \
-    -t ~{cpu} \
-    "${ref_genome}" \
-    ~{read1} ~{read2} |\
-    samtools sort | samtools view -F 4 -o ~{samplename}.sorted.bam
+      -t ~{cpu} \
+      "${ref_genome}" \
+      ~{read1} \
+      ~{read2} | \
+    samtools sort \
+      -@ ~{cpu} - \
+      > ~{samplename}.sorted.sam
+
+    # convert SAM to BAM that only includes aligned reads
+    samtools view \
+      -@ ~{cpu} \
+      -F 4 \
+      -b \
+      -o ~{samplename}.sorted.bam \
+      ~{samplename}.sorted.sam
+
+    # convert SAM to BAM that only includes unaligned reads
+    samtools view \
+      -@ ~{cpu} \
+      -f 4 \
+      -b \
+      -o ~{samplename}.sorted.unaligned-reads.bam \
+      ~{samplename}.sorted.sam
+
+    # see here for "samtools fastq" options: https://www.htslib.org/doc/samtools-fasta.html
+    # TL;DR is that "samtools fastq -1 R1.fastq -2 R2.fastq" works with paired-end inputs and will output R1 and R2 reads to separate files due to tags in the SAM & BAM file
 
+    # AFAIK for single end alignments w/ bwa mem, the output SAM/BAM files do not have tags to differentiate between R1 and R2 reads, so the "samtools fastq -0 R1.fastq" command is used to output all reads to a single file
+
+    # if read2 was provided by user, extract both read1 and read2 from aligned and unaligned BAMs
     if [[ ! -z "~{read2}" ]]; then
-      echo "processing paired reads"
-      samtools fastq -F4 -1 ~{samplename}_R1.fastq.gz -2 ~{samplename}_R2.fastq.gz ~{samplename}.sorted.bam
+      echo -e "\nGenerating FASTQs for aligned reads"
+      samtools fastq \
+        -@ ~{cpu} \
+        -F 4 \
+        -1 ~{samplename}_R1.fastq.gz \
+        -2 ~{samplename}_R2.fastq.gz \
+        ~{samplename}.sorted.bam
+      echo "Generating FASTQs for unaligned reads"
+      # note the lowercase 'f' here is imporant
+      samtools fastq \
+        -@ ~{cpu} \
+        -f 4 \
+        -1 ~{samplename}_unaligned_R1.fastq.gz \
+        -2 ~{samplename}_unaligned_R2.fastq.gz \
+        ~{samplename}.sorted.unaligned-reads.bam
     else
-      echo "processing single-end reads"
-      samtools fastq -F4 ~{samplename}.sorted.bam | gzip > ~{samplename}_R1.fastq.gz
+      echo -e "\nGenerating FASTQs for aligned single-end reads\n"
+      samtools fastq \
+        -@ ~{cpu} \
+        -F 4 \
+        -0 ~{samplename}_R1.fastq.gz \
+        ~{samplename}.sorted.bam
+      echo -e "Generating FASTQs for unaligned single-end reads\n"
+      # again, lowercase 'f' is important for getting all unaligned reads
+      samtools fastq \
+        -@ ~{cpu} \
+        -f 4 \
+        -0 ~{samplename}_unaligned_R1.fastq.gz \
+        ~{samplename}.sorted.unaligned-reads.bam
     fi
 
-
     # index BAMs
-    samtools index ~{samplename}.sorted.bam
+    samtools index ~{samplename}.sorted.bam 
+    samtools index ~{samplename}.sorted.unaligned-reads.bam
+
+    # count output reads to ensure we are outputting all reads, regardless if the aligned or not
+    # if read2 does exist as input, count both R1 and R2
+    if [[ ! -z "~{read2}" ]]; then
+      echo -e "\noutput R1_aligned has $(zcat ~{samplename}_R1.fastq.gz | grep -c '^@') reads as input"
+      echo "output R2_aligned has $(zcat ~{samplename}_R2.fastq.gz | grep -c '^@') reads as input"
+      echo
+      echo "output R1_unaligned has $(zcat ~{samplename}_unaligned_R1.fastq.gz | grep -c '^@') reads as input"
+      echo "output R2_unaligned has $(zcat ~{samplename}_unaligned_R2.fastq.gz | grep -c '^@') reads as input"
+    # else = if read2 does not exist as input, only count R1
+    else
+      echo "output R1_aligned has $(zcat ~{samplename}_R1.fastq.gz | grep -c '^@') reads as input"
+      echo
+      echo "output R1_unaligned has $(zcat ~{samplename}_unaligned_R1.fastq.gz | grep -c '^@') reads as input"
+    fi
   >>>
   output {
     String bwa_version = read_string("BWA_VERSION")
@@ -52,14 +127,18 @@ task bwa {
     File sorted_bai = "${samplename}.sorted.bam.bai"
     File read1_aligned = "~{samplename}_R1.fastq.gz"
     File? read2_aligned = "~{samplename}_R2.fastq.gz"
+    File read1_unaligned = "~{samplename}_unaligned_R1.fastq.gz"
+    File? read2_unaligned = "~{samplename}_unaligned_R2.fastq.gz"
+    File sorted_bam_unaligned = "~{samplename}.sorted.unaligned-reads.bam"
+    File sorted_bam_unaligned_bai = "~{samplename}.sorted.unaligned-reads.bam.bai"
   }
   runtime {
-    docker: "us-docker.pkg.dev/general-theiagen/staphb/ivar:1.3.1-titan"
-    memory: "8 GB"
+    docker: docker
+    memory: memory + " GB"
     cpu: cpu
     disks:  "local-disk " + disk_size + " SSD"
     disk: disk_size + " GB" # TES
     preemptible: 0
-    #maxRetries: 3
+    maxRetries: 3
   }
 }
@@ -342,6 +342,10 @@ workflow theiacov_illumina_pe {
     File? read2_aligned = ivar_consensus.read2_aligned
     String aligned_bam = select_first([ivar_consensus.aligned_bam, ""])
     String aligned_bai = select_first([ivar_consensus.aligned_bai, ""])
+    File? read1_unaligned = ivar_consensus.read1_unaligned
+    File? read2_unaligned = ivar_consensus.read2_unaligned
+    File? sorted_bam_unaligned = ivar_consensus.sorted_bam_unaligned
+    File? sorted_bam_unaligned_bai = ivar_consensus.sorted_bam_unaligned_bai
     # Read Alignment - primer trimming outputs
     Float? primer_trimmed_read_percent = ivar_consensus.primer_trimmed_read_percent
     String? ivar_version_primtrim = ivar_consensus.ivar_version_primtrim

@@ -218,6 +218,9 @@ workflow theiacov_illumina_se {
     String? assembly_method = ivar_consensus.assembly_method_nonflu
     File? aligned_bam = ivar_consensus.aligned_bam
     File? aligned_bai = ivar_consensus.aligned_bai
+    File? read1_unaligned = ivar_consensus.read1_unaligned
+    File? sorted_bam_unaligned = ivar_consensus.sorted_bam_unaligned
+    File? sorted_bam_unaligned_bai = ivar_consensus.sorted_bam_unaligned_bai
     # Read Alignment - primer trimming outputs
     Float? primer_trimmed_read_percent = ivar_consensus.primer_trimmed_read_percent
     String? ivar_version_primtrim = ivar_consensus.ivar_version_primtrim

@@ -75,6 +75,10 @@ workflow ivar_consensus {
     File? read2_aligned = bwa.read2_aligned
     File aligned_bam =  select_first([primer_trim.trim_sorted_bam, bwa.sorted_bam, ""]) # set default values for select_first() to avoid workflow failures
     File aligned_bai = select_first([primer_trim.trim_sorted_bai, bwa.sorted_bai, ""])
+    File read1_unaligned = bwa.read1_unaligned
+    File? read2_unaligned = bwa.read2_unaligned
+    File sorted_bam_unaligned = bwa.sorted_bam_unaligned
+    File sorted_bam_unaligned_bai = bwa.sorted_bam_unaligned_bai
 
     # primer trimming outputs
     Float? primer_trimmed_read_percent = primer_trim.primer_trimmed_read_percent