diff --git a/tasks/alignment/task_bwa.wdl b/tasks/alignment/task_bwa.wdl index ad067ea90..eeecbf0eb 100644 --- a/tasks/alignment/task_bwa.wdl +++ b/tasks/alignment/task_bwa.wdl @@ -7,7 +7,9 @@ task bwa { String samplename File? reference_genome Int cpu = 6 + Int memory = 16 Int disk_size = 100 + String docker = "us-docker.pkg.dev/general-theiagen/staphb/ivar:1.3.1-titan" } command <<< # date and version control @@ -25,25 +27,98 @@ task bwa { ref_genome="/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta" fi - # Map with BWA MEM - echo "Running bwa mem -t ~{cpu} ${ref_genome} ~{read1} ~{read2} | samtools sort | samtools view -F 4 -o ~{samplename}.sorted.bam " + # set cat command based on compression + if [[ "~{read1}" == *".gz" ]] ; then + cat_reads="zcat" + else + cat_reads="cat" + fi + + echo -e "\ninput R1 has $(${cat_reads} ~{read1} | grep -c '^@') reads as input" + echo "input R2 has $(${cat_reads} ~{read2} | grep -c '^@') reads as input" + + # Map with BWA MEM; pipe to samtools sort to write sorted SAM file bwa mem \ - -t ~{cpu} \ - "${ref_genome}" \ - ~{read1} ~{read2} |\ - samtools sort | samtools view -F 4 -o ~{samplename}.sorted.bam + -t ~{cpu} \ + "${ref_genome}" \ + ~{read1} \ + ~{read2} | \ + samtools sort \ + -@ ~{cpu} - \ + > ~{samplename}.sorted.sam + + # convert SAM to BAM that only includes aligned reads + samtools view \ + -@ ~{cpu} \ + -F 4 \ + -b \ + -o ~{samplename}.sorted.bam \ + ~{samplename}.sorted.sam + + # convert SAM to BAM that only includes unaligned reads + samtools view \ + -@ ~{cpu} \ + -f 4 \ + -b \ + -o ~{samplename}.sorted.unaligned-reads.bam \ + ~{samplename}.sorted.sam + + # see here for "samtools fastq" options: https://www.htslib.org/doc/samtools-fasta.html + # TL;DR is that "samtools fastq -1 R1.fastq -2 R2.fastq" works with paired-end inputs and will output R1 and R2 reads to separate files due to tags in the SAM & BAM file + # AFAIK for single end alignments w/ bwa mem, the output SAM/BAM files do not have tags to differentiate between R1 and R2 reads, so the "samtools fastq -0 R1.fastq" command is used to output all reads to a single file + + # if read2 was provided by user, extract both read1 and read2 from aligned and unaligned BAMs if [[ ! -z "~{read2}" ]]; then - echo "processing paired reads" - samtools fastq -F4 -1 ~{samplename}_R1.fastq.gz -2 ~{samplename}_R2.fastq.gz ~{samplename}.sorted.bam + echo -e "\nGenerating FASTQs for aligned reads" + samtools fastq \ + -@ ~{cpu} \ + -F 4 \ + -1 ~{samplename}_R1.fastq.gz \ + -2 ~{samplename}_R2.fastq.gz \ + ~{samplename}.sorted.bam + echo "Generating FASTQs for unaligned reads" + # note the lowercase 'f' here is imporant + samtools fastq \ + -@ ~{cpu} \ + -f 4 \ + -1 ~{samplename}_unaligned_R1.fastq.gz \ + -2 ~{samplename}_unaligned_R2.fastq.gz \ + ~{samplename}.sorted.unaligned-reads.bam else - echo "processing single-end reads" - samtools fastq -F4 ~{samplename}.sorted.bam | gzip > ~{samplename}_R1.fastq.gz + echo -e "\nGenerating FASTQs for aligned single-end reads\n" + samtools fastq \ + -@ ~{cpu} \ + -F 4 \ + -0 ~{samplename}_R1.fastq.gz \ + ~{samplename}.sorted.bam + echo -e "Generating FASTQs for unaligned single-end reads\n" + # again, lowercase 'f' is important for getting all unaligned reads + samtools fastq \ + -@ ~{cpu} \ + -f 4 \ + -0 ~{samplename}_unaligned_R1.fastq.gz \ + ~{samplename}.sorted.unaligned-reads.bam fi - # index BAMs - samtools index ~{samplename}.sorted.bam + samtools index ~{samplename}.sorted.bam + samtools index ~{samplename}.sorted.unaligned-reads.bam + + # count output reads to ensure we are outputting all reads, regardless if the aligned or not + # if read2 does exist as input, count both R1 and R2 + if [[ ! -z "~{read2}" ]]; then + echo -e "\noutput R1_aligned has $(zcat ~{samplename}_R1.fastq.gz | grep -c '^@') reads as input" + echo "output R2_aligned has $(zcat ~{samplename}_R2.fastq.gz | grep -c '^@') reads as input" + echo + echo "output R1_unaligned has $(zcat ~{samplename}_unaligned_R1.fastq.gz | grep -c '^@') reads as input" + echo "output R2_unaligned has $(zcat ~{samplename}_unaligned_R2.fastq.gz | grep -c '^@') reads as input" + # else = if read2 does not exist as input, only count R1 + else + echo "output R1_aligned has $(zcat ~{samplename}_R1.fastq.gz | grep -c '^@') reads as input" + echo + echo "output R1_unaligned has $(zcat ~{samplename}_unaligned_R1.fastq.gz | grep -c '^@') reads as input" + fi >>> output { String bwa_version = read_string("BWA_VERSION") @@ -52,14 +127,18 @@ task bwa { File sorted_bai = "${samplename}.sorted.bam.bai" File read1_aligned = "~{samplename}_R1.fastq.gz" File? read2_aligned = "~{samplename}_R2.fastq.gz" + File read1_unaligned = "~{samplename}_unaligned_R1.fastq.gz" + File? read2_unaligned = "~{samplename}_unaligned_R2.fastq.gz" + File sorted_bam_unaligned = "~{samplename}.sorted.unaligned-reads.bam" + File sorted_bam_unaligned_bai = "~{samplename}.sorted.unaligned-reads.bam.bai" } runtime { - docker: "us-docker.pkg.dev/general-theiagen/staphb/ivar:1.3.1-titan" - memory: "8 GB" + docker: docker + memory: memory + " GB" cpu: cpu disks: "local-disk " + disk_size + " SSD" disk: disk_size + " GB" # TES preemptible: 0 - #maxRetries: 3 + maxRetries: 3 } } \ No newline at end of file diff --git a/workflows/theiacov/wf_theiacov_illumina_pe.wdl b/workflows/theiacov/wf_theiacov_illumina_pe.wdl index 7f9fc2598..36cbe8e89 100644 --- a/workflows/theiacov/wf_theiacov_illumina_pe.wdl +++ b/workflows/theiacov/wf_theiacov_illumina_pe.wdl @@ -342,6 +342,10 @@ workflow theiacov_illumina_pe { File? read2_aligned = ivar_consensus.read2_aligned String aligned_bam = select_first([ivar_consensus.aligned_bam, ""]) String aligned_bai = select_first([ivar_consensus.aligned_bai, ""]) + File? read1_unaligned = ivar_consensus.read1_unaligned + File? read2_unaligned = ivar_consensus.read2_unaligned + File? sorted_bam_unaligned = ivar_consensus.sorted_bam_unaligned + File? sorted_bam_unaligned_bai = ivar_consensus.sorted_bam_unaligned_bai # Read Alignment - primer trimming outputs Float? primer_trimmed_read_percent = ivar_consensus.primer_trimmed_read_percent String? ivar_version_primtrim = ivar_consensus.ivar_version_primtrim diff --git a/workflows/theiacov/wf_theiacov_illumina_se.wdl b/workflows/theiacov/wf_theiacov_illumina_se.wdl index 02c15bb6c..45f9e37a9 100644 --- a/workflows/theiacov/wf_theiacov_illumina_se.wdl +++ b/workflows/theiacov/wf_theiacov_illumina_se.wdl @@ -218,6 +218,9 @@ workflow theiacov_illumina_se { String? assembly_method = ivar_consensus.assembly_method_nonflu File? aligned_bam = ivar_consensus.aligned_bam File? aligned_bai = ivar_consensus.aligned_bai + File? read1_unaligned = ivar_consensus.read1_unaligned + File? sorted_bam_unaligned = ivar_consensus.sorted_bam_unaligned + File? sorted_bam_unaligned_bai = ivar_consensus.sorted_bam_unaligned_bai # Read Alignment - primer trimming outputs Float? primer_trimmed_read_percent = ivar_consensus.primer_trimmed_read_percent String? ivar_version_primtrim = ivar_consensus.ivar_version_primtrim diff --git a/workflows/utilities/wf_ivar_consensus.wdl b/workflows/utilities/wf_ivar_consensus.wdl index e88ead2a1..1a902f901 100644 --- a/workflows/utilities/wf_ivar_consensus.wdl +++ b/workflows/utilities/wf_ivar_consensus.wdl @@ -75,6 +75,10 @@ workflow ivar_consensus { File? read2_aligned = bwa.read2_aligned File aligned_bam = select_first([primer_trim.trim_sorted_bam, bwa.sorted_bam, ""]) # set default values for select_first() to avoid workflow failures File aligned_bai = select_first([primer_trim.trim_sorted_bai, bwa.sorted_bai, ""]) + File read1_unaligned = bwa.read1_unaligned + File? read2_unaligned = bwa.read2_unaligned + File sorted_bam_unaligned = bwa.sorted_bam_unaligned + File sorted_bam_unaligned_bai = bwa.sorted_bam_unaligned_bai # primer trimming outputs Float? primer_trimmed_read_percent = primer_trim.primer_trimmed_read_percent