broadinstitute · gbggrant · Apr 13, 2022 · Apr 4, 2022 · Apr 4, 2022 · Apr 4, 2022
diff --git a/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.changelog.md b/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.changelog.md
@@ -1,3 +1,8 @@
+# 1.0.6
+2022-03-29 (Date of Last Commit)
+
+* Clip adapter bases pre-alignment
+
 # 1.0.5
 2022-03-29 (Date of Last Commit)
 

diff --git a/pipelines/broad/rna_seq/RNAWithUMIsPipeline.changelog.md b/pipelines/broad/rna_seq/RNAWithUMIsPipeline.changelog.md
@@ -1,3 +1,8 @@
+# 1.0.4
+2022-03-29 (Date of Last Commit)
+
+* Clip adapter bases pre-alignment
+
 # 1.0.3
 2022-03-29 (Date of Last Commit)
 

diff --git a/pipelines/broad/rna_seq/RNAWithUMIsPipeline.wdl b/pipelines/broad/rna_seq/RNAWithUMIsPipeline.wdl
@@ -20,7 +20,7 @@ import "../../../tasks/broad/RNAWithUMIsTasks.wdl" as tasks
 
 workflow RNAWithUMIsPipeline {
 
-  String pipeline_version = "1.0.3"
+  String pipeline_version = "1.0.4"
 
   input {
     File? bam
@@ -30,12 +30,11 @@ workflow RNAWithUMIsPipeline {
     String read2Structure
     String output_basename
 
-    # The following inputs are only required if fastqs are given as input.
-    String? platform
-    String? library_name
-    String? platform_unit
-    String? read_group_name
-    String? sequencing_center = "BI"
+    String platform
+    String library_name
+    String platform_unit
+    String read_group_name
+    String sequencing_center = "BI"
 
     File starIndex
     File gtf
@@ -60,11 +59,11 @@ workflow RNAWithUMIsPipeline {
     starIndex: "TAR file containing genome indices used for the STAR aligner"
     output_basename: "String used as a prefix in workflow output files"
     gtf: "Gene annotation file (GTF) used for the rnaseqc tool"
-    platform: "String used to describe the sequencing platform; only required when using FASTQ files as input"
-    library_name: "String used to describe the library; only required when using FASTQ files as input"
-    platform_unit: "String used to describe the platform unit; only required when using FASTQ files as input"
-    read_group_name: "String used to describe the read group name; only required when using FASTQ files as input"
-    sequencing_center: "String used to describe the sequencing center; only required when using FASTQ files as input; default is set to 'BI'"
+    platform: "String used to describe the sequencing platform"
+    library_name: "String used to describe the library"
+    platform_unit: "String used to describe the platform unit"
+    read_group_name: "String used to describe the read group name"
+    sequencing_center: "String used to describe the sequencing center; default is set to 'BI'"
     ref: "FASTA file used for metric collection with Picard tools"
     refIndex: "FASTA index file used for metric collection with Picard tools"
     refDict: "Dictionary file used for metric collection with Picard tools"
@@ -75,29 +74,18 @@ workflow RNAWithUMIsPipeline {
     population_vcf_index: "Population VCF index file used for contamination estimation"
   }
 
-  call tasks.VerifyPipelineInputs {
-    input:
-      bam = bam,
-      r1_fastq = r1_fastq,
-      r2_fastq = r2_fastq,
-      library_name = library_name,
-      platform = platform,
-      platform_unit = platform_unit,
-      read_group_name = read_group_name,
-      sequencing_center = sequencing_center
-  }
-
-  if (VerifyPipelineInputs.fastq_run) {
+  # Assume 
+  if (defined(r1_fastq)) {
     call tasks.FastqToUbam {
       input:
         r1_fastq = select_first([r1_fastq]),
         r2_fastq = select_first([r2_fastq]),
         bam_filename = output_basename,
-        library_name = select_first([library_name]),
-        platform = select_first([platform]),
-        platform_unit = select_first([platform_unit]),
-        read_group_name = select_first([read_group_name]),
-        sequencing_center = select_first([sequencing_center])
+        library_name = library_name,
+        platform = platform,
+        platform_unit = platform_unit,
+        read_group_name = read_group_name,
+        sequencing_center = sequencing_center
     }
   }
 
@@ -110,9 +98,43 @@ workflow RNAWithUMIsPipeline {
       read2Structure = read2Structure
   }
 
-  call tasks.STAR {
+  # Convert SAM to fastq for adapter clipping
+  # This step also removes reads that fail platform/vendor quality checks
+  call tasks.SamToFastq {
     input:
       bam = ExtractUMIs.bam_umis_extracted,
+      output_prefix = output_basename
+  }
+
+  # Adapter clipping
+  call tasks.Fastp {
+    input:
+      fastq1 = SamToFastq.fastq1,
+      fastq2 = SamToFastq.fastq1,
-      fastq2 = SamToFastq.fastq1,
+      fastq2 = SamToFastq.fastq2,
-      fastq2 = SamToFastq.fastq1,
+      fastq2 = SamToFastq.fastq2,
+      output_prefix = output_basename + ".adapter_clipped"
+  }
+
+  # Back to SAM before alignment
+  call tasks.FastqToUbam as FastqToUbamAfterClipping {
+    input:
+        r1_fastq = Fastp.fastq1_clipped,
+        r2_fastq = Fastp.fastq2_clipped,
+        bam_filename = output_basename + ".adapter_clipped",
+        library_name = library_name,
+        platform = platform,
+        platform_unit = platform_unit,
+        read_group_name = read_group_name,
+        sequencing_center = sequencing_center
+  }
+
+  call tasks.FastQC {
+    input:
+      unmapped_bam = FastqToUbamAfterClipping.unmapped_bam
+  }
+
+  call tasks.STAR {
+    input:
+      bam = FastqToUbamAfterClipping.unmapped_bam,
       starIndex = starIndex
   }
 
@@ -125,17 +147,20 @@ workflow RNAWithUMIsPipeline {
   call UmiMD.UMIAwareDuplicateMarking {
     input:
       aligned_bam = STAR.aligned_bam,
-      output_basename = output_basename
+      unaligned_bam = ExtractUMIs.bam_umis_extracted,
+      output_basename = output_basename,
+      remove_duplicates = false
   }
 
+  # We set remove dupli
   call UmiMD.UMIAwareDuplicateMarking as UMIAwareDuplicateMarkingTranscriptome {
     input:
       aligned_bam = CopyReadGroupsToHeader.output_bam,
-      output_basename = output_basename + ".transcriptome"
+      unaligned_bam = ExtractUMIs.bam_umis_extracted,
+      output_basename = output_basename + ".transcriptome",
+      remove_duplicates = true
   }
 
-  ### PLACEHOLDER for CROSSCHECK ###
-
   call tasks.GetSampleName {
     input:
       bam = bam_to_use
@@ -208,6 +233,8 @@ workflow RNAWithUMIsPipeline {
     File picard_quality_distribution_pdf = CollectMultipleMetrics.quality_distribution_pdf
     Float contamination = CalculateContamination.contamination
     Float contamination_error = CalculateContamination.contamination_error
+    File fastqc_html_report = FastQC.fastqc_html
+    Float fastqc_adapter_content = FastQC.adapter_content # sato: might be good to have this one too.
   }
 }