RNA pipeline with adapter clipping (#662)

* enable adapter clipping * Bring fastp tool into warp managed docker images * add fastQC as output to pipeline * Update to public version of Illumina_adapters * add new outputs to TDR, update docker tag for ingest script * sort unmapped just in case and fastp disable length filtering * round fastqc_percent_reads_with_adapter to 5 digits
broadinstitute · Apr 13, 2022 · 000b278 · 000b278
1 parent 956a82d
commit 000b278
Show file tree

Hide file tree

Showing 22 changed files with 644 additions and 80 deletions.
diff --git a/dockers/broad/rna_seq/fastp/Dockerfile b/dockers/broad/rna_seq/fastp/Dockerfile
@@ -0,0 +1,36 @@
+FROM debian:bullseye-slim
+
+ARG FASTP_VERSION=0.20.1
+
+ENV TERM=xterm-256color \
+        FASTP_URL=http://opengene.org/fastp/fastp.${FASTP_VERSION} \
+        TINI_VERSION=v0.19.0 
+
+LABEL MAINTAINER="Broad Institute DSDE <dsde-engineering@broadinstitute.org>" \
+        FASTP_VERSION=${FASTP_VERSION}
+
+WORKDIR /usr/gitc
+
+# Install dependencies
+
+RUN set -eux; \
+        apt-get update; \
+        apt-get install -y \
+            wget \
+    ; \
+# Install fastp
+    wget ${FASTP_URL} ;\
+    \
+    mv fastp.${FASTP_VERSION} /usr/local/bin/fastp; \
+    chmod a+x /usr/local/bin/fastp \
+    ; \
+# Install tini 
+    wget https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini -O /sbin/tini; \
+    chmod +x /sbin/tini \
+    ; \
+# Clean up cached files
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+
+# Set tini as default entrypoint
+ENTRYPOINT [ "/sbin/tini", "--" ]
+
diff --git a/dockers/broad/rna_seq/fastp/README.md b/dockers/broad/rna_seq/fastp/README.md
@@ -0,0 +1,33 @@
+# RNA Seq fastp
+
+## Quick reference
+
+Copy and paste to pull this image
+
+#### `us.gcr.io/broad-gotc-prod/fastp:1.0.0-0.20.1-1649253500`
+
+- __What is this image:__ This image is a lightweight debian based image for running the fastp tool set within our RNA sequencing pipeline.
+- __What is fastp:__ fastp from OpenGene is a tool designed to provide fast all-in-one preprocessing for FastQ files. See [here](https://github.com/OpenGene/fastp) for more information.
+- __How to see tool version used in image:__ Please see below.
+
+## Versioning
+
+fastp uses the following convention for versioning:
+
+#### `us.gcr.io/broad-gotc-prod/fastp:<image-version>-<fastp-version>-<unix-timestamp>` 
+
+We keep track of all past versions in [docker_versions](docker_versions.tsv) with the last image listed being the currently used version in WARP.
+
+You can see more information about the image, including the tool versions, by running the following command:
+
+```bash
+$ docker pull us.gcr.io/broad-gotc-prod/fastp:1.0.0-0.20.1-1649253500
+$ docker inspect us.gcr.io/broad-gotc-prod/fastp:1.0.0-0.20.1-1649253500
+```
+
+## Usage
+
+```bash
+$ docker run --rm -it \
+    us.gcr.io/broad-gotc-prod/fastp:1.0.0-0.20.1-1649253500 fastp
+```
diff --git a/dockers/broad/rna_seq/fastp/docker_build.sh b/dockers/broad/rna_seq/fastp/docker_build.sh
@@ -0,0 +1,71 @@
+set -e
+
+# Update verson when changes to Dockerfile are made
+DOCKER_IMAGE_VERSION=1.0.0
+TIMESTAMP=$(date +"%s")
+DIR=$(cd $(dirname $0) && pwd)
+
+# Registries and tags
+GCR_URL="us.gcr.io/broad-gotc-prod/fastp"
+QUAY_URL="quay.io/broadinstitute/gotc-prod-fastp"
+
+# Fgbio version
+FASTP_VERSION="0.20.1"
+
+# Necessary tools and help text
+TOOLS=(docker gcloud)
+HELP="$(basename "$0") [-h|--help] [-t|--tools] -- script to build the fastp image and push to GCR & Quay
+
+where:
+    -h|--help Show help text
+    -v|--version of fastp to use (default: $FGBIO_VERSION)
+    -t|--tools Show tools needed to run script
+    "
+
+function main(){
+    for t in "${TOOLS[@]}"; do which $t >/dev/null || ok=no; done
+        if [[ $ok == no ]]; then
+            echo "Missing one of the following tools: "
+            for t in "${TOOLS[@]}"; do echo "$t"; done
+            exit 1
+        fi
+
+    while [[ $# -gt 0 ]]
+    do 
+    key="$1"
+    case $key in
+        -v|--version)
+        FASTP_VERSION="$2"
+        shift
+        shift
+        ;;
+        -h|--help)
+        echo "$HELP"
+        exit 0
+        ;;
+        -t|--tools)
+        for t in "${TOOLS[@]}"; do echo $t; done
+        exit 0
+        ;;
+        *)
+        shift
+        ;;
+    esac
+    done
+
+    IMAGE_TAG="$DOCKER_IMAGE_VERSION-$FASTP_VERSION-$TIMESTAMP"
+
+    echo "building and pushing GCR Image - $GCR_URL:$IMAGE_TAG"
+    docker build --no-cache -t "$GCR_URL:$IMAGE_TAG" \
+        --build-arg FASTP_VERSION="$FASTP_VERSION" "$DIR"
+    docker push "$GCR_URL:$IMAGE_TAG"
+
+    #echo "tagging and pushing Quay Image"
+    #docker tag "$GCR_URL:$IMAGE_TAG" "$QUAY_URL:$IMAGE_TAG"
+    #docker push "$QUAY_URL:$IMAGE_TAG"
+
+    echo "$GCR_URL:$IMAGE_TAG" >> "$DIR/docker_versions.tsv"
+    echo "done"
+}
+
+main "$@"
diff --git a/dockers/broad/rna_seq/fastp/docker_versions.tsv b/dockers/broad/rna_seq/fastp/docker_versions.tsv
@@ -0,0 +1 @@
+us.gcr.io/broad-gotc-prod/fastp:1.0.0-0.20.1-1649253500
diff --git a/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.changelog.md b/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.changelog.md
@@ -1,3 +1,8 @@
+# 1.0.7
+2022-04-12 (Date of Last Commit)
+
+* Clip adapter bases pre-alignment & associated updates for TDR ingest
+
 # 1.0.6
 2022-04-04 (Date of Last Commit)
 

diff --git a/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.wdl b/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.wdl
@@ -7,7 +7,7 @@ import "../../../../tasks/broad/Utilities.wdl" as utils
 
 workflow BroadInternalRNAWithUMIs {
 
-  String pipeline_version = "1.0.6"
+  String pipeline_version = "1.0.7"
 
   input {
     # input needs to be either "hg19" or "hg38"
@@ -157,7 +157,9 @@ workflow BroadInternalRNAWithUMIs {
         picard_fingerprint_detail_metrics = CheckFingerprint.fingerprint_detail_metrics_file,
         unified_metrics = MergeMetrics.unified_metrics,
         contamination = RNAWithUMIs.contamination,
-        contamination_error = RNAWithUMIs.contamination_error
+        contamination_error = RNAWithUMIs.contamination_error,
+        fastqc_html_report = RNAWithUMIs.fastqc_html_report,
+        fastqc_percent_reads_with_adapter = RNAWithUMIs.fastqc_percent_reads_with_adapter
     }
 
     call tasks.updateOutputsInTDR {
@@ -196,5 +198,7 @@ workflow BroadInternalRNAWithUMIs {
     File unified_metrics = MergeMetrics.unified_metrics
     Float contamination = RNAWithUMIs.contamination
     Float contamination_error = RNAWithUMIs.contamination_error
+    File fastqc_html_report = RNAWithUMIs.fastqc_html_report
+    Float fastqc_percent_reads_with_adapter = RNAWithUMIs.fastqc_percent_reads_with_adapter
   }
 }
diff --git a/...ad/internal/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg19.fastqs.plumbing.json b/...ad/internal/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg19.fastqs.plumbing.json
@@ -0,0 +1,18 @@
+{
+  "BroadInternalRNAWithUMIs.sample_lsid": "broadinstitute.org:bsp.prod.sample:K4Y2X",
+  "BroadInternalRNAWithUMIs.output_basename": "SM-K4Y2X_350ng_.65X_D3",
+
+  "BroadInternalRNAWithUMIs.reference_build": "hg19",
+
+  "BroadInternalRNAWithUMIs.r1_fastq": "gs://broad-gotc-test-storage/rna_seq/rna_with_umis/plumbing/inputs/SM-K4Y2X_350ng_.65X_D3_read1_plumbing.fastq",
+  "BroadInternalRNAWithUMIs.r2_fastq": "gs://broad-gotc-test-storage/rna_seq/rna_with_umis/plumbing/inputs/SM-K4Y2X_350ng_.65X_D3_read2_plumbing.fastq",
+  "BroadInternalRNAWithUMIs.read1Structure": "3M2S146T",
+  "BroadInternalRNAWithUMIs.read2Structure": "3M2S146T",
+  "BroadInternalRNAWithUMIs.library_name": "SM-K4Y2X_350ng_.65X_D3",
+  "BroadInternalRNAWithUMIs.platform": "ILLUMINA",
+  "BroadInternalRNAWithUMIs.platform_unit": "barcode1",
+  "BroadInternalRNAWithUMIs.read_group_name": "RG1",
+
+  "BroadInternalRNAWithUMIs.vault_token_path": "{VAULT_TOKEN_PATH}",
+  "BroadInternalRNAWithUMIs.environment": "{ENV}"
+}
diff --git a/...ad/internal/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg38.fastqs.plumbing.json b/...ad/internal/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg38.fastqs.plumbing.json
@@ -0,0 +1,18 @@
+{
+  "BroadInternalRNAWithUMIs.sample_lsid": "broadinstitute.org:bsp.prod.sample:K4Y2X",
+  "BroadInternalRNAWithUMIs.output_basename": "SM-K4Y2X_350ng_.65X_D3",
+
+  "BroadInternalRNAWithUMIs.reference_build": "hg38",
+
+  "BroadInternalRNAWithUMIs.r1_fastq": "gs://broad-gotc-test-storage/rna_seq/rna_with_umis/plumbing/inputs/SM-K4Y2X_350ng_.65X_D3_read1_plumbing.fastq",
+  "BroadInternalRNAWithUMIs.r2_fastq": "gs://broad-gotc-test-storage/rna_seq/rna_with_umis/plumbing/inputs/SM-K4Y2X_350ng_.65X_D3_read2_plumbing.fastq",
+  "BroadInternalRNAWithUMIs.read1Structure": "3M2S146T",
+  "BroadInternalRNAWithUMIs.read2Structure": "3M2S146T",
+  "BroadInternalRNAWithUMIs.library_name": "SM-K4Y2X_350ng_.65X_D3",
+  "BroadInternalRNAWithUMIs.platform": "ILLUMINA",
+  "BroadInternalRNAWithUMIs.platform_unit": "barcode1",
+  "BroadInternalRNAWithUMIs.read_group_name": "RG1",
+
+  "BroadInternalRNAWithUMIs.vault_token_path": "{VAULT_TOKEN_PATH}",
+  "BroadInternalRNAWithUMIs.environment": "{ENV}"
+}
diff --git a/pipelines/broad/rna_seq/RNAWithUMIsPipeline.changelog.md b/pipelines/broad/rna_seq/RNAWithUMIsPipeline.changelog.md
@@ -1,3 +1,8 @@
+# 1.0.4
+2022-04-08 (Date of Last Commit)
+
+* Clip adapter bases pre-alignment
+
 # 1.0.3
 2022-03-29 (Date of Last Commit)
 

diff --git a/pipelines/broad/rna_seq/RNAWithUMIsPipeline.wdl b/pipelines/broad/rna_seq/RNAWithUMIsPipeline.wdl
@@ -20,7 +20,7 @@ import "../../../tasks/broad/RNAWithUMIsTasks.wdl" as tasks
 
 workflow RNAWithUMIsPipeline {
 
-  String pipeline_version = "1.0.3"
+  String pipeline_version = "1.0.4"
 
   input {
     File? bam
@@ -30,12 +30,11 @@ workflow RNAWithUMIsPipeline {
     String read2Structure
     String output_basename
 
-    # The following inputs are only required if fastqs are given as input.
-    String? platform
-    String? library_name
-    String? platform_unit
-    String? read_group_name
-    String? sequencing_center = "BI"
+    String platform
+    String library_name
+    String platform_unit
+    String read_group_name
+    String sequencing_center = "BI"
 
     File starIndex
     File gtf
@@ -60,11 +59,11 @@ workflow RNAWithUMIsPipeline {
     starIndex: "TAR file containing genome indices used for the STAR aligner"
     output_basename: "String used as a prefix in workflow output files"
     gtf: "Gene annotation file (GTF) used for the rnaseqc tool"
-    platform: "String used to describe the sequencing platform; only required when using FASTQ files as input"
-    library_name: "String used to describe the library; only required when using FASTQ files as input"
-    platform_unit: "String used to describe the platform unit; only required when using FASTQ files as input"
-    read_group_name: "String used to describe the read group name; only required when using FASTQ files as input"
-    sequencing_center: "String used to describe the sequencing center; only required when using FASTQ files as input; default is set to 'BI'"
+    platform: "String used to describe the sequencing platform"
+    library_name: "String used to describe the library"
+    platform_unit: "String used to describe the platform unit"
+    read_group_name: "String used to describe the read group name"
+    sequencing_center: "String used to describe the sequencing center; default is set to 'BI'"
     ref: "FASTA file used for metric collection with Picard tools"
     refIndex: "FASTA index file used for metric collection with Picard tools"
     refDict: "Dictionary file used for metric collection with Picard tools"
@@ -79,12 +78,7 @@ workflow RNAWithUMIsPipeline {
     input:
       bam = bam,
       r1_fastq = r1_fastq,
-      r2_fastq = r2_fastq,
-      library_name = library_name,
-      platform = platform,
-      platform_unit = platform_unit,
-      read_group_name = read_group_name,
-      sequencing_center = sequencing_center
+      r2_fastq = r2_fastq
   }
 
   if (VerifyPipelineInputs.fastq_run) {
@@ -93,11 +87,11 @@ workflow RNAWithUMIsPipeline {
         r1_fastq = select_first([r1_fastq]),
         r2_fastq = select_first([r2_fastq]),
         bam_filename = output_basename,
-        library_name = select_first([library_name]),
-        platform = select_first([platform]),
-        platform_unit = select_first([platform_unit]),
-        read_group_name = select_first([read_group_name]),
-        sequencing_center = select_first([sequencing_center])
+        library_name = library_name,
+        platform = platform,
+        platform_unit = platform_unit,
+        read_group_name = read_group_name,
+        sequencing_center = sequencing_center
     }
   }
 
@@ -110,9 +104,43 @@ workflow RNAWithUMIsPipeline {
       read2Structure = read2Structure
   }
 
-  call tasks.STAR {
+  # Convert SAM to fastq for adapter clipping
+  # This step also removes reads that fail platform/vendor quality checks
+  call tasks.SamToFastq {
     input:
       bam = ExtractUMIs.bam_umis_extracted,
+      output_prefix = output_basename
+  }
+
+  # Adapter clipping
+  call tasks.Fastp {
+    input:
+      fastq1 = SamToFastq.fastq1,
+      fastq2 = SamToFastq.fastq2,
+      output_prefix = output_basename + ".adapter_clipped"
+  }
+
+  # Back to SAM before alignment
+  call tasks.FastqToUbam as FastqToUbamAfterClipping {
+    input:
+        r1_fastq = Fastp.fastq1_clipped,
+        r2_fastq = Fastp.fastq2_clipped,
+        bam_filename = output_basename + ".adapter_clipped",
+        library_name = library_name,
+        platform = platform,
+        platform_unit = platform_unit,
+        read_group_name = read_group_name,
+        sequencing_center = sequencing_center
+  }
+
+  call tasks.FastQC {
+    input:
+      unmapped_bam = FastqToUbamAfterClipping.unmapped_bam
+  }
+
+  call tasks.STAR {
+    input:
+      bam = FastqToUbamAfterClipping.unmapped_bam,
       starIndex = starIndex
   }
 
@@ -122,19 +150,32 @@ workflow RNAWithUMIsPipeline {
       bam_without_readgroups = STAR.transcriptome_bam
   }
 
+  # Mark duplicate reads in the genome-aligned bam. The output is coordinate-sorted.
   call UmiMD.UMIAwareDuplicateMarking {
     input:
       aligned_bam = STAR.aligned_bam,
-      output_basename = output_basename
+      unaligned_bam = ExtractUMIs.bam_umis_extracted,
+      output_basename = output_basename,
+      remove_duplicates = false,
+      coordinate_sort_output = true
   }
 
+  # Remove, rather than mark, duplicates reads from the transcriptome-aligned bam
+  # The output is query-name-sorted.
   call UmiMD.UMIAwareDuplicateMarking as UMIAwareDuplicateMarkingTranscriptome {
     input:
       aligned_bam = CopyReadGroupsToHeader.output_bam,
-      output_basename = output_basename + ".transcriptome"
+      unaligned_bam = ExtractUMIs.bam_umis_extracted,
+      output_basename = output_basename + ".transcriptome",
+      remove_duplicates = true,
+      coordinate_sort_output = false
   }
 
-  ### PLACEHOLDER for CROSSCHECK ###
+  call tasks.PostprocessTranscriptomeForRSEM {
+    input:
+      prefix = output_basename + ".transcriptome.RSEM",
+      input_bam = UMIAwareDuplicateMarkingTranscriptome.duplicate_marked_bam
+  }
 
   call tasks.GetSampleName {
     input:
@@ -208,6 +249,8 @@ workflow RNAWithUMIsPipeline {
     File picard_quality_distribution_pdf = CollectMultipleMetrics.quality_distribution_pdf
     Float contamination = CalculateContamination.contamination
     Float contamination_error = CalculateContamination.contamination_error
+    File fastqc_html_report = FastQC.fastqc_html
+    Float fastqc_percent_reads_with_adapter = FastQC.fastqc_percent_reads_with_adapter
   }
 }
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		us.gcr.io/broad-gotc-prod/fastp:1.0.0-0.20.1-1649253500