diff --git a/dockers/broad/rna_seq/fastp/Dockerfile b/dockers/broad/rna_seq/fastp/Dockerfile
new file mode 100644
index 0000000000..adf501195b
--- /dev/null
+++ b/dockers/broad/rna_seq/fastp/Dockerfile
@@ -0,0 +1,36 @@
+FROM debian:bullseye-slim
+
+ARG FASTP_VERSION=0.20.1
+
+ENV TERM=xterm-256color \
+        FASTP_URL=http://opengene.org/fastp/fastp.${FASTP_VERSION} \
+        TINI_VERSION=v0.19.0 
+
+LABEL MAINTAINER="Broad Institute DSDE <dsde-engineering@broadinstitute.org>" \
+        FASTP_VERSION=${FASTP_VERSION}
+
+WORKDIR /usr/gitc
+
+# Install dependencies
+
+RUN set -eux; \
+        apt-get update; \
+        apt-get install -y \
+            wget \
+    ; \
+# Install fastp
+    wget ${FASTP_URL} ;\
+    \
+    mv fastp.${FASTP_VERSION} /usr/local/bin/fastp; \
+    chmod a+x /usr/local/bin/fastp \
+    ; \
+# Install tini 
+    wget https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini -O /sbin/tini; \
+    chmod +x /sbin/tini \
+    ; \
+# Clean up cached files
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+
+# Set tini as default entrypoint
+ENTRYPOINT [ "/sbin/tini", "--" ]
+
diff --git a/dockers/broad/rna_seq/fastp/README.md b/dockers/broad/rna_seq/fastp/README.md
new file mode 100644
index 0000000000..2bd0154518
--- /dev/null
+++ b/dockers/broad/rna_seq/fastp/README.md
@@ -0,0 +1,33 @@
+# RNA Seq fastp
+
+## Quick reference
+
+Copy and paste to pull this image
+
+#### `us.gcr.io/broad-gotc-prod/fastp:1.0.0-0.20.1-1649253500`
+
+- __What is this image:__ This image is a lightweight debian based image for running the fastp tool set within our RNA sequencing pipeline.
+- __What is fastp:__ fastp from OpenGene is a tool designed to provide fast all-in-one preprocessing for FastQ files. See [here](https://github.com/OpenGene/fastp) for more information.
+- __How to see tool version used in image:__ Please see below.
+
+## Versioning
+
+fastp uses the following convention for versioning:
+
+#### `us.gcr.io/broad-gotc-prod/fastp:<image-version>-<fastp-version>-<unix-timestamp>` 
+
+We keep track of all past versions in [docker_versions](docker_versions.tsv) with the last image listed being the currently used version in WARP.
+
+You can see more information about the image, including the tool versions, by running the following command:
+
+```bash
+$ docker pull us.gcr.io/broad-gotc-prod/fastp:1.0.0-0.20.1-1649253500
+$ docker inspect us.gcr.io/broad-gotc-prod/fastp:1.0.0-0.20.1-1649253500
+```
+
+## Usage
+
+```bash
+$ docker run --rm -it \
+    us.gcr.io/broad-gotc-prod/fastp:1.0.0-0.20.1-1649253500 fastp
+```
\ No newline at end of file
diff --git a/dockers/broad/rna_seq/fastp/docker_build.sh b/dockers/broad/rna_seq/fastp/docker_build.sh
new file mode 100755
index 0000000000..e3298dff0a
--- /dev/null
+++ b/dockers/broad/rna_seq/fastp/docker_build.sh
@@ -0,0 +1,71 @@
+set -e
+
+# Update verson when changes to Dockerfile are made
+DOCKER_IMAGE_VERSION=1.0.0
+TIMESTAMP=$(date +"%s")
+DIR=$(cd $(dirname $0) && pwd)
+
+# Registries and tags
+GCR_URL="us.gcr.io/broad-gotc-prod/fastp"
+QUAY_URL="quay.io/broadinstitute/gotc-prod-fastp"
+
+# Fgbio version
+FASTP_VERSION="0.20.1"
+
+# Necessary tools and help text
+TOOLS=(docker gcloud)
+HELP="$(basename "$0") [-h|--help] [-t|--tools] -- script to build the fastp image and push to GCR & Quay
+
+where:
+    -h|--help Show help text
+    -v|--version of fastp to use (default: $FGBIO_VERSION)
+    -t|--tools Show tools needed to run script
+    "
+
+function main(){
+    for t in "${TOOLS[@]}"; do which $t >/dev/null || ok=no; done
+        if [[ $ok == no ]]; then
+            echo "Missing one of the following tools: "
+            for t in "${TOOLS[@]}"; do echo "$t"; done
+            exit 1
+        fi
+
+    while [[ $# -gt 0 ]]
+    do 
+    key="$1"
+    case $key in
+        -v|--version)
+        FASTP_VERSION="$2"
+        shift
+        shift
+        ;;
+        -h|--help)
+        echo "$HELP"
+        exit 0
+        ;;
+        -t|--tools)
+        for t in "${TOOLS[@]}"; do echo $t; done
+        exit 0
+        ;;
+        *)
+        shift
+        ;;
+    esac
+    done
+
+    IMAGE_TAG="$DOCKER_IMAGE_VERSION-$FASTP_VERSION-$TIMESTAMP"
+
+    echo "building and pushing GCR Image - $GCR_URL:$IMAGE_TAG"
+    docker build --no-cache -t "$GCR_URL:$IMAGE_TAG" \
+        --build-arg FASTP_VERSION="$FASTP_VERSION" "$DIR"
+    docker push "$GCR_URL:$IMAGE_TAG"
+
+    #echo "tagging and pushing Quay Image"
+    #docker tag "$GCR_URL:$IMAGE_TAG" "$QUAY_URL:$IMAGE_TAG"
+    #docker push "$QUAY_URL:$IMAGE_TAG"
+
+    echo "$GCR_URL:$IMAGE_TAG" >> "$DIR/docker_versions.tsv"
+    echo "done"
+}
+
+main "$@"
\ No newline at end of file
diff --git a/dockers/broad/rna_seq/fastp/docker_versions.tsv b/dockers/broad/rna_seq/fastp/docker_versions.tsv
new file mode 100644
index 0000000000..5a25da6143
--- /dev/null
+++ b/dockers/broad/rna_seq/fastp/docker_versions.tsv
@@ -0,0 +1 @@
+us.gcr.io/broad-gotc-prod/fastp:1.0.0-0.20.1-1649253500
diff --git a/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.changelog.md b/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.changelog.md
index 9905f00e31..310f8c39f9 100644
--- a/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.changelog.md
+++ b/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.changelog.md
@@ -1,3 +1,8 @@
+# 1.0.7
+2022-04-12 (Date of Last Commit)
+
+* Clip adapter bases pre-alignment & associated updates for TDR ingest
+
 # 1.0.6
 2022-04-04 (Date of Last Commit)
 
diff --git a/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.wdl b/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.wdl
index d2f1323e26..3b5c05ccaf 100644
--- a/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.wdl
+++ b/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.wdl
@@ -7,7 +7,7 @@ import "../../../../tasks/broad/Utilities.wdl" as utils
 
 workflow BroadInternalRNAWithUMIs {
 
-  String pipeline_version = "1.0.6"
+  String pipeline_version = "1.0.7"
 
   input {
     # input needs to be either "hg19" or "hg38"
@@ -157,7 +157,9 @@ workflow BroadInternalRNAWithUMIs {
         picard_fingerprint_detail_metrics = CheckFingerprint.fingerprint_detail_metrics_file,
         unified_metrics = MergeMetrics.unified_metrics,
         contamination = RNAWithUMIs.contamination,
-        contamination_error = RNAWithUMIs.contamination_error
+        contamination_error = RNAWithUMIs.contamination_error,
+        fastqc_html_report = RNAWithUMIs.fastqc_html_report,
+        fastqc_percent_reads_with_adapter = RNAWithUMIs.fastqc_percent_reads_with_adapter
     }
 
     call tasks.updateOutputsInTDR {
@@ -196,5 +198,7 @@ workflow BroadInternalRNAWithUMIs {
     File unified_metrics = MergeMetrics.unified_metrics
     Float contamination = RNAWithUMIs.contamination
     Float contamination_error = RNAWithUMIs.contamination_error
+    File fastqc_html_report = RNAWithUMIs.fastqc_html_report
+    Float fastqc_percent_reads_with_adapter = RNAWithUMIs.fastqc_percent_reads_with_adapter
   }
 }
diff --git a/pipelines/broad/internal/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg19.fastqs.plumbing.json b/pipelines/broad/internal/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg19.fastqs.plumbing.json
new file mode 100644
index 0000000000..1c5590a574
--- /dev/null
+++ b/pipelines/broad/internal/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg19.fastqs.plumbing.json
@@ -0,0 +1,18 @@
+{
+  "BroadInternalRNAWithUMIs.sample_lsid": "broadinstitute.org:bsp.prod.sample:K4Y2X",
+  "BroadInternalRNAWithUMIs.output_basename": "SM-K4Y2X_350ng_.65X_D3",
+
+  "BroadInternalRNAWithUMIs.reference_build": "hg19",
+
+  "BroadInternalRNAWithUMIs.r1_fastq": "gs://broad-gotc-test-storage/rna_seq/rna_with_umis/plumbing/inputs/SM-K4Y2X_350ng_.65X_D3_read1_plumbing.fastq",
+  "BroadInternalRNAWithUMIs.r2_fastq": "gs://broad-gotc-test-storage/rna_seq/rna_with_umis/plumbing/inputs/SM-K4Y2X_350ng_.65X_D3_read2_plumbing.fastq",
+  "BroadInternalRNAWithUMIs.read1Structure": "3M2S146T",
+  "BroadInternalRNAWithUMIs.read2Structure": "3M2S146T",
+  "BroadInternalRNAWithUMIs.library_name": "SM-K4Y2X_350ng_.65X_D3",
+  "BroadInternalRNAWithUMIs.platform": "ILLUMINA",
+  "BroadInternalRNAWithUMIs.platform_unit": "barcode1",
+  "BroadInternalRNAWithUMIs.read_group_name": "RG1",
+
+  "BroadInternalRNAWithUMIs.vault_token_path": "{VAULT_TOKEN_PATH}",
+  "BroadInternalRNAWithUMIs.environment": "{ENV}"
+}
\ No newline at end of file
diff --git a/pipelines/broad/internal/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg38.fastqs.plumbing.json b/pipelines/broad/internal/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg38.fastqs.plumbing.json
new file mode 100644
index 0000000000..09ac79d9d3
--- /dev/null
+++ b/pipelines/broad/internal/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg38.fastqs.plumbing.json
@@ -0,0 +1,18 @@
+{
+  "BroadInternalRNAWithUMIs.sample_lsid": "broadinstitute.org:bsp.prod.sample:K4Y2X",
+  "BroadInternalRNAWithUMIs.output_basename": "SM-K4Y2X_350ng_.65X_D3",
+
+  "BroadInternalRNAWithUMIs.reference_build": "hg38",
+
+  "BroadInternalRNAWithUMIs.r1_fastq": "gs://broad-gotc-test-storage/rna_seq/rna_with_umis/plumbing/inputs/SM-K4Y2X_350ng_.65X_D3_read1_plumbing.fastq",
+  "BroadInternalRNAWithUMIs.r2_fastq": "gs://broad-gotc-test-storage/rna_seq/rna_with_umis/plumbing/inputs/SM-K4Y2X_350ng_.65X_D3_read2_plumbing.fastq",
+  "BroadInternalRNAWithUMIs.read1Structure": "3M2S146T",
+  "BroadInternalRNAWithUMIs.read2Structure": "3M2S146T",
+  "BroadInternalRNAWithUMIs.library_name": "SM-K4Y2X_350ng_.65X_D3",
+  "BroadInternalRNAWithUMIs.platform": "ILLUMINA",
+  "BroadInternalRNAWithUMIs.platform_unit": "barcode1",
+  "BroadInternalRNAWithUMIs.read_group_name": "RG1",
+
+  "BroadInternalRNAWithUMIs.vault_token_path": "{VAULT_TOKEN_PATH}",
+  "BroadInternalRNAWithUMIs.environment": "{ENV}"
+}
\ No newline at end of file
diff --git a/pipelines/broad/rna_seq/RNAWithUMIsPipeline.changelog.md b/pipelines/broad/rna_seq/RNAWithUMIsPipeline.changelog.md
index 29e681384f..43eb1010a5 100644
--- a/pipelines/broad/rna_seq/RNAWithUMIsPipeline.changelog.md
+++ b/pipelines/broad/rna_seq/RNAWithUMIsPipeline.changelog.md
@@ -1,3 +1,8 @@
+# 1.0.4
+2022-04-08 (Date of Last Commit)
+
+* Clip adapter bases pre-alignment
+
 # 1.0.3
 2022-03-29 (Date of Last Commit)
 
diff --git a/pipelines/broad/rna_seq/RNAWithUMIsPipeline.wdl b/pipelines/broad/rna_seq/RNAWithUMIsPipeline.wdl
index 593ed94bb8..6645098bf2 100644
--- a/pipelines/broad/rna_seq/RNAWithUMIsPipeline.wdl
+++ b/pipelines/broad/rna_seq/RNAWithUMIsPipeline.wdl
@@ -20,7 +20,7 @@ import "../../../tasks/broad/RNAWithUMIsTasks.wdl" as tasks
 
 workflow RNAWithUMIsPipeline {
 
-  String pipeline_version = "1.0.3"
+  String pipeline_version = "1.0.4"
 
   input {
     File? bam
@@ -30,12 +30,11 @@ workflow RNAWithUMIsPipeline {
     String read2Structure
     String output_basename
 
-    # The following inputs are only required if fastqs are given as input.
-    String? platform
-    String? library_name
-    String? platform_unit
-    String? read_group_name
-    String? sequencing_center = "BI"
+    String platform
+    String library_name
+    String platform_unit
+    String read_group_name
+    String sequencing_center = "BI"
 
     File starIndex
     File gtf
@@ -60,11 +59,11 @@ workflow RNAWithUMIsPipeline {
     starIndex: "TAR file containing genome indices used for the STAR aligner"
     output_basename: "String used as a prefix in workflow output files"
     gtf: "Gene annotation file (GTF) used for the rnaseqc tool"
-    platform: "String used to describe the sequencing platform; only required when using FASTQ files as input"
-    library_name: "String used to describe the library; only required when using FASTQ files as input"
-    platform_unit: "String used to describe the platform unit; only required when using FASTQ files as input"
-    read_group_name: "String used to describe the read group name; only required when using FASTQ files as input"
-    sequencing_center: "String used to describe the sequencing center; only required when using FASTQ files as input; default is set to 'BI'"
+    platform: "String used to describe the sequencing platform"
+    library_name: "String used to describe the library"
+    platform_unit: "String used to describe the platform unit"
+    read_group_name: "String used to describe the read group name"
+    sequencing_center: "String used to describe the sequencing center; default is set to 'BI'"
     ref: "FASTA file used for metric collection with Picard tools"
     refIndex: "FASTA index file used for metric collection with Picard tools"
     refDict: "Dictionary file used for metric collection with Picard tools"
@@ -79,12 +78,7 @@ workflow RNAWithUMIsPipeline {
     input:
       bam = bam,
       r1_fastq = r1_fastq,
-      r2_fastq = r2_fastq,
-      library_name = library_name,
-      platform = platform,
-      platform_unit = platform_unit,
-      read_group_name = read_group_name,
-      sequencing_center = sequencing_center
+      r2_fastq = r2_fastq
   }
 
   if (VerifyPipelineInputs.fastq_run) {
@@ -93,11 +87,11 @@ workflow RNAWithUMIsPipeline {
         r1_fastq = select_first([r1_fastq]),
         r2_fastq = select_first([r2_fastq]),
         bam_filename = output_basename,
-        library_name = select_first([library_name]),
-        platform = select_first([platform]),
-        platform_unit = select_first([platform_unit]),
-        read_group_name = select_first([read_group_name]),
-        sequencing_center = select_first([sequencing_center])
+        library_name = library_name,
+        platform = platform,
+        platform_unit = platform_unit,
+        read_group_name = read_group_name,
+        sequencing_center = sequencing_center
     }
   }
 
@@ -110,9 +104,43 @@ workflow RNAWithUMIsPipeline {
       read2Structure = read2Structure
   }
 
-  call tasks.STAR {
+  # Convert SAM to fastq for adapter clipping
+  # This step also removes reads that fail platform/vendor quality checks
+  call tasks.SamToFastq {
     input:
       bam = ExtractUMIs.bam_umis_extracted,
+      output_prefix = output_basename
+  }
+
+  # Adapter clipping
+  call tasks.Fastp {
+    input:
+      fastq1 = SamToFastq.fastq1,
+      fastq2 = SamToFastq.fastq2,
+      output_prefix = output_basename + ".adapter_clipped"
+  }
+
+  # Back to SAM before alignment
+  call tasks.FastqToUbam as FastqToUbamAfterClipping {
+    input:
+        r1_fastq = Fastp.fastq1_clipped,
+        r2_fastq = Fastp.fastq2_clipped,
+        bam_filename = output_basename + ".adapter_clipped",
+        library_name = library_name,
+        platform = platform,
+        platform_unit = platform_unit,
+        read_group_name = read_group_name,
+        sequencing_center = sequencing_center
+  }
+
+  call tasks.FastQC {
+    input:
+      unmapped_bam = FastqToUbamAfterClipping.unmapped_bam
+  }
+
+  call tasks.STAR {
+    input:
+      bam = FastqToUbamAfterClipping.unmapped_bam,
       starIndex = starIndex
   }
 
@@ -122,19 +150,32 @@ workflow RNAWithUMIsPipeline {
       bam_without_readgroups = STAR.transcriptome_bam
   }
 
+  # Mark duplicate reads in the genome-aligned bam. The output is coordinate-sorted.
   call UmiMD.UMIAwareDuplicateMarking {
     input:
       aligned_bam = STAR.aligned_bam,
-      output_basename = output_basename
+      unaligned_bam = ExtractUMIs.bam_umis_extracted,
+      output_basename = output_basename,
+      remove_duplicates = false,
+      coordinate_sort_output = true
   }
 
+  # Remove, rather than mark, duplicates reads from the transcriptome-aligned bam
+  # The output is query-name-sorted.
   call UmiMD.UMIAwareDuplicateMarking as UMIAwareDuplicateMarkingTranscriptome {
     input:
       aligned_bam = CopyReadGroupsToHeader.output_bam,
-      output_basename = output_basename + ".transcriptome"
+      unaligned_bam = ExtractUMIs.bam_umis_extracted,
+      output_basename = output_basename + ".transcriptome",
+      remove_duplicates = true,
+      coordinate_sort_output = false
   }
 
-  ### PLACEHOLDER for CROSSCHECK ###
+  call tasks.PostprocessTranscriptomeForRSEM {
+    input:
+      prefix = output_basename + ".transcriptome.RSEM",
+      input_bam = UMIAwareDuplicateMarkingTranscriptome.duplicate_marked_bam
+  }
 
   call tasks.GetSampleName {
     input:
@@ -208,6 +249,8 @@ workflow RNAWithUMIsPipeline {
     File picard_quality_distribution_pdf = CollectMultipleMetrics.quality_distribution_pdf
     Float contamination = CalculateContamination.contamination
     Float contamination_error = CalculateContamination.contamination_error
+    File fastqc_html_report = FastQC.fastqc_html
+    Float fastqc_percent_reads_with_adapter = FastQC.fastqc_percent_reads_with_adapter
   }
 }
 
diff --git a/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-HFCB1_100ng_.65X_A3.hg19.plumbing.json b/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-HFCB1_100ng_.65X_A3.hg19.plumbing.json
index 0e7643e620..6e299bf072 100644
--- a/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-HFCB1_100ng_.65X_A3.hg19.plumbing.json
+++ b/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-HFCB1_100ng_.65X_A3.hg19.plumbing.json
@@ -5,6 +5,11 @@
 
   "RNAWithUMIsPipeline.output_basename": "SM-HFCB1_100ng_.65X_A3",
 
+  "RNAWithUMIsPipeline.library_name": "SM-HFCB1_100ng_.65X_A3",
+  "RNAWithUMIsPipeline.platform": "ILLUMINA",
+  "RNAWithUMIsPipeline.platform_unit": "barcode1",
+  "RNAWithUMIsPipeline.read_group_name": "RG1",
+
   "RNAWithUMIsPipeline.starIndex": "gs://gcp-public-data--broad-references/hg19/v0/star/STAR2.7.10a_genome_hg19_noALT_noHLA_noDecoy_v19_oh145.tar.gz",
   "RNAWithUMIsPipeline.gtf": "gs://gcp-public-data--broad-references/hg19/v0/annotation/gencode.v19.genes.v7.collapsed_only.patched_contigs.gtf",
   "RNAWithUMIsPipeline.ref": "gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.fasta",
diff --git a/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-HFCB1_100ng_.65X_A3.hg38.plumbing.json b/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-HFCB1_100ng_.65X_A3.hg38.plumbing.json
index 516f4c3879..eeee337a52 100644
--- a/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-HFCB1_100ng_.65X_A3.hg38.plumbing.json
+++ b/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-HFCB1_100ng_.65X_A3.hg38.plumbing.json
@@ -5,6 +5,11 @@
 
   "RNAWithUMIsPipeline.output_basename": "SM-HFCB1_100ng_.65X_A3",
 
+  "RNAWithUMIsPipeline.library_name": "SM-HFCB1_100ng_.65X_A3",
+  "RNAWithUMIsPipeline.platform": "ILLUMINA",
+  "RNAWithUMIsPipeline.platform_unit": "barcode1",
+  "RNAWithUMIsPipeline.read_group_name": "RG1",
+
   "RNAWithUMIsPipeline.starIndex": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/star/STAR2.7.10a_genome_GRCh38_noALT_noHLA_noDecoy_v34_oh145.tar.gz",
   "RNAWithUMIsPipeline.gtf": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/annotation/gencode.v34.annotation_collapsed_only.gtf",
   "RNAWithUMIsPipeline.ref": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/Homo_sapiens_assembly38_noALT_noHLA_noDecoy.fasta",
diff --git a/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg19.fastqs.plumbing.json b/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg19.fastqs.plumbing.json
new file mode 100644
index 0000000000..923fa59a61
--- /dev/null
+++ b/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg19.fastqs.plumbing.json
@@ -0,0 +1,24 @@
+{
+  "RNAWithUMIsPipeline.r1_fastq": "gs://broad-gotc-test-storage/rna_seq/rna_with_umis/plumbing/inputs/SM-K4Y2X_350ng_.65X_D3_read1_plumbing.fastq",
+  "RNAWithUMIsPipeline.r2_fastq": "gs://broad-gotc-test-storage/rna_seq/rna_with_umis/plumbing/inputs/SM-K4Y2X_350ng_.65X_D3_read2_plumbing.fastq",
+  "RNAWithUMIsPipeline.read1Structure": "3M2S146T",
+  "RNAWithUMIsPipeline.read2Structure": "3M2S146T",
+
+  "RNAWithUMIsPipeline.output_basename": "SM-K4Y2X_350ng_.65X_D3",
+
+  "RNAWithUMIsPipeline.library_name": "SM-K4Y2X_350ng_.65X_D3",
+  "RNAWithUMIsPipeline.platform": "ILLUMINA",
+  "RNAWithUMIsPipeline.platform_unit": "barcode1",
+  "RNAWithUMIsPipeline.read_group_name": "RG1",
+
+  "RNAWithUMIsPipeline.starIndex": "gs://gcp-public-data--broad-references/hg19/v0/star/STAR2.7.10a_genome_hg19_noALT_noHLA_noDecoy_v19_oh145.tar.gz",
+  "RNAWithUMIsPipeline.gtf": "gs://gcp-public-data--broad-references/hg19/v0/annotation/gencode.v19.genes.v7.collapsed_only.patched_contigs.gtf",
+  "RNAWithUMIsPipeline.ref": "gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.fasta",
+  "RNAWithUMIsPipeline.refIndex": "gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.fasta.fai",
+  "RNAWithUMIsPipeline.refDict": "gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.dict",
+  "RNAWithUMIsPipeline.refFlat": "gs://gcp-public-data--broad-references/hg19/v0/annotation/Homo_sapiens_assembly19.refFlat.txt",
+  "RNAWithUMIsPipeline.ribosomalIntervals": "gs://gcp-public-data--broad-references/hg19/v0/annotation/Homo_sapiens_assembly19.rRNA.interval_list",
+  "RNAWithUMIsPipeline.exonBedFile": "gs://gcp-public-data--broad-references/hg19/v0/annotation/gencode.v19.hg19.insert_size_intervals_geq1000bp.bed",
+  "RNAWithUMIsPipeline.population_vcf": "gs://gatk-best-practices/somatic-b37/small_exac_common_3.vcf",
+  "RNAWithUMIsPipeline.population_vcf_index": "gs://gatk-best-practices/somatic-b37/small_exac_common_3.vcf.idx"
+}
\ No newline at end of file
diff --git a/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg19.plumbing.json b/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg19.plumbing.json
new file mode 100644
index 0000000000..d5e704de07
--- /dev/null
+++ b/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg19.plumbing.json
@@ -0,0 +1,23 @@
+{
+  "RNAWithUMIsPipeline.bam": "gs://broad-gotc-test-storage/rna_seq/rna_with_umis/plumbing/inputs/SM-K4Y2X_350ng_.65X_D3_plumbing.u.bam",
+  "RNAWithUMIsPipeline.read1Structure": "3M2S146T",
+  "RNAWithUMIsPipeline.read2Structure": "3M2S146T",
+
+  "RNAWithUMIsPipeline.output_basename": "SM-K4Y2X_350ng_.65X_D3",
+
+  "RNAWithUMIsPipeline.library_name": "SM-K4Y2X_350ng_.65X_D3",
+  "RNAWithUMIsPipeline.platform": "ILLUMINA",
+  "RNAWithUMIsPipeline.platform_unit": "barcode1",
+  "RNAWithUMIsPipeline.read_group_name": "RG1",
+
+  "RNAWithUMIsPipeline.starIndex": "gs://gcp-public-data--broad-references/hg19/v0/star/STAR2.7.10a_genome_hg19_noALT_noHLA_noDecoy_v19_oh145.tar.gz",
+  "RNAWithUMIsPipeline.gtf": "gs://gcp-public-data--broad-references/hg19/v0/annotation/gencode.v19.genes.v7.collapsed_only.patched_contigs.gtf",
+  "RNAWithUMIsPipeline.ref": "gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.fasta",
+  "RNAWithUMIsPipeline.refIndex": "gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.fasta.fai",
+  "RNAWithUMIsPipeline.refDict": "gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.dict",
+  "RNAWithUMIsPipeline.refFlat": "gs://gcp-public-data--broad-references/hg19/v0/annotation/Homo_sapiens_assembly19.refFlat.txt",
+  "RNAWithUMIsPipeline.ribosomalIntervals": "gs://gcp-public-data--broad-references/hg19/v0/annotation/Homo_sapiens_assembly19.rRNA.interval_list",
+  "RNAWithUMIsPipeline.exonBedFile": "gs://gcp-public-data--broad-references/hg19/v0/annotation/gencode.v19.hg19.insert_size_intervals_geq1000bp.bed",
+  "RNAWithUMIsPipeline.population_vcf": "gs://gatk-best-practices/somatic-b37/small_exac_common_3.vcf",
+  "RNAWithUMIsPipeline.population_vcf_index": "gs://gatk-best-practices/somatic-b37/small_exac_common_3.vcf.idx"
+}
\ No newline at end of file
diff --git a/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg38.fastqs.plumbing.json b/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg38.fastqs.plumbing.json
new file mode 100644
index 0000000000..a3d23e61cb
--- /dev/null
+++ b/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg38.fastqs.plumbing.json
@@ -0,0 +1,24 @@
+{
+  "RNAWithUMIsPipeline.r1_fastq": "gs://broad-gotc-test-storage/rna_seq/rna_with_umis/plumbing/inputs/SM-K4Y2X_350ng_.65X_D3_read1_plumbing.fastq",
+  "RNAWithUMIsPipeline.r2_fastq": "gs://broad-gotc-test-storage/rna_seq/rna_with_umis/plumbing/inputs/SM-K4Y2X_350ng_.65X_D3_read2_plumbing.fastq",
+  "RNAWithUMIsPipeline.read1Structure": "3M2S146T",
+  "RNAWithUMIsPipeline.read2Structure": "3M2S146T",
+
+  "RNAWithUMIsPipeline.output_basename": "SM-K4Y2X_350ng_.65X_D3",
+
+  "RNAWithUMIsPipeline.library_name": "SM-K4Y2X_350ng_.65X_D3",
+  "RNAWithUMIsPipeline.platform": "ILLUMINA",
+  "RNAWithUMIsPipeline.platform_unit": "barcode1",
+  "RNAWithUMIsPipeline.read_group_name": "RG1",
+
+  "RNAWithUMIsPipeline.starIndex": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/star/STAR2.7.10a_genome_GRCh38_noALT_noHLA_noDecoy_v34_oh145.tar.gz",
+  "RNAWithUMIsPipeline.gtf": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/annotation/gencode.v34.annotation_collapsed_only.gtf",
+  "RNAWithUMIsPipeline.ref": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/Homo_sapiens_assembly38_noALT_noHLA_noDecoy.fasta",
+  "RNAWithUMIsPipeline.refIndex": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/Homo_sapiens_assembly38_noALT_noHLA_noDecoy.fasta.fai",
+  "RNAWithUMIsPipeline.refDict": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/Homo_sapiens_assembly38_noALT_noHLA_noDecoy.dict",
+  "RNAWithUMIsPipeline.refFlat": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/annotation/hg38_GENCODE_v34_refFlat.txt",
+  "RNAWithUMIsPipeline.ribosomalIntervals": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/annotation/gencode_v34_rRNA.interval_list",
+  "RNAWithUMIsPipeline.exonBedFile": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/annotation/gencode.v34.GRCh38.insert_size_intervals_geq1000bp.bed",
+  "RNAWithUMIsPipeline.population_vcf": "gs://gatk-best-practices/somatic-hg38/small_exac_common_3.hg38.vcf.gz",
+  "RNAWithUMIsPipeline.population_vcf_index": "gs://gatk-best-practices/somatic-hg38/small_exac_common_3.hg38.vcf.gz.tbi"
+}
\ No newline at end of file
diff --git a/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg38.plumbing.json b/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg38.plumbing.json
new file mode 100644
index 0000000000..b1287095ea
--- /dev/null
+++ b/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg38.plumbing.json
@@ -0,0 +1,23 @@
+{
+  "RNAWithUMIsPipeline.bam": "gs://broad-gotc-test-storage/rna_seq/rna_with_umis/plumbing/inputs/SM-K4Y2X_350ng_.65X_D3_plumbing.u.bam",
+  "RNAWithUMIsPipeline.read1Structure": "3M2S146T",
+  "RNAWithUMIsPipeline.read2Structure": "3M2S146T",
+
+  "RNAWithUMIsPipeline.output_basename": "SM-K4Y2X_350ng_.65X_D3",
+
+  "RNAWithUMIsPipeline.library_name": "SM-K4Y2X_350ng_.65X_D3",
+  "RNAWithUMIsPipeline.platform": "ILLUMINA",
+  "RNAWithUMIsPipeline.platform_unit": "barcode1",
+  "RNAWithUMIsPipeline.read_group_name": "RG1",
+
+  "RNAWithUMIsPipeline.starIndex": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/star/STAR2.7.10a_genome_GRCh38_noALT_noHLA_noDecoy_v34_oh145.tar.gz",
+  "RNAWithUMIsPipeline.gtf": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/annotation/gencode.v34.annotation_collapsed_only.gtf",
+  "RNAWithUMIsPipeline.ref": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/Homo_sapiens_assembly38_noALT_noHLA_noDecoy.fasta",
+  "RNAWithUMIsPipeline.refIndex": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/Homo_sapiens_assembly38_noALT_noHLA_noDecoy.fasta.fai",
+  "RNAWithUMIsPipeline.refDict": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/Homo_sapiens_assembly38_noALT_noHLA_noDecoy.dict",
+  "RNAWithUMIsPipeline.refFlat": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/annotation/hg38_GENCODE_v34_refFlat.txt",
+  "RNAWithUMIsPipeline.ribosomalIntervals": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/annotation/gencode_v34_rRNA.interval_list",
+  "RNAWithUMIsPipeline.exonBedFile": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/annotation/gencode.v34.GRCh38.insert_size_intervals_geq1000bp.bed",
+  "RNAWithUMIsPipeline.population_vcf": "gs://gatk-best-practices/somatic-hg38/small_exac_common_3.hg38.vcf.gz",
+  "RNAWithUMIsPipeline.population_vcf_index": "gs://gatk-best-practices/somatic-hg38/small_exac_common_3.hg38.vcf.gz.tbi"
+}
\ No newline at end of file
diff --git a/pipelines/broad/rna_seq/test_inputs/Scientific/SM-HFCB1_100ng_.65X_A3.hg19.json b/pipelines/broad/rna_seq/test_inputs/Scientific/SM-HFCB1_100ng_.65X_A3.hg19.json
index 59f1fbbbbc..a23cc817ba 100644
--- a/pipelines/broad/rna_seq/test_inputs/Scientific/SM-HFCB1_100ng_.65X_A3.hg19.json
+++ b/pipelines/broad/rna_seq/test_inputs/Scientific/SM-HFCB1_100ng_.65X_A3.hg19.json
@@ -5,6 +5,11 @@
 
   "RNAWithUMIsPipeline.output_basename": "SM-HFCB1_100ng_.65X_A3",
 
+  "RNAWithUMIsPipeline.library_name": "SM-HFCB1_100ng_.65X_A3",
+  "RNAWithUMIsPipeline.platform": "ILLUMINA",
+  "RNAWithUMIsPipeline.platform_unit": "barcode1",
+  "RNAWithUMIsPipeline.read_group_name": "RG1",
+
   "RNAWithUMIsPipeline.starIndex": "gs://gcp-public-data--broad-references/hg19/v0/star/STAR2.7.10a_genome_hg19_noALT_noHLA_noDecoy_v19_oh145.tar.gz",
   "RNAWithUMIsPipeline.gtf": "gs://gcp-public-data--broad-references/hg19/v0/annotation/gencode.v19.genes.v7.collapsed_only.patched_contigs.gtf",
   "RNAWithUMIsPipeline.ref": "gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.fasta",
diff --git a/pipelines/broad/rna_seq/test_inputs/Scientific/SM-HFCB1_100ng_.65X_A3.hg38.json b/pipelines/broad/rna_seq/test_inputs/Scientific/SM-HFCB1_100ng_.65X_A3.hg38.json
index 064038cbe3..7325fc76c6 100644
--- a/pipelines/broad/rna_seq/test_inputs/Scientific/SM-HFCB1_100ng_.65X_A3.hg38.json
+++ b/pipelines/broad/rna_seq/test_inputs/Scientific/SM-HFCB1_100ng_.65X_A3.hg38.json
@@ -5,6 +5,11 @@
 
   "RNAWithUMIsPipeline.output_basename": "SM-HFCB1_100ng_.65X_A3",
 
+  "RNAWithUMIsPipeline.library_name": "SM-HFCB1_100ng_.65X_A3",
+  "RNAWithUMIsPipeline.platform": "ILLUMINA",
+  "RNAWithUMIsPipeline.platform_unit": "barcode1",
+  "RNAWithUMIsPipeline.read_group_name": "RG1",
+
   "RNAWithUMIsPipeline.starIndex": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/star/STAR2.7.10a_genome_GRCh38_noALT_noHLA_noDecoy_v34_oh145.tar.gz",
   "RNAWithUMIsPipeline.gtf": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/annotation/gencode.v34.annotation_collapsed_only.gtf",
   "RNAWithUMIsPipeline.ref": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/Homo_sapiens_assembly38_noALT_noHLA_noDecoy.fasta",
diff --git a/pipelines/broad/rna_seq/test_inputs/Scientific/SM-K4Y2X_350ng_.65X_D3.hg19.json b/pipelines/broad/rna_seq/test_inputs/Scientific/SM-K4Y2X_350ng_.65X_D3.hg19.json
index 50c44e6e88..9c0e314166 100644
--- a/pipelines/broad/rna_seq/test_inputs/Scientific/SM-K4Y2X_350ng_.65X_D3.hg19.json
+++ b/pipelines/broad/rna_seq/test_inputs/Scientific/SM-K4Y2X_350ng_.65X_D3.hg19.json
@@ -5,6 +5,11 @@
 
   "RNAWithUMIsPipeline.output_basename": "SM-K4Y2X_350ng_.65X_D3",
 
+  "RNAWithUMIsPipeline.library_name": "SM-K4Y2X_350ng_.65X_D3",
+  "RNAWithUMIsPipeline.platform": "ILLUMINA",
+  "RNAWithUMIsPipeline.platform_unit": "barcode1",
+  "RNAWithUMIsPipeline.read_group_name": "RG1",
+
   "RNAWithUMIsPipeline.starIndex": "gs://gcp-public-data--broad-references/hg19/v0/star/STAR2.7.10a_genome_hg19_noALT_noHLA_noDecoy_v19_oh145.tar.gz",
   "RNAWithUMIsPipeline.gtf": "gs://gcp-public-data--broad-references/hg19/v0/annotation/gencode.v19.genes.v7.collapsed_only.patched_contigs.gtf",
   "RNAWithUMIsPipeline.ref": "gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.fasta",
diff --git a/pipelines/broad/rna_seq/test_inputs/Scientific/SM-K4Y2X_350ng_.65X_D3.hg38.json b/pipelines/broad/rna_seq/test_inputs/Scientific/SM-K4Y2X_350ng_.65X_D3.hg38.json
index fe4daf1b28..3c42a6feb5 100644
--- a/pipelines/broad/rna_seq/test_inputs/Scientific/SM-K4Y2X_350ng_.65X_D3.hg38.json
+++ b/pipelines/broad/rna_seq/test_inputs/Scientific/SM-K4Y2X_350ng_.65X_D3.hg38.json
@@ -5,6 +5,11 @@
 
   "RNAWithUMIsPipeline.output_basename": "SM-K4Y2X_350ng_.65X_D3",
 
+  "RNAWithUMIsPipeline.library_name": "SM-K4Y2X_350ng_.65X_D3",
+  "RNAWithUMIsPipeline.platform": "ILLUMINA",
+  "RNAWithUMIsPipeline.platform_unit": "barcode1",
+  "RNAWithUMIsPipeline.read_group_name": "RG1",
+
   "RNAWithUMIsPipeline.starIndex": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/star/STAR2.7.10a_genome_GRCh38_noALT_noHLA_noDecoy_v34_oh145.tar.gz",
   "RNAWithUMIsPipeline.gtf": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/annotation/gencode.v34.annotation_collapsed_only.gtf",
   "RNAWithUMIsPipeline.ref": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/Homo_sapiens_assembly38_noALT_noHLA_noDecoy.fasta",
diff --git a/tasks/broad/RNAWithUMIsTasks.wdl b/tasks/broad/RNAWithUMIsTasks.wdl
index 2772e96cda..7c5c66f91f 100644
--- a/tasks/broad/RNAWithUMIsTasks.wdl
+++ b/tasks/broad/RNAWithUMIsTasks.wdl
@@ -10,13 +10,6 @@ task VerifyPipelineInputs {
     File? r1_fastq
     File? r2_fastq
 
-    # fastq specific field
-    String? platform
-    String? library_name
-    String? platform_unit
-    String? read_group_name
-    String? sequencing_center = "BI"
-
     String docker = "us.gcr.io/broad-dsp-gcr-public/base/python:3.9-debian"
     Int cpu = 1
     Int memory_mb = 2000
@@ -31,19 +24,13 @@ task VerifyPipelineInputs {
     bam = "~{bam}"
     r1_fastq = "~{r1_fastq}"
     r2_fastq = "~{r2_fastq}"
-    platform = "~{platform}"
-    library_name = "~{library_name}"
-    platform_unit = "~{platform_unit}"
-    read_group_name = "~{read_group_name}"
-    sequencing_center = "~{sequencing_center}"
 
     if bam and not r1_fastq and not r2_fastq:
       pass
     elif r1_fastq and r2_fastq and not bam:
-      if platform and library_name and platform_unit and read_group_name and sequencing_center:
-        fastq_flag += 1
-      else:
-        raise ValueError("Invalid Input. Input must be either ubam or pair of fastqs with supplemental data")
+      fastq_flag += 1
+    else:
+      raise ValueError("Invalid Input. Input must be either ubam or a pair of fastqs")
 
     with open("output.txt", "w") as f:
       if fastq_flag == 1:
@@ -101,6 +88,42 @@ task ExtractUMIs {
   }
 }
 
+# Adapter clipping
+# https://github.com/OpenGene/fastp
+task Fastp {
+  input {
+    File fastq1
+    File fastq2
+    String output_prefix
+    File adapter_fasta = "gs://gcp-public-data--broad-references/RNA/resources/Illumina_adapters.fasta"
+
+    String docker = "us.gcr.io/broad-gotc-prod/fastp:1.0.0-0.20.1-1649253500"
+    Int memory_mb =  "16384"
+    Int disk_size_gb = 5*ceil(size(fastq1, "GiB")) + 128
+  }
+
+  command {
+    fastp --in1 ~{fastq1} --in2 ~{fastq2} --out1 ~{output_prefix}_read1.fastq.gz --out2 ~{output_prefix}_read2.fastq.gz \
+    --disable_quality_filtering \
+    --disable_length_filtering \
+    --adapter_fasta ~{adapter_fasta}
+  }
+  
+
+  runtime {
+    docker: docker
+    memory: "~{memory_mb} MiB"
+    disks: "local-disk ~{disk_size_gb} HDD"
+    preemptible: 0
+  }
+
+  output {
+    File fastq1_clipped = output_prefix + "_read1.fastq.gz"
+    File fastq2_clipped = output_prefix + "_read2.fastq.gz"
+  }
+
+}
+
 task STAR {
   input {
     File bam
@@ -121,7 +144,7 @@ task STAR {
       --runMode alignReads \
       --runThreadN ~{cpu} \
       --genomeDir star_index \
-      --outSAMtype BAM Unsorted  \
+      --outSAMtype BAM Unsorted \
       --readFilesIn ~{bam} \
       --readFilesType SAM PE \
       --readFilesCommand samtools view -h \
@@ -144,7 +167,7 @@ task STAR {
       --chimOutJunctionFormat 0 \
       --twopassMode Basic \
       --quantMode TranscriptomeSAM \
-      --quantTranscriptomeBan Singleend \
+      --quantTranscriptomeBan IndelSoftclipSingleend \
       --alignEndsProtrude 20 ConcordantPair
   >>>
 
@@ -173,7 +196,7 @@ task FastqToUbam {
     String read_group_name
     String sequencing_center
 
-    String docker = "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.6"
+    String docker = "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.11"
     Int cpu = 1
     Int memory_mb = 4000
     Int disk_size_gb = ceil(size(r1_fastq, "GiB")*2.2 + size(r2_fastq, "GiB")*2.2) + 50
@@ -245,7 +268,7 @@ task GetSampleName {
   input {
     File bam
 
-    String docker = "us.gcr.io/broad-gatk/gatk:4.2.5.0"
+    String docker = "us.gcr.io/broad-gatk/gatk:4.2.6.0"
     Int cpu = 1
     Int memory_mb = 1000
     Int disk_size_gb = ceil(2.0 * size(bam, "GiB")) + 10
@@ -322,7 +345,7 @@ task CollectRNASeqMetrics {
     File ref_flat
     File ribosomal_intervals
 
-    String docker =  "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.6"
+    String docker =  "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.11"
     Int cpu = 1
     Int memory_mb = 7500
     Int disk_size_gb = ceil(size(input_bam, "GiB") + size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB")) + 20
@@ -361,7 +384,7 @@ task CollectMultipleMetrics {
     File ref_fasta
     File ref_fasta_index
 
-    String docker =  "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.6"
+    String docker =  "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.11"
     Int cpu = 1
     Int memory_mb = 7500
     Int disk_size_gb = ceil(size(input_bam, "GiB") + size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB")) + 20
@@ -492,7 +515,7 @@ task SortSamByCoordinate {
     # more disk space.  Also it spills to disk in an uncompressed format so we need to account for that with a larger multiplier
     Float sort_sam_disk_multiplier = 4.0
 
-    String docker = "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.6"
+    String docker = "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.11"
     Int cpu = 1
     Int memory_mb = 7500
     Int disk_size_gb = ceil(sort_sam_disk_multiplier * size(input_bam, "GiB")) + 20
@@ -534,7 +557,7 @@ task SortSamByQueryName {
     # more disk space.  Also it spills to disk in an uncompressed format so we need to account for that with a larger multiplier
     Float sort_sam_disk_multiplier = 6.0
 
-    String docker = "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.6"
+    String docker = "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.11"
     Int cpu = 1
     Int memory_mb = 7500
     Int disk_size_gb = ceil(sort_sam_disk_multiplier * size(input_bam, "GiB")) + 20
@@ -598,8 +621,9 @@ task MarkDuplicatesUMIAware {
   input {
     File bam
     String output_basename
+    Boolean remove_duplicates
 
-    String docker = "us.gcr.io/broad-gatk/gatk:4.1.9.0"
+    String docker =  "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.11"
     Int cpu = 1
     Int memory_mb = 16000
     Int disk_size_gb = ceil(3 * size(bam, "GiB")) + 60
@@ -608,7 +632,12 @@ task MarkDuplicatesUMIAware {
   String output_bam_basename = output_basename + ".duplicate_marked"
 
   command <<<
-    gatk MarkDuplicates -I ~{bam} --READ_ONE_BARCODE_TAG BX -O ~{output_bam_basename}.bam --METRICS_FILE ~{output_basename}.duplicate.metrics --ASSUME_SORT_ORDER queryname
+    java -jar /usr/picard/picard.jar MarkDuplicates \
+    INPUT=~{bam} \
+    OUTPUT=~{output_bam_basename}.bam \
+    METRICS_FILE=~{output_basename}.duplicate.metrics \
+    READ_ONE_BARCODE_TAG=BX \
+    REMOVE_DUPLICATES=~{remove_duplicates}
   >>>
 
   output {
@@ -653,6 +682,8 @@ task formatPipelineOutputs {
     File unified_metrics
     Float contamination
     Float contamination_error
+    String fastqc_html_report
+    String fastqc_percent_reads_with_adapter
 
     Int cpu = 1
     Int memory_mb = 2000
@@ -692,8 +723,10 @@ task formatPipelineOutputs {
     outputs_dict["picard_quality_distribution_pdf_file"]="~{picard_quality_distribution_pdf}"
     outputs_dict["fp_summary_metrics_file"]="~{picard_fingerprint_summary_metrics}"
     outputs_dict["fp_detail_metrics_file"]="~{picard_fingerprint_detail_metrics}"
+    outputs_dict["fastqc_html_report"]="~{fastqc_html_report}"
 
     # truncate to 5 digits
+    outputs_dict["fastqc_percent_reads_with_adapter"]='%.5f'%(~{fastqc_percent_reads_with_adapter})
     outputs_dict["contamination"]='%.5f'%(~{contamination})
     outputs_dict["contamination_error"]='%.5f'%(~{contamination_error})
 
@@ -714,7 +747,7 @@ task formatPipelineOutputs {
   >>>
 
   runtime {
-    docker: "broadinstitute/horsefish:twisttcap_scripts"
+    docker: "broadinstitute/horsefish:tdr_import_v1.1"
     cpu: cpu
     memory: "~{memory_mb} MiB"
     disks: "local-disk ~{disk_size_gb} HDD"
@@ -780,7 +813,7 @@ task CalculateContamination {
     File population_vcf
     File population_vcf_index
     # runtime
-    String docker = "us.gcr.io/broad-gatk/gatk:4.2.5.0"
+    String docker = "us.gcr.io/broad-gatk/gatk:4.2.6.0"
     Int cpu = 1
     Int memory_mb = 8192
     Int disk_size_gb = 256
@@ -813,17 +846,154 @@ task CalculateContamination {
     grep -v ^sample ~{base_name}_contamination.tsv | awk 'BEGIN{FS="\t"}{print($3)}' > contamination_error.txt
   >>>
 
-    runtime {
-      docker: docker
-      cpu: cpu
-      memory: "~{memory_mb} MiB"
-      disks: "local-disk ~{disk_size_gb} HDD"
-    }
+  runtime {
+    docker: docker
+    cpu: cpu
+    memory: "~{memory_mb} MiB"
+    disks: "local-disk ~{disk_size_gb} HDD"
+  }
 
-    output {
-        File pileups = "~{base_name}_pileups.tsv"
-        File contamination_table = "~{base_name}_contamination.tsv"
-        Float contamination = read_float("contamination.txt")
-        Float contamination_error = read_float("contamination_error.txt")
-    }
+  output {
+      File pileups = "~{base_name}_pileups.tsv"
+      File contamination_table = "~{base_name}_contamination.tsv"
+      Float contamination = read_float("contamination.txt")
+      Float contamination_error = read_float("contamination_error.txt")
+  }
+}
+
+task SamToFastq {
+  input {
+    File bam
+    String output_prefix
+    String docker = "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.11"
+    Int memory_mb = 4096
+    Int disk_size_gb = 3*ceil(size(bam, "GiB")) + 128
+  }
+
+  command {
+    java -jar /usr/picard/picard.jar SamToFastq \
+    I=~{bam} \
+    FASTQ=~{output_prefix}_1.fastq.gz \
+    SECOND_END_FASTQ=~{output_prefix}_2.fastq.gz
+
+  }
+
+  runtime {
+    docker: docker
+    memory: "~{memory_mb} MiB"
+    disks: "local-disk ~{disk_size_gb} HDD"
+  }
+
+  output {
+    File fastq1 = output_prefix + "_1.fastq.gz"
+    File fastq2 = output_prefix + "_2.fastq.gz"
+  }
+}
+
+task FastQC {
+  input {
+    File unmapped_bam
+    Float? mem = 4
+    String docker = "us.gcr.io/tag-public/tag-tools:1.0.0" # sato: This likely needs to be made public
+    Int memory_mb = 4096
+    Int disk_size_gb = 3*ceil(size(unmapped_bam, "GiB")) + 128
+  }
+
+  String bam_basename = basename(unmapped_bam, ".bam")
+
+  command {
+    perl /usr/tag/scripts/FastQC/fastqc ~{unmapped_bam} --extract -o ./
+    mv ~{bam_basename}_fastqc/fastqc_data.txt ~{bam_basename}_fastqc_data.txt
+
+    tail -n 2 ~{bam_basename}_fastqc_data.txt | head -n 1 | cut -f 2 > ~{bam_basename}_adapter_content.txt
+  }
+  
+  runtime {
+    docker: docker
+    memory: "~{memory_mb} MiB"
+    disks: "local-disk ~{disk_size_gb} HDD"
+  }
+
+  output {
+    File fastqc_data = "~{bam_basename}_fastqc_data.txt"
+    File fastqc_html = "~{bam_basename}_fastqc.html"
+    Float fastqc_percent_reads_with_adapter = read_float("~{bam_basename}_adapter_content.txt")
+  }
+}
+
+task TransferReadTags {
+  input {
+    File aligned_bam
+    File ubam
+    String output_basename
+    String docker = "us.gcr.io/broad-gatk/gatk:4.2.6.0"
+    Int memory_mb = 16000
+    Int disk_size_gb = ceil(2 * size(aligned_bam, "GiB")) + ceil(2 * size(ubam, "GiB")) + 128
+  }
+
+  command <<<
+    gatk TransferReadTags \
+    -I ~{aligned_bam} \
+    --unmapped-sam ~{ubam} \
+    -O ~{output_basename}.bam \
+    --read-tags RX
+  >>>
+
+  output {
+    File output_bam = "~{output_basename}.bam"
+  }
+
+  runtime {
+    docker: docker
+    memory: "~{memory_mb} MiB"
+    disks: "local-disk ~{disk_size_gb} HDD"
+  }
+}
+
+task PostprocessTranscriptomeForRSEM {
+  input {
+    String prefix
+    File input_bam # the input must be queryname sorted
+    Int disk_size_gb = ceil(3*size(input_bam,"GiB")) + 128
+    String docker = "us.gcr.io/broad-gatk/gatk:4.2.6.0"
+    Int memory_mb = 16000
+  }
+
+  command {
+    gatk PostProcessReadsForRSEM \
+    -I ~{input_bam} \
+    -O ~{prefix}_RSEM_post_processed.bam
+  }
+
+  output {
+    File output_bam = "~{prefix}_RSEM_post_processed.bam"
+  }
+
+  runtime {
+    docker: docker 
+    disks: "local-disk ~{disk_size_gb} HDD"
+    memory: memory_mb
+  }
+}
+
+task CreateEmptyFile {
+  input {
+    Int disk_size_gb = 128
+    String docker = "us.gcr.io/broad-gatk/gatk:4.2.6.0"
+    Int memory_mb = 4096
+  }
+
+  command {
+    echo "place holder for a bam index file" > empty.txt
+  }
+
+  output {
+    File empty_file = "empty.txt"
+  }
+
+  runtime {
+    docker: docker 
+    disks: "local-disk ~{disk_size_gb} HDD"
+    memory: memory_mb
+  }
 }
\ No newline at end of file
diff --git a/tasks/broad/UMIAwareDuplicateMarking.wdl b/tasks/broad/UMIAwareDuplicateMarking.wdl
index 253159bf12..e28427f164 100644
--- a/tasks/broad/UMIAwareDuplicateMarking.wdl
+++ b/tasks/broad/UMIAwareDuplicateMarking.wdl
@@ -21,18 +21,44 @@ workflow UMIAwareDuplicateMarking {
 
   input {
     File aligned_bam 
+    File unaligned_bam
     String output_basename
+    Boolean remove_duplicates
+    Boolean coordinate_sort_output
   }
 
   parameter_meta {
-    aligned_bam: "Aligned bam sorted by the query (read) name"
+    aligned_bam: "Unsorted aligned bam (the output of STAR in multithread mode is not query-name sorted)"
+    unaligned_bam: "Query-name sorted unaligned bam; contains UMIs in the RX tag"
     output_basename: "Basename for file outputs from this workflow"
+    remove_duplicates: "If true, remove (rather than mark) duplicate reads from the output"
+    coordinate_sort: "If true, the output bam will be coordinate sorted. Else it will be query-name sorted."
+  }
+
+  call tasks.SortSamByQueryName as SortSamByQueryNameAfterAlignment {
+    input:
+      input_bam = aligned_bam,
+      output_bam_basename = output_basename + ".queryname_sorted"
+  }
+
+  # It appears we cannot assume that the unmapped bam/fastqs will be sorted
+  call tasks.SortSamByQueryName as SortSamByQueryNameUnmapped {
+    input:
+      input_bam = unaligned_bam,
+      output_bam_basename = output_basename + ".u.queryname_sorted"
+  }
+
+  call tasks.TransferReadTags {
+    input:
+      aligned_bam = SortSamByQueryNameAfterAlignment.output_bam,
+      ubam = SortSamByQueryNameUnmapped.output_bam,
+      output_basename = output_basename + ".queryname_sorted_with_RX"
   }
 
   # First sort the aligned bam by coordinate, so we can group duplicate sets using UMIs in the next step.
   call tasks.SortSamByCoordinate as SortSamByCoordinateFirstPass {
     input:
-      input_bam = aligned_bam,
+      input_bam = TransferReadTags.output_bam,
       output_bam_basename = output_basename + ".STAR_aligned.coordinate_sorted"
   }
 
@@ -49,7 +75,7 @@ workflow UMIAwareDuplicateMarking {
       output_bam_basename = output_basename + ".grouped_by_UMI"
   }
 
-  call tasks.SortSamByQueryName as SortSamByQueryName {
+  call tasks.SortSamByQueryName as SortSamByQueryNameBeforeDuplicateMarking {
     input:
       input_bam = GroupByUMIs.grouped_bam,
       output_bam_basename = output_basename + ".grouped.queryname_sorted"
@@ -57,19 +83,29 @@ workflow UMIAwareDuplicateMarking {
 
   call tasks.MarkDuplicatesUMIAware as MarkDuplicates {
     input:
-      bam = SortSamByQueryName.output_bam,
-      output_basename = output_basename
+      bam = SortSamByQueryNameBeforeDuplicateMarking.output_bam,
+      output_basename = output_basename,
+      remove_duplicates = remove_duplicates
   }
 
-  call tasks.SortSamByCoordinate as SortSamByCoordinateSecondPass {
-    input:
-      input_bam = MarkDuplicates.duplicate_marked_bam,
-      output_bam_basename = output_basename + ".duplicate_marked.coordinate_sorted"
+  if (coordinate_sort_output){
+    call tasks.SortSamByCoordinate as SortSamByCoordinateSecondPass {
+      input:
+        input_bam = MarkDuplicates.duplicate_marked_bam,
+        output_bam_basename = output_basename + ".duplicate_marked.coordinate_sorted"
+    }
+  }
+
+  # We won't have the index file if the output is query-name sorted.
+  # Until we remove the transcriptome index from the TDR schema,
+  # output a placeholder text file as the bam index as a temporary fix
+  if (!coordinate_sort_output){
+    call tasks.CreateEmptyFile {}
   }
 
   output {
-    File duplicate_marked_bam = SortSamByCoordinateSecondPass.output_bam
-    File duplicate_marked_bam_index = SortSamByCoordinateSecondPass.output_bam_index
+    File duplicate_marked_bam = select_first([SortSamByCoordinateSecondPass.output_bam, MarkDuplicates.duplicate_marked_bam])
+    File duplicate_marked_bam_index = select_first([SortSamByCoordinateSecondPass.output_bam_index, CreateEmptyFile.empty_file])
     File duplicate_metrics = MarkDuplicates.duplicate_metrics
   }
 }