diff --git a/dockers/broad/rna_seq/fastp/Dockerfile b/dockers/broad/rna_seq/fastp/Dockerfile new file mode 100644 index 0000000000..adf501195b --- /dev/null +++ b/dockers/broad/rna_seq/fastp/Dockerfile @@ -0,0 +1,36 @@ +FROM debian:bullseye-slim + +ARG FASTP_VERSION=0.20.1 + +ENV TERM=xterm-256color \ + FASTP_URL=http://opengene.org/fastp/fastp.${FASTP_VERSION} \ + TINI_VERSION=v0.19.0 + +LABEL MAINTAINER="Broad Institute DSDE " \ + FASTP_VERSION=${FASTP_VERSION} + +WORKDIR /usr/gitc + +# Install dependencies + +RUN set -eux; \ + apt-get update; \ + apt-get install -y \ + wget \ + ; \ +# Install fastp + wget ${FASTP_URL} ;\ + \ + mv fastp.${FASTP_VERSION} /usr/local/bin/fastp; \ + chmod a+x /usr/local/bin/fastp \ + ; \ +# Install tini + wget https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini -O /sbin/tini; \ + chmod +x /sbin/tini \ + ; \ +# Clean up cached files + apt-get clean && rm -rf /var/lib/apt/lists/* + +# Set tini as default entrypoint +ENTRYPOINT [ "/sbin/tini", "--" ] + diff --git a/dockers/broad/rna_seq/fastp/README.md b/dockers/broad/rna_seq/fastp/README.md new file mode 100644 index 0000000000..2bd0154518 --- /dev/null +++ b/dockers/broad/rna_seq/fastp/README.md @@ -0,0 +1,33 @@ +# RNA Seq fastp + +## Quick reference + +Copy and paste to pull this image + +#### `us.gcr.io/broad-gotc-prod/fastp:1.0.0-0.20.1-1649253500` + +- __What is this image:__ This image is a lightweight debian based image for running the fastp tool set within our RNA sequencing pipeline. +- __What is fastp:__ fastp from OpenGene is a tool designed to provide fast all-in-one preprocessing for FastQ files. See [here](https://github.com/OpenGene/fastp) for more information. +- __How to see tool version used in image:__ Please see below. + +## Versioning + +fastp uses the following convention for versioning: + +#### `us.gcr.io/broad-gotc-prod/fastp:--` + +We keep track of all past versions in [docker_versions](docker_versions.tsv) with the last image listed being the currently used version in WARP. + +You can see more information about the image, including the tool versions, by running the following command: + +```bash +$ docker pull us.gcr.io/broad-gotc-prod/fastp:1.0.0-0.20.1-1649253500 +$ docker inspect us.gcr.io/broad-gotc-prod/fastp:1.0.0-0.20.1-1649253500 +``` + +## Usage + +```bash +$ docker run --rm -it \ + us.gcr.io/broad-gotc-prod/fastp:1.0.0-0.20.1-1649253500 fastp +``` \ No newline at end of file diff --git a/dockers/broad/rna_seq/fastp/docker_build.sh b/dockers/broad/rna_seq/fastp/docker_build.sh new file mode 100755 index 0000000000..e3298dff0a --- /dev/null +++ b/dockers/broad/rna_seq/fastp/docker_build.sh @@ -0,0 +1,71 @@ +set -e + +# Update verson when changes to Dockerfile are made +DOCKER_IMAGE_VERSION=1.0.0 +TIMESTAMP=$(date +"%s") +DIR=$(cd $(dirname $0) && pwd) + +# Registries and tags +GCR_URL="us.gcr.io/broad-gotc-prod/fastp" +QUAY_URL="quay.io/broadinstitute/gotc-prod-fastp" + +# Fgbio version +FASTP_VERSION="0.20.1" + +# Necessary tools and help text +TOOLS=(docker gcloud) +HELP="$(basename "$0") [-h|--help] [-t|--tools] -- script to build the fastp image and push to GCR & Quay + +where: + -h|--help Show help text + -v|--version of fastp to use (default: $FGBIO_VERSION) + -t|--tools Show tools needed to run script + " + +function main(){ + for t in "${TOOLS[@]}"; do which $t >/dev/null || ok=no; done + if [[ $ok == no ]]; then + echo "Missing one of the following tools: " + for t in "${TOOLS[@]}"; do echo "$t"; done + exit 1 + fi + + while [[ $# -gt 0 ]] + do + key="$1" + case $key in + -v|--version) + FASTP_VERSION="$2" + shift + shift + ;; + -h|--help) + echo "$HELP" + exit 0 + ;; + -t|--tools) + for t in "${TOOLS[@]}"; do echo $t; done + exit 0 + ;; + *) + shift + ;; + esac + done + + IMAGE_TAG="$DOCKER_IMAGE_VERSION-$FASTP_VERSION-$TIMESTAMP" + + echo "building and pushing GCR Image - $GCR_URL:$IMAGE_TAG" + docker build --no-cache -t "$GCR_URL:$IMAGE_TAG" \ + --build-arg FASTP_VERSION="$FASTP_VERSION" "$DIR" + docker push "$GCR_URL:$IMAGE_TAG" + + #echo "tagging and pushing Quay Image" + #docker tag "$GCR_URL:$IMAGE_TAG" "$QUAY_URL:$IMAGE_TAG" + #docker push "$QUAY_URL:$IMAGE_TAG" + + echo "$GCR_URL:$IMAGE_TAG" >> "$DIR/docker_versions.tsv" + echo "done" +} + +main "$@" \ No newline at end of file diff --git a/dockers/broad/rna_seq/fastp/docker_versions.tsv b/dockers/broad/rna_seq/fastp/docker_versions.tsv new file mode 100644 index 0000000000..5a25da6143 --- /dev/null +++ b/dockers/broad/rna_seq/fastp/docker_versions.tsv @@ -0,0 +1 @@ +us.gcr.io/broad-gotc-prod/fastp:1.0.0-0.20.1-1649253500 diff --git a/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.changelog.md b/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.changelog.md index 9905f00e31..310f8c39f9 100644 --- a/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.changelog.md +++ b/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.changelog.md @@ -1,3 +1,8 @@ +# 1.0.7 +2022-04-12 (Date of Last Commit) + +* Clip adapter bases pre-alignment & associated updates for TDR ingest + # 1.0.6 2022-04-04 (Date of Last Commit) diff --git a/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.wdl b/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.wdl index d2f1323e26..3b5c05ccaf 100644 --- a/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.wdl +++ b/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.wdl @@ -7,7 +7,7 @@ import "../../../../tasks/broad/Utilities.wdl" as utils workflow BroadInternalRNAWithUMIs { - String pipeline_version = "1.0.6" + String pipeline_version = "1.0.7" input { # input needs to be either "hg19" or "hg38" @@ -157,7 +157,9 @@ workflow BroadInternalRNAWithUMIs { picard_fingerprint_detail_metrics = CheckFingerprint.fingerprint_detail_metrics_file, unified_metrics = MergeMetrics.unified_metrics, contamination = RNAWithUMIs.contamination, - contamination_error = RNAWithUMIs.contamination_error + contamination_error = RNAWithUMIs.contamination_error, + fastqc_html_report = RNAWithUMIs.fastqc_html_report, + fastqc_percent_reads_with_adapter = RNAWithUMIs.fastqc_percent_reads_with_adapter } call tasks.updateOutputsInTDR { @@ -196,5 +198,7 @@ workflow BroadInternalRNAWithUMIs { File unified_metrics = MergeMetrics.unified_metrics Float contamination = RNAWithUMIs.contamination Float contamination_error = RNAWithUMIs.contamination_error + File fastqc_html_report = RNAWithUMIs.fastqc_html_report + Float fastqc_percent_reads_with_adapter = RNAWithUMIs.fastqc_percent_reads_with_adapter } } diff --git a/pipelines/broad/internal/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg19.fastqs.plumbing.json b/pipelines/broad/internal/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg19.fastqs.plumbing.json new file mode 100644 index 0000000000..1c5590a574 --- /dev/null +++ b/pipelines/broad/internal/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg19.fastqs.plumbing.json @@ -0,0 +1,18 @@ +{ + "BroadInternalRNAWithUMIs.sample_lsid": "broadinstitute.org:bsp.prod.sample:K4Y2X", + "BroadInternalRNAWithUMIs.output_basename": "SM-K4Y2X_350ng_.65X_D3", + + "BroadInternalRNAWithUMIs.reference_build": "hg19", + + "BroadInternalRNAWithUMIs.r1_fastq": "gs://broad-gotc-test-storage/rna_seq/rna_with_umis/plumbing/inputs/SM-K4Y2X_350ng_.65X_D3_read1_plumbing.fastq", + "BroadInternalRNAWithUMIs.r2_fastq": "gs://broad-gotc-test-storage/rna_seq/rna_with_umis/plumbing/inputs/SM-K4Y2X_350ng_.65X_D3_read2_plumbing.fastq", + "BroadInternalRNAWithUMIs.read1Structure": "3M2S146T", + "BroadInternalRNAWithUMIs.read2Structure": "3M2S146T", + "BroadInternalRNAWithUMIs.library_name": "SM-K4Y2X_350ng_.65X_D3", + "BroadInternalRNAWithUMIs.platform": "ILLUMINA", + "BroadInternalRNAWithUMIs.platform_unit": "barcode1", + "BroadInternalRNAWithUMIs.read_group_name": "RG1", + + "BroadInternalRNAWithUMIs.vault_token_path": "{VAULT_TOKEN_PATH}", + "BroadInternalRNAWithUMIs.environment": "{ENV}" +} \ No newline at end of file diff --git a/pipelines/broad/internal/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg38.fastqs.plumbing.json b/pipelines/broad/internal/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg38.fastqs.plumbing.json new file mode 100644 index 0000000000..09ac79d9d3 --- /dev/null +++ b/pipelines/broad/internal/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg38.fastqs.plumbing.json @@ -0,0 +1,18 @@ +{ + "BroadInternalRNAWithUMIs.sample_lsid": "broadinstitute.org:bsp.prod.sample:K4Y2X", + "BroadInternalRNAWithUMIs.output_basename": "SM-K4Y2X_350ng_.65X_D3", + + "BroadInternalRNAWithUMIs.reference_build": "hg38", + + "BroadInternalRNAWithUMIs.r1_fastq": "gs://broad-gotc-test-storage/rna_seq/rna_with_umis/plumbing/inputs/SM-K4Y2X_350ng_.65X_D3_read1_plumbing.fastq", + "BroadInternalRNAWithUMIs.r2_fastq": "gs://broad-gotc-test-storage/rna_seq/rna_with_umis/plumbing/inputs/SM-K4Y2X_350ng_.65X_D3_read2_plumbing.fastq", + "BroadInternalRNAWithUMIs.read1Structure": "3M2S146T", + "BroadInternalRNAWithUMIs.read2Structure": "3M2S146T", + "BroadInternalRNAWithUMIs.library_name": "SM-K4Y2X_350ng_.65X_D3", + "BroadInternalRNAWithUMIs.platform": "ILLUMINA", + "BroadInternalRNAWithUMIs.platform_unit": "barcode1", + "BroadInternalRNAWithUMIs.read_group_name": "RG1", + + "BroadInternalRNAWithUMIs.vault_token_path": "{VAULT_TOKEN_PATH}", + "BroadInternalRNAWithUMIs.environment": "{ENV}" +} \ No newline at end of file diff --git a/pipelines/broad/rna_seq/RNAWithUMIsPipeline.changelog.md b/pipelines/broad/rna_seq/RNAWithUMIsPipeline.changelog.md index 29e681384f..43eb1010a5 100644 --- a/pipelines/broad/rna_seq/RNAWithUMIsPipeline.changelog.md +++ b/pipelines/broad/rna_seq/RNAWithUMIsPipeline.changelog.md @@ -1,3 +1,8 @@ +# 1.0.4 +2022-04-08 (Date of Last Commit) + +* Clip adapter bases pre-alignment + # 1.0.3 2022-03-29 (Date of Last Commit) diff --git a/pipelines/broad/rna_seq/RNAWithUMIsPipeline.wdl b/pipelines/broad/rna_seq/RNAWithUMIsPipeline.wdl index 593ed94bb8..6645098bf2 100644 --- a/pipelines/broad/rna_seq/RNAWithUMIsPipeline.wdl +++ b/pipelines/broad/rna_seq/RNAWithUMIsPipeline.wdl @@ -20,7 +20,7 @@ import "../../../tasks/broad/RNAWithUMIsTasks.wdl" as tasks workflow RNAWithUMIsPipeline { - String pipeline_version = "1.0.3" + String pipeline_version = "1.0.4" input { File? bam @@ -30,12 +30,11 @@ workflow RNAWithUMIsPipeline { String read2Structure String output_basename - # The following inputs are only required if fastqs are given as input. - String? platform - String? library_name - String? platform_unit - String? read_group_name - String? sequencing_center = "BI" + String platform + String library_name + String platform_unit + String read_group_name + String sequencing_center = "BI" File starIndex File gtf @@ -60,11 +59,11 @@ workflow RNAWithUMIsPipeline { starIndex: "TAR file containing genome indices used for the STAR aligner" output_basename: "String used as a prefix in workflow output files" gtf: "Gene annotation file (GTF) used for the rnaseqc tool" - platform: "String used to describe the sequencing platform; only required when using FASTQ files as input" - library_name: "String used to describe the library; only required when using FASTQ files as input" - platform_unit: "String used to describe the platform unit; only required when using FASTQ files as input" - read_group_name: "String used to describe the read group name; only required when using FASTQ files as input" - sequencing_center: "String used to describe the sequencing center; only required when using FASTQ files as input; default is set to 'BI'" + platform: "String used to describe the sequencing platform" + library_name: "String used to describe the library" + platform_unit: "String used to describe the platform unit" + read_group_name: "String used to describe the read group name" + sequencing_center: "String used to describe the sequencing center; default is set to 'BI'" ref: "FASTA file used for metric collection with Picard tools" refIndex: "FASTA index file used for metric collection with Picard tools" refDict: "Dictionary file used for metric collection with Picard tools" @@ -79,12 +78,7 @@ workflow RNAWithUMIsPipeline { input: bam = bam, r1_fastq = r1_fastq, - r2_fastq = r2_fastq, - library_name = library_name, - platform = platform, - platform_unit = platform_unit, - read_group_name = read_group_name, - sequencing_center = sequencing_center + r2_fastq = r2_fastq } if (VerifyPipelineInputs.fastq_run) { @@ -93,11 +87,11 @@ workflow RNAWithUMIsPipeline { r1_fastq = select_first([r1_fastq]), r2_fastq = select_first([r2_fastq]), bam_filename = output_basename, - library_name = select_first([library_name]), - platform = select_first([platform]), - platform_unit = select_first([platform_unit]), - read_group_name = select_first([read_group_name]), - sequencing_center = select_first([sequencing_center]) + library_name = library_name, + platform = platform, + platform_unit = platform_unit, + read_group_name = read_group_name, + sequencing_center = sequencing_center } } @@ -110,9 +104,43 @@ workflow RNAWithUMIsPipeline { read2Structure = read2Structure } - call tasks.STAR { + # Convert SAM to fastq for adapter clipping + # This step also removes reads that fail platform/vendor quality checks + call tasks.SamToFastq { input: bam = ExtractUMIs.bam_umis_extracted, + output_prefix = output_basename + } + + # Adapter clipping + call tasks.Fastp { + input: + fastq1 = SamToFastq.fastq1, + fastq2 = SamToFastq.fastq2, + output_prefix = output_basename + ".adapter_clipped" + } + + # Back to SAM before alignment + call tasks.FastqToUbam as FastqToUbamAfterClipping { + input: + r1_fastq = Fastp.fastq1_clipped, + r2_fastq = Fastp.fastq2_clipped, + bam_filename = output_basename + ".adapter_clipped", + library_name = library_name, + platform = platform, + platform_unit = platform_unit, + read_group_name = read_group_name, + sequencing_center = sequencing_center + } + + call tasks.FastQC { + input: + unmapped_bam = FastqToUbamAfterClipping.unmapped_bam + } + + call tasks.STAR { + input: + bam = FastqToUbamAfterClipping.unmapped_bam, starIndex = starIndex } @@ -122,19 +150,32 @@ workflow RNAWithUMIsPipeline { bam_without_readgroups = STAR.transcriptome_bam } + # Mark duplicate reads in the genome-aligned bam. The output is coordinate-sorted. call UmiMD.UMIAwareDuplicateMarking { input: aligned_bam = STAR.aligned_bam, - output_basename = output_basename + unaligned_bam = ExtractUMIs.bam_umis_extracted, + output_basename = output_basename, + remove_duplicates = false, + coordinate_sort_output = true } + # Remove, rather than mark, duplicates reads from the transcriptome-aligned bam + # The output is query-name-sorted. call UmiMD.UMIAwareDuplicateMarking as UMIAwareDuplicateMarkingTranscriptome { input: aligned_bam = CopyReadGroupsToHeader.output_bam, - output_basename = output_basename + ".transcriptome" + unaligned_bam = ExtractUMIs.bam_umis_extracted, + output_basename = output_basename + ".transcriptome", + remove_duplicates = true, + coordinate_sort_output = false } - ### PLACEHOLDER for CROSSCHECK ### + call tasks.PostprocessTranscriptomeForRSEM { + input: + prefix = output_basename + ".transcriptome.RSEM", + input_bam = UMIAwareDuplicateMarkingTranscriptome.duplicate_marked_bam + } call tasks.GetSampleName { input: @@ -208,6 +249,8 @@ workflow RNAWithUMIsPipeline { File picard_quality_distribution_pdf = CollectMultipleMetrics.quality_distribution_pdf Float contamination = CalculateContamination.contamination Float contamination_error = CalculateContamination.contamination_error + File fastqc_html_report = FastQC.fastqc_html + Float fastqc_percent_reads_with_adapter = FastQC.fastqc_percent_reads_with_adapter } } diff --git a/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-HFCB1_100ng_.65X_A3.hg19.plumbing.json b/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-HFCB1_100ng_.65X_A3.hg19.plumbing.json index 0e7643e620..6e299bf072 100644 --- a/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-HFCB1_100ng_.65X_A3.hg19.plumbing.json +++ b/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-HFCB1_100ng_.65X_A3.hg19.plumbing.json @@ -5,6 +5,11 @@ "RNAWithUMIsPipeline.output_basename": "SM-HFCB1_100ng_.65X_A3", + "RNAWithUMIsPipeline.library_name": "SM-HFCB1_100ng_.65X_A3", + "RNAWithUMIsPipeline.platform": "ILLUMINA", + "RNAWithUMIsPipeline.platform_unit": "barcode1", + "RNAWithUMIsPipeline.read_group_name": "RG1", + "RNAWithUMIsPipeline.starIndex": "gs://gcp-public-data--broad-references/hg19/v0/star/STAR2.7.10a_genome_hg19_noALT_noHLA_noDecoy_v19_oh145.tar.gz", "RNAWithUMIsPipeline.gtf": "gs://gcp-public-data--broad-references/hg19/v0/annotation/gencode.v19.genes.v7.collapsed_only.patched_contigs.gtf", "RNAWithUMIsPipeline.ref": "gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.fasta", diff --git a/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-HFCB1_100ng_.65X_A3.hg38.plumbing.json b/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-HFCB1_100ng_.65X_A3.hg38.plumbing.json index 516f4c3879..eeee337a52 100644 --- a/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-HFCB1_100ng_.65X_A3.hg38.plumbing.json +++ b/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-HFCB1_100ng_.65X_A3.hg38.plumbing.json @@ -5,6 +5,11 @@ "RNAWithUMIsPipeline.output_basename": "SM-HFCB1_100ng_.65X_A3", + "RNAWithUMIsPipeline.library_name": "SM-HFCB1_100ng_.65X_A3", + "RNAWithUMIsPipeline.platform": "ILLUMINA", + "RNAWithUMIsPipeline.platform_unit": "barcode1", + "RNAWithUMIsPipeline.read_group_name": "RG1", + "RNAWithUMIsPipeline.starIndex": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/star/STAR2.7.10a_genome_GRCh38_noALT_noHLA_noDecoy_v34_oh145.tar.gz", "RNAWithUMIsPipeline.gtf": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/annotation/gencode.v34.annotation_collapsed_only.gtf", "RNAWithUMIsPipeline.ref": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/Homo_sapiens_assembly38_noALT_noHLA_noDecoy.fasta", diff --git a/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg19.fastqs.plumbing.json b/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg19.fastqs.plumbing.json new file mode 100644 index 0000000000..923fa59a61 --- /dev/null +++ b/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg19.fastqs.plumbing.json @@ -0,0 +1,24 @@ +{ + "RNAWithUMIsPipeline.r1_fastq": "gs://broad-gotc-test-storage/rna_seq/rna_with_umis/plumbing/inputs/SM-K4Y2X_350ng_.65X_D3_read1_plumbing.fastq", + "RNAWithUMIsPipeline.r2_fastq": "gs://broad-gotc-test-storage/rna_seq/rna_with_umis/plumbing/inputs/SM-K4Y2X_350ng_.65X_D3_read2_plumbing.fastq", + "RNAWithUMIsPipeline.read1Structure": "3M2S146T", + "RNAWithUMIsPipeline.read2Structure": "3M2S146T", + + "RNAWithUMIsPipeline.output_basename": "SM-K4Y2X_350ng_.65X_D3", + + "RNAWithUMIsPipeline.library_name": "SM-K4Y2X_350ng_.65X_D3", + "RNAWithUMIsPipeline.platform": "ILLUMINA", + "RNAWithUMIsPipeline.platform_unit": "barcode1", + "RNAWithUMIsPipeline.read_group_name": "RG1", + + "RNAWithUMIsPipeline.starIndex": "gs://gcp-public-data--broad-references/hg19/v0/star/STAR2.7.10a_genome_hg19_noALT_noHLA_noDecoy_v19_oh145.tar.gz", + "RNAWithUMIsPipeline.gtf": "gs://gcp-public-data--broad-references/hg19/v0/annotation/gencode.v19.genes.v7.collapsed_only.patched_contigs.gtf", + "RNAWithUMIsPipeline.ref": "gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.fasta", + "RNAWithUMIsPipeline.refIndex": "gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.fasta.fai", + "RNAWithUMIsPipeline.refDict": "gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.dict", + "RNAWithUMIsPipeline.refFlat": "gs://gcp-public-data--broad-references/hg19/v0/annotation/Homo_sapiens_assembly19.refFlat.txt", + "RNAWithUMIsPipeline.ribosomalIntervals": "gs://gcp-public-data--broad-references/hg19/v0/annotation/Homo_sapiens_assembly19.rRNA.interval_list", + "RNAWithUMIsPipeline.exonBedFile": "gs://gcp-public-data--broad-references/hg19/v0/annotation/gencode.v19.hg19.insert_size_intervals_geq1000bp.bed", + "RNAWithUMIsPipeline.population_vcf": "gs://gatk-best-practices/somatic-b37/small_exac_common_3.vcf", + "RNAWithUMIsPipeline.population_vcf_index": "gs://gatk-best-practices/somatic-b37/small_exac_common_3.vcf.idx" +} \ No newline at end of file diff --git a/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg19.plumbing.json b/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg19.plumbing.json new file mode 100644 index 0000000000..d5e704de07 --- /dev/null +++ b/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg19.plumbing.json @@ -0,0 +1,23 @@ +{ + "RNAWithUMIsPipeline.bam": "gs://broad-gotc-test-storage/rna_seq/rna_with_umis/plumbing/inputs/SM-K4Y2X_350ng_.65X_D3_plumbing.u.bam", + "RNAWithUMIsPipeline.read1Structure": "3M2S146T", + "RNAWithUMIsPipeline.read2Structure": "3M2S146T", + + "RNAWithUMIsPipeline.output_basename": "SM-K4Y2X_350ng_.65X_D3", + + "RNAWithUMIsPipeline.library_name": "SM-K4Y2X_350ng_.65X_D3", + "RNAWithUMIsPipeline.platform": "ILLUMINA", + "RNAWithUMIsPipeline.platform_unit": "barcode1", + "RNAWithUMIsPipeline.read_group_name": "RG1", + + "RNAWithUMIsPipeline.starIndex": "gs://gcp-public-data--broad-references/hg19/v0/star/STAR2.7.10a_genome_hg19_noALT_noHLA_noDecoy_v19_oh145.tar.gz", + "RNAWithUMIsPipeline.gtf": "gs://gcp-public-data--broad-references/hg19/v0/annotation/gencode.v19.genes.v7.collapsed_only.patched_contigs.gtf", + "RNAWithUMIsPipeline.ref": "gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.fasta", + "RNAWithUMIsPipeline.refIndex": "gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.fasta.fai", + "RNAWithUMIsPipeline.refDict": "gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.dict", + "RNAWithUMIsPipeline.refFlat": "gs://gcp-public-data--broad-references/hg19/v0/annotation/Homo_sapiens_assembly19.refFlat.txt", + "RNAWithUMIsPipeline.ribosomalIntervals": "gs://gcp-public-data--broad-references/hg19/v0/annotation/Homo_sapiens_assembly19.rRNA.interval_list", + "RNAWithUMIsPipeline.exonBedFile": "gs://gcp-public-data--broad-references/hg19/v0/annotation/gencode.v19.hg19.insert_size_intervals_geq1000bp.bed", + "RNAWithUMIsPipeline.population_vcf": "gs://gatk-best-practices/somatic-b37/small_exac_common_3.vcf", + "RNAWithUMIsPipeline.population_vcf_index": "gs://gatk-best-practices/somatic-b37/small_exac_common_3.vcf.idx" +} \ No newline at end of file diff --git a/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg38.fastqs.plumbing.json b/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg38.fastqs.plumbing.json new file mode 100644 index 0000000000..a3d23e61cb --- /dev/null +++ b/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg38.fastqs.plumbing.json @@ -0,0 +1,24 @@ +{ + "RNAWithUMIsPipeline.r1_fastq": "gs://broad-gotc-test-storage/rna_seq/rna_with_umis/plumbing/inputs/SM-K4Y2X_350ng_.65X_D3_read1_plumbing.fastq", + "RNAWithUMIsPipeline.r2_fastq": "gs://broad-gotc-test-storage/rna_seq/rna_with_umis/plumbing/inputs/SM-K4Y2X_350ng_.65X_D3_read2_plumbing.fastq", + "RNAWithUMIsPipeline.read1Structure": "3M2S146T", + "RNAWithUMIsPipeline.read2Structure": "3M2S146T", + + "RNAWithUMIsPipeline.output_basename": "SM-K4Y2X_350ng_.65X_D3", + + "RNAWithUMIsPipeline.library_name": "SM-K4Y2X_350ng_.65X_D3", + "RNAWithUMIsPipeline.platform": "ILLUMINA", + "RNAWithUMIsPipeline.platform_unit": "barcode1", + "RNAWithUMIsPipeline.read_group_name": "RG1", + + "RNAWithUMIsPipeline.starIndex": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/star/STAR2.7.10a_genome_GRCh38_noALT_noHLA_noDecoy_v34_oh145.tar.gz", + "RNAWithUMIsPipeline.gtf": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/annotation/gencode.v34.annotation_collapsed_only.gtf", + "RNAWithUMIsPipeline.ref": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/Homo_sapiens_assembly38_noALT_noHLA_noDecoy.fasta", + "RNAWithUMIsPipeline.refIndex": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/Homo_sapiens_assembly38_noALT_noHLA_noDecoy.fasta.fai", + "RNAWithUMIsPipeline.refDict": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/Homo_sapiens_assembly38_noALT_noHLA_noDecoy.dict", + "RNAWithUMIsPipeline.refFlat": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/annotation/hg38_GENCODE_v34_refFlat.txt", + "RNAWithUMIsPipeline.ribosomalIntervals": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/annotation/gencode_v34_rRNA.interval_list", + "RNAWithUMIsPipeline.exonBedFile": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/annotation/gencode.v34.GRCh38.insert_size_intervals_geq1000bp.bed", + "RNAWithUMIsPipeline.population_vcf": "gs://gatk-best-practices/somatic-hg38/small_exac_common_3.hg38.vcf.gz", + "RNAWithUMIsPipeline.population_vcf_index": "gs://gatk-best-practices/somatic-hg38/small_exac_common_3.hg38.vcf.gz.tbi" +} \ No newline at end of file diff --git a/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg38.plumbing.json b/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg38.plumbing.json new file mode 100644 index 0000000000..b1287095ea --- /dev/null +++ b/pipelines/broad/rna_seq/test_inputs/Plumbing/SM-K4Y2X_350ng_.65X_D3.hg38.plumbing.json @@ -0,0 +1,23 @@ +{ + "RNAWithUMIsPipeline.bam": "gs://broad-gotc-test-storage/rna_seq/rna_with_umis/plumbing/inputs/SM-K4Y2X_350ng_.65X_D3_plumbing.u.bam", + "RNAWithUMIsPipeline.read1Structure": "3M2S146T", + "RNAWithUMIsPipeline.read2Structure": "3M2S146T", + + "RNAWithUMIsPipeline.output_basename": "SM-K4Y2X_350ng_.65X_D3", + + "RNAWithUMIsPipeline.library_name": "SM-K4Y2X_350ng_.65X_D3", + "RNAWithUMIsPipeline.platform": "ILLUMINA", + "RNAWithUMIsPipeline.platform_unit": "barcode1", + "RNAWithUMIsPipeline.read_group_name": "RG1", + + "RNAWithUMIsPipeline.starIndex": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/star/STAR2.7.10a_genome_GRCh38_noALT_noHLA_noDecoy_v34_oh145.tar.gz", + "RNAWithUMIsPipeline.gtf": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/annotation/gencode.v34.annotation_collapsed_only.gtf", + "RNAWithUMIsPipeline.ref": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/Homo_sapiens_assembly38_noALT_noHLA_noDecoy.fasta", + "RNAWithUMIsPipeline.refIndex": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/Homo_sapiens_assembly38_noALT_noHLA_noDecoy.fasta.fai", + "RNAWithUMIsPipeline.refDict": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/Homo_sapiens_assembly38_noALT_noHLA_noDecoy.dict", + "RNAWithUMIsPipeline.refFlat": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/annotation/hg38_GENCODE_v34_refFlat.txt", + "RNAWithUMIsPipeline.ribosomalIntervals": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/annotation/gencode_v34_rRNA.interval_list", + "RNAWithUMIsPipeline.exonBedFile": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/annotation/gencode.v34.GRCh38.insert_size_intervals_geq1000bp.bed", + "RNAWithUMIsPipeline.population_vcf": "gs://gatk-best-practices/somatic-hg38/small_exac_common_3.hg38.vcf.gz", + "RNAWithUMIsPipeline.population_vcf_index": "gs://gatk-best-practices/somatic-hg38/small_exac_common_3.hg38.vcf.gz.tbi" +} \ No newline at end of file diff --git a/pipelines/broad/rna_seq/test_inputs/Scientific/SM-HFCB1_100ng_.65X_A3.hg19.json b/pipelines/broad/rna_seq/test_inputs/Scientific/SM-HFCB1_100ng_.65X_A3.hg19.json index 59f1fbbbbc..a23cc817ba 100644 --- a/pipelines/broad/rna_seq/test_inputs/Scientific/SM-HFCB1_100ng_.65X_A3.hg19.json +++ b/pipelines/broad/rna_seq/test_inputs/Scientific/SM-HFCB1_100ng_.65X_A3.hg19.json @@ -5,6 +5,11 @@ "RNAWithUMIsPipeline.output_basename": "SM-HFCB1_100ng_.65X_A3", + "RNAWithUMIsPipeline.library_name": "SM-HFCB1_100ng_.65X_A3", + "RNAWithUMIsPipeline.platform": "ILLUMINA", + "RNAWithUMIsPipeline.platform_unit": "barcode1", + "RNAWithUMIsPipeline.read_group_name": "RG1", + "RNAWithUMIsPipeline.starIndex": "gs://gcp-public-data--broad-references/hg19/v0/star/STAR2.7.10a_genome_hg19_noALT_noHLA_noDecoy_v19_oh145.tar.gz", "RNAWithUMIsPipeline.gtf": "gs://gcp-public-data--broad-references/hg19/v0/annotation/gencode.v19.genes.v7.collapsed_only.patched_contigs.gtf", "RNAWithUMIsPipeline.ref": "gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.fasta", diff --git a/pipelines/broad/rna_seq/test_inputs/Scientific/SM-HFCB1_100ng_.65X_A3.hg38.json b/pipelines/broad/rna_seq/test_inputs/Scientific/SM-HFCB1_100ng_.65X_A3.hg38.json index 064038cbe3..7325fc76c6 100644 --- a/pipelines/broad/rna_seq/test_inputs/Scientific/SM-HFCB1_100ng_.65X_A3.hg38.json +++ b/pipelines/broad/rna_seq/test_inputs/Scientific/SM-HFCB1_100ng_.65X_A3.hg38.json @@ -5,6 +5,11 @@ "RNAWithUMIsPipeline.output_basename": "SM-HFCB1_100ng_.65X_A3", + "RNAWithUMIsPipeline.library_name": "SM-HFCB1_100ng_.65X_A3", + "RNAWithUMIsPipeline.platform": "ILLUMINA", + "RNAWithUMIsPipeline.platform_unit": "barcode1", + "RNAWithUMIsPipeline.read_group_name": "RG1", + "RNAWithUMIsPipeline.starIndex": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/star/STAR2.7.10a_genome_GRCh38_noALT_noHLA_noDecoy_v34_oh145.tar.gz", "RNAWithUMIsPipeline.gtf": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/annotation/gencode.v34.annotation_collapsed_only.gtf", "RNAWithUMIsPipeline.ref": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/Homo_sapiens_assembly38_noALT_noHLA_noDecoy.fasta", diff --git a/pipelines/broad/rna_seq/test_inputs/Scientific/SM-K4Y2X_350ng_.65X_D3.hg19.json b/pipelines/broad/rna_seq/test_inputs/Scientific/SM-K4Y2X_350ng_.65X_D3.hg19.json index 50c44e6e88..9c0e314166 100644 --- a/pipelines/broad/rna_seq/test_inputs/Scientific/SM-K4Y2X_350ng_.65X_D3.hg19.json +++ b/pipelines/broad/rna_seq/test_inputs/Scientific/SM-K4Y2X_350ng_.65X_D3.hg19.json @@ -5,6 +5,11 @@ "RNAWithUMIsPipeline.output_basename": "SM-K4Y2X_350ng_.65X_D3", + "RNAWithUMIsPipeline.library_name": "SM-K4Y2X_350ng_.65X_D3", + "RNAWithUMIsPipeline.platform": "ILLUMINA", + "RNAWithUMIsPipeline.platform_unit": "barcode1", + "RNAWithUMIsPipeline.read_group_name": "RG1", + "RNAWithUMIsPipeline.starIndex": "gs://gcp-public-data--broad-references/hg19/v0/star/STAR2.7.10a_genome_hg19_noALT_noHLA_noDecoy_v19_oh145.tar.gz", "RNAWithUMIsPipeline.gtf": "gs://gcp-public-data--broad-references/hg19/v0/annotation/gencode.v19.genes.v7.collapsed_only.patched_contigs.gtf", "RNAWithUMIsPipeline.ref": "gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.fasta", diff --git a/pipelines/broad/rna_seq/test_inputs/Scientific/SM-K4Y2X_350ng_.65X_D3.hg38.json b/pipelines/broad/rna_seq/test_inputs/Scientific/SM-K4Y2X_350ng_.65X_D3.hg38.json index fe4daf1b28..3c42a6feb5 100644 --- a/pipelines/broad/rna_seq/test_inputs/Scientific/SM-K4Y2X_350ng_.65X_D3.hg38.json +++ b/pipelines/broad/rna_seq/test_inputs/Scientific/SM-K4Y2X_350ng_.65X_D3.hg38.json @@ -5,6 +5,11 @@ "RNAWithUMIsPipeline.output_basename": "SM-K4Y2X_350ng_.65X_D3", + "RNAWithUMIsPipeline.library_name": "SM-K4Y2X_350ng_.65X_D3", + "RNAWithUMIsPipeline.platform": "ILLUMINA", + "RNAWithUMIsPipeline.platform_unit": "barcode1", + "RNAWithUMIsPipeline.read_group_name": "RG1", + "RNAWithUMIsPipeline.starIndex": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/star/STAR2.7.10a_genome_GRCh38_noALT_noHLA_noDecoy_v34_oh145.tar.gz", "RNAWithUMIsPipeline.gtf": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/annotation/gencode.v34.annotation_collapsed_only.gtf", "RNAWithUMIsPipeline.ref": "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/Homo_sapiens_assembly38_noALT_noHLA_noDecoy.fasta", diff --git a/tasks/broad/RNAWithUMIsTasks.wdl b/tasks/broad/RNAWithUMIsTasks.wdl index 2772e96cda..7c5c66f91f 100644 --- a/tasks/broad/RNAWithUMIsTasks.wdl +++ b/tasks/broad/RNAWithUMIsTasks.wdl @@ -10,13 +10,6 @@ task VerifyPipelineInputs { File? r1_fastq File? r2_fastq - # fastq specific field - String? platform - String? library_name - String? platform_unit - String? read_group_name - String? sequencing_center = "BI" - String docker = "us.gcr.io/broad-dsp-gcr-public/base/python:3.9-debian" Int cpu = 1 Int memory_mb = 2000 @@ -31,19 +24,13 @@ task VerifyPipelineInputs { bam = "~{bam}" r1_fastq = "~{r1_fastq}" r2_fastq = "~{r2_fastq}" - platform = "~{platform}" - library_name = "~{library_name}" - platform_unit = "~{platform_unit}" - read_group_name = "~{read_group_name}" - sequencing_center = "~{sequencing_center}" if bam and not r1_fastq and not r2_fastq: pass elif r1_fastq and r2_fastq and not bam: - if platform and library_name and platform_unit and read_group_name and sequencing_center: - fastq_flag += 1 - else: - raise ValueError("Invalid Input. Input must be either ubam or pair of fastqs with supplemental data") + fastq_flag += 1 + else: + raise ValueError("Invalid Input. Input must be either ubam or a pair of fastqs") with open("output.txt", "w") as f: if fastq_flag == 1: @@ -101,6 +88,42 @@ task ExtractUMIs { } } +# Adapter clipping +# https://github.com/OpenGene/fastp +task Fastp { + input { + File fastq1 + File fastq2 + String output_prefix + File adapter_fasta = "gs://gcp-public-data--broad-references/RNA/resources/Illumina_adapters.fasta" + + String docker = "us.gcr.io/broad-gotc-prod/fastp:1.0.0-0.20.1-1649253500" + Int memory_mb = "16384" + Int disk_size_gb = 5*ceil(size(fastq1, "GiB")) + 128 + } + + command { + fastp --in1 ~{fastq1} --in2 ~{fastq2} --out1 ~{output_prefix}_read1.fastq.gz --out2 ~{output_prefix}_read2.fastq.gz \ + --disable_quality_filtering \ + --disable_length_filtering \ + --adapter_fasta ~{adapter_fasta} + } + + + runtime { + docker: docker + memory: "~{memory_mb} MiB" + disks: "local-disk ~{disk_size_gb} HDD" + preemptible: 0 + } + + output { + File fastq1_clipped = output_prefix + "_read1.fastq.gz" + File fastq2_clipped = output_prefix + "_read2.fastq.gz" + } + +} + task STAR { input { File bam @@ -121,7 +144,7 @@ task STAR { --runMode alignReads \ --runThreadN ~{cpu} \ --genomeDir star_index \ - --outSAMtype BAM Unsorted \ + --outSAMtype BAM Unsorted \ --readFilesIn ~{bam} \ --readFilesType SAM PE \ --readFilesCommand samtools view -h \ @@ -144,7 +167,7 @@ task STAR { --chimOutJunctionFormat 0 \ --twopassMode Basic \ --quantMode TranscriptomeSAM \ - --quantTranscriptomeBan Singleend \ + --quantTranscriptomeBan IndelSoftclipSingleend \ --alignEndsProtrude 20 ConcordantPair >>> @@ -173,7 +196,7 @@ task FastqToUbam { String read_group_name String sequencing_center - String docker = "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.6" + String docker = "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.11" Int cpu = 1 Int memory_mb = 4000 Int disk_size_gb = ceil(size(r1_fastq, "GiB")*2.2 + size(r2_fastq, "GiB")*2.2) + 50 @@ -245,7 +268,7 @@ task GetSampleName { input { File bam - String docker = "us.gcr.io/broad-gatk/gatk:4.2.5.0" + String docker = "us.gcr.io/broad-gatk/gatk:4.2.6.0" Int cpu = 1 Int memory_mb = 1000 Int disk_size_gb = ceil(2.0 * size(bam, "GiB")) + 10 @@ -322,7 +345,7 @@ task CollectRNASeqMetrics { File ref_flat File ribosomal_intervals - String docker = "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.6" + String docker = "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.11" Int cpu = 1 Int memory_mb = 7500 Int disk_size_gb = ceil(size(input_bam, "GiB") + size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB")) + 20 @@ -361,7 +384,7 @@ task CollectMultipleMetrics { File ref_fasta File ref_fasta_index - String docker = "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.6" + String docker = "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.11" Int cpu = 1 Int memory_mb = 7500 Int disk_size_gb = ceil(size(input_bam, "GiB") + size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB")) + 20 @@ -492,7 +515,7 @@ task SortSamByCoordinate { # more disk space. Also it spills to disk in an uncompressed format so we need to account for that with a larger multiplier Float sort_sam_disk_multiplier = 4.0 - String docker = "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.6" + String docker = "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.11" Int cpu = 1 Int memory_mb = 7500 Int disk_size_gb = ceil(sort_sam_disk_multiplier * size(input_bam, "GiB")) + 20 @@ -534,7 +557,7 @@ task SortSamByQueryName { # more disk space. Also it spills to disk in an uncompressed format so we need to account for that with a larger multiplier Float sort_sam_disk_multiplier = 6.0 - String docker = "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.6" + String docker = "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.11" Int cpu = 1 Int memory_mb = 7500 Int disk_size_gb = ceil(sort_sam_disk_multiplier * size(input_bam, "GiB")) + 20 @@ -598,8 +621,9 @@ task MarkDuplicatesUMIAware { input { File bam String output_basename + Boolean remove_duplicates - String docker = "us.gcr.io/broad-gatk/gatk:4.1.9.0" + String docker = "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.11" Int cpu = 1 Int memory_mb = 16000 Int disk_size_gb = ceil(3 * size(bam, "GiB")) + 60 @@ -608,7 +632,12 @@ task MarkDuplicatesUMIAware { String output_bam_basename = output_basename + ".duplicate_marked" command <<< - gatk MarkDuplicates -I ~{bam} --READ_ONE_BARCODE_TAG BX -O ~{output_bam_basename}.bam --METRICS_FILE ~{output_basename}.duplicate.metrics --ASSUME_SORT_ORDER queryname + java -jar /usr/picard/picard.jar MarkDuplicates \ + INPUT=~{bam} \ + OUTPUT=~{output_bam_basename}.bam \ + METRICS_FILE=~{output_basename}.duplicate.metrics \ + READ_ONE_BARCODE_TAG=BX \ + REMOVE_DUPLICATES=~{remove_duplicates} >>> output { @@ -653,6 +682,8 @@ task formatPipelineOutputs { File unified_metrics Float contamination Float contamination_error + String fastqc_html_report + String fastqc_percent_reads_with_adapter Int cpu = 1 Int memory_mb = 2000 @@ -692,8 +723,10 @@ task formatPipelineOutputs { outputs_dict["picard_quality_distribution_pdf_file"]="~{picard_quality_distribution_pdf}" outputs_dict["fp_summary_metrics_file"]="~{picard_fingerprint_summary_metrics}" outputs_dict["fp_detail_metrics_file"]="~{picard_fingerprint_detail_metrics}" + outputs_dict["fastqc_html_report"]="~{fastqc_html_report}" # truncate to 5 digits + outputs_dict["fastqc_percent_reads_with_adapter"]='%.5f'%(~{fastqc_percent_reads_with_adapter}) outputs_dict["contamination"]='%.5f'%(~{contamination}) outputs_dict["contamination_error"]='%.5f'%(~{contamination_error}) @@ -714,7 +747,7 @@ task formatPipelineOutputs { >>> runtime { - docker: "broadinstitute/horsefish:twisttcap_scripts" + docker: "broadinstitute/horsefish:tdr_import_v1.1" cpu: cpu memory: "~{memory_mb} MiB" disks: "local-disk ~{disk_size_gb} HDD" @@ -780,7 +813,7 @@ task CalculateContamination { File population_vcf File population_vcf_index # runtime - String docker = "us.gcr.io/broad-gatk/gatk:4.2.5.0" + String docker = "us.gcr.io/broad-gatk/gatk:4.2.6.0" Int cpu = 1 Int memory_mb = 8192 Int disk_size_gb = 256 @@ -813,17 +846,154 @@ task CalculateContamination { grep -v ^sample ~{base_name}_contamination.tsv | awk 'BEGIN{FS="\t"}{print($3)}' > contamination_error.txt >>> - runtime { - docker: docker - cpu: cpu - memory: "~{memory_mb} MiB" - disks: "local-disk ~{disk_size_gb} HDD" - } + runtime { + docker: docker + cpu: cpu + memory: "~{memory_mb} MiB" + disks: "local-disk ~{disk_size_gb} HDD" + } - output { - File pileups = "~{base_name}_pileups.tsv" - File contamination_table = "~{base_name}_contamination.tsv" - Float contamination = read_float("contamination.txt") - Float contamination_error = read_float("contamination_error.txt") - } + output { + File pileups = "~{base_name}_pileups.tsv" + File contamination_table = "~{base_name}_contamination.tsv" + Float contamination = read_float("contamination.txt") + Float contamination_error = read_float("contamination_error.txt") + } +} + +task SamToFastq { + input { + File bam + String output_prefix + String docker = "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.11" + Int memory_mb = 4096 + Int disk_size_gb = 3*ceil(size(bam, "GiB")) + 128 + } + + command { + java -jar /usr/picard/picard.jar SamToFastq \ + I=~{bam} \ + FASTQ=~{output_prefix}_1.fastq.gz \ + SECOND_END_FASTQ=~{output_prefix}_2.fastq.gz + + } + + runtime { + docker: docker + memory: "~{memory_mb} MiB" + disks: "local-disk ~{disk_size_gb} HDD" + } + + output { + File fastq1 = output_prefix + "_1.fastq.gz" + File fastq2 = output_prefix + "_2.fastq.gz" + } +} + +task FastQC { + input { + File unmapped_bam + Float? mem = 4 + String docker = "us.gcr.io/tag-public/tag-tools:1.0.0" # sato: This likely needs to be made public + Int memory_mb = 4096 + Int disk_size_gb = 3*ceil(size(unmapped_bam, "GiB")) + 128 + } + + String bam_basename = basename(unmapped_bam, ".bam") + + command { + perl /usr/tag/scripts/FastQC/fastqc ~{unmapped_bam} --extract -o ./ + mv ~{bam_basename}_fastqc/fastqc_data.txt ~{bam_basename}_fastqc_data.txt + + tail -n 2 ~{bam_basename}_fastqc_data.txt | head -n 1 | cut -f 2 > ~{bam_basename}_adapter_content.txt + } + + runtime { + docker: docker + memory: "~{memory_mb} MiB" + disks: "local-disk ~{disk_size_gb} HDD" + } + + output { + File fastqc_data = "~{bam_basename}_fastqc_data.txt" + File fastqc_html = "~{bam_basename}_fastqc.html" + Float fastqc_percent_reads_with_adapter = read_float("~{bam_basename}_adapter_content.txt") + } +} + +task TransferReadTags { + input { + File aligned_bam + File ubam + String output_basename + String docker = "us.gcr.io/broad-gatk/gatk:4.2.6.0" + Int memory_mb = 16000 + Int disk_size_gb = ceil(2 * size(aligned_bam, "GiB")) + ceil(2 * size(ubam, "GiB")) + 128 + } + + command <<< + gatk TransferReadTags \ + -I ~{aligned_bam} \ + --unmapped-sam ~{ubam} \ + -O ~{output_basename}.bam \ + --read-tags RX + >>> + + output { + File output_bam = "~{output_basename}.bam" + } + + runtime { + docker: docker + memory: "~{memory_mb} MiB" + disks: "local-disk ~{disk_size_gb} HDD" + } +} + +task PostprocessTranscriptomeForRSEM { + input { + String prefix + File input_bam # the input must be queryname sorted + Int disk_size_gb = ceil(3*size(input_bam,"GiB")) + 128 + String docker = "us.gcr.io/broad-gatk/gatk:4.2.6.0" + Int memory_mb = 16000 + } + + command { + gatk PostProcessReadsForRSEM \ + -I ~{input_bam} \ + -O ~{prefix}_RSEM_post_processed.bam + } + + output { + File output_bam = "~{prefix}_RSEM_post_processed.bam" + } + + runtime { + docker: docker + disks: "local-disk ~{disk_size_gb} HDD" + memory: memory_mb + } +} + +task CreateEmptyFile { + input { + Int disk_size_gb = 128 + String docker = "us.gcr.io/broad-gatk/gatk:4.2.6.0" + Int memory_mb = 4096 + } + + command { + echo "place holder for a bam index file" > empty.txt + } + + output { + File empty_file = "empty.txt" + } + + runtime { + docker: docker + disks: "local-disk ~{disk_size_gb} HDD" + memory: memory_mb + } } \ No newline at end of file diff --git a/tasks/broad/UMIAwareDuplicateMarking.wdl b/tasks/broad/UMIAwareDuplicateMarking.wdl index 253159bf12..e28427f164 100644 --- a/tasks/broad/UMIAwareDuplicateMarking.wdl +++ b/tasks/broad/UMIAwareDuplicateMarking.wdl @@ -21,18 +21,44 @@ workflow UMIAwareDuplicateMarking { input { File aligned_bam + File unaligned_bam String output_basename + Boolean remove_duplicates + Boolean coordinate_sort_output } parameter_meta { - aligned_bam: "Aligned bam sorted by the query (read) name" + aligned_bam: "Unsorted aligned bam (the output of STAR in multithread mode is not query-name sorted)" + unaligned_bam: "Query-name sorted unaligned bam; contains UMIs in the RX tag" output_basename: "Basename for file outputs from this workflow" + remove_duplicates: "If true, remove (rather than mark) duplicate reads from the output" + coordinate_sort: "If true, the output bam will be coordinate sorted. Else it will be query-name sorted." + } + + call tasks.SortSamByQueryName as SortSamByQueryNameAfterAlignment { + input: + input_bam = aligned_bam, + output_bam_basename = output_basename + ".queryname_sorted" + } + + # It appears we cannot assume that the unmapped bam/fastqs will be sorted + call tasks.SortSamByQueryName as SortSamByQueryNameUnmapped { + input: + input_bam = unaligned_bam, + output_bam_basename = output_basename + ".u.queryname_sorted" + } + + call tasks.TransferReadTags { + input: + aligned_bam = SortSamByQueryNameAfterAlignment.output_bam, + ubam = SortSamByQueryNameUnmapped.output_bam, + output_basename = output_basename + ".queryname_sorted_with_RX" } # First sort the aligned bam by coordinate, so we can group duplicate sets using UMIs in the next step. call tasks.SortSamByCoordinate as SortSamByCoordinateFirstPass { input: - input_bam = aligned_bam, + input_bam = TransferReadTags.output_bam, output_bam_basename = output_basename + ".STAR_aligned.coordinate_sorted" } @@ -49,7 +75,7 @@ workflow UMIAwareDuplicateMarking { output_bam_basename = output_basename + ".grouped_by_UMI" } - call tasks.SortSamByQueryName as SortSamByQueryName { + call tasks.SortSamByQueryName as SortSamByQueryNameBeforeDuplicateMarking { input: input_bam = GroupByUMIs.grouped_bam, output_bam_basename = output_basename + ".grouped.queryname_sorted" @@ -57,19 +83,29 @@ workflow UMIAwareDuplicateMarking { call tasks.MarkDuplicatesUMIAware as MarkDuplicates { input: - bam = SortSamByQueryName.output_bam, - output_basename = output_basename + bam = SortSamByQueryNameBeforeDuplicateMarking.output_bam, + output_basename = output_basename, + remove_duplicates = remove_duplicates } - call tasks.SortSamByCoordinate as SortSamByCoordinateSecondPass { - input: - input_bam = MarkDuplicates.duplicate_marked_bam, - output_bam_basename = output_basename + ".duplicate_marked.coordinate_sorted" + if (coordinate_sort_output){ + call tasks.SortSamByCoordinate as SortSamByCoordinateSecondPass { + input: + input_bam = MarkDuplicates.duplicate_marked_bam, + output_bam_basename = output_basename + ".duplicate_marked.coordinate_sorted" + } + } + + # We won't have the index file if the output is query-name sorted. + # Until we remove the transcriptome index from the TDR schema, + # output a placeholder text file as the bam index as a temporary fix + if (!coordinate_sort_output){ + call tasks.CreateEmptyFile {} } output { - File duplicate_marked_bam = SortSamByCoordinateSecondPass.output_bam - File duplicate_marked_bam_index = SortSamByCoordinateSecondPass.output_bam_index + File duplicate_marked_bam = select_first([SortSamByCoordinateSecondPass.output_bam, MarkDuplicates.duplicate_marked_bam]) + File duplicate_marked_bam_index = select_first([SortSamByCoordinateSecondPass.output_bam_index, CreateEmptyFile.empty_file]) File duplicate_metrics = MarkDuplicates.duplicate_metrics } }