Skip to content

Commit

Permalink
RNA pipeline with adapter clipping (#662)
Browse files Browse the repository at this point in the history
* enable adapter clipping
* Bring fastp tool into warp managed docker images
* add fastQC as output to pipeline
* Update to public version of Illumina_adapters
* add new outputs to TDR, update docker tag for ingest script
* sort unmapped just in case and fastp disable length filtering
* round fastqc_percent_reads_with_adapter to 5 digits
  • Loading branch information
takutosato authored Apr 13, 2022
1 parent 956a82d commit 000b278
Show file tree
Hide file tree
Showing 22 changed files with 644 additions and 80 deletions.
36 changes: 36 additions & 0 deletions dockers/broad/rna_seq/fastp/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
FROM debian:bullseye-slim

ARG FASTP_VERSION=0.20.1

ENV TERM=xterm-256color \
FASTP_URL=http://opengene.org/fastp/fastp.${FASTP_VERSION} \
TINI_VERSION=v0.19.0

LABEL MAINTAINER="Broad Institute DSDE <dsde-engineering@broadinstitute.org>" \
FASTP_VERSION=${FASTP_VERSION}

WORKDIR /usr/gitc

# Install dependencies

RUN set -eux; \
apt-get update; \
apt-get install -y \
wget \
; \
# Install fastp
wget ${FASTP_URL} ;\
\
mv fastp.${FASTP_VERSION} /usr/local/bin/fastp; \
chmod a+x /usr/local/bin/fastp \
; \
# Install tini
wget https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini -O /sbin/tini; \
chmod +x /sbin/tini \
; \
# Clean up cached files
apt-get clean && rm -rf /var/lib/apt/lists/*

# Set tini as default entrypoint
ENTRYPOINT [ "/sbin/tini", "--" ]

33 changes: 33 additions & 0 deletions dockers/broad/rna_seq/fastp/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# RNA Seq fastp

## Quick reference

Copy and paste to pull this image

#### `us.gcr.io/broad-gotc-prod/fastp:1.0.0-0.20.1-1649253500`

- __What is this image:__ This image is a lightweight debian based image for running the fastp tool set within our RNA sequencing pipeline.
- __What is fastp:__ fastp from OpenGene is a tool designed to provide fast all-in-one preprocessing for FastQ files. See [here](https://github.com/OpenGene/fastp) for more information.
- __How to see tool version used in image:__ Please see below.

## Versioning

fastp uses the following convention for versioning:

#### `us.gcr.io/broad-gotc-prod/fastp:<image-version>-<fastp-version>-<unix-timestamp>`

We keep track of all past versions in [docker_versions](docker_versions.tsv) with the last image listed being the currently used version in WARP.

You can see more information about the image, including the tool versions, by running the following command:

```bash
$ docker pull us.gcr.io/broad-gotc-prod/fastp:1.0.0-0.20.1-1649253500
$ docker inspect us.gcr.io/broad-gotc-prod/fastp:1.0.0-0.20.1-1649253500
```

## Usage

```bash
$ docker run --rm -it \
us.gcr.io/broad-gotc-prod/fastp:1.0.0-0.20.1-1649253500 fastp
```
71 changes: 71 additions & 0 deletions dockers/broad/rna_seq/fastp/docker_build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
set -e

# Update verson when changes to Dockerfile are made
DOCKER_IMAGE_VERSION=1.0.0
TIMESTAMP=$(date +"%s")
DIR=$(cd $(dirname $0) && pwd)

# Registries and tags
GCR_URL="us.gcr.io/broad-gotc-prod/fastp"
QUAY_URL="quay.io/broadinstitute/gotc-prod-fastp"

# Fgbio version
FASTP_VERSION="0.20.1"

# Necessary tools and help text
TOOLS=(docker gcloud)
HELP="$(basename "$0") [-h|--help] [-t|--tools] -- script to build the fastp image and push to GCR & Quay
where:
-h|--help Show help text
-v|--version of fastp to use (default: $FGBIO_VERSION)
-t|--tools Show tools needed to run script
"

function main(){
for t in "${TOOLS[@]}"; do which $t >/dev/null || ok=no; done
if [[ $ok == no ]]; then
echo "Missing one of the following tools: "
for t in "${TOOLS[@]}"; do echo "$t"; done
exit 1
fi

while [[ $# -gt 0 ]]
do
key="$1"
case $key in
-v|--version)
FASTP_VERSION="$2"
shift
shift
;;
-h|--help)
echo "$HELP"
exit 0
;;
-t|--tools)
for t in "${TOOLS[@]}"; do echo $t; done
exit 0
;;
*)
shift
;;
esac
done

IMAGE_TAG="$DOCKER_IMAGE_VERSION-$FASTP_VERSION-$TIMESTAMP"

echo "building and pushing GCR Image - $GCR_URL:$IMAGE_TAG"
docker build --no-cache -t "$GCR_URL:$IMAGE_TAG" \
--build-arg FASTP_VERSION="$FASTP_VERSION" "$DIR"
docker push "$GCR_URL:$IMAGE_TAG"

#echo "tagging and pushing Quay Image"
#docker tag "$GCR_URL:$IMAGE_TAG" "$QUAY_URL:$IMAGE_TAG"
#docker push "$QUAY_URL:$IMAGE_TAG"

echo "$GCR_URL:$IMAGE_TAG" >> "$DIR/docker_versions.tsv"
echo "done"
}

main "$@"
1 change: 1 addition & 0 deletions dockers/broad/rna_seq/fastp/docker_versions.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
us.gcr.io/broad-gotc-prod/fastp:1.0.0-0.20.1-1649253500
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# 1.0.7
2022-04-12 (Date of Last Commit)

* Clip adapter bases pre-alignment & associated updates for TDR ingest

# 1.0.6
2022-04-04 (Date of Last Commit)

Expand Down
8 changes: 6 additions & 2 deletions pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import "../../../../tasks/broad/Utilities.wdl" as utils

workflow BroadInternalRNAWithUMIs {

String pipeline_version = "1.0.6"
String pipeline_version = "1.0.7"

input {
# input needs to be either "hg19" or "hg38"
Expand Down Expand Up @@ -157,7 +157,9 @@ workflow BroadInternalRNAWithUMIs {
picard_fingerprint_detail_metrics = CheckFingerprint.fingerprint_detail_metrics_file,
unified_metrics = MergeMetrics.unified_metrics,
contamination = RNAWithUMIs.contamination,
contamination_error = RNAWithUMIs.contamination_error
contamination_error = RNAWithUMIs.contamination_error,
fastqc_html_report = RNAWithUMIs.fastqc_html_report,
fastqc_percent_reads_with_adapter = RNAWithUMIs.fastqc_percent_reads_with_adapter
}

call tasks.updateOutputsInTDR {
Expand Down Expand Up @@ -196,5 +198,7 @@ workflow BroadInternalRNAWithUMIs {
File unified_metrics = MergeMetrics.unified_metrics
Float contamination = RNAWithUMIs.contamination
Float contamination_error = RNAWithUMIs.contamination_error
File fastqc_html_report = RNAWithUMIs.fastqc_html_report
Float fastqc_percent_reads_with_adapter = RNAWithUMIs.fastqc_percent_reads_with_adapter
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"BroadInternalRNAWithUMIs.sample_lsid": "broadinstitute.org:bsp.prod.sample:K4Y2X",
"BroadInternalRNAWithUMIs.output_basename": "SM-K4Y2X_350ng_.65X_D3",

"BroadInternalRNAWithUMIs.reference_build": "hg19",

"BroadInternalRNAWithUMIs.r1_fastq": "gs://broad-gotc-test-storage/rna_seq/rna_with_umis/plumbing/inputs/SM-K4Y2X_350ng_.65X_D3_read1_plumbing.fastq",
"BroadInternalRNAWithUMIs.r2_fastq": "gs://broad-gotc-test-storage/rna_seq/rna_with_umis/plumbing/inputs/SM-K4Y2X_350ng_.65X_D3_read2_plumbing.fastq",
"BroadInternalRNAWithUMIs.read1Structure": "3M2S146T",
"BroadInternalRNAWithUMIs.read2Structure": "3M2S146T",
"BroadInternalRNAWithUMIs.library_name": "SM-K4Y2X_350ng_.65X_D3",
"BroadInternalRNAWithUMIs.platform": "ILLUMINA",
"BroadInternalRNAWithUMIs.platform_unit": "barcode1",
"BroadInternalRNAWithUMIs.read_group_name": "RG1",

"BroadInternalRNAWithUMIs.vault_token_path": "{VAULT_TOKEN_PATH}",
"BroadInternalRNAWithUMIs.environment": "{ENV}"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"BroadInternalRNAWithUMIs.sample_lsid": "broadinstitute.org:bsp.prod.sample:K4Y2X",
"BroadInternalRNAWithUMIs.output_basename": "SM-K4Y2X_350ng_.65X_D3",

"BroadInternalRNAWithUMIs.reference_build": "hg38",

"BroadInternalRNAWithUMIs.r1_fastq": "gs://broad-gotc-test-storage/rna_seq/rna_with_umis/plumbing/inputs/SM-K4Y2X_350ng_.65X_D3_read1_plumbing.fastq",
"BroadInternalRNAWithUMIs.r2_fastq": "gs://broad-gotc-test-storage/rna_seq/rna_with_umis/plumbing/inputs/SM-K4Y2X_350ng_.65X_D3_read2_plumbing.fastq",
"BroadInternalRNAWithUMIs.read1Structure": "3M2S146T",
"BroadInternalRNAWithUMIs.read2Structure": "3M2S146T",
"BroadInternalRNAWithUMIs.library_name": "SM-K4Y2X_350ng_.65X_D3",
"BroadInternalRNAWithUMIs.platform": "ILLUMINA",
"BroadInternalRNAWithUMIs.platform_unit": "barcode1",
"BroadInternalRNAWithUMIs.read_group_name": "RG1",

"BroadInternalRNAWithUMIs.vault_token_path": "{VAULT_TOKEN_PATH}",
"BroadInternalRNAWithUMIs.environment": "{ENV}"
}
5 changes: 5 additions & 0 deletions pipelines/broad/rna_seq/RNAWithUMIsPipeline.changelog.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# 1.0.4
2022-04-08 (Date of Last Commit)

* Clip adapter bases pre-alignment

# 1.0.3
2022-03-29 (Date of Last Commit)

Expand Down
97 changes: 70 additions & 27 deletions pipelines/broad/rna_seq/RNAWithUMIsPipeline.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ import "../../../tasks/broad/RNAWithUMIsTasks.wdl" as tasks
workflow RNAWithUMIsPipeline {

String pipeline_version = "1.0.3"
String pipeline_version = "1.0.4"

input {
File? bam
Expand All @@ -30,12 +30,11 @@ workflow RNAWithUMIsPipeline {
String read2Structure
String output_basename

# The following inputs are only required if fastqs are given as input.
String? platform
String? library_name
String? platform_unit
String? read_group_name
String? sequencing_center = "BI"
String platform
String library_name
String platform_unit
String read_group_name
String sequencing_center = "BI"

File starIndex
File gtf
Expand All @@ -60,11 +59,11 @@ workflow RNAWithUMIsPipeline {
starIndex: "TAR file containing genome indices used for the STAR aligner"
output_basename: "String used as a prefix in workflow output files"
gtf: "Gene annotation file (GTF) used for the rnaseqc tool"
platform: "String used to describe the sequencing platform; only required when using FASTQ files as input"
library_name: "String used to describe the library; only required when using FASTQ files as input"
platform_unit: "String used to describe the platform unit; only required when using FASTQ files as input"
read_group_name: "String used to describe the read group name; only required when using FASTQ files as input"
sequencing_center: "String used to describe the sequencing center; only required when using FASTQ files as input; default is set to 'BI'"
platform: "String used to describe the sequencing platform"
library_name: "String used to describe the library"
platform_unit: "String used to describe the platform unit"
read_group_name: "String used to describe the read group name"
sequencing_center: "String used to describe the sequencing center; default is set to 'BI'"
ref: "FASTA file used for metric collection with Picard tools"
refIndex: "FASTA index file used for metric collection with Picard tools"
refDict: "Dictionary file used for metric collection with Picard tools"
Expand All @@ -79,12 +78,7 @@ workflow RNAWithUMIsPipeline {
input:
bam = bam,
r1_fastq = r1_fastq,
r2_fastq = r2_fastq,
library_name = library_name,
platform = platform,
platform_unit = platform_unit,
read_group_name = read_group_name,
sequencing_center = sequencing_center
r2_fastq = r2_fastq
}

if (VerifyPipelineInputs.fastq_run) {
Expand All @@ -93,11 +87,11 @@ workflow RNAWithUMIsPipeline {
r1_fastq = select_first([r1_fastq]),
r2_fastq = select_first([r2_fastq]),
bam_filename = output_basename,
library_name = select_first([library_name]),
platform = select_first([platform]),
platform_unit = select_first([platform_unit]),
read_group_name = select_first([read_group_name]),
sequencing_center = select_first([sequencing_center])
library_name = library_name,
platform = platform,
platform_unit = platform_unit,
read_group_name = read_group_name,
sequencing_center = sequencing_center
}
}

Expand All @@ -110,9 +104,43 @@ workflow RNAWithUMIsPipeline {
read2Structure = read2Structure
}

call tasks.STAR {
# Convert SAM to fastq for adapter clipping
# This step also removes reads that fail platform/vendor quality checks
call tasks.SamToFastq {
input:
bam = ExtractUMIs.bam_umis_extracted,
output_prefix = output_basename
}

# Adapter clipping
call tasks.Fastp {
input:
fastq1 = SamToFastq.fastq1,
fastq2 = SamToFastq.fastq2,
output_prefix = output_basename + ".adapter_clipped"
}

# Back to SAM before alignment
call tasks.FastqToUbam as FastqToUbamAfterClipping {
input:
r1_fastq = Fastp.fastq1_clipped,
r2_fastq = Fastp.fastq2_clipped,
bam_filename = output_basename + ".adapter_clipped",
library_name = library_name,
platform = platform,
platform_unit = platform_unit,
read_group_name = read_group_name,
sequencing_center = sequencing_center
}

call tasks.FastQC {
input:
unmapped_bam = FastqToUbamAfterClipping.unmapped_bam
}

call tasks.STAR {
input:
bam = FastqToUbamAfterClipping.unmapped_bam,
starIndex = starIndex
}

Expand All @@ -122,19 +150,32 @@ workflow RNAWithUMIsPipeline {
bam_without_readgroups = STAR.transcriptome_bam
}

# Mark duplicate reads in the genome-aligned bam. The output is coordinate-sorted.
call UmiMD.UMIAwareDuplicateMarking {
input:
aligned_bam = STAR.aligned_bam,
output_basename = output_basename
unaligned_bam = ExtractUMIs.bam_umis_extracted,
output_basename = output_basename,
remove_duplicates = false,
coordinate_sort_output = true
}

# Remove, rather than mark, duplicates reads from the transcriptome-aligned bam
# The output is query-name-sorted.
call UmiMD.UMIAwareDuplicateMarking as UMIAwareDuplicateMarkingTranscriptome {
input:
aligned_bam = CopyReadGroupsToHeader.output_bam,
output_basename = output_basename + ".transcriptome"
unaligned_bam = ExtractUMIs.bam_umis_extracted,
output_basename = output_basename + ".transcriptome",
remove_duplicates = true,
coordinate_sort_output = false
}

### PLACEHOLDER for CROSSCHECK ###
call tasks.PostprocessTranscriptomeForRSEM {
input:
prefix = output_basename + ".transcriptome.RSEM",
input_bam = UMIAwareDuplicateMarkingTranscriptome.duplicate_marked_bam
}

call tasks.GetSampleName {
input:
Expand Down Expand Up @@ -208,6 +249,8 @@ workflow RNAWithUMIsPipeline {
File picard_quality_distribution_pdf = CollectMultipleMetrics.quality_distribution_pdf
Float contamination = CalculateContamination.contamination
Float contamination_error = CalculateContamination.contamination_error
File fastqc_html_report = FastQC.fastqc_html
Float fastqc_percent_reads_with_adapter = FastQC.fastqc_percent_reads_with_adapter
}
}

Loading

0 comments on commit 000b278

Please sign in to comment.