From e61eac0646784582cf8b1b2c1a22db07bf456b8d Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Tue, 10 Dec 2024 12:13:31 -0500 Subject: [PATCH] fix: seperate paired and single ended rules, conditionally import --- workflow/Snakefile | 13 +- workflow/rules/paired/peakcall.smk | 132 ++++++++ workflow/rules/paired/qc.smk | 116 +++++++ .../rules/{ => paired}/trim_align_dedup.smk | 282 +++++----------- workflow/rules/peakcall.smk | 244 ++------------ workflow/rules/qc.smk | 147 +------- workflow/rules/single/peakcall.smk | 131 ++++++++ workflow/rules/single/qc.smk | 108 ++++++ workflow/rules/single/trim_align_dedup.smk | 316 ++++++++++++++++++ workflow/scripts/common.py | 102 +++--- workflow/scripts/grouping.py | 98 ++---- workflow/scripts/peakcall.py | 25 -- 12 files changed, 1023 insertions(+), 691 deletions(-) create mode 100644 workflow/rules/paired/peakcall.smk create mode 100644 workflow/rules/paired/qc.smk rename workflow/rules/{ => paired}/trim_align_dedup.smk (52%) create mode 100644 workflow/rules/single/peakcall.smk create mode 100644 workflow/rules/single/qc.smk create mode 100644 workflow/rules/single/trim_align_dedup.smk diff --git a/workflow/Snakefile b/workflow/Snakefile index a820a07..52e7d4c 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -221,9 +221,18 @@ rule all: input: rule_all_ins -# Include child rules +# Include endedness rules +if paired_end: + include: join("rules", "paired", "peakcall.smk") + include: join("rules", "paired", "qc.smk") + include: join("rules", "paired", "trim.smk") +else: + include: join("rules", "paired", "peakcall.smk") + include: join("rules", "paired", "qc.smk") + include: join("rules", "paired", "trim.smk") + +# Include general rules include: join("rules", "hooks.smk") -include: join("rules", "trim_align_dedup.smk") include: join("rules", "qc.smk") include: join("rules", "peakcall.smk") include: join("rules", "dba.smk") diff --git a/workflow/rules/paired/peakcall.smk b/workflow/rules/paired/peakcall.smk new file mode 100644 index 0000000..c40a4a5 --- /dev/null +++ b/workflow/rules/paired/peakcall.smk @@ -0,0 +1,132 @@ +# Calling read coverage as peaks +# ~~~~ +# Paired-end peak calling rules + + +rule MACS2_broad: + input: + chip = join(bam_dir, "{name}.Q5DD_tagAlign.gz"), + txt = join(ppqt_dir, "{name}.Q5DD_tagAlign.ppqt.txt"), + c_option = lambda w: join(bam_dir, f"{chip2input[w.name]}.Q5DD.bam"), + output: + join(macsB_dir, "{name}", "{name}_peaks.broadPeak"), + params: + rname = 'MACS2_broad', + gsize = config['references'][genome]['EFFECTIVEGENOMESIZE'], + macsver = config['tools']['MACSVER'], + flag = lambda w: "-c" if chip2input[w.name] else "", + shell: + """ + module load {params.macsver}; + macs2 callpeak \\ + -t {input.chip} {params.flag} {input.c_option} \\ + -g {params.gsize} \\ + -n {wildcards.name} \\ + --outdir {macsB_dir}/{wildcards.name} \\ + --broad \\ + --broad-cutoff 0.01 \\ + --keep-dup="all" \\ + -f "BAMPE" + """ + + +rule SICER: + input: + chip = lambda w: join(bam_dir, w.name + ".Q5DD.bam"), + fragLen = lambda w: join(qc_dir, name + ".Q5DD.insert_size_metrics.txt"), + c_option = lambda w: join(bam_dir, f"{chip2input[w.name]}.Q5DD.bam"), + output: + bed = join(sicer_dir, "{name}", "{name}_broadpeaks.bed") if has_inputs else [], + params: + rname = 'SICER', + name = "{name}", + sicerver = config['tools']['SICERVER'], + bedtoolsver = config['tools']['BEDTOOLSVER'], + genomever = config['options']['genome'], + this_sicer_dir = join(sicer_dir,"{name}"), + frac = config['references'][genome]['FRAC'], + flag = lambda w: "-c" if chip2input[w.name] else "", + shell: + """ + module load {params.sicerver} + module load {params.bedtoolsver} + if [ ! -d \"""" + str(tmpdir) + """\" ]; then mkdir -p \"""" + str(tmpdir) + """\"; fi + tmp=$(mktemp -d -p \"""" + str(tmpdir) + """\") + trap 'rm -rf "${{tmp}}"' EXIT + + MEAN_INSERT_SIZE=$(cat {input.fragLen} | awk '/MEDIAN_INSERT_SIZE/{{f=1;next}} /## HISTOGRAM/{{f=0}} f' | cut -f 6) + mean_insert_size=$(printf "%.0f" $MEAN_INSERT_SIZE) + + echo "printing out value of mean-insert-size ${{mean_insert_size}}" + a={input.c_option} + echo "Printing input.c_option ${{a}}" + + if [ -f "{input.c_option}" ]; then + # Copying input to tmpdir due to SICER2 + # bam2bed file conversion, if more than + # one sample shares the same IP sample + # than a race condition can occur where + # two jobs can concurrent try to write + # to the same BED file (during bedtools + # bam2bed that sicer calls). + input_bam="$(basename "{input.c_option}")" + cp {input.c_option} ${{tmp}} + echo "paired-end with input... ${{tmp}}/${{input_bam}}" + sicer \\ + -t {input.chip} \\ + -c "${{tmp}}/${{input_bam}}" \\ + -s {params.genomever} \\ + -rt 100 \\ + -w 300 \\ + -f ${{mean_insert_size}} \\ + -egf {params.frac} \\ + -g 600 \\ + -fdr 1E-2 \\ + -cpu 30 \\ + -o ${{tmp}} + + mv ${{tmp}}/{params.name}.Q5DD-W300-G600-FDR0.01-island.bed {output.bed}; + mv ${{tmp}}/{params.name}.Q5DD-W300-G600-islands-summary {params.this_sicer_dir} + else + echo "paired-end without input" + sicer \\ + -t {input.chip} \\ + -s {params.genomever} \\ + -rt 100 \\ + -w 300 \\ + -f ${{mean_insert_size}} \\ + -egf {params.frac} \\ + -g 600 \\ + -e 100 \\ + -cpu 30 \\ + -o ${{tmp}} + + mv ${{tmp}}/{params.name}.Q5DD-W300-G600.scoreisland {params.this_sicer_dir} + fi + """ + + +rule MACS2_narrow: + input: + chip = join(bam_dir, "{name}.Q5DD_tagAlign.gz"), + txt = join(ppqt_dir, "{name}.Q5DD_tagAlign.ppqt.txt"), + c_option = lambda w: join(bam_dir, f"{chip2input[w.name]}.Q5DD.bam"), + output: + join(macsN_dir, "{name}", "{name}_peaks.narrowPeak"), + params: + rname = 'MACS2_narrow', + gsize = config['references'][genome]['EFFECTIVEGENOMESIZE'], + macsver = config['tools']['MACSVER'], + flag = lambda w: "-c" if chip2input[w.name] else "", + shell: + """ + module load {params.macsver}; + macs2 callpeak \\ + -t {input.chip} {params.flag} {input.c_option} \\ + -g {params.gsize} \\ + -n {wildcards.name} \\ + --outdir {macsN_dir}/{wildcards.name} \\ + -q 0.01 \\ + --keep-dup="all" \\ + -f "BAMPE" + """ \ No newline at end of file diff --git a/workflow/rules/paired/qc.smk b/workflow/rules/paired/qc.smk new file mode 100644 index 0000000..ed5f3e7 --- /dev/null +++ b/workflow/rules/paired/qc.smk @@ -0,0 +1,116 @@ +# Quality control metrics and reports +# ~~~~ +# Paired-end peak qc rules + +rule fastq_screen: + """ + Quality-control step to screen for different sources of contamination. + FastQ Screen compares your sequencing data to a set of different reference + genomes to determine if there is contamination. It allows a user to see if + the composition of your library matches what you expect. + @Input: + Trimmed FastQ files (scatter) + @Output: + FastQ Screen report and logfiles + """ + input: + expand(join(trim_dir, "{name}.R{rn}.trim.fastq.gz"), name=samples, rn=ends) + output: + first_txt = expand(join( + qc_dir, "FQscreen", "{name}.R{rn}.trim_screen.txt"), name=samples, rn=[1, 2] + ), + first_png = expand(join( + qc_dir, "FQscreen", "{name}.R{rn}.trim_screen.png"), name=samples, rn=[1, 2] + ), + second_txt = expand(join( + qc_dir, "FQscreen2", "{name}.R{rn}.trim_screen.txt"), name=samples, rn=[1, 2] + ), + second_png = expand(join( + qc_dir, "FQscreen2", "{name}.R{rn}.trim_screen.png"), name=samples, rn=[1, 2] + ) + params: + rname = 'fqscreen', + outdir = join(qc_dir, "FQscreen"), + outdir2 = join(qc_dir, "FQscreen2"), + fastq_screen = config['bin']['FASTQ_SCREEN'], + fastq_screen_config1 = config['shared_resources']['FASTQ_SCREEN_CONFIG_P1'], + fastq_screen_config2 = config['shared_resources']['FASTQ_SCREEN_CONFIG_P2'], + envmodules: + config['tools']['BOWTIE2VER'], + config['tools']['PERLVER'], + threads: + int(allocated("threads", "fastq_screen", cluster)) + shell: + """ + # First pass of contamination screening + {params.fastq_screen} \\ + --conf {params.fastq_screen_config1} \\ + --outdir {params.outdir} \\ + --threads {threads} \\ + --subset 1000000 \\ + --aligner bowtie2 \\ + --force \\ + {input} + # Second pass of contamination screening + {params.fastq_screen} \\ + --conf {params.fastq_screen_config2} \\ + --outdir {params.outdir2} \\ + --threads {threads} \\ + --subset 1000000 \\ + --aligner bowtie2 \\ + --force \\ + {input} + """ + +rule kraken: + """ + Quality-control step to assess for potential sources of microbial contamination. + If there are high levels of microbial contamination, Kraken will provide an + estimation of the taxonomic composition. Kraken is used in conjunction with + Krona to produce an interactive reports. + @Input: + Trimmed FastQ files (scatter) + @Output: + Kraken logfile and interative krona report + """ + input: + fq1 = join(trim_dir, "{name}.R1.trim.fastq.gz"), + fq2 = join(trim_dir, "{name}.R2.trim.fastq.gz") + output: + krakenout = join(kraken_dir, "{name}.trim.kraken_bacteria.out.txt"), + krakentaxa = join(kraken_dir, "{name}.trim.kraken_bacteria.taxa.txt"), + kronahtml = join(kraken_dir, "{name}.trim.kraken_bacteria.krona.html"), + params: + rname = 'kraken', + outdir = kraken_dir, + bacdb = config['shared_resources']['KRAKENBACDB'], + tmpdir = tmpdir + threads: + int(allocated("threads", "kraken_pe", cluster)), + envmodules: + config['tools']['KRAKENVER'], + config['tools']['KRONATOOLSVER'], + shell: + """ + # Setups temporary directory for + # intermediate files with built-in + # mechanism for deletion on exit + if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi + tmp=$(mktemp -d -p "{params.tmpdir}") + trap 'rm -rf "${{tmp}}"' EXIT + + # Copy kraken2 db to /lscratch or temp + # location to reduce filesystem strain + cp -rv {params.bacdb} ${{tmp}}/; + kdb_base=$(basename {params.bacdb}) + + kraken2 --db ${{tmp}}/${{kdb_base}} \\ + --threads {threads} --report {output.krakentaxa} \\ + --output {output.krakenout} \\ + --gzip-compressed \\ + --paired {input.fq1} {input.fq2} + + # Generate Krona Report + cut -f2,3 {output.krakenout} | \\ + ktImportTaxonomy - -o {output.kronahtml} + """ \ No newline at end of file diff --git a/workflow/rules/trim_align_dedup.smk b/workflow/rules/paired/trim_align_dedup.smk similarity index 52% rename from workflow/rules/trim_align_dedup.smk rename to workflow/rules/paired/trim_align_dedup.smk index 1a809f7..3ae4943 100644 --- a/workflow/rules/trim_align_dedup.smk +++ b/workflow/rules/paired/trim_align_dedup.smk @@ -1,27 +1,17 @@ # Trimming, alignment, and redundancy reduction rules # ~~~~ -# Common paired-end rules: trim_pe, kraken_pe, BWA_PE, insert_size -import snakemake -from os.path import join -from scripts.common import allocated, get_bam_ext -from scripts.grouping import dedup_out7, get_bam_input, get_ppqt_input, \ - ctrl_test +# Rules for paired-ended reads -# ~~ workflow configuration +# ~~ workflow endpoints workpath = config['project']['workpath'] bin_path = config['project']['binpath'] genome = config['options']['genome'] -paired_end = False if config['project']['nends'] == 1 else True -ends = [1] if not paired_end else [1, 2] -chip2input = config['project']['peaks']['inputs'] -has_inputs = False if set(chip2input.values()) == {''} else True # ~~ directories trim_dir = join(workpath, 'trim') tmpdir = config['options']['tmp_dir'] bam_dir = join(workpath, "bam") bw_dir = join(workpath, "bigwig") -qc_dir = join(workpath, "QC") ppqt_dir = join(bam_dir, "ppqt") @@ -44,10 +34,10 @@ rule trim: """ input: file1 = join(workpath, "{name}.R1.fastq.gz"), - file2 = provided(join(workpath, "{name}.R2.fastq.gz"), paired_end) + file2 = join(workpath, "{name}.R2.fastq.gz"), output: outfq1 = temp(join(trim_dir, "{name}.R1.trim.fastq.gz")), - outfq2 = provided(temp(join(trim_dir, "{name}.R2.trim.fastq.gz")), paired_end) + outfq2 = temp(join(trim_dir, "{name}.R2.trim.fastq.gz")), params: rname = "trim", cutadaptver = config['tools']['CUTADAPTVER'], @@ -62,6 +52,7 @@ rule trim: trailingquality = 10, javaram = "64g", sample = "{name}", + tmpdir = tmpdir, threads: 32 shell: @@ -70,28 +61,27 @@ rule trim: module load {params.bwaver}; module load {params.samtoolsver}; module load {params.picardver}; - if [ ! -d \"""" + tmpdir + """\" ]; then mkdir -p \"""" + tmpdir + """\"; fi - tmp=$(mktemp -d -p \"""" + tmpdir + """\") + if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi + tmp=$(mktemp -d -p "{params.tmpdir}") trap 'rm -rf "${{tmp}}"' EXIT - if [ \"""" + str(paired_end) + """\" == True ]; then - cutadapt \\ - --pair-filter=any \\ - --nextseq-trim=2 \\ - --trim-n \\ - -n 5 \\ - -O 5 \\ - -q {params.leadingquality},{params.trailingquality} \\ - -m {params.minlen}:{params.minlen} \\ - -b file:{params.fastawithadaptersetd} \\ - -B file:{params.fastawithadaptersetd} \\ - -j {threads} \\ - -o ${{tmp}}/{params.sample}.R1.trim.fastq.gz \\ - -p ${{tmp}}/{params.sample}.R2.trim.fastq.gz \\ - {input.file1} {input.file2} - - if [ "{params.blacklistbwaindex}" != "" ]; - then bwa mem -t {threads} \\ + cutadapt \\ + --pair-filter=any \\ + --nextseq-trim=2 \\ + --trim-n \\ + -n 5 \\ + -O 5 \\ + -q {params.leadingquality},{params.trailingquality} \\ + -m {params.minlen}:{params.minlen} \\ + -b file:{params.fastawithadaptersetd} \\ + -B file:{params.fastawithadaptersetd} \\ + -j {threads} \\ + -o ${{tmp}}/{params.sample}.R1.trim.fastq.gz \\ + -p ${{tmp}}/{params.sample}.R2.trim.fastq.gz \\ + {input.file1} {input.file2} + + if [ "{params.blacklistbwaindex}" != "" ]; then + bwa mem -t {threads} \\ {params.blacklistbwaindex} \\ ${{tmp}}/{params.sample}.R1.trim.fastq.gz \\ ${{tmp}}/{params.sample}.R2.trim.fastq.gz \\ @@ -101,56 +91,19 @@ rule trim: -o ${{tmp}}/{params.sample}.bam; rm ${{tmp}}/{params.sample}.R1.trim.fastq.gz; rm ${{tmp}}/{params.sample}.R2.trim.fastq.gz; - java -Xmx{params.javaram} -jar $PICARDJARPATH/picard.jar SamToFastq \\ -VALIDATION_STRINGENCY SILENT \\ -INPUT ${{tmp}}/{params.sample}.bam \\ -FASTQ ${{tmp}}/{params.sample}.R1.trim.fastq \\ -SECOND_END_FASTQ ${{tmp}}/{params.sample}.R2.trim.fastq \\ -UNPAIRED_FASTQ ${{tmp}}/{params.sample}.unpaired.noBL.fastq - rm ${{tmp}}/{params.sample}.bam; - pigz -p {threads} ${{tmp}}/{params.sample}.R1.trim.fastq; pigz -p {threads} ${{tmp}}/{params.sample}.R2.trim.fastq; - fi - mv ${{tmp}}/{params.sample}.R1.trim.fastq.gz {output.outfq1}; - mv ${{tmp}}/{params.sample}.R2.trim.fastq.gz {output.outfq2}; - else - cutadapt \\ - --nextseq-trim=2 \\ - --trim-n \\ - -n 5 \\ - -O 5 \\ - -q {params.leadingquality},{params.trailingquality} \\ - -m {params.minlen} \\ - -b file:{params.fastawithadaptersetd} \\ - -j {threads} \\ - -o ${{tmp}}/{params.sample}.R1.trim.fastq.gz \\ - {input.file1} - - if [ "{params.blacklistbwaindex}" != "" ]; - then bwa mem -t {threads} \\ - {params.blacklistbwaindex} \\ - ${{tmp}}/{params.sample}.R1.trim.fastq.gz \\ - | samtools view -@{threads} \\ - -f4 \\ - -b \\ - -o ${{tmp}}/{params.sample}.bam; - rm ${{tmp}}/{params.sample}.R1.trim.fastq.gz; - - java -Xmx{params.javaram} -jar $PICARDJARPATH/picard.jar SamToFastq \\ - -VALIDATION_STRINGENCY SILENT \\ - -INPUT ${{tmp}}/{params.sample}.bam \\ - -FASTQ ${{tmp}}/{params.sample}.R1.trim.fastq - - rm ${{tmp}}/{params.sample}.bam; - - pigz -p {threads} ${{tmp}}/{params.sample}.R1.trim.fastq; - - fi - mv ${{tmp}}/{params.sample}.R1.trim.fastq.gz {output.outfq1}; fi + + mv ${{tmp}}/{params.sample}.R1.trim.fastq.gz {output.outfq1}; + mv ${{tmp}}/{params.sample}.R2.trim.fastq.gz {output.outfq2}; """ @@ -171,7 +124,7 @@ rule BWA: """ input: infq1 = join(trim_dir, "{name}.R1.trim.fastq.gz"), - infq2 = join(trim_dir, "{name}.R2.trim.fastq.gz") if paired_end else [], + infq2 = join(trim_dir, "{name}.R2.trim.fastq.gz"), params: d = join(bam_dir), rname = 'bwa', @@ -194,30 +147,18 @@ rule BWA: module load {params.bwaver}; module load {params.samtoolsver}; module load {params.pythonver}; - if [ '""" + str(paired_end) + """' == True ];then - bwa mem -t {threads} {params.reference} {input.infq1} {input.infq2} \\ - | samtools sort -@{threads} -o {output.outbam1} - - samtools index {output.outbam1} - samtools flagstat {output.outbam1} > {output.flagstat1} - samtools idxstats {output.outbam1} > {output.idxstat1} - - python {params.script} -i {output.outbam1} -o {output.outbam2} -q 6 - samtools index {output.outbam2} - samtools flagstat {output.outbam2} > {output.flagstat2} - samtools idxstats {output.outbam2} > {output.idxstat2} - else - bwa mem -t {threads} {params.reference} {input.infq1} \\ - | samtools sort -@{threads} -o {output.outbam1} - samtools index {output.outbam1} - samtools flagstat {output.outbam1} > {output.flagstat1} - samtools idxstats {output.outbam1} > {output.idxstat1} - samtools view -b -q 6 {output.outbam1} -o {output.outbam2} - - samtools index {output.outbam2} - samtools flagstat {output.outbam2} > {output.flagstat2} - samtools idxstats {output.outbam2} > {output.idxstat2} - fi + + bwa mem -t {threads} {params.reference} {input.infq1} {input.infq2} \\ + | samtools sort -@{threads} -o {output.outbam1} + + samtools index {output.outbam1} + samtools flagstat {output.outbam1} > {output.flagstat1} + samtools idxstats {output.outbam1} > {output.idxstat1} + + python {params.script} -i {output.outbam1} -o {output.outbam2} -q 6 + samtools index {output.outbam2} + samtools flagstat {output.outbam2} > {output.flagstat2} + samtools idxstats {output.outbam2} > {output.idxstat2} """ @@ -239,13 +180,14 @@ rule dedup: """ input: - bam2 = join(bam_dir,"{name}.Q5.bam") + join(bam_dir, "{name}.Q5.bam") output: - out5 = join(bam_dir, "{name}.Q5DD.bam"), - out5f = join(bam_dir, "{name}.Q5DD.bam.flagstat"), - out5i = join(bam_dir, "{name}.Q5DD.bam.idxstat"), - out6 = provided(join(bam_dir, "{name}.bwa.Q5.duplic"), paired_end), - out7 = dedup_out7(join(bam_dir, "{name}"), assay, paired_end) + out_bam = join(bam_dir, "{name}.Q5DD.bam"), + flagstat = join(bam_dir, "{name}.Q5DD.bam.flagstat"), + idxstat = join(bam_dir, "{name}.Q5DD.bam.idxstat"), + bwadups = join(bam_dir, "{name}.bwa.Q5.duplic"), + tagalign = join(bam_dir, "{name}.Q5DD_tagAlign") if assay == "cfchip" \ + else join(bam_dir, "{name}.Q5DD_tagAlign.gz") params: rname = 'dedup', picardver = config['tools']['PICARDVER'], @@ -266,56 +208,45 @@ rule dedup: module load {params.bedtoolsver}; module load {params.macsver}; module load {params.rver}; - if [ ! -d \"""" + tmpdir + """\" ]; then mkdir -p \"""" + tmpdir + """\"; fi - tmp=$(mktemp -d -p \"""" + tmpdir + """\") + if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi + tmp=$(mktemp -d -p "{params.tmpdir}") trap 'rm -rf "${{tmp}}"' EXIT - + if [ "{assay}" == "cfchip" ]; then java -Xmx{params.javaram} \\ -jar $PICARDJARPATH/picard.jar MarkDuplicates \\ - -I {input.bam2} \\ + -I {input} \\ -O {params.tmpBam} \\ -TMP_DIR ${{tmp}} \\ -VALIDATION_STRINGENCY SILENT \\ -REMOVE_DUPLICATES true \\ - -METRICS_FILE {output.out6}; + -METRICS_FILE {output.bwadups}; samtools index {params.tmpBam}; - samtools view -b {params.tmpBam} chr{{1..22}} > {output.out5}; - Rscript {params.rscript} {params.tmpBam} {output.out7}; + samtools view -b {params.tmpBam} chr{{1..22}} > {output.out_bam}; + Rscript {params.rscript} {params.tmpBam} {output.tagalign}; rm {params.tmpBam} {params.tmpBam}.bai; - samtools index {output.out5}; - samtools flagstat {output.out5} > {output.out5f}; - samtools idxstats {output.out5} > {output.out5i}; - elif [ '""" + str(paired_end) + """' == False ];then - macs2 filterdup -i {input} -g {params.gsize} --keep-dup="auto" -o ${{tmp}}/TmpTagAlign; - awk -F"\\t" -v OFS="\\t" '{{if ($2>0 && $3>0) {{print}}}}' ${{tmp}}/TmpTagAlign > ${{tmp}}/TmpTagAlign2; - awk -F"\\t" -v OFS="\\t" '{{print $1,1,$2}}' {params.genomefile} | sort -k1,1 -k2,2n > ${{tmp}}/GenomeFileBed; - bedtools intersect -wa -f 1.0 -a ${{tmp}}/TmpTagAlign2 -b ${{tmp}}/GenomeFileBed > ${{tmp}}/TmpTagAlign3; - bedtools bedtobam -i ${{tmp}}/TmpTagAlign3 -g {params.genomefile} | samtools sort -@4 -o {output.out5}; - gzip ${{tmp}}/TmpTagAlign3; - mv ${{tmp}}/TmpTagAlign3.gz {output.out7}; - samtools index {output.out5}; - samtools flagstat {output.out5} > {output.out5f} - samtools idxstats {output.out5} > {output.out5i} + samtools index {output.out_bam}; + samtools flagstat {output.out_bam} > {output.flagstat}; + samtools idxstats {output.out_bam} > {output.idxstat}; else java -Xmx{params.javaram} \\ -jar $PICARDJARPATH/picard.jar MarkDuplicates \\ - -I {input.bam2} \\ - -O {output.out5} \\ + -I {input} \\ + -O {output.out_bam} \\ -TMP_DIR ${{tmp}} \\ -VALIDATION_STRINGENCY SILENT \\ -REMOVE_DUPLICATES true \\ - -METRICS_FILE {output.out6}; - samtools index {output.out5}; - samtools flagstat {output.out5} > {output.out5f}; - samtools idxstats {output.out5} > {output.out5i}; + -METRICS_FILE {output.bwadups}; + samtools index {output.out_bam}; + samtools flagstat {output.out_bam} > {output.flagstat}; + samtools idxstats {output.out_bam} > {output.idxstat}; fi """ rule ppqt: input: - bam = lambda w : join(bam_dir, w.name + "." + w.ext + "." + get_bam_ext(w.ext, paired_end)) + lambda w : join(bam_dir, f"{w.name}.{w.ext}.bam") output: ppqt = join(ppqt_dir, "{name}.{ext}.ppqt"), pdf = join(ppqt_dir, "{name}.{ext}.pdf"), @@ -325,35 +256,24 @@ rule ppqt: samtoolsver = config['tools']['SAMTOOLSVER'], rver = config['tools']['RVER'], scriptPy = join(bin_path, "ppqt_process.py"), - file_name = "{name}" + file_name = "{name}", + tmpdir = tmpdir, container: config['images']['ppqt'] shell: """ - if [ ! -d \"""" + tmpdir + """\" ]; then mkdir -p \"""" + tmpdir + """\"; fi - tmp=$(mktemp -d -p \"""" + tmpdir + """\") + if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi + tmp=$(mktemp -d -p "{params.tmpdir}") trap 'rm -rf "${{tmp}}"' EXIT - if [ '""" + str(paired_end) + """' == True ]; then - samtools view -b \\ + samtools view -b \\ -f 66 \\ - -o ${{tmp}}/bam1.f66.bam {input.bam}; - samtools index ${{tmp}}/bam1.f66.bam; - run_spp.R -c=${{tmp}}/bam1.f66.bam \ - -savp={output.pdf} -out={output.ppqt} \ - -tmpdir=${{tmp}} -rf - else - if [[ '{input.bam}' == *.gz ]]; then - ln -s {input.bam} ${{tmp}}/{params.file_name}.Q5DD.tagAlign.gz; - run_spp.R -c=${{tmp}}/{params.file_name}.Q5DD.tagAlign.gz \ - -savp={output.pdf} -out={output.ppqt} \ - -tmpdir=/lscratch/$SLURM_JOBID -rf - else - run_spp.R -c={input.bam} \ - -savp={output.pdf} -out={output.ppqt} \ - -tmpdir=/lscratch/$SLURM_JOBID -rf - fi - fi + -o ${{tmp}}/bam1.f66.bam {input}; + samtools index ${{tmp}}/bam1.f66.bam; + run_spp.R -c=${{tmp}}/bam1.f66.bam \ + -savp={output.pdf} -out={output.ppqt} \ + -tmpdir=${{tmp}} -rf + python {params.scriptPy} -i {output.ppqt} -o {output.txt} """ @@ -370,31 +290,28 @@ rule bam2bw: an associated score, RPGC """ input: - bam = lambda w: get_bam_input(bam_dir, w, paired_end), - ppqt = lambda w: get_ppqt_input(ppqt_dir, w, paired_end), + bam = lambda w: join(bam_dir, f"{w.name}.{w.ext}.bam"), + ppqt = lambda w: join(ppqt_dir, f"{w.name}.{w.ext}.ppqt.txt"), output: outbw = join(bw_dir, "{name}.{ext}.RPGC.bw"), params: rname = "bam2bw", name = "{name}", effectivegenomesize = config['references'][genome]['EFFECTIVEGENOMESIZE'], + tmpdir = tmpdir, threads: int(allocated("threads", "bam2bw", cluster)), envmodules: config['tools']['DEEPTOOLSVER'], shell: """ - if [ ! -d \"""" + tmpdir + """\" ]; then mkdir -p \"""" + tmpdir + """\"; fi - tmp=$(mktemp -d -p \"""" + tmpdir + """\") + if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi + tmp=$(mktemp -d -p "{params.tmpdir}") trap 'rm -rf "${{tmp}}"' EXIT bam_cov_option={input.ppqt} - if [ \"""" + str(paired_end) + """\" == False ]; then - ppqt_len=$(awk '{{print $1}}' {input.ppqt}) - bam_cov_option="-e ${{ppqt_len}}" - else - bam_cov_option=--centerReads - fi + ppqt_len=$(awk '{{print $1}}' {input.ppqt}) + bam_cov_option="-e ${{ppqt_len}}" echo "printing out value of bam-cov-option $bam_cov_option" bamCoverage \\ @@ -406,39 +323,4 @@ rule bam2bw: --normalizeUsing RPGC \\ --effectiveGenomeSize {params.effectivegenomesize} \\ ${{bam_cov_option}}; - """ - - -rule inputnorm: - """ - bigwigCompare (deepTools) subracts input control from treatment bigWig file, - normalizing treatment bigWig. - @Input: - Treatment sample, which has input control: extension Q5DD.RPGC.bw - and input its control: extension Q5DD.RPGC.bw - @Output: - bigWig file of treatmment sample normalizes with its input control - """ - input: - chip = lambda wc: ctrl_test(chip2input, wc.name, bw_dir, 'chip'), - ctrl = lambda wc: ctrl_test(chip2input, wc.name, bw_dir, 'ctrl') - output: - join(bw_dir, "{name}.Q5DD.RPGC.inputnorm.bw") - params: - rname = "inputnorm", - bigwig_declare = lambda wc, input: f"--bigwig1 {input.chip} --bigwig2 {input.ctrl}", - threads: - int(allocated("threads", "inputnorm", cluster)), - envmodules: - config['tools']['DEEPTOOLSVER'], - shell: - """ - bigwigCompare \\ - --binSize 25 \\ - --outFileName {output} \\ - --outFileFormat 'bigwig' \\ - {params.bigwig_declare} \\ - --operation 'subtract' \\ - --skipNonCoveredRegions \\ - -p {threads} """ \ No newline at end of file diff --git a/workflow/rules/peakcall.smk b/workflow/rules/peakcall.smk index 5b56e0c..0a3ec94 100644 --- a/workflow/rules/peakcall.smk +++ b/workflow/rules/peakcall.smk @@ -1,8 +1,7 @@ -# Quality control rules +# Calling read coverage as peaks # ~~~~ -# Common quality-control rules: preseq, NRF, rawfastqc, -# fastqc, fastq_screen, multiQC +# Genrally applicable rules from os.path import join from scripts.peakcall import get_control_input, \ getSicerChips, getSicerFragLen, get_control_input @@ -24,6 +23,41 @@ macsB_dir = join(workpath, "macsBroad") sicer_dir = join(workpath, "sicer") +rule inputnorm: + """ + bigwigCompare (deepTools) subracts input control from treatment bigWig file, + normalizing treatment bigWig. + @Input: + Treatment sample, which has input control: extension Q5DD.RPGC.bw + and input its control: extension Q5DD.RPGC.bw + @Output: + bigWig file of treatmment sample normalizes with its input control + """ + input: + chip = lambda wc: ctrl_test(chip2input, wc.name, bw_dir, 'chip'), + ctrl = lambda wc: ctrl_test(chip2input, wc.name, bw_dir, 'ctrl') + output: + join(bw_dir, "{name}.Q5DD.RPGC.inputnorm.bw") + params: + rname = "inputnorm", + bigwig_declare = lambda wc, input: f"--bigwig1 {input.chip} --bigwig2 {input.ctrl}", + threads: + int(allocated("threads", "inputnorm", cluster)), + envmodules: + config['tools']['DEEPTOOLSVER'], + shell: + """ + bigwigCompare \\ + --binSize 25 \\ + --outFileName {output} \\ + --outFileFormat 'bigwig' \\ + {params.bigwig_declare} \\ + --operation 'subtract' \\ + --skipNonCoveredRegions \\ + -p {threads} + """ + + rule sortByRead: """ Bam files(extension: sorted.bam) need to be sorted by read name @@ -87,210 +121,6 @@ rule genrich: """ -rule MACS2_narrow: - input: - chip = join(bam_dir, "{name}.Q5DD.bam") if paired_end \ - else join(bam_dir, "{name}.Q5DD_tagAlign.gz"), - txt = join(ppqt_dir, "{name}.Q5DD.ppqt.txt") if paired_end \ - else join(ppqt_dir, "{name}.Q5DD_tagAlign.ppqt.txt"), - c_option = lambda w: get_control_input(chip2input[w.name], paired_end, bam_dir), - output: - join(macsN_dir, "{name}", "{name}_peaks.narrowPeak"), - params: - rname = 'MACS2_narrow', - gsize = config['references'][genome]['EFFECTIVEGENOMESIZE'], - macsver = config['tools']['MACSVER'], - flag = lambda w: "-c" if chip2input[w.name] else "", - shell: - """ - module load {params.macsver}; - if [ '""" + str(paired_end) + """' == True ]; then - macs2 callpeak \\ - -t {input.chip} {params.flag} {input.c_option} \\ - -g {params.gsize} \\ - -n {wildcards.name} \\ - --outdir {macsN_dir}/{wildcards.name} \\ - -q 0.01 \\ - --keep-dup="all" \\ - -f "BAMPE" - else - ppqt_len=$(awk '{{print $1}}' {input.txt}) - macs2 callpeak \\ - -t {input.chip} {params.flag} {input.c_option} \\ - -g {params.gsize} \\ - -n {wildcards.name} \\ - --outdir {macsN_dir}/{wildcards.name} \\ - -q 0.01 \\ - --keep-dup="all" \\ - --nomodel \\ - --extsize $ppqt_len - fi - """ - - -rule MACS2_broad: - input: - chip = join(bam_dir, "{name}.Q5DD.bam") if paired_end \ - else join(bam_dir, "{name}.Q5DD_tagAlign.gz"), - txt = join(ppqt_dir, "{name}.Q5DD.ppqt.txt") if paired_end \ - else join(ppqt_dir, "{name}.Q5DD_tagAlign.ppqt.txt"), - c_option = lambda w: get_control_input(chip2input[w.name], paired_end, bam_dir), - output: - join(macsB_dir, "{name}", "{name}_peaks.broadPeak"), - params: - rname = 'MACS2_broad', - gsize = config['references'][genome]['EFFECTIVEGENOMESIZE'], - macsver = config['tools']['MACSVER'], - flag = lambda w: "-c" if chip2input[w.name] else "", - shell: - """ - module load {params.macsver}; - if [ '""" + str(paired_end) + """' == True ]; then - macs2 callpeak \\ - -t {input.chip} {params.flag} {input.c_option} \\ - -g {params.gsize} \\ - -n {wildcards.name} \\ - --outdir {macsB_dir}/{wildcards.name} \\ - --broad \\ - --broad-cutoff 0.01 \\ - --keep-dup="all" \\ - -f "BAMPE" - else - ppqt_len=$(awk '{{print $1}}' {input.txt}) - macs2 callpeak \\ - -t {input.chip} {params.flag} {input.c_option} \\ - -g {params.gsize} \\ - -n {wildcards.name} \\ - --outdir {macsB_dir}/{wildcards.name} \\ - --broad \\ - --broad-cutoff 0.01 \\ - --keep-dup="all" \\ - --nomodel \\ - --extsize $ppqt_len - fi - """ - - -rule SICER: - input: - chip = lambda w: getSicerChips(bam_dir, w.name, paired_end), - fragLen = lambda w: getSicerFragLen(ppqt_dir, qc_dir, w.name, paired_end), - c_option = lambda w: get_control_input(chip2input[w.name], paired_end, bam_dir), - output: - bed = join(sicer_dir, "{name}", "{name}_broadpeaks.bed") if has_inputs else [], - params: - rname = 'SICER', - name = "{name}", - sicerver = config['tools']['SICERVER'], - bedtoolsver = config['tools']['BEDTOOLSVER'], - genomever = config['options']['genome'], - this_sicer_dir = join(sicer_dir,"{name}"), - frac = config['references'][genome]['FRAC'], - flag = lambda w: "-c" if chip2input[w.name] else "", - shell: - """ - module load {params.sicerver} - module load {params.bedtoolsver} - if [ ! -d \"""" + str(tmpdir) + """\" ]; then mkdir -p \"""" + str(tmpdir) + """\"; fi - tmp=$(mktemp -d -p \"""" + str(tmpdir) + """\") - trap 'rm -rf "${{tmp}}"' EXIT - - if [ '""" + str(paired_end) + """' == True ]; then - MEAN_INSERT_SIZE=$(cat {input.fragLen} | awk '/MEDIAN_INSERT_SIZE/{{f=1;next}} /## HISTOGRAM/{{f=0}} f' | cut -f 6) - mean_insert_size=$(printf "%.0f" $MEAN_INSERT_SIZE) - else - mean_insert_size=$(awk '{{print $1}}' {input.fragLen}) - fi - echo "printing out value of mean-insert-size ${{mean_insert_size}}" - a={input.c_option} - echo "Printing input.c_option ${{a}}" - if [ '""" + str(paired_end) + """' == True ]; then - if [ -f "{input.c_option}" ]; then - # Copying input to tmpdir due to SICER2 - # bam2bed file conversion, if more than - # one sample shares the same IP sample - # than a race condition can occur where - # two jobs can concurrent try to write - # to the same BED file (during bedtools - # bam2bed that sicer calls). - input_bam="$(basename "{input.c_option}")" - cp {input.c_option} ${{tmp}} - echo "paired-end with input... ${{tmp}}/${{input_bam}}" - sicer \\ - -t {input.chip} \\ - -c "${{tmp}}/${{input_bam}}" \\ - -s {params.genomever} \\ - -rt 100 \\ - -w 300 \\ - -f ${{mean_insert_size}} \\ - -egf {params.frac} \\ - -g 600 \\ - -fdr 1E-2 \\ - -cpu 30 \\ - -o ${{tmp}} - - mv ${{tmp}}/{params.name}.Q5DD-W300-G600-FDR0.01-island.bed {output.bed}; - mv ${{tmp}}/{params.name}.Q5DD-W300-G600-islands-summary {params.this_sicer_dir} - else - echo "paired-end without input" - sicer \\ - -t {input.chip} \\ - -s {params.genomever} \\ - -rt 100 \\ - -w 300 \\ - -f ${{mean_insert_size}} \\ - -egf {params.frac} \\ - -g 600 \\ - -e 100 \\ - -cpu 30 \\ - -o ${{tmp}} - - mv ${{tmp}}/{params.name}.Q5DD-W300-G600.scoreisland {params.this_sicer_dir} - fi - else - if [ -f "{input.c_option}" ]; then - echo "single-end with input" - cp {input.chip} ${{tmp}}/chip.bed.gz; gzip -d ${{tmp}}/chip.bed.gz; - awk 'BEGIN{{FS=OFS="\\t"}} {{gsub(/\./, 0, $5)}} 1' ${{tmp}}/chip.bed > ${{tmp}}/{params.name}.bed; - - cp {input.c_option} ${{tmp}}/input.bed.gz; gzip -d ${{tmp}}/input.bed.gz; - awk 'BEGIN{{FS=OFS="\\t"}} {{gsub(/\./, 0, $5)}} 1' ${{tmp}}/input.bed > ${{tmp}}/inputV2.bed; - - sicer \\ - -t ${{tmp}}/{params.name}.bed \\ - -c ${{tmp}}/inputV2.bed \\ - -s {params.genomever} \\ - -rt 100 \\ - -w 300 \\ - -f ${{mean_insert_size}} \\ - -egf {params.frac} \\ - -g 600 \\ - -fdr 1E-2 \\ - -cpu 30 \\ - -o ${{tmp}} - mv ${{tmp}}/{params.name}-W300-G600-FDR0.01-island.bed {output.bed}; - mv ${{tmp}}/{params.name}-W300-G600-islands-summary {params.this_sicer_dir} - else - echo "single-end without input" - cp {input.chip} ${{tmp}}/chip.bed.gz; gzip -d ${{tmp}}/chip.bed.gz; - awk 'BEGIN{{FS=OFS="\\t"}} {{gsub(/\./, 0, $5)}} 1' ${{tmp}}/chip.bed > ${{tmp}}/{params.name}.bed; - sicer \\ - -t ${{tmp}}/{params.name}.bed \\ - -s {params.genomever} \\ - -rt 100 \\ - -w 300 \\ - -f ${{mean_insert_size}} \\ - -egf {params.frac} \\ - -g 600 \\ - -e 100 \\ - -cpu 30 \\ - -o ${{tmp}} - mv ${{tmp}}/{params.name}-W300-G600.scoreisland {output.bed} - fi - fi - """ - - rule MEME: input: bed = lambda w: join(workpath, w.PeakTool, w.name, w.name + PeakExtensions[w.PeakTool]) diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index fa7c99d..e7bd61a 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -1,7 +1,7 @@ -# Quality control rules +# Quality control metrics and reports # ~~~~ -# Common quality-control rules: preseq, NRF, rawfastqc, -# fastqc, fastq_screen, multiQC +# Generally applicable quality control rules + from os.path import join from scripts.common import get_bam_ext, get_fqscreen_outputs @@ -186,118 +186,6 @@ rule fastqc: """ -rule fastq_screen: - """ - Quality-control step to screen for different sources of contamination. - FastQ Screen compares your sequencing data to a set of different reference - genomes to determine if there is contamination. It allows a user to see if - the composition of your library matches what you expect. - @Input: - Trimmed FastQ files (scatter) - @Output: - FastQ Screen report and logfiles - """ - input: - expand(join(trim_dir, "{name}.R{rn}.trim.fastq.gz"), name=samples, rn=ends) - output: - get_fqscreen_outputs(paired_end, samples, qc_dir) - params: - rname = 'fqscreen', - outdir = join(qc_dir, "FQscreen"), - outdir2 = join(qc_dir, "FQscreen2"), - fastq_screen = config['bin']['FASTQ_SCREEN'], - fastq_screen_config1 = config['shared_resources']['FASTQ_SCREEN_CONFIG_P1'], - fastq_screen_config2 = config['shared_resources']['FASTQ_SCREEN_CONFIG_P2'], - envmodules: - config['tools']['BOWTIE2VER'], - config['tools']['PERLVER'], - threads: - int(allocated("threads", "fastq_screen", cluster)) - shell: - """ - # First pass of contamination screening - {params.fastq_screen} \\ - --conf {params.fastq_screen_config1} \\ - --outdir {params.outdir} \\ - --threads {threads} \\ - --subset 1000000 \\ - --aligner bowtie2 \\ - --force \\ - {input} - # Second pass of contamination screening - {params.fastq_screen} \\ - --conf {params.fastq_screen_config2} \\ - --outdir {params.outdir2} \\ - --threads {threads} \\ - --subset 1000000 \\ - --aligner bowtie2 \\ - --force \\ - {input} - """ - - -rule kraken: - """ - Quality-control step to assess for potential sources of microbial contamination. - If there are high levels of microbial contamination, Kraken will provide an - estimation of the taxonomic composition. Kraken is used in conjunction with - Krona to produce an interactive reports. - @Input: - Trimmed FastQ files (scatter) - @Output: - Kraken logfile and interative krona report - """ - input: - fq1 = join(trim_dir, "{name}.R1.trim.fastq.gz"), - fq2 = provided(join(trim_dir,"{name}.R2.trim.fastq.gz"), paired_end) - output: - krakenout = join(kraken_dir, "{name}.trim.kraken_bacteria.out.txt"), - krakentaxa = join(kraken_dir, "{name}.trim.kraken_bacteria.taxa.txt"), - kronahtml = join(kraken_dir, "{name}.trim.kraken_bacteria.krona.html"), - params: - rname = 'kraken', - outdir = kraken_dir, - bacdb = config['shared_resources']['KRAKENBACDB'], - tmpdir = tmpdir, - paired_end = paired_end - threads: - int(allocated("threads", "kraken_pe", cluster)), - envmodules: - config['tools']['KRAKENVER'], - config['tools']['KRONATOOLSVER'], - shell: - """ - # Setups temporary directory for - # intermediate files with built-in - # mechanism for deletion on exit - if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi - tmp=$(mktemp -d -p "{params.tmpdir}") - trap 'rm -rf "${{tmp}}"' EXIT - - # Copy kraken2 db to /lscratch or temp - # location to reduce filesystem strain - cp -rv {params.bacdb} ${{tmp}}/; - kdb_base=$(basename {params.bacdb}) - if [ '{params.paired_end}' == True ]; then - kraken2 --db ${{tmp}}/${{kdb_base}} \\ - --threads {threads} --report {output.krakentaxa} \\ - --output {output.krakenout} \\ - --gzip-compressed \\ - --paired {input.fq1} {input.fq2} - else - kraken2 --db ${{tmp}}/${{kdb_base}} \\ - --threads {threads} --report {output.krakentaxa} \\ - --output {output.krakenout} \\ - --gzip-compressed \\ - {input.fq1} - fi - - # Generate Krona Report - cut -f2,3 {output.krakenout} | \\ - ktImportTaxonomy - -o {output.kronahtml} - """ - - rule multiqc: """ Reporting step to aggregate sample statistics and quality-control information @@ -377,8 +265,8 @@ rule deeptools_QC: heatmap = join(deeptools_dir, "spearman_heatmap.Q5DD.pdf"), pca = join(deeptools_dir, "pca.Q5DD.pdf"), npz = temp(join(deeptools_dir, "Q5DD.npz")), - mqc = join(deeptools_dir, "spearman_readcounts.Q5DD.tab"), - png = join(deeptools_dir, "spearman_heatmap.Q5DD_mqc.png") + mqc = join(deeptools_dir, "spearman_readcounts.Q5DD.tab"), + png = join(deeptools_dir, "spearman_heatmap.Q5DD_mqc.png") params: rname = "deeptools_QC", parent_dir = deeptools_dir, @@ -397,6 +285,7 @@ rule deeptools_QC: plotPCA -in {output.npz} -o {output.pca} """ + rule deeptools_fingerprint: input: [ join(bam_dir, name + ".Q5DD.bam") for name in samples ] @@ -410,20 +299,16 @@ rule deeptools_fingerprint: deeptoolsver = config['tools']['DEEPTOOLSVER'], # this should be the sample names to match the bigwigs in the same order labels = samples + ext = "" if paired_end else "-e 200" threads: int(allocated("threads", "deeptools_fingerprint", cluster)), shell: """ module load {params.deeptoolsver} if [ ! -d "{params.parent_dir}" ]; then mkdir "{params.parent_dir}"; fi - if [ \"""" + str(paired_end) + """\" == False ]; then - extension_option="-e 200" - else - extension_option="" - fi - +\ plotFingerprint -b {input} --labels {params.labels} -p {threads} --skipZeros \\ --outQualityMetrics {output.metrics} --plotFile {output.image} --outRawCounts {output.raw} \\ - ${{extension_option}} + {params.ext} """ @@ -434,7 +319,7 @@ rule deeptools_gene_all: TSSline=join(deeptools_dir,"TSS_profile.Q5DD.pdf"), TSSmat=temp(join(deeptools_dir,"TSS.Q5DD.mat.gz")), bed=temp(join(deeptools_dir,"geneinfo.Q5DD.bed")), - mqc=join(deeptools_dir,"TSS_profile.Q5DD.tab") + mqc=join(deeptools_dir,"TSS_profile.Q5DD.tab") params: rname = "deeptools_gene_all", parent_dir = deeptools_dir, @@ -449,12 +334,12 @@ rule deeptools_gene_all: module load {params.deeptoolsver} if [ ! -d "{params.parent_dir}" ]; then mkdir "{params.parent_dir}"; fi grep --line-buffered 'protein_coding' {params.prebed} | awk -v OFS='\t' -F'\t' '{{print $1, $2, $3, $5, ".", $4}}' > {output.bed} - computeMatrix reference-point -S {input} -R {output.bed} -p {threads} \\ - --referencePoint TSS --upstream 3000 --downstream 3000 --skipZeros \\ - -o {output.TSSmat} --samplesLabel {params.labels} - plotProfile -m {output.TSSmat} -out {output.TSSline} \\ - --yAxisLabel 'average RPGC' --plotType 'se' --legendLocation upper-left \\ - --numPlotsPerRow 5 --outFileNameData {output.mqc} + computeMatrix reference-point -S {input} -R {output.bed} -p {threads} \\ + --referencePoint TSS --upstream 3000 --downstream 3000 --skipZeros \\ + -o {output.TSSmat} --samplesLabel {params.labels} + plotProfile -m {output.TSSmat} -out {output.TSSline} \\ + --yAxisLabel 'average RPGC' --plotType 'se' --legendLocation upper-left \\ + --numPlotsPerRow 5 --outFileNameData {output.mqc} """ diff --git a/workflow/rules/single/peakcall.smk b/workflow/rules/single/peakcall.smk new file mode 100644 index 0000000..2ab553d --- /dev/null +++ b/workflow/rules/single/peakcall.smk @@ -0,0 +1,131 @@ +# Calling read coverage as peaks +# ~~~~ +# Single-end peak calling rules + +rule MACS2_broad: + input: + chip = join(bam_dir, "{name}.Q5DD.bam") + txt = join(ppqt_dir, "{name}.Q5DD.ppqt.txt") + c_option = lambda w: join(bam_dir, f"{chip2input[w.name]}.Q5DD_tagAlign.gz"), + output: + join(macsB_dir, "{name}", "{name}_peaks.broadPeak"), + params: + rname = 'MACS2_broad', + gsize = config['references'][genome]['EFFECTIVEGENOMESIZE'], + macsver = config['tools']['MACSVER'], + flag = lambda w: "-c" if chip2input[w.name] else "", + shell: + """ + module load {params.macsver}; + ppqt_len=$(awk '{{print $1}}' {input.txt}) + macs2 callpeak \\ + -t {input.chip} {params.flag} {input.c_option} \\ + -g {params.gsize} \\ + -n {wildcards.name} \\ + --outdir {macsB_dir}/{wildcards.name} \\ + --broad \\ + --broad-cutoff 0.01 \\ + --keep-dup="all" \\ + --nomodel \\ + --extsize $ppqt_len + """ + + +rule SICER: + input: + chip = lambda w: join(bam_dir, w.name + ".Q5DD_tagAlign.gz"), + fragLen = lambda w: join(ppqt_dir, w.name + ".Q5DD_tagAlign.ppqt.txt"), + c_option = lambda w: join(bam_dir, f"{chip2input[w.name]}.Q5DD_tagAlign.gz"), + output: + bed = join(sicer_dir, "{name}", "{name}_broadpeaks.bed") if has_inputs else [], + params: + rname = 'SICER', + name = "{name}", + sicerver = config['tools']['SICERVER'], + bedtoolsver = config['tools']['BEDTOOLSVER'], + genomever = config['options']['genome'], + this_sicer_dir = join(sicer_dir,"{name}"), + frac = config['references'][genome]['FRAC'], + flag = lambda w: "-c" if chip2input[w.name] else "", + shell: + """ + module load {params.sicerver} + module load {params.bedtoolsver} + if [ ! -d \"""" + str(tmpdir) + """\" ]; then mkdir -p \"""" + str(tmpdir) + """\"; fi + tmp=$(mktemp -d -p \"""" + str(tmpdir) + """\") + trap 'rm -rf "${{tmp}}"' EXIT + + mean_insert_size=$(awk '{{print $1}}' {input.fragLen}) + + echo "printing out value of mean-insert-size ${{mean_insert_size}}" + a={input.c_option} + echo "Printing input.c_option ${{a}}" + + if [ -f "{input.c_option}" ]; then + echo "single-end with input" + cp {input.chip} ${{tmp}}/chip.bed.gz; gzip -d ${{tmp}}/chip.bed.gz; + awk 'BEGIN{{FS=OFS="\\t"}} {{gsub(/\./, 0, $5)}} 1' ${{tmp}}/chip.bed > ${{tmp}}/{params.name}.bed; + + cp {input.c_option} ${{tmp}}/input.bed.gz; gzip -d ${{tmp}}/input.bed.gz; + awk 'BEGIN{{FS=OFS="\\t"}} {{gsub(/\./, 0, $5)}} 1' ${{tmp}}/input.bed > ${{tmp}}/inputV2.bed; + + sicer \\ + -t ${{tmp}}/{params.name}.bed \\ + -c ${{tmp}}/inputV2.bed \\ + -s {params.genomever} \\ + -rt 100 \\ + -w 300 \\ + -f ${{mean_insert_size}} \\ + -egf {params.frac} \\ + -g 600 \\ + -fdr 1E-2 \\ + -cpu 30 \\ + -o ${{tmp}} + mv ${{tmp}}/{params.name}-W300-G600-FDR0.01-island.bed {output.bed}; + mv ${{tmp}}/{params.name}-W300-G600-islands-summary {params.this_sicer_dir} + else + echo "single-end without input" + cp {input.chip} ${{tmp}}/chip.bed.gz; gzip -d ${{tmp}}/chip.bed.gz; + awk 'BEGIN{{FS=OFS="\\t"}} {{gsub(/\./, 0, $5)}} 1' ${{tmp}}/chip.bed > ${{tmp}}/{params.name}.bed; + sicer \\ + -t ${{tmp}}/{params.name}.bed \\ + -s {params.genomever} \\ + -rt 100 \\ + -w 300 \\ + -f ${{mean_insert_size}} \\ + -egf {params.frac} \\ + -g 600 \\ + -e 100 \\ + -cpu 30 \\ + -o ${{tmp}} + mv ${{tmp}}/{params.name}-W300-G600.scoreisland {output.bed} + fi + """ + + +rule MACS2_narrow: + input: + chip = join(bam_dir, "{name}.Q5DD.bam"), + txt = join(ppqt_dir, "{name}.Q5DD.ppqt.txt"), + c_option = lambda w: join(bam_dir, f"{chip2input[w.name]}.Q5DD_tagAlign.gz"), + output: + join(macsN_dir, "{name}", "{name}_peaks.narrowPeak"), + params: + rname = 'MACS2_narrow', + gsize = config['references'][genome]['EFFECTIVEGENOMESIZE'], + macsver = config['tools']['MACSVER'], + flag = lambda w: "-c" if chip2input[w.name] else "", + shell: + """ + module load {params.macsver}; + ppqt_len=$(awk '{{print $1}}' {input.txt}) + macs2 callpeak \\ + -t {input.chip} {params.flag} {input.c_option} \\ + -g {params.gsize} \\ + -n {wildcards.name} \\ + --outdir {macsN_dir}/{wildcards.name} \\ + -q 0.01 \\ + --keep-dup="all" \\ + --nomodel \\ + --extsize $ppqt_len + """ \ No newline at end of file diff --git a/workflow/rules/single/qc.smk b/workflow/rules/single/qc.smk new file mode 100644 index 0000000..43fdda0 --- /dev/null +++ b/workflow/rules/single/qc.smk @@ -0,0 +1,108 @@ +# Quality control metrics and reports +# ~~~~ +# Single-ended quality control rules + +rule fastq_screen: + """ + Quality-control step to screen for different sources of contamination. + FastQ Screen compares your sequencing data to a set of different reference + genomes to determine if there is contamination. It allows a user to see if + the composition of your library matches what you expect. + @Input: + Trimmed FastQ files (scatter) + @Output: + FastQ Screen report and logfiles + """ + input: + expand(join(trim_dir, "{name}.R{rn}.trim.fastq.gz"), name=samples, rn=ends) + output: + first_txt = expand(join(qc_dir, "FQscreen", "{name}.R1.trim_screen.txt"), name=samples), + first_png = expand(join(qc_dir, "FQscreen", "{name}.R1.trim_screen.png"), name=samples), + second_txt = expand(join(qc_dir, "FQscreen2", "{name}.R1.trim_screen.txt"), name=samples), + second_png = expand(join(qc_dir, "FQscreen2", "{name}.R1.trim_screen.png"), name=samples), + params: + rname = 'fqscreen', + outdir = join(qc_dir, "FQscreen"), + outdir2 = join(qc_dir, "FQscreen2"), + fastq_screen = config['bin']['FASTQ_SCREEN'], + fastq_screen_config1 = config['shared_resources']['FASTQ_SCREEN_CONFIG_P1'], + fastq_screen_config2 = config['shared_resources']['FASTQ_SCREEN_CONFIG_P2'], + envmodules: + config['tools']['BOWTIE2VER'], + config['tools']['PERLVER'], + threads: + int(allocated("threads", "fastq_screen", cluster)) + shell: + """ + # First pass of contamination screening + {params.fastq_screen} \\ + --conf {params.fastq_screen_config1} \\ + --outdir {params.outdir} \\ + --threads {threads} \\ + --subset 1000000 \\ + --aligner bowtie2 \\ + --force \\ + {input} + # Second pass of contamination screening + {params.fastq_screen} \\ + --conf {params.fastq_screen_config2} \\ + --outdir {params.outdir2} \\ + --threads {threads} \\ + --subset 1000000 \\ + --aligner bowtie2 \\ + --force \\ + {input} + """ + +rule kraken: + """ + Quality-control step to assess for potential sources of microbial contamination. + If there are high levels of microbial contamination, Kraken will provide an + estimation of the taxonomic composition. Kraken is used in conjunction with + Krona to produce an interactive reports. + @Input: + Trimmed FastQ files (scatter) + @Output: + Kraken logfile and interative krona report + """ + input: + fq1 = join(trim_dir, "{name}.R1.trim.fastq.gz"), + output: + krakenout = join(kraken_dir, "{name}.trim.kraken_bacteria.out.txt"), + krakentaxa = join(kraken_dir, "{name}.trim.kraken_bacteria.taxa.txt"), + kronahtml = join(kraken_dir, "{name}.trim.kraken_bacteria.krona.html"), + params: + rname = 'kraken', + outdir = kraken_dir, + bacdb = config['shared_resources']['KRAKENBACDB'], + tmpdir = tmpdir, + threads: + int(allocated("threads", "kraken_pe", cluster)), + envmodules: + config['tools']['KRAKENVER'], + config['tools']['KRONATOOLSVER'], + shell: + """ + # Setups temporary directory for + # intermediate files with built-in + # mechanism for deletion on exit + if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi + tmp=$(mktemp -d -p "{params.tmpdir}") + trap 'rm -rf "${{tmp}}"' EXIT + + # Copy kraken2 db to /lscratch or temp + # location to reduce filesystem strain + cp -rv {params.bacdb} ${{tmp}}/; + kdb_base=$(basename {params.bacdb}) + + kraken2 --db ${{tmp}}/${{kdb_base}} \\ + --threads {threads} --report {output.krakentaxa} \\ + --output {output.krakenout} \\ + --gzip-compressed \\ + {input.fq1} + fi + + # Generate Krona Report + cut -f2,3 {output.krakenout} | \\ + ktImportTaxonomy - -o {output.kronahtml} + """ \ No newline at end of file diff --git a/workflow/rules/single/trim_align_dedup.smk b/workflow/rules/single/trim_align_dedup.smk new file mode 100644 index 0000000..cb6fe52 --- /dev/null +++ b/workflow/rules/single/trim_align_dedup.smk @@ -0,0 +1,316 @@ +# Trimming, alignment, and redundancy reduction rules +# ~~~~ +# Rules for single-ended reads + +# ~~ workflow endpoints +workpath = config['project']['workpath'] +bin_path = config['project']['binpath'] +genome = config['options']['genome'] + +# ~~ directories +trim_dir = join(workpath, 'trim') +tmpdir = config['options']['tmp_dir'] +bam_dir = join(workpath, "bam") +bw_dir = join(workpath, "bigwig") +ppqt_dir = join(bam_dir, "ppqt") + + +rule trim: + """ + Data-processing step to remove adapter sequences and perform quality trimming + prior to alignment the reference genome. Adapters are composed of synthetic + sequences and should be removed prior to alignment. Bwa mem aligns adapter-free + fastqs to ba lacklist that has anomalous, unstructured, or high signal in + next-generation sequencing experiments independent of cell line or experiment. + Samtools view -f4 selects for reads unmapped and outpute a blacklist-sequences-free bam file. + SamToFastq convers BAM file to FASTQs. All file processing is done in + "tmp_dir": "/lscratch/$SLURM_JOBID/", meaning all files are lost at job completion + except for final R1.trim.fastq.gz and R2.trim.fastq.gz + + @Input: + Raw FastQ files + @Output: + Trimmed and blacklist-sequences-free FastQ files + """ + input: + file1 = join(workpath, "{name}.R1.fastq.gz"), + file2 = provided(join(workpath, "{name}.R2.fastq.gz"), paired_end) + output: + outfq1 = temp(join(trim_dir, "{name}.R1.trim.fastq.gz")), + outfq2 = provided(temp(join(trim_dir, "{name}.R2.trim.fastq.gz")), paired_end) + params: + rname = "trim", + cutadaptver = config['tools']['CUTADAPTVER'], + workpath = config['project']['workpath'], + fastawithadaptersetd = join(workpath, config['shared_resources']['ADAPTERS_FASTA']), + blacklistbwaindex = config['references'][genome]['BLACKLISTBWAINDEX'], + picardver = config['tools']['PICARDVER'], + bwaver = config['tools']['BWAVER'], + samtoolsver = config['tools']['SAMTOOLSVER'], + minlen = 35, + leadingquality = 10, + trailingquality = 10, + javaram = "64g", + sample = "{name}", + tmpdir = tmpdir, + threads: + 32 + shell: + """ + module load {params.cutadaptver}; + module load {params.bwaver}; + module load {params.samtoolsver}; + module load {params.picardver}; + if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi + tmp=$(mktemp -d -p "{params.tmpdir}") + trap 'rm -rf "${{tmp}}"' EXIT + + cutadapt \\ + --nextseq-trim=2 \\ + --trim-n \\ + -n 5 \\ + -O 5 \\ + -q {params.leadingquality},{params.trailingquality} \\ + -m {params.minlen} \\ + -b file:{params.fastawithadaptersetd} \\ + -j {threads} \\ + -o ${{tmp}}/{params.sample}.R1.trim.fastq.gz \\ + {input.file1} + + if [ "{params.blacklistbwaindex}" != "" ]; then + bwa mem -t {threads} \\ + {params.blacklistbwaindex} \\ + ${{tmp}}/{params.sample}.R1.trim.fastq.gz \\ + | samtools view -@{threads} \\ + -f4 \\ + -b \\ + -o ${{tmp}}/{params.sample}.bam; + rm ${{tmp}}/{params.sample}.R1.trim.fastq.gz; + java -Xmx{params.javaram} -jar $PICARDJARPATH/picard.jar SamToFastq \\ + -VALIDATION_STRINGENCY SILENT \\ + -INPUT ${{tmp}}/{params.sample}.bam \\ + -FASTQ ${{tmp}}/{params.sample}.R1.trim.fastq + rm ${{tmp}}/{params.sample}.bam; + pigz -p {threads} ${{tmp}}/{params.sample}.R1.trim.fastq; + fi + mv ${{tmp}}/{params.sample}.R1.trim.fastq.gz {output.outfq1}; + """ + + +rule BWA: + """ + Data processing rule to align trimmed and blacklist-sequences-free reads + to the reference genome using bwa mem aligner. Samtools sort sort alignments + by coordinates to produce bam file. Samtools flagstats summarizes reads by QC + and number of alignments for each FLAG type. Samtools idxstats outputs + stats by chromosomesequence,length, # mapped read-segments and + # unmapped read-segments. Resulting bam file is filtered by a mapQ value. + + @Input: + Trimmed and blacklist-sequences-free FastQ files + @Output: + Bam file that have read aligned: sorted.bam + Bam file that has reads aligned and filted by mapQ a value: Q5.bam + """ + input: + join(trim_dir, "{name}.R1.trim.fastq.gz"), + params: + d = join(bam_dir), + rname = 'bwa', + reference = config['references'][genome]['BWA'], + bwaver = config['tools']['BWAVER'], + samtoolsver = config['tools']['SAMTOOLSVER'], + script = join(workpath, "bin", "bam_filter_by_mapq.py"), + pythonver = config['tools']['PYTHONVER'], + output: + outbam1 = join(bam_dir, "{name}.sorted.bam"), + outbam2 = temp(join(bam_dir, "{name}.Q5.bam")), + flagstat1 = join(bam_dir, "{name}.sorted.bam.flagstat"), + idxstat1 = join(bam_dir, "{name}.sorted.bam.idxstat"), + flagstat2 = join(bam_dir, "{name}.Q5.bam.flagstat"), + idxstat2 = join(bam_dir, "{name}.Q5.bam.idxstat"), + threads: + 32 + shell: + """ + module load {params.bwaver}; + module load {params.samtoolsver}; + module load {params.pythonver}; + + bwa mem -t {threads} {params.reference} {input} \\ + | samtools sort -@{threads} -o {output.outbam1} + samtools index {output.outbam1} + samtools flagstat {output.outbam1} > {output.flagstat1} + samtools idxstats {output.outbam1} > {output.idxstat1} + samtools view -b -q 6 {output.outbam1} -o {output.outbam2} + + samtools index {output.outbam2} + samtools flagstat {output.outbam2} > {output.flagstat2} + samtools idxstats {output.outbam2} > {output.idxstat2} + """ + + +rule dedup: + """ + Picard MarkDuplicates removes duplicates from bam file. + Samtools flagstats summarizes reads by QC + and number of alignments for each FLAG type. Samtools idxstats outputs + stats by chromosomesequence,length, # mapped read-segments and + # unmapped read-segments. If assay is cfchip, deduplicated bam file is + filtered for chromosomes 1 through 22 to produce: Q5DD.bam; + the original deduplicated file also convereted to bed file of fragments + via granges and rtracklayer: Q5DD.tagAlign. + + @Input: + Bam file that has reads aligned and filted by mapQ a value: Q5.bam + @Output: + Deduplicated Q5DD.bam for all assays, plus Q5DD.tagAlign if cfchip assay + + """ + input: + join(bam_dir,"{name}.Q5.bam") + output: + out_bam = join(bam_dir, "{name}.Q5DD.bam"), + flagstat = join(bam_dir, "{name}.Q5DD.bam.flagstat"), + idxstat = join(bam_dir, "{name}.Q5DD.bam.idxstat"), + bwadups = join(bam_dir, "{name}.bwa.Q5.duplic"), + tagalign = join(bam_dir, "{name}.Q5DD_tagAlign") if assay == "cfchip" \ + else join(bam_dir, "{name}.Q5DD_tagAlign.gz") + params: + rname = 'dedup', + picardver = config['tools']['PICARDVER'], + samtoolsver = config['tools']['SAMTOOLSVER'], + bedtoolsver = config['tools']['BEDTOOLSVER'], + macsver = config['tools']['MACSVER'], + gsize = config['references'][genome]['EFFECTIVEGENOMESIZE'], + genomefile = config['references'][genome]['REFLEN'], + rver = config['tools']['RVER'], + javaram = '16g', + tmpdir = tmpdir, + tmpBam = "{name}.Q5DD.withXY.bam", + rscript = join(config['references'][genome]['cfChIP_TOOLS_SRC'], "bam2fragment.R"), + shell: + """ + module load {params.samtoolsver}; + module load {params.picardver}; + module load {params.bedtoolsver}; + module load {params.macsver}; + module load {params.rver}; + if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi + tmp=$(mktemp -d -p "{params.tmpdir}") + trap 'rm -rf "${{tmp}}"' EXIT + + if [ "{assay}" == "cfchip" ]; then + java -Xmx{params.javaram} \\ + -jar $PICARDJARPATH/picard.jar MarkDuplicates \\ + -I {input} \\ + -O {params.tmpBam} \\ + -TMP_DIR ${{tmp}} \\ + -VALIDATION_STRINGENCY SILENT \\ + -REMOVE_DUPLICATES true + -METRICS_FILE {output.bwadups}; + samtools index {params.tmpBam}; + samtools view -b {params.tmpBam} chr{{1..22}} > {output.out_bam}; + Rscript {params.rscript} {params.tmpBam} {output.tagalign}; + rm {params.tmpBam} {params.tmpBam}.bai; + samtools index {output.out_bam}; + samtools flagstat {output.out_bam} > {output.flagstat}; + samtools idxstats {output.out_bam} > {output.idxstat}; + else + java -Xmx{params.javaram} \\ + -jar $PICARDJARPATH/picard.jar MarkDuplicates \\ + -I {input} \\ + -O {output.out_bam} \\ + -TMP_DIR ${{tmp}} \\ + -VALIDATION_STRINGENCY SILENT \\ + -REMOVE_DUPLICATES true \\ + -METRICS_FILE {output.bwadups}; + samtools index {output.out_bam}; + samtools flagstat {output.outout_bam5} > {output.flagstat}; + samtools idxstats {output.out_bam} > {output.idxstat}; + fi + """ + + +rule ppqt: + input: + bam = lambda w : join(bam_dir, f"{w.name}.{w.ext}." + + "bam" if w.ext.lower() == 'sorted' else "gz"), + output: + ppqt = join(ppqt_dir, "{name}.{ext}.ppqt"), + pdf = join(ppqt_dir, "{name}.{ext}.pdf"), + txt = join(ppqt_dir, "{name}.{ext}.ppqt.txt"), + params: + rname = "ppqt", + samtoolsver = config['tools']['SAMTOOLSVER'], + rver = config['tools']['RVER'], + scriptPy = join(bin_path, "ppqt_process.py"), + file_name = "{name}" + container: + config['images']['ppqt'] + shell: + """ + if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi + tmp=$(mktemp -d -p "{params.tmpdir}") + trap 'rm -rf "${{tmp}}"' EXIT + + if [[ '{input.bam}' == *.gz ]]; then + ln -s {input.bam} ${{tmp}}/{params.file_name}.Q5DD.tagAlign.gz; + run_spp.R -c=${{tmp}}/{params.file_name}.Q5DD.tagAlign.gz \ + -savp={output.pdf} -out={output.ppqt} \ + -tmpdir=/lscratch/$SLURM_JOBID -rf + else + run_spp.R -c={input.bam} \ + -savp={output.pdf} -out={output.ppqt} \ + -tmpdir=/lscratch/$SLURM_JOBID -rf + fi + python {params.scriptPy} -i {output.ppqt} -o {output.txt} + """ + + +rule bam2bw: + """ + bamCoverage converts bams to bigwig files for read visialization + in UCSC Genome Browser or IGV. + @Input: + Files with extensions: sorted.bam and Q5DD.bam, + for all samples (treatment and input controls) + @Output: + bigWig file with contains coordinates for an interval and + an associated score, RPGC + """ + input: + bam = lambda w: join(bam_dir, "{w.name}." + + "Q5DD.bam" if w.ext == "Q5DD" else "sorted.bam"), + ppqt = lambda w: join(ppqt_dir, "{w.name}." + + "Q5DD_tagAlign.ppqt.txt" if w.ext == "Q5DD" else + "sorted.ppqt.txt") + output: + outbw = join(bw_dir, "{name}.{ext}.RPGC.bw"), + params: + rname = "bam2bw", + name = "{name}", + effectivegenomesize = config['references'][genome]['EFFECTIVEGENOMESIZE'], + threads: + int(allocated("threads", "bam2bw", cluster)), + envmodules: + config['tools']['DEEPTOOLSVER'], + shell: + """ + if [ ! -d \"""" + tmpdir + """\" ]; then mkdir -p \"""" + tmpdir + """\"; fi + tmp=$(mktemp -d -p \"""" + tmpdir + """\") + trap 'rm -rf "${{tmp}}"' EXIT + + bam_cov_option=--centerReads + echo "printing out value of bam-cov-option $bam_cov_option" + + bamCoverage \\ + --bam {input.bam} \\ + -o {output.outbw} \\ + --binSize 25 \\ + --smoothLength 75 \\ + --numberOfProcessors {threads} \\ + --normalizeUsing RPGC \\ + --effectiveGenomeSize {params.effectivegenomesize} \\ + ${{bam_cov_option}}; + """ \ No newline at end of file diff --git a/workflow/scripts/common.py b/workflow/scripts/common.py index 6feba1c..156065d 100644 --- a/workflow/scripts/common.py +++ b/workflow/scripts/common.py @@ -1,4 +1,3 @@ - #!/usr/bin/env python3 # ~~~ Common helper functions shared across the entire workflow import os @@ -14,9 +13,9 @@ def provided(samplelist, condition): """ if not condition: - # If condition is False, - # returns an empty list - # to prevent rule from + # If condition is False, + # returns an empty list + # to prevent rule from # running samplelist = [] @@ -27,14 +26,14 @@ def ignore(samplelist, condition): """ Determines if optional rules should run. If an empty list is provided to rule all, snakemake will not try to generate that set of target files. If a given condition - is met (i.e. True) then it will not try to run that rule. This function is the - inverse to provided(). + is met (i.e. True) then it will not try to run that rule. This function is the + inverse to provided(). """ if condition: - # If condition is True, - # returns an empty list - # to prevent rule from + # If condition is True, + # returns an empty list + # to prevent rule from # running samplelist = [] @@ -58,7 +57,7 @@ def s3_configured(uri): import re # Get bucket and key from s3 uri - parsed = re.match(r's3:\/\/(.+?)\/(.+)', uri) + parsed = re.match(r"s3:\/\/(.+?)\/(.+)", uri) bucket, key = parsed.groups() accessible = True @@ -88,7 +87,11 @@ def abstract_location(file_address, *args, **kwargs): # Check if user provided any input if not file_address or file_address is None: - raise IOError("Failed to provide any input files! Input(s) are required to resolve required files.".format(file_address)) + raise IOError( + "Failed to provide any input files! Input(s) are required to resolve required files.".format( + file_address + ) + ) # If given file path to one file, convert it a list[] file_list = [file_address] if isinstance(file_address, str) else file_address @@ -96,7 +99,7 @@ def abstract_location(file_address, *args, **kwargs): # Loop through list of provided files, and if a remote storage option # is given, convert its index to a remote file object. for i, uri in enumerate(file_list): - if uri.lower().startswith('s3://'): + if uri.lower().startswith("s3://"): # Remote option for S3 storage import snakemake.remote.S3 import botocore.session @@ -112,18 +115,22 @@ def abstract_location(file_address, *args, **kwargs): # s3 bucket are configured correctly. # If a file in provieded as input to a Snakemake rule, only read # access is needed to access the remote S3 object. - remote_provider = snakemake.remote.S3.RemoteProvider(config=botocore.client.Config(signature_version=botocore.UNSIGNED)) + remote_provider = snakemake.remote.S3.RemoteProvider( + config=botocore.client.Config(signature_version=botocore.UNSIGNED) + ) file_list[i] = remote_provider.remote(uri, *args, **kwargs) - elif uri.lower().startswith('gs://'): + elif uri.lower().startswith("gs://"): # Remote option for Google Cloud Storage import snakemake.remote.GS + remote_provider = snakemake.remote.GS.RemoteProvider() file_list[i] = remote_provider.remote(uri, *args, **kwargs) - elif uri.lower().startswith('sftp://'): + elif uri.lower().startswith("sftp://"): # Remote option for SFTP transfers import snakemake.remote.SFTP + remote_provider = snakemake.remote.SFTP.RemoteProvider() file_list[i] = remote_provider.remote(uri, *args, **kwargs) @@ -142,60 +149,62 @@ def references(config, reflist): _all = True for ref in reflist: - try: tmp = config['references'][ref] + try: + tmp = config["references"][ref] # Check if ref exists in config except KeyError: _all = False break # Check if ref is empty key string - if not tmp: _all = False + if not tmp: + _all = False return _all def allocated(resource, rule, lookup, default="__default__"): - """Pulls resource information for a given rule. If a rule does not have any information + """Pulls resource information for a given rule. If a rule does not have any information for a given resource type, then it will pull from the default. Information is pulled from - definitions in the cluster.json (which is used a job submission). This ensures that any + definitions in the cluster.json (which is used a job submission). This ensures that any resources used at runtime mirror the resources that were allocated. :param resource : resource type to look in cluster.json (i.e. threads, mem, time, gres) :param rule : rule to lookup its information :param lookup : Lookup containing allocation information (i.e. cluster.json) :param default : default information to use if rule information cannot be found - :return allocation : + :return allocation : allocation information for a given resource type for a given rule """ - try: + try: # Try to get allocation information # for a given rule allocation = lookup[rule][resource] except KeyError: # Use default allocation information allocation = lookup[default][resource] - + return allocation def str_bool(s): """Converts a string to boolean. It is dangerous to try to - typecast a string into a boolean value using the built-in + typecast a string into a boolean value using the built-in `bool()` function. This function avoids any issues that can - arise when using `bool()`. + arise when using `bool()`. Example: boolean('True') returns True boolean('False') returns False boolean('asdas') raises TypeError """ val = s.lower() - if val in ['true', '1', 'y', 'yes']: + if val in ["true", "1", "y", "yes"]: return True - elif val in ['false', '0', 'n', 'no', '']: + elif val in ["false", "0", "n", "no", ""]: return False else: # Provided value could not be # type casted into a boolean - raise TypeError('Fatal: cannot type cast {} into a boolean'.format(val)) + raise TypeError("Fatal: cannot type cast {} into a boolean".format(val)) def joint_option(prefix, valueslist): @@ -214,7 +223,7 @@ def joint_option(prefix, valueslist): def mk_dir_if_not_exist(dirs): if isinstance(dirs, str): dirs = [dirs] - assert isinstance(dirs, list), 'Supplied directories should be in a list' + assert isinstance(dirs, list), "Supplied directories should be in a list" for _dir in dirs: if not os.path.exists(_dir): os.mkdir(_dir, mode=0o775) @@ -224,38 +233,9 @@ def mk_dir_if_not_exist(dirs): def get_file_components(pair_ended): alnexts = [] if pair_ended: - alnexts.extend(['sorted.bam', 'Q5DD.bam']) + alnexts.extend(["sorted.bam", "Q5DD.bam"]) else: - alnexts.extend(['sorted.bam', 'Q5DD_tagAlign.gz']) - stems = list(map(lambda x: x.split('.')[0], alnexts)) - rpgc_exts = list(map(lambda x: x.split('.')[0] + '.RPGC', alnexts)) + alnexts.extend(["sorted.bam", "Q5DD_tagAlign.gz"]) + stems = list(map(lambda x: x.split(".")[0], alnexts)) + rpgc_exts = list(map(lambda x: x.split(".")[0] + ".RPGC", alnexts)) return stems, rpgc_exts, alnexts - - -def get_bam_ext(ext, pair_ended): - if pair_ended: - if ext.lower() == 'sorted': - return "bam" - elif ext == 'Q5DD': - return "bam" - else: - if ext.lower() == 'sorted': - return "bam" - elif ext == "Q5DD_tagAlign": - return "gz" - raise ValueError(f'Unknown file component. Pair ended: {str(pair_ended)}. Ext: {str(ext)}') - - -def get_fqscreen_outputs(paired_end, samples, qc_dir): - outs = [] - if paired_end: - outs.extend(expand(join(qc_dir, "FQscreen", "{name}.R{rn}.trim_screen.txt"), name=samples, rn=[1, 2])), - outs.extend(expand(join(qc_dir, "FQscreen", "{name}.R{rn}.trim_screen.png"), name=samples, rn=[1, 2])), - outs.extend(expand(join(qc_dir, "FQscreen2", "{name}.R{rn}.trim_screen.txt"), name=samples, rn=[1, 2])), - outs.extend(expand(join(qc_dir, "FQscreen2", "{name}.R{rn}.trim_screen.png"), name=samples, rn=[1, 2])), - else: - outs.extend(expand(join(qc_dir, "FQscreen", "{name}.R1.trim_screen.txt"), name=samples)), - outs.extend(expand(join(qc_dir, "FQscreen", "{name}.R1.trim_screen.png"), name=samples)), - outs.extend(expand(join(qc_dir, "FQscreen2", "{name}.R1.trim_screen.txt"), name=samples)), - outs.extend(expand(join(qc_dir, "FQscreen2", "{name}.R1.trim_screen.png"), name=samples)), - return outs \ No newline at end of file diff --git a/workflow/scripts/grouping.py b/workflow/scripts/grouping.py index 6739fe6..b3a3395 100644 --- a/workflow/scripts/grouping.py +++ b/workflow/scripts/grouping.py @@ -2,22 +2,23 @@ # ~~~ Common helper functions for grouping of outputs from os.path import join + # common functions related to sample grouping or group meta-information def group_samples_by_reps(groupdata, samples, chip2input): groupdatawinput = {} groupswreps = [] - for group, chipsamples in groupdata.items() : - tmp = [ ] + for group, chipsamples in groupdata.items(): + tmp = [] if len(chipsamples) > 1: groupswreps.append(group) - for chip in chipsamples : + for chip in chipsamples: if chip in samples: tmp.append(chip) input = chip2input[chip] - if input != 'NA' and input != '': + if input != "NA" and input != "": tmp.append(input) if len(tmp) != 0: - groupdatawinput[group]=set(tmp) + groupdatawinput[group] = set(tmp) return groupdatawinput, groupswreps @@ -31,30 +32,30 @@ def group_output_files(extensions, groupslist, inputnorm): Note: Inputnorm will only be included when there are input samples. """ dtoolgroups, dtoolext = [], [] - + if len(inputnorm) == 2: - dtoolgroups.extend(["InputNorm"]) - dtoolext.extend([extensions[1]]) - + dtoolgroups.extend(["InputNorm"]) + dtoolext.extend([extensions[1]]) + for group in groupslist: - dtoolgroups.extend([group] * 2) - dtoolext.extend([extensions[1], extensions[0]]) - + dtoolgroups.extend([group] * 2) + dtoolext.extend([extensions[1], extensions[0]]) + if len(inputnorm) == 2: - dtoolgroups.extend(["InputNorm.prot"]) - dtoolext.extend([extensions[1]]) - + dtoolgroups.extend(["InputNorm.prot"]) + dtoolext.extend([extensions[1]]) + for group in groupslist: - dtoolgroups.extend([group + ".prot"] * 2) - dtoolext.extend([extensions[1], extensions[0]]) - + dtoolgroups.extend([group + ".prot"] * 2) + dtoolext.extend([extensions[1], extensions[0]]) + return dtoolgroups, dtoolext def get_peaktools(assay_type): tools = ["macsNarrow"] - if assay_type == "atac": - tools.append("Genrich") + if assay_type == "atac": + tools.append("Genrich") elif assay_type == "chip": tools.append("macsBroad") # turn sicer off for now @@ -62,50 +63,15 @@ def get_peaktools(assay_type): return tools -def dedup_out7(input, assay, paired_end): - dd = [] - if assay == "cfchip": - dd.append(input + ".Q5DD_tagAlign") - elif paired_end == False and assay == "chip": - dd.append(input + ".Q5DD_tagAlign.gz") - return dd - - -def get_ppqt_input(ppqt_dir, wildcards, paired_end): - ppqt = [] - if paired_end: - ppqt.append(join(ppqt_dir, "{0}.{1}.ppqt.txt".format(wildcards.name, wildcards.ext))) - else: - if wildcards.ext == "Q5DD": - ppqt.append(join(ppqt_dir, "{0}.Q5DD_tagAlign.ppqt.txt".format(wildcards.name))) - elif wildcards.ext == "sorted": - ppqt.append(join(ppqt_dir, "{0}.sorted.ppqt.txt".format(wildcards.name))) - else: - raise ValueError(f'Unknown alignment file extension, name: {wildcards.name}, ext: {wildcards.ext}.') - return ppqt - - -def get_bam_input(bam_dir, wildcards, paired_end): - bams = [] - if paired_end: - bams.append(join(bam_dir, "{0}.{1}.bam".format(wildcards.name, wildcards.ext))) - else: - if wildcards.ext == "Q5DD": - bams.append(join(bam_dir, "{0}.Q5DD.bam".format(wildcards.name))) - elif wildcards.ext == "sorted": - bams.append(join(bam_dir, "{0}.sorted.bam".format(wildcards.name))) - return bams - - def test_for_block(groupdata, contrast, blocks): - """ only want to run blocking on contrasts where all - individuals are on both sides of the contrast """ - contrastBlock = [ ] + """only want to run blocking on contrasts where all + individuals are on both sides of the contrast""" + contrastBlock = [] for con in contrast: group1 = con[0] group2 = con[1] - block1 = [ blocks[sample] for sample in groupdata[group1] ] - block2 = [ blocks[sample] for sample in groupdata[group2] ] + block1 = [blocks[sample] for sample in groupdata[group1]] + block2 = [blocks[sample] for sample in groupdata[group2]] if len(block1) == len(block2): if len(set(block1).intersection(block2)) == len(block1): contrastBlock.append(con) @@ -114,11 +80,13 @@ def test_for_block(groupdata, contrast, blocks): def ctrl_test(ctrl_dict, input_name, in_dir, mode=None): sample = join(in_dir, f"{input_name}.Q5DD.RPGC.bw") - assert mode in ('chip', 'ctrl'), 'Unrecognized input file mode.' - + assert mode in ("chip", "ctrl"), "Unrecognized input file mode." + if input_name in ctrl_dict: norm = join(in_dir, ctrl_dict[input_name] + ".Q5DD.RPGC.bw") else: - raise ValueError(f'ChIP sample {input_name} missing from input lookup: \n{str(ctrl_dict)}') - outs = {'chip': sample, 'ctrl': norm} - return outs[mode] \ No newline at end of file + raise ValueError( + f"ChIP sample {input_name} missing from input lookup: \n{str(ctrl_dict)}" + ) + outs = {"chip": sample, "ctrl": norm} + return outs[mode] diff --git a/workflow/scripts/peakcall.py b/workflow/scripts/peakcall.py index 1567a55..6cc4c00 100644 --- a/workflow/scripts/peakcall.py +++ b/workflow/scripts/peakcall.py @@ -13,15 +13,6 @@ def get_input_bam(input_sample, bam_dir): return [] -def get_control_input(ext, paired_end, bam_dir): - i = [] - if paired_end and ext != "": - i = [join(bam_dir, "{0}.Q5DD.bam".format(ext))] - elif not paired_end and ext != "": - i = [join(bam_dir, "{0}.Q5DD_tagAlign.gz".format(ext))] - return i - - def outputIDR(groupswreps, groupdata, chip2input, tools): """ Produces the correct output files for IDR. All supposed replicates @@ -73,22 +64,6 @@ def calc_effective_genome_fraction(effectivesize, genomefile): return(str(float(effectivesize)/ genomelen)) -def getSicerChips(bam_dir, name, paired_end): - if paired_end: - chip = join(bam_dir, name + ".Q5DD.bam") - else: - chip = join(bam_dir, name + ".Q5DD_tagAlign.gz") - return chip - - -def getSicerFragLen(ppqt_dir, qc_dir, name, paired_end): - if paired_end: - fragLen = join(qc_dir, name + ".Q5DD.insert_size_metrics.txt") - else: - fragLen = join(ppqt_dir, name + ".Q5DD_tagAlign.ppqt.txt") - return fragLen - - def get_manorm_sizes(g1, g2, group_data, ppqt_in): if not ppqt_in: return ""