diff --git a/workflow/Snakefile b/workflow/Snakefile index 52e7d4c..a820a07 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -221,18 +221,9 @@ rule all: input: rule_all_ins -# Include endedness rules -if paired_end: - include: join("rules", "paired", "peakcall.smk") - include: join("rules", "paired", "qc.smk") - include: join("rules", "paired", "trim.smk") -else: - include: join("rules", "paired", "peakcall.smk") - include: join("rules", "paired", "qc.smk") - include: join("rules", "paired", "trim.smk") - -# Include general rules +# Include child rules include: join("rules", "hooks.smk") +include: join("rules", "trim_align_dedup.smk") include: join("rules", "qc.smk") include: join("rules", "peakcall.smk") include: join("rules", "dba.smk") diff --git a/workflow/rules/paired/peakcall.smk b/workflow/rules/paired/peakcall.smk deleted file mode 100644 index c40a4a5..0000000 --- a/workflow/rules/paired/peakcall.smk +++ /dev/null @@ -1,132 +0,0 @@ -# Calling read coverage as peaks -# ~~~~ -# Paired-end peak calling rules - - -rule MACS2_broad: - input: - chip = join(bam_dir, "{name}.Q5DD_tagAlign.gz"), - txt = join(ppqt_dir, "{name}.Q5DD_tagAlign.ppqt.txt"), - c_option = lambda w: join(bam_dir, f"{chip2input[w.name]}.Q5DD.bam"), - output: - join(macsB_dir, "{name}", "{name}_peaks.broadPeak"), - params: - rname = 'MACS2_broad', - gsize = config['references'][genome]['EFFECTIVEGENOMESIZE'], - macsver = config['tools']['MACSVER'], - flag = lambda w: "-c" if chip2input[w.name] else "", - shell: - """ - module load {params.macsver}; - macs2 callpeak \\ - -t {input.chip} {params.flag} {input.c_option} \\ - -g {params.gsize} \\ - -n {wildcards.name} \\ - --outdir {macsB_dir}/{wildcards.name} \\ - --broad \\ - --broad-cutoff 0.01 \\ - --keep-dup="all" \\ - -f "BAMPE" - """ - - -rule SICER: - input: - chip = lambda w: join(bam_dir, w.name + ".Q5DD.bam"), - fragLen = lambda w: join(qc_dir, name + ".Q5DD.insert_size_metrics.txt"), - c_option = lambda w: join(bam_dir, f"{chip2input[w.name]}.Q5DD.bam"), - output: - bed = join(sicer_dir, "{name}", "{name}_broadpeaks.bed") if has_inputs else [], - params: - rname = 'SICER', - name = "{name}", - sicerver = config['tools']['SICERVER'], - bedtoolsver = config['tools']['BEDTOOLSVER'], - genomever = config['options']['genome'], - this_sicer_dir = join(sicer_dir,"{name}"), - frac = config['references'][genome]['FRAC'], - flag = lambda w: "-c" if chip2input[w.name] else "", - shell: - """ - module load {params.sicerver} - module load {params.bedtoolsver} - if [ ! -d \"""" + str(tmpdir) + """\" ]; then mkdir -p \"""" + str(tmpdir) + """\"; fi - tmp=$(mktemp -d -p \"""" + str(tmpdir) + """\") - trap 'rm -rf "${{tmp}}"' EXIT - - MEAN_INSERT_SIZE=$(cat {input.fragLen} | awk '/MEDIAN_INSERT_SIZE/{{f=1;next}} /## HISTOGRAM/{{f=0}} f' | cut -f 6) - mean_insert_size=$(printf "%.0f" $MEAN_INSERT_SIZE) - - echo "printing out value of mean-insert-size ${{mean_insert_size}}" - a={input.c_option} - echo "Printing input.c_option ${{a}}" - - if [ -f "{input.c_option}" ]; then - # Copying input to tmpdir due to SICER2 - # bam2bed file conversion, if more than - # one sample shares the same IP sample - # than a race condition can occur where - # two jobs can concurrent try to write - # to the same BED file (during bedtools - # bam2bed that sicer calls). - input_bam="$(basename "{input.c_option}")" - cp {input.c_option} ${{tmp}} - echo "paired-end with input... ${{tmp}}/${{input_bam}}" - sicer \\ - -t {input.chip} \\ - -c "${{tmp}}/${{input_bam}}" \\ - -s {params.genomever} \\ - -rt 100 \\ - -w 300 \\ - -f ${{mean_insert_size}} \\ - -egf {params.frac} \\ - -g 600 \\ - -fdr 1E-2 \\ - -cpu 30 \\ - -o ${{tmp}} - - mv ${{tmp}}/{params.name}.Q5DD-W300-G600-FDR0.01-island.bed {output.bed}; - mv ${{tmp}}/{params.name}.Q5DD-W300-G600-islands-summary {params.this_sicer_dir} - else - echo "paired-end without input" - sicer \\ - -t {input.chip} \\ - -s {params.genomever} \\ - -rt 100 \\ - -w 300 \\ - -f ${{mean_insert_size}} \\ - -egf {params.frac} \\ - -g 600 \\ - -e 100 \\ - -cpu 30 \\ - -o ${{tmp}} - - mv ${{tmp}}/{params.name}.Q5DD-W300-G600.scoreisland {params.this_sicer_dir} - fi - """ - - -rule MACS2_narrow: - input: - chip = join(bam_dir, "{name}.Q5DD_tagAlign.gz"), - txt = join(ppqt_dir, "{name}.Q5DD_tagAlign.ppqt.txt"), - c_option = lambda w: join(bam_dir, f"{chip2input[w.name]}.Q5DD.bam"), - output: - join(macsN_dir, "{name}", "{name}_peaks.narrowPeak"), - params: - rname = 'MACS2_narrow', - gsize = config['references'][genome]['EFFECTIVEGENOMESIZE'], - macsver = config['tools']['MACSVER'], - flag = lambda w: "-c" if chip2input[w.name] else "", - shell: - """ - module load {params.macsver}; - macs2 callpeak \\ - -t {input.chip} {params.flag} {input.c_option} \\ - -g {params.gsize} \\ - -n {wildcards.name} \\ - --outdir {macsN_dir}/{wildcards.name} \\ - -q 0.01 \\ - --keep-dup="all" \\ - -f "BAMPE" - """ \ No newline at end of file diff --git a/workflow/rules/paired/qc.smk b/workflow/rules/paired/qc.smk deleted file mode 100644 index ed5f3e7..0000000 --- a/workflow/rules/paired/qc.smk +++ /dev/null @@ -1,116 +0,0 @@ -# Quality control metrics and reports -# ~~~~ -# Paired-end peak qc rules - -rule fastq_screen: - """ - Quality-control step to screen for different sources of contamination. - FastQ Screen compares your sequencing data to a set of different reference - genomes to determine if there is contamination. It allows a user to see if - the composition of your library matches what you expect. - @Input: - Trimmed FastQ files (scatter) - @Output: - FastQ Screen report and logfiles - """ - input: - expand(join(trim_dir, "{name}.R{rn}.trim.fastq.gz"), name=samples, rn=ends) - output: - first_txt = expand(join( - qc_dir, "FQscreen", "{name}.R{rn}.trim_screen.txt"), name=samples, rn=[1, 2] - ), - first_png = expand(join( - qc_dir, "FQscreen", "{name}.R{rn}.trim_screen.png"), name=samples, rn=[1, 2] - ), - second_txt = expand(join( - qc_dir, "FQscreen2", "{name}.R{rn}.trim_screen.txt"), name=samples, rn=[1, 2] - ), - second_png = expand(join( - qc_dir, "FQscreen2", "{name}.R{rn}.trim_screen.png"), name=samples, rn=[1, 2] - ) - params: - rname = 'fqscreen', - outdir = join(qc_dir, "FQscreen"), - outdir2 = join(qc_dir, "FQscreen2"), - fastq_screen = config['bin']['FASTQ_SCREEN'], - fastq_screen_config1 = config['shared_resources']['FASTQ_SCREEN_CONFIG_P1'], - fastq_screen_config2 = config['shared_resources']['FASTQ_SCREEN_CONFIG_P2'], - envmodules: - config['tools']['BOWTIE2VER'], - config['tools']['PERLVER'], - threads: - int(allocated("threads", "fastq_screen", cluster)) - shell: - """ - # First pass of contamination screening - {params.fastq_screen} \\ - --conf {params.fastq_screen_config1} \\ - --outdir {params.outdir} \\ - --threads {threads} \\ - --subset 1000000 \\ - --aligner bowtie2 \\ - --force \\ - {input} - # Second pass of contamination screening - {params.fastq_screen} \\ - --conf {params.fastq_screen_config2} \\ - --outdir {params.outdir2} \\ - --threads {threads} \\ - --subset 1000000 \\ - --aligner bowtie2 \\ - --force \\ - {input} - """ - -rule kraken: - """ - Quality-control step to assess for potential sources of microbial contamination. - If there are high levels of microbial contamination, Kraken will provide an - estimation of the taxonomic composition. Kraken is used in conjunction with - Krona to produce an interactive reports. - @Input: - Trimmed FastQ files (scatter) - @Output: - Kraken logfile and interative krona report - """ - input: - fq1 = join(trim_dir, "{name}.R1.trim.fastq.gz"), - fq2 = join(trim_dir, "{name}.R2.trim.fastq.gz") - output: - krakenout = join(kraken_dir, "{name}.trim.kraken_bacteria.out.txt"), - krakentaxa = join(kraken_dir, "{name}.trim.kraken_bacteria.taxa.txt"), - kronahtml = join(kraken_dir, "{name}.trim.kraken_bacteria.krona.html"), - params: - rname = 'kraken', - outdir = kraken_dir, - bacdb = config['shared_resources']['KRAKENBACDB'], - tmpdir = tmpdir - threads: - int(allocated("threads", "kraken_pe", cluster)), - envmodules: - config['tools']['KRAKENVER'], - config['tools']['KRONATOOLSVER'], - shell: - """ - # Setups temporary directory for - # intermediate files with built-in - # mechanism for deletion on exit - if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi - tmp=$(mktemp -d -p "{params.tmpdir}") - trap 'rm -rf "${{tmp}}"' EXIT - - # Copy kraken2 db to /lscratch or temp - # location to reduce filesystem strain - cp -rv {params.bacdb} ${{tmp}}/; - kdb_base=$(basename {params.bacdb}) - - kraken2 --db ${{tmp}}/${{kdb_base}} \\ - --threads {threads} --report {output.krakentaxa} \\ - --output {output.krakenout} \\ - --gzip-compressed \\ - --paired {input.fq1} {input.fq2} - - # Generate Krona Report - cut -f2,3 {output.krakenout} | \\ - ktImportTaxonomy - -o {output.kronahtml} - """ \ No newline at end of file diff --git a/workflow/rules/peakcall.smk b/workflow/rules/peakcall.smk index 0a3ec94..5b56e0c 100644 --- a/workflow/rules/peakcall.smk +++ b/workflow/rules/peakcall.smk @@ -1,7 +1,8 @@ -# Calling read coverage as peaks +# Quality control rules # ~~~~ -# Genrally applicable rules +# Common quality-control rules: preseq, NRF, rawfastqc, +# fastqc, fastq_screen, multiQC from os.path import join from scripts.peakcall import get_control_input, \ getSicerChips, getSicerFragLen, get_control_input @@ -23,41 +24,6 @@ macsB_dir = join(workpath, "macsBroad") sicer_dir = join(workpath, "sicer") -rule inputnorm: - """ - bigwigCompare (deepTools) subracts input control from treatment bigWig file, - normalizing treatment bigWig. - @Input: - Treatment sample, which has input control: extension Q5DD.RPGC.bw - and input its control: extension Q5DD.RPGC.bw - @Output: - bigWig file of treatmment sample normalizes with its input control - """ - input: - chip = lambda wc: ctrl_test(chip2input, wc.name, bw_dir, 'chip'), - ctrl = lambda wc: ctrl_test(chip2input, wc.name, bw_dir, 'ctrl') - output: - join(bw_dir, "{name}.Q5DD.RPGC.inputnorm.bw") - params: - rname = "inputnorm", - bigwig_declare = lambda wc, input: f"--bigwig1 {input.chip} --bigwig2 {input.ctrl}", - threads: - int(allocated("threads", "inputnorm", cluster)), - envmodules: - config['tools']['DEEPTOOLSVER'], - shell: - """ - bigwigCompare \\ - --binSize 25 \\ - --outFileName {output} \\ - --outFileFormat 'bigwig' \\ - {params.bigwig_declare} \\ - --operation 'subtract' \\ - --skipNonCoveredRegions \\ - -p {threads} - """ - - rule sortByRead: """ Bam files(extension: sorted.bam) need to be sorted by read name @@ -121,6 +87,210 @@ rule genrich: """ +rule MACS2_narrow: + input: + chip = join(bam_dir, "{name}.Q5DD.bam") if paired_end \ + else join(bam_dir, "{name}.Q5DD_tagAlign.gz"), + txt = join(ppqt_dir, "{name}.Q5DD.ppqt.txt") if paired_end \ + else join(ppqt_dir, "{name}.Q5DD_tagAlign.ppqt.txt"), + c_option = lambda w: get_control_input(chip2input[w.name], paired_end, bam_dir), + output: + join(macsN_dir, "{name}", "{name}_peaks.narrowPeak"), + params: + rname = 'MACS2_narrow', + gsize = config['references'][genome]['EFFECTIVEGENOMESIZE'], + macsver = config['tools']['MACSVER'], + flag = lambda w: "-c" if chip2input[w.name] else "", + shell: + """ + module load {params.macsver}; + if [ '""" + str(paired_end) + """' == True ]; then + macs2 callpeak \\ + -t {input.chip} {params.flag} {input.c_option} \\ + -g {params.gsize} \\ + -n {wildcards.name} \\ + --outdir {macsN_dir}/{wildcards.name} \\ + -q 0.01 \\ + --keep-dup="all" \\ + -f "BAMPE" + else + ppqt_len=$(awk '{{print $1}}' {input.txt}) + macs2 callpeak \\ + -t {input.chip} {params.flag} {input.c_option} \\ + -g {params.gsize} \\ + -n {wildcards.name} \\ + --outdir {macsN_dir}/{wildcards.name} \\ + -q 0.01 \\ + --keep-dup="all" \\ + --nomodel \\ + --extsize $ppqt_len + fi + """ + + +rule MACS2_broad: + input: + chip = join(bam_dir, "{name}.Q5DD.bam") if paired_end \ + else join(bam_dir, "{name}.Q5DD_tagAlign.gz"), + txt = join(ppqt_dir, "{name}.Q5DD.ppqt.txt") if paired_end \ + else join(ppqt_dir, "{name}.Q5DD_tagAlign.ppqt.txt"), + c_option = lambda w: get_control_input(chip2input[w.name], paired_end, bam_dir), + output: + join(macsB_dir, "{name}", "{name}_peaks.broadPeak"), + params: + rname = 'MACS2_broad', + gsize = config['references'][genome]['EFFECTIVEGENOMESIZE'], + macsver = config['tools']['MACSVER'], + flag = lambda w: "-c" if chip2input[w.name] else "", + shell: + """ + module load {params.macsver}; + if [ '""" + str(paired_end) + """' == True ]; then + macs2 callpeak \\ + -t {input.chip} {params.flag} {input.c_option} \\ + -g {params.gsize} \\ + -n {wildcards.name} \\ + --outdir {macsB_dir}/{wildcards.name} \\ + --broad \\ + --broad-cutoff 0.01 \\ + --keep-dup="all" \\ + -f "BAMPE" + else + ppqt_len=$(awk '{{print $1}}' {input.txt}) + macs2 callpeak \\ + -t {input.chip} {params.flag} {input.c_option} \\ + -g {params.gsize} \\ + -n {wildcards.name} \\ + --outdir {macsB_dir}/{wildcards.name} \\ + --broad \\ + --broad-cutoff 0.01 \\ + --keep-dup="all" \\ + --nomodel \\ + --extsize $ppqt_len + fi + """ + + +rule SICER: + input: + chip = lambda w: getSicerChips(bam_dir, w.name, paired_end), + fragLen = lambda w: getSicerFragLen(ppqt_dir, qc_dir, w.name, paired_end), + c_option = lambda w: get_control_input(chip2input[w.name], paired_end, bam_dir), + output: + bed = join(sicer_dir, "{name}", "{name}_broadpeaks.bed") if has_inputs else [], + params: + rname = 'SICER', + name = "{name}", + sicerver = config['tools']['SICERVER'], + bedtoolsver = config['tools']['BEDTOOLSVER'], + genomever = config['options']['genome'], + this_sicer_dir = join(sicer_dir,"{name}"), + frac = config['references'][genome]['FRAC'], + flag = lambda w: "-c" if chip2input[w.name] else "", + shell: + """ + module load {params.sicerver} + module load {params.bedtoolsver} + if [ ! -d \"""" + str(tmpdir) + """\" ]; then mkdir -p \"""" + str(tmpdir) + """\"; fi + tmp=$(mktemp -d -p \"""" + str(tmpdir) + """\") + trap 'rm -rf "${{tmp}}"' EXIT + + if [ '""" + str(paired_end) + """' == True ]; then + MEAN_INSERT_SIZE=$(cat {input.fragLen} | awk '/MEDIAN_INSERT_SIZE/{{f=1;next}} /## HISTOGRAM/{{f=0}} f' | cut -f 6) + mean_insert_size=$(printf "%.0f" $MEAN_INSERT_SIZE) + else + mean_insert_size=$(awk '{{print $1}}' {input.fragLen}) + fi + echo "printing out value of mean-insert-size ${{mean_insert_size}}" + a={input.c_option} + echo "Printing input.c_option ${{a}}" + if [ '""" + str(paired_end) + """' == True ]; then + if [ -f "{input.c_option}" ]; then + # Copying input to tmpdir due to SICER2 + # bam2bed file conversion, if more than + # one sample shares the same IP sample + # than a race condition can occur where + # two jobs can concurrent try to write + # to the same BED file (during bedtools + # bam2bed that sicer calls). + input_bam="$(basename "{input.c_option}")" + cp {input.c_option} ${{tmp}} + echo "paired-end with input... ${{tmp}}/${{input_bam}}" + sicer \\ + -t {input.chip} \\ + -c "${{tmp}}/${{input_bam}}" \\ + -s {params.genomever} \\ + -rt 100 \\ + -w 300 \\ + -f ${{mean_insert_size}} \\ + -egf {params.frac} \\ + -g 600 \\ + -fdr 1E-2 \\ + -cpu 30 \\ + -o ${{tmp}} + + mv ${{tmp}}/{params.name}.Q5DD-W300-G600-FDR0.01-island.bed {output.bed}; + mv ${{tmp}}/{params.name}.Q5DD-W300-G600-islands-summary {params.this_sicer_dir} + else + echo "paired-end without input" + sicer \\ + -t {input.chip} \\ + -s {params.genomever} \\ + -rt 100 \\ + -w 300 \\ + -f ${{mean_insert_size}} \\ + -egf {params.frac} \\ + -g 600 \\ + -e 100 \\ + -cpu 30 \\ + -o ${{tmp}} + + mv ${{tmp}}/{params.name}.Q5DD-W300-G600.scoreisland {params.this_sicer_dir} + fi + else + if [ -f "{input.c_option}" ]; then + echo "single-end with input" + cp {input.chip} ${{tmp}}/chip.bed.gz; gzip -d ${{tmp}}/chip.bed.gz; + awk 'BEGIN{{FS=OFS="\\t"}} {{gsub(/\./, 0, $5)}} 1' ${{tmp}}/chip.bed > ${{tmp}}/{params.name}.bed; + + cp {input.c_option} ${{tmp}}/input.bed.gz; gzip -d ${{tmp}}/input.bed.gz; + awk 'BEGIN{{FS=OFS="\\t"}} {{gsub(/\./, 0, $5)}} 1' ${{tmp}}/input.bed > ${{tmp}}/inputV2.bed; + + sicer \\ + -t ${{tmp}}/{params.name}.bed \\ + -c ${{tmp}}/inputV2.bed \\ + -s {params.genomever} \\ + -rt 100 \\ + -w 300 \\ + -f ${{mean_insert_size}} \\ + -egf {params.frac} \\ + -g 600 \\ + -fdr 1E-2 \\ + -cpu 30 \\ + -o ${{tmp}} + mv ${{tmp}}/{params.name}-W300-G600-FDR0.01-island.bed {output.bed}; + mv ${{tmp}}/{params.name}-W300-G600-islands-summary {params.this_sicer_dir} + else + echo "single-end without input" + cp {input.chip} ${{tmp}}/chip.bed.gz; gzip -d ${{tmp}}/chip.bed.gz; + awk 'BEGIN{{FS=OFS="\\t"}} {{gsub(/\./, 0, $5)}} 1' ${{tmp}}/chip.bed > ${{tmp}}/{params.name}.bed; + sicer \\ + -t ${{tmp}}/{params.name}.bed \\ + -s {params.genomever} \\ + -rt 100 \\ + -w 300 \\ + -f ${{mean_insert_size}} \\ + -egf {params.frac} \\ + -g 600 \\ + -e 100 \\ + -cpu 30 \\ + -o ${{tmp}} + mv ${{tmp}}/{params.name}-W300-G600.scoreisland {output.bed} + fi + fi + """ + + rule MEME: input: bed = lambda w: join(workpath, w.PeakTool, w.name, w.name + PeakExtensions[w.PeakTool]) diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index e7bd61a..fa7c99d 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -1,7 +1,7 @@ -# Quality control metrics and reports +# Quality control rules # ~~~~ -# Generally applicable quality control rules - +# Common quality-control rules: preseq, NRF, rawfastqc, +# fastqc, fastq_screen, multiQC from os.path import join from scripts.common import get_bam_ext, get_fqscreen_outputs @@ -186,6 +186,118 @@ rule fastqc: """ +rule fastq_screen: + """ + Quality-control step to screen for different sources of contamination. + FastQ Screen compares your sequencing data to a set of different reference + genomes to determine if there is contamination. It allows a user to see if + the composition of your library matches what you expect. + @Input: + Trimmed FastQ files (scatter) + @Output: + FastQ Screen report and logfiles + """ + input: + expand(join(trim_dir, "{name}.R{rn}.trim.fastq.gz"), name=samples, rn=ends) + output: + get_fqscreen_outputs(paired_end, samples, qc_dir) + params: + rname = 'fqscreen', + outdir = join(qc_dir, "FQscreen"), + outdir2 = join(qc_dir, "FQscreen2"), + fastq_screen = config['bin']['FASTQ_SCREEN'], + fastq_screen_config1 = config['shared_resources']['FASTQ_SCREEN_CONFIG_P1'], + fastq_screen_config2 = config['shared_resources']['FASTQ_SCREEN_CONFIG_P2'], + envmodules: + config['tools']['BOWTIE2VER'], + config['tools']['PERLVER'], + threads: + int(allocated("threads", "fastq_screen", cluster)) + shell: + """ + # First pass of contamination screening + {params.fastq_screen} \\ + --conf {params.fastq_screen_config1} \\ + --outdir {params.outdir} \\ + --threads {threads} \\ + --subset 1000000 \\ + --aligner bowtie2 \\ + --force \\ + {input} + # Second pass of contamination screening + {params.fastq_screen} \\ + --conf {params.fastq_screen_config2} \\ + --outdir {params.outdir2} \\ + --threads {threads} \\ + --subset 1000000 \\ + --aligner bowtie2 \\ + --force \\ + {input} + """ + + +rule kraken: + """ + Quality-control step to assess for potential sources of microbial contamination. + If there are high levels of microbial contamination, Kraken will provide an + estimation of the taxonomic composition. Kraken is used in conjunction with + Krona to produce an interactive reports. + @Input: + Trimmed FastQ files (scatter) + @Output: + Kraken logfile and interative krona report + """ + input: + fq1 = join(trim_dir, "{name}.R1.trim.fastq.gz"), + fq2 = provided(join(trim_dir,"{name}.R2.trim.fastq.gz"), paired_end) + output: + krakenout = join(kraken_dir, "{name}.trim.kraken_bacteria.out.txt"), + krakentaxa = join(kraken_dir, "{name}.trim.kraken_bacteria.taxa.txt"), + kronahtml = join(kraken_dir, "{name}.trim.kraken_bacteria.krona.html"), + params: + rname = 'kraken', + outdir = kraken_dir, + bacdb = config['shared_resources']['KRAKENBACDB'], + tmpdir = tmpdir, + paired_end = paired_end + threads: + int(allocated("threads", "kraken_pe", cluster)), + envmodules: + config['tools']['KRAKENVER'], + config['tools']['KRONATOOLSVER'], + shell: + """ + # Setups temporary directory for + # intermediate files with built-in + # mechanism for deletion on exit + if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi + tmp=$(mktemp -d -p "{params.tmpdir}") + trap 'rm -rf "${{tmp}}"' EXIT + + # Copy kraken2 db to /lscratch or temp + # location to reduce filesystem strain + cp -rv {params.bacdb} ${{tmp}}/; + kdb_base=$(basename {params.bacdb}) + if [ '{params.paired_end}' == True ]; then + kraken2 --db ${{tmp}}/${{kdb_base}} \\ + --threads {threads} --report {output.krakentaxa} \\ + --output {output.krakenout} \\ + --gzip-compressed \\ + --paired {input.fq1} {input.fq2} + else + kraken2 --db ${{tmp}}/${{kdb_base}} \\ + --threads {threads} --report {output.krakentaxa} \\ + --output {output.krakenout} \\ + --gzip-compressed \\ + {input.fq1} + fi + + # Generate Krona Report + cut -f2,3 {output.krakenout} | \\ + ktImportTaxonomy - -o {output.kronahtml} + """ + + rule multiqc: """ Reporting step to aggregate sample statistics and quality-control information @@ -265,8 +377,8 @@ rule deeptools_QC: heatmap = join(deeptools_dir, "spearman_heatmap.Q5DD.pdf"), pca = join(deeptools_dir, "pca.Q5DD.pdf"), npz = temp(join(deeptools_dir, "Q5DD.npz")), - mqc = join(deeptools_dir, "spearman_readcounts.Q5DD.tab"), - png = join(deeptools_dir, "spearman_heatmap.Q5DD_mqc.png") + mqc = join(deeptools_dir, "spearman_readcounts.Q5DD.tab"), + png = join(deeptools_dir, "spearman_heatmap.Q5DD_mqc.png") params: rname = "deeptools_QC", parent_dir = deeptools_dir, @@ -285,7 +397,6 @@ rule deeptools_QC: plotPCA -in {output.npz} -o {output.pca} """ - rule deeptools_fingerprint: input: [ join(bam_dir, name + ".Q5DD.bam") for name in samples ] @@ -299,16 +410,20 @@ rule deeptools_fingerprint: deeptoolsver = config['tools']['DEEPTOOLSVER'], # this should be the sample names to match the bigwigs in the same order labels = samples - ext = "" if paired_end else "-e 200" threads: int(allocated("threads", "deeptools_fingerprint", cluster)), shell: """ module load {params.deeptoolsver} if [ ! -d "{params.parent_dir}" ]; then mkdir "{params.parent_dir}"; fi -\ + if [ \"""" + str(paired_end) + """\" == False ]; then + extension_option="-e 200" + else + extension_option="" + fi + plotFingerprint -b {input} --labels {params.labels} -p {threads} --skipZeros \\ --outQualityMetrics {output.metrics} --plotFile {output.image} --outRawCounts {output.raw} \\ - {params.ext} + ${{extension_option}} """ @@ -319,7 +434,7 @@ rule deeptools_gene_all: TSSline=join(deeptools_dir,"TSS_profile.Q5DD.pdf"), TSSmat=temp(join(deeptools_dir,"TSS.Q5DD.mat.gz")), bed=temp(join(deeptools_dir,"geneinfo.Q5DD.bed")), - mqc=join(deeptools_dir,"TSS_profile.Q5DD.tab") + mqc=join(deeptools_dir,"TSS_profile.Q5DD.tab") params: rname = "deeptools_gene_all", parent_dir = deeptools_dir, @@ -334,12 +449,12 @@ rule deeptools_gene_all: module load {params.deeptoolsver} if [ ! -d "{params.parent_dir}" ]; then mkdir "{params.parent_dir}"; fi grep --line-buffered 'protein_coding' {params.prebed} | awk -v OFS='\t' -F'\t' '{{print $1, $2, $3, $5, ".", $4}}' > {output.bed} - computeMatrix reference-point -S {input} -R {output.bed} -p {threads} \\ - --referencePoint TSS --upstream 3000 --downstream 3000 --skipZeros \\ - -o {output.TSSmat} --samplesLabel {params.labels} - plotProfile -m {output.TSSmat} -out {output.TSSline} \\ - --yAxisLabel 'average RPGC' --plotType 'se' --legendLocation upper-left \\ - --numPlotsPerRow 5 --outFileNameData {output.mqc} + computeMatrix reference-point -S {input} -R {output.bed} -p {threads} \\ + --referencePoint TSS --upstream 3000 --downstream 3000 --skipZeros \\ + -o {output.TSSmat} --samplesLabel {params.labels} + plotProfile -m {output.TSSmat} -out {output.TSSline} \\ + --yAxisLabel 'average RPGC' --plotType 'se' --legendLocation upper-left \\ + --numPlotsPerRow 5 --outFileNameData {output.mqc} """ diff --git a/workflow/rules/single/peakcall.smk b/workflow/rules/single/peakcall.smk deleted file mode 100644 index 2ab553d..0000000 --- a/workflow/rules/single/peakcall.smk +++ /dev/null @@ -1,131 +0,0 @@ -# Calling read coverage as peaks -# ~~~~ -# Single-end peak calling rules - -rule MACS2_broad: - input: - chip = join(bam_dir, "{name}.Q5DD.bam") - txt = join(ppqt_dir, "{name}.Q5DD.ppqt.txt") - c_option = lambda w: join(bam_dir, f"{chip2input[w.name]}.Q5DD_tagAlign.gz"), - output: - join(macsB_dir, "{name}", "{name}_peaks.broadPeak"), - params: - rname = 'MACS2_broad', - gsize = config['references'][genome]['EFFECTIVEGENOMESIZE'], - macsver = config['tools']['MACSVER'], - flag = lambda w: "-c" if chip2input[w.name] else "", - shell: - """ - module load {params.macsver}; - ppqt_len=$(awk '{{print $1}}' {input.txt}) - macs2 callpeak \\ - -t {input.chip} {params.flag} {input.c_option} \\ - -g {params.gsize} \\ - -n {wildcards.name} \\ - --outdir {macsB_dir}/{wildcards.name} \\ - --broad \\ - --broad-cutoff 0.01 \\ - --keep-dup="all" \\ - --nomodel \\ - --extsize $ppqt_len - """ - - -rule SICER: - input: - chip = lambda w: join(bam_dir, w.name + ".Q5DD_tagAlign.gz"), - fragLen = lambda w: join(ppqt_dir, w.name + ".Q5DD_tagAlign.ppqt.txt"), - c_option = lambda w: join(bam_dir, f"{chip2input[w.name]}.Q5DD_tagAlign.gz"), - output: - bed = join(sicer_dir, "{name}", "{name}_broadpeaks.bed") if has_inputs else [], - params: - rname = 'SICER', - name = "{name}", - sicerver = config['tools']['SICERVER'], - bedtoolsver = config['tools']['BEDTOOLSVER'], - genomever = config['options']['genome'], - this_sicer_dir = join(sicer_dir,"{name}"), - frac = config['references'][genome]['FRAC'], - flag = lambda w: "-c" if chip2input[w.name] else "", - shell: - """ - module load {params.sicerver} - module load {params.bedtoolsver} - if [ ! -d \"""" + str(tmpdir) + """\" ]; then mkdir -p \"""" + str(tmpdir) + """\"; fi - tmp=$(mktemp -d -p \"""" + str(tmpdir) + """\") - trap 'rm -rf "${{tmp}}"' EXIT - - mean_insert_size=$(awk '{{print $1}}' {input.fragLen}) - - echo "printing out value of mean-insert-size ${{mean_insert_size}}" - a={input.c_option} - echo "Printing input.c_option ${{a}}" - - if [ -f "{input.c_option}" ]; then - echo "single-end with input" - cp {input.chip} ${{tmp}}/chip.bed.gz; gzip -d ${{tmp}}/chip.bed.gz; - awk 'BEGIN{{FS=OFS="\\t"}} {{gsub(/\./, 0, $5)}} 1' ${{tmp}}/chip.bed > ${{tmp}}/{params.name}.bed; - - cp {input.c_option} ${{tmp}}/input.bed.gz; gzip -d ${{tmp}}/input.bed.gz; - awk 'BEGIN{{FS=OFS="\\t"}} {{gsub(/\./, 0, $5)}} 1' ${{tmp}}/input.bed > ${{tmp}}/inputV2.bed; - - sicer \\ - -t ${{tmp}}/{params.name}.bed \\ - -c ${{tmp}}/inputV2.bed \\ - -s {params.genomever} \\ - -rt 100 \\ - -w 300 \\ - -f ${{mean_insert_size}} \\ - -egf {params.frac} \\ - -g 600 \\ - -fdr 1E-2 \\ - -cpu 30 \\ - -o ${{tmp}} - mv ${{tmp}}/{params.name}-W300-G600-FDR0.01-island.bed {output.bed}; - mv ${{tmp}}/{params.name}-W300-G600-islands-summary {params.this_sicer_dir} - else - echo "single-end without input" - cp {input.chip} ${{tmp}}/chip.bed.gz; gzip -d ${{tmp}}/chip.bed.gz; - awk 'BEGIN{{FS=OFS="\\t"}} {{gsub(/\./, 0, $5)}} 1' ${{tmp}}/chip.bed > ${{tmp}}/{params.name}.bed; - sicer \\ - -t ${{tmp}}/{params.name}.bed \\ - -s {params.genomever} \\ - -rt 100 \\ - -w 300 \\ - -f ${{mean_insert_size}} \\ - -egf {params.frac} \\ - -g 600 \\ - -e 100 \\ - -cpu 30 \\ - -o ${{tmp}} - mv ${{tmp}}/{params.name}-W300-G600.scoreisland {output.bed} - fi - """ - - -rule MACS2_narrow: - input: - chip = join(bam_dir, "{name}.Q5DD.bam"), - txt = join(ppqt_dir, "{name}.Q5DD.ppqt.txt"), - c_option = lambda w: join(bam_dir, f"{chip2input[w.name]}.Q5DD_tagAlign.gz"), - output: - join(macsN_dir, "{name}", "{name}_peaks.narrowPeak"), - params: - rname = 'MACS2_narrow', - gsize = config['references'][genome]['EFFECTIVEGENOMESIZE'], - macsver = config['tools']['MACSVER'], - flag = lambda w: "-c" if chip2input[w.name] else "", - shell: - """ - module load {params.macsver}; - ppqt_len=$(awk '{{print $1}}' {input.txt}) - macs2 callpeak \\ - -t {input.chip} {params.flag} {input.c_option} \\ - -g {params.gsize} \\ - -n {wildcards.name} \\ - --outdir {macsN_dir}/{wildcards.name} \\ - -q 0.01 \\ - --keep-dup="all" \\ - --nomodel \\ - --extsize $ppqt_len - """ \ No newline at end of file diff --git a/workflow/rules/single/qc.smk b/workflow/rules/single/qc.smk deleted file mode 100644 index 43fdda0..0000000 --- a/workflow/rules/single/qc.smk +++ /dev/null @@ -1,108 +0,0 @@ -# Quality control metrics and reports -# ~~~~ -# Single-ended quality control rules - -rule fastq_screen: - """ - Quality-control step to screen for different sources of contamination. - FastQ Screen compares your sequencing data to a set of different reference - genomes to determine if there is contamination. It allows a user to see if - the composition of your library matches what you expect. - @Input: - Trimmed FastQ files (scatter) - @Output: - FastQ Screen report and logfiles - """ - input: - expand(join(trim_dir, "{name}.R{rn}.trim.fastq.gz"), name=samples, rn=ends) - output: - first_txt = expand(join(qc_dir, "FQscreen", "{name}.R1.trim_screen.txt"), name=samples), - first_png = expand(join(qc_dir, "FQscreen", "{name}.R1.trim_screen.png"), name=samples), - second_txt = expand(join(qc_dir, "FQscreen2", "{name}.R1.trim_screen.txt"), name=samples), - second_png = expand(join(qc_dir, "FQscreen2", "{name}.R1.trim_screen.png"), name=samples), - params: - rname = 'fqscreen', - outdir = join(qc_dir, "FQscreen"), - outdir2 = join(qc_dir, "FQscreen2"), - fastq_screen = config['bin']['FASTQ_SCREEN'], - fastq_screen_config1 = config['shared_resources']['FASTQ_SCREEN_CONFIG_P1'], - fastq_screen_config2 = config['shared_resources']['FASTQ_SCREEN_CONFIG_P2'], - envmodules: - config['tools']['BOWTIE2VER'], - config['tools']['PERLVER'], - threads: - int(allocated("threads", "fastq_screen", cluster)) - shell: - """ - # First pass of contamination screening - {params.fastq_screen} \\ - --conf {params.fastq_screen_config1} \\ - --outdir {params.outdir} \\ - --threads {threads} \\ - --subset 1000000 \\ - --aligner bowtie2 \\ - --force \\ - {input} - # Second pass of contamination screening - {params.fastq_screen} \\ - --conf {params.fastq_screen_config2} \\ - --outdir {params.outdir2} \\ - --threads {threads} \\ - --subset 1000000 \\ - --aligner bowtie2 \\ - --force \\ - {input} - """ - -rule kraken: - """ - Quality-control step to assess for potential sources of microbial contamination. - If there are high levels of microbial contamination, Kraken will provide an - estimation of the taxonomic composition. Kraken is used in conjunction with - Krona to produce an interactive reports. - @Input: - Trimmed FastQ files (scatter) - @Output: - Kraken logfile and interative krona report - """ - input: - fq1 = join(trim_dir, "{name}.R1.trim.fastq.gz"), - output: - krakenout = join(kraken_dir, "{name}.trim.kraken_bacteria.out.txt"), - krakentaxa = join(kraken_dir, "{name}.trim.kraken_bacteria.taxa.txt"), - kronahtml = join(kraken_dir, "{name}.trim.kraken_bacteria.krona.html"), - params: - rname = 'kraken', - outdir = kraken_dir, - bacdb = config['shared_resources']['KRAKENBACDB'], - tmpdir = tmpdir, - threads: - int(allocated("threads", "kraken_pe", cluster)), - envmodules: - config['tools']['KRAKENVER'], - config['tools']['KRONATOOLSVER'], - shell: - """ - # Setups temporary directory for - # intermediate files with built-in - # mechanism for deletion on exit - if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi - tmp=$(mktemp -d -p "{params.tmpdir}") - trap 'rm -rf "${{tmp}}"' EXIT - - # Copy kraken2 db to /lscratch or temp - # location to reduce filesystem strain - cp -rv {params.bacdb} ${{tmp}}/; - kdb_base=$(basename {params.bacdb}) - - kraken2 --db ${{tmp}}/${{kdb_base}} \\ - --threads {threads} --report {output.krakentaxa} \\ - --output {output.krakenout} \\ - --gzip-compressed \\ - {input.fq1} - fi - - # Generate Krona Report - cut -f2,3 {output.krakenout} | \\ - ktImportTaxonomy - -o {output.kronahtml} - """ \ No newline at end of file diff --git a/workflow/rules/single/trim_align_dedup.smk b/workflow/rules/single/trim_align_dedup.smk deleted file mode 100644 index cb6fe52..0000000 --- a/workflow/rules/single/trim_align_dedup.smk +++ /dev/null @@ -1,316 +0,0 @@ -# Trimming, alignment, and redundancy reduction rules -# ~~~~ -# Rules for single-ended reads - -# ~~ workflow endpoints -workpath = config['project']['workpath'] -bin_path = config['project']['binpath'] -genome = config['options']['genome'] - -# ~~ directories -trim_dir = join(workpath, 'trim') -tmpdir = config['options']['tmp_dir'] -bam_dir = join(workpath, "bam") -bw_dir = join(workpath, "bigwig") -ppqt_dir = join(bam_dir, "ppqt") - - -rule trim: - """ - Data-processing step to remove adapter sequences and perform quality trimming - prior to alignment the reference genome. Adapters are composed of synthetic - sequences and should be removed prior to alignment. Bwa mem aligns adapter-free - fastqs to ba lacklist that has anomalous, unstructured, or high signal in - next-generation sequencing experiments independent of cell line or experiment. - Samtools view -f4 selects for reads unmapped and outpute a blacklist-sequences-free bam file. - SamToFastq convers BAM file to FASTQs. All file processing is done in - "tmp_dir": "/lscratch/$SLURM_JOBID/", meaning all files are lost at job completion - except for final R1.trim.fastq.gz and R2.trim.fastq.gz - - @Input: - Raw FastQ files - @Output: - Trimmed and blacklist-sequences-free FastQ files - """ - input: - file1 = join(workpath, "{name}.R1.fastq.gz"), - file2 = provided(join(workpath, "{name}.R2.fastq.gz"), paired_end) - output: - outfq1 = temp(join(trim_dir, "{name}.R1.trim.fastq.gz")), - outfq2 = provided(temp(join(trim_dir, "{name}.R2.trim.fastq.gz")), paired_end) - params: - rname = "trim", - cutadaptver = config['tools']['CUTADAPTVER'], - workpath = config['project']['workpath'], - fastawithadaptersetd = join(workpath, config['shared_resources']['ADAPTERS_FASTA']), - blacklistbwaindex = config['references'][genome]['BLACKLISTBWAINDEX'], - picardver = config['tools']['PICARDVER'], - bwaver = config['tools']['BWAVER'], - samtoolsver = config['tools']['SAMTOOLSVER'], - minlen = 35, - leadingquality = 10, - trailingquality = 10, - javaram = "64g", - sample = "{name}", - tmpdir = tmpdir, - threads: - 32 - shell: - """ - module load {params.cutadaptver}; - module load {params.bwaver}; - module load {params.samtoolsver}; - module load {params.picardver}; - if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi - tmp=$(mktemp -d -p "{params.tmpdir}") - trap 'rm -rf "${{tmp}}"' EXIT - - cutadapt \\ - --nextseq-trim=2 \\ - --trim-n \\ - -n 5 \\ - -O 5 \\ - -q {params.leadingquality},{params.trailingquality} \\ - -m {params.minlen} \\ - -b file:{params.fastawithadaptersetd} \\ - -j {threads} \\ - -o ${{tmp}}/{params.sample}.R1.trim.fastq.gz \\ - {input.file1} - - if [ "{params.blacklistbwaindex}" != "" ]; then - bwa mem -t {threads} \\ - {params.blacklistbwaindex} \\ - ${{tmp}}/{params.sample}.R1.trim.fastq.gz \\ - | samtools view -@{threads} \\ - -f4 \\ - -b \\ - -o ${{tmp}}/{params.sample}.bam; - rm ${{tmp}}/{params.sample}.R1.trim.fastq.gz; - java -Xmx{params.javaram} -jar $PICARDJARPATH/picard.jar SamToFastq \\ - -VALIDATION_STRINGENCY SILENT \\ - -INPUT ${{tmp}}/{params.sample}.bam \\ - -FASTQ ${{tmp}}/{params.sample}.R1.trim.fastq - rm ${{tmp}}/{params.sample}.bam; - pigz -p {threads} ${{tmp}}/{params.sample}.R1.trim.fastq; - fi - mv ${{tmp}}/{params.sample}.R1.trim.fastq.gz {output.outfq1}; - """ - - -rule BWA: - """ - Data processing rule to align trimmed and blacklist-sequences-free reads - to the reference genome using bwa mem aligner. Samtools sort sort alignments - by coordinates to produce bam file. Samtools flagstats summarizes reads by QC - and number of alignments for each FLAG type. Samtools idxstats outputs - stats by chromosomesequence,length, # mapped read-segments and - # unmapped read-segments. Resulting bam file is filtered by a mapQ value. - - @Input: - Trimmed and blacklist-sequences-free FastQ files - @Output: - Bam file that have read aligned: sorted.bam - Bam file that has reads aligned and filted by mapQ a value: Q5.bam - """ - input: - join(trim_dir, "{name}.R1.trim.fastq.gz"), - params: - d = join(bam_dir), - rname = 'bwa', - reference = config['references'][genome]['BWA'], - bwaver = config['tools']['BWAVER'], - samtoolsver = config['tools']['SAMTOOLSVER'], - script = join(workpath, "bin", "bam_filter_by_mapq.py"), - pythonver = config['tools']['PYTHONVER'], - output: - outbam1 = join(bam_dir, "{name}.sorted.bam"), - outbam2 = temp(join(bam_dir, "{name}.Q5.bam")), - flagstat1 = join(bam_dir, "{name}.sorted.bam.flagstat"), - idxstat1 = join(bam_dir, "{name}.sorted.bam.idxstat"), - flagstat2 = join(bam_dir, "{name}.Q5.bam.flagstat"), - idxstat2 = join(bam_dir, "{name}.Q5.bam.idxstat"), - threads: - 32 - shell: - """ - module load {params.bwaver}; - module load {params.samtoolsver}; - module load {params.pythonver}; - - bwa mem -t {threads} {params.reference} {input} \\ - | samtools sort -@{threads} -o {output.outbam1} - samtools index {output.outbam1} - samtools flagstat {output.outbam1} > {output.flagstat1} - samtools idxstats {output.outbam1} > {output.idxstat1} - samtools view -b -q 6 {output.outbam1} -o {output.outbam2} - - samtools index {output.outbam2} - samtools flagstat {output.outbam2} > {output.flagstat2} - samtools idxstats {output.outbam2} > {output.idxstat2} - """ - - -rule dedup: - """ - Picard MarkDuplicates removes duplicates from bam file. - Samtools flagstats summarizes reads by QC - and number of alignments for each FLAG type. Samtools idxstats outputs - stats by chromosomesequence,length, # mapped read-segments and - # unmapped read-segments. If assay is cfchip, deduplicated bam file is - filtered for chromosomes 1 through 22 to produce: Q5DD.bam; - the original deduplicated file also convereted to bed file of fragments - via granges and rtracklayer: Q5DD.tagAlign. - - @Input: - Bam file that has reads aligned and filted by mapQ a value: Q5.bam - @Output: - Deduplicated Q5DD.bam for all assays, plus Q5DD.tagAlign if cfchip assay - - """ - input: - join(bam_dir,"{name}.Q5.bam") - output: - out_bam = join(bam_dir, "{name}.Q5DD.bam"), - flagstat = join(bam_dir, "{name}.Q5DD.bam.flagstat"), - idxstat = join(bam_dir, "{name}.Q5DD.bam.idxstat"), - bwadups = join(bam_dir, "{name}.bwa.Q5.duplic"), - tagalign = join(bam_dir, "{name}.Q5DD_tagAlign") if assay == "cfchip" \ - else join(bam_dir, "{name}.Q5DD_tagAlign.gz") - params: - rname = 'dedup', - picardver = config['tools']['PICARDVER'], - samtoolsver = config['tools']['SAMTOOLSVER'], - bedtoolsver = config['tools']['BEDTOOLSVER'], - macsver = config['tools']['MACSVER'], - gsize = config['references'][genome]['EFFECTIVEGENOMESIZE'], - genomefile = config['references'][genome]['REFLEN'], - rver = config['tools']['RVER'], - javaram = '16g', - tmpdir = tmpdir, - tmpBam = "{name}.Q5DD.withXY.bam", - rscript = join(config['references'][genome]['cfChIP_TOOLS_SRC'], "bam2fragment.R"), - shell: - """ - module load {params.samtoolsver}; - module load {params.picardver}; - module load {params.bedtoolsver}; - module load {params.macsver}; - module load {params.rver}; - if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi - tmp=$(mktemp -d -p "{params.tmpdir}") - trap 'rm -rf "${{tmp}}"' EXIT - - if [ "{assay}" == "cfchip" ]; then - java -Xmx{params.javaram} \\ - -jar $PICARDJARPATH/picard.jar MarkDuplicates \\ - -I {input} \\ - -O {params.tmpBam} \\ - -TMP_DIR ${{tmp}} \\ - -VALIDATION_STRINGENCY SILENT \\ - -REMOVE_DUPLICATES true - -METRICS_FILE {output.bwadups}; - samtools index {params.tmpBam}; - samtools view -b {params.tmpBam} chr{{1..22}} > {output.out_bam}; - Rscript {params.rscript} {params.tmpBam} {output.tagalign}; - rm {params.tmpBam} {params.tmpBam}.bai; - samtools index {output.out_bam}; - samtools flagstat {output.out_bam} > {output.flagstat}; - samtools idxstats {output.out_bam} > {output.idxstat}; - else - java -Xmx{params.javaram} \\ - -jar $PICARDJARPATH/picard.jar MarkDuplicates \\ - -I {input} \\ - -O {output.out_bam} \\ - -TMP_DIR ${{tmp}} \\ - -VALIDATION_STRINGENCY SILENT \\ - -REMOVE_DUPLICATES true \\ - -METRICS_FILE {output.bwadups}; - samtools index {output.out_bam}; - samtools flagstat {output.outout_bam5} > {output.flagstat}; - samtools idxstats {output.out_bam} > {output.idxstat}; - fi - """ - - -rule ppqt: - input: - bam = lambda w : join(bam_dir, f"{w.name}.{w.ext}." + - "bam" if w.ext.lower() == 'sorted' else "gz"), - output: - ppqt = join(ppqt_dir, "{name}.{ext}.ppqt"), - pdf = join(ppqt_dir, "{name}.{ext}.pdf"), - txt = join(ppqt_dir, "{name}.{ext}.ppqt.txt"), - params: - rname = "ppqt", - samtoolsver = config['tools']['SAMTOOLSVER'], - rver = config['tools']['RVER'], - scriptPy = join(bin_path, "ppqt_process.py"), - file_name = "{name}" - container: - config['images']['ppqt'] - shell: - """ - if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi - tmp=$(mktemp -d -p "{params.tmpdir}") - trap 'rm -rf "${{tmp}}"' EXIT - - if [[ '{input.bam}' == *.gz ]]; then - ln -s {input.bam} ${{tmp}}/{params.file_name}.Q5DD.tagAlign.gz; - run_spp.R -c=${{tmp}}/{params.file_name}.Q5DD.tagAlign.gz \ - -savp={output.pdf} -out={output.ppqt} \ - -tmpdir=/lscratch/$SLURM_JOBID -rf - else - run_spp.R -c={input.bam} \ - -savp={output.pdf} -out={output.ppqt} \ - -tmpdir=/lscratch/$SLURM_JOBID -rf - fi - python {params.scriptPy} -i {output.ppqt} -o {output.txt} - """ - - -rule bam2bw: - """ - bamCoverage converts bams to bigwig files for read visialization - in UCSC Genome Browser or IGV. - @Input: - Files with extensions: sorted.bam and Q5DD.bam, - for all samples (treatment and input controls) - @Output: - bigWig file with contains coordinates for an interval and - an associated score, RPGC - """ - input: - bam = lambda w: join(bam_dir, "{w.name}." + - "Q5DD.bam" if w.ext == "Q5DD" else "sorted.bam"), - ppqt = lambda w: join(ppqt_dir, "{w.name}." + - "Q5DD_tagAlign.ppqt.txt" if w.ext == "Q5DD" else - "sorted.ppqt.txt") - output: - outbw = join(bw_dir, "{name}.{ext}.RPGC.bw"), - params: - rname = "bam2bw", - name = "{name}", - effectivegenomesize = config['references'][genome]['EFFECTIVEGENOMESIZE'], - threads: - int(allocated("threads", "bam2bw", cluster)), - envmodules: - config['tools']['DEEPTOOLSVER'], - shell: - """ - if [ ! -d \"""" + tmpdir + """\" ]; then mkdir -p \"""" + tmpdir + """\"; fi - tmp=$(mktemp -d -p \"""" + tmpdir + """\") - trap 'rm -rf "${{tmp}}"' EXIT - - bam_cov_option=--centerReads - echo "printing out value of bam-cov-option $bam_cov_option" - - bamCoverage \\ - --bam {input.bam} \\ - -o {output.outbw} \\ - --binSize 25 \\ - --smoothLength 75 \\ - --numberOfProcessors {threads} \\ - --normalizeUsing RPGC \\ - --effectiveGenomeSize {params.effectivegenomesize} \\ - ${{bam_cov_option}}; - """ \ No newline at end of file diff --git a/workflow/rules/paired/trim_align_dedup.smk b/workflow/rules/trim_align_dedup.smk similarity index 52% rename from workflow/rules/paired/trim_align_dedup.smk rename to workflow/rules/trim_align_dedup.smk index 3ae4943..1a809f7 100644 --- a/workflow/rules/paired/trim_align_dedup.smk +++ b/workflow/rules/trim_align_dedup.smk @@ -1,17 +1,27 @@ # Trimming, alignment, and redundancy reduction rules # ~~~~ -# Rules for paired-ended reads +# Common paired-end rules: trim_pe, kraken_pe, BWA_PE, insert_size +import snakemake +from os.path import join +from scripts.common import allocated, get_bam_ext +from scripts.grouping import dedup_out7, get_bam_input, get_ppqt_input, \ + ctrl_test -# ~~ workflow endpoints +# ~~ workflow configuration workpath = config['project']['workpath'] bin_path = config['project']['binpath'] genome = config['options']['genome'] +paired_end = False if config['project']['nends'] == 1 else True +ends = [1] if not paired_end else [1, 2] +chip2input = config['project']['peaks']['inputs'] +has_inputs = False if set(chip2input.values()) == {''} else True # ~~ directories trim_dir = join(workpath, 'trim') tmpdir = config['options']['tmp_dir'] bam_dir = join(workpath, "bam") bw_dir = join(workpath, "bigwig") +qc_dir = join(workpath, "QC") ppqt_dir = join(bam_dir, "ppqt") @@ -34,10 +44,10 @@ rule trim: """ input: file1 = join(workpath, "{name}.R1.fastq.gz"), - file2 = join(workpath, "{name}.R2.fastq.gz"), + file2 = provided(join(workpath, "{name}.R2.fastq.gz"), paired_end) output: outfq1 = temp(join(trim_dir, "{name}.R1.trim.fastq.gz")), - outfq2 = temp(join(trim_dir, "{name}.R2.trim.fastq.gz")), + outfq2 = provided(temp(join(trim_dir, "{name}.R2.trim.fastq.gz")), paired_end) params: rname = "trim", cutadaptver = config['tools']['CUTADAPTVER'], @@ -52,7 +62,6 @@ rule trim: trailingquality = 10, javaram = "64g", sample = "{name}", - tmpdir = tmpdir, threads: 32 shell: @@ -61,27 +70,28 @@ rule trim: module load {params.bwaver}; module load {params.samtoolsver}; module load {params.picardver}; - if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi - tmp=$(mktemp -d -p "{params.tmpdir}") + if [ ! -d \"""" + tmpdir + """\" ]; then mkdir -p \"""" + tmpdir + """\"; fi + tmp=$(mktemp -d -p \"""" + tmpdir + """\") trap 'rm -rf "${{tmp}}"' EXIT - cutadapt \\ - --pair-filter=any \\ - --nextseq-trim=2 \\ - --trim-n \\ - -n 5 \\ - -O 5 \\ - -q {params.leadingquality},{params.trailingquality} \\ - -m {params.minlen}:{params.minlen} \\ - -b file:{params.fastawithadaptersetd} \\ - -B file:{params.fastawithadaptersetd} \\ - -j {threads} \\ - -o ${{tmp}}/{params.sample}.R1.trim.fastq.gz \\ - -p ${{tmp}}/{params.sample}.R2.trim.fastq.gz \\ - {input.file1} {input.file2} - - if [ "{params.blacklistbwaindex}" != "" ]; then - bwa mem -t {threads} \\ + if [ \"""" + str(paired_end) + """\" == True ]; then + cutadapt \\ + --pair-filter=any \\ + --nextseq-trim=2 \\ + --trim-n \\ + -n 5 \\ + -O 5 \\ + -q {params.leadingquality},{params.trailingquality} \\ + -m {params.minlen}:{params.minlen} \\ + -b file:{params.fastawithadaptersetd} \\ + -B file:{params.fastawithadaptersetd} \\ + -j {threads} \\ + -o ${{tmp}}/{params.sample}.R1.trim.fastq.gz \\ + -p ${{tmp}}/{params.sample}.R2.trim.fastq.gz \\ + {input.file1} {input.file2} + + if [ "{params.blacklistbwaindex}" != "" ]; + then bwa mem -t {threads} \\ {params.blacklistbwaindex} \\ ${{tmp}}/{params.sample}.R1.trim.fastq.gz \\ ${{tmp}}/{params.sample}.R2.trim.fastq.gz \\ @@ -91,19 +101,56 @@ rule trim: -o ${{tmp}}/{params.sample}.bam; rm ${{tmp}}/{params.sample}.R1.trim.fastq.gz; rm ${{tmp}}/{params.sample}.R2.trim.fastq.gz; + java -Xmx{params.javaram} -jar $PICARDJARPATH/picard.jar SamToFastq \\ -VALIDATION_STRINGENCY SILENT \\ -INPUT ${{tmp}}/{params.sample}.bam \\ -FASTQ ${{tmp}}/{params.sample}.R1.trim.fastq \\ -SECOND_END_FASTQ ${{tmp}}/{params.sample}.R2.trim.fastq \\ -UNPAIRED_FASTQ ${{tmp}}/{params.sample}.unpaired.noBL.fastq + rm ${{tmp}}/{params.sample}.bam; + pigz -p {threads} ${{tmp}}/{params.sample}.R1.trim.fastq; pigz -p {threads} ${{tmp}}/{params.sample}.R2.trim.fastq; - fi + fi + mv ${{tmp}}/{params.sample}.R1.trim.fastq.gz {output.outfq1}; + mv ${{tmp}}/{params.sample}.R2.trim.fastq.gz {output.outfq2}; + else + cutadapt \\ + --nextseq-trim=2 \\ + --trim-n \\ + -n 5 \\ + -O 5 \\ + -q {params.leadingquality},{params.trailingquality} \\ + -m {params.minlen} \\ + -b file:{params.fastawithadaptersetd} \\ + -j {threads} \\ + -o ${{tmp}}/{params.sample}.R1.trim.fastq.gz \\ + {input.file1} + + if [ "{params.blacklistbwaindex}" != "" ]; + then bwa mem -t {threads} \\ + {params.blacklistbwaindex} \\ + ${{tmp}}/{params.sample}.R1.trim.fastq.gz \\ + | samtools view -@{threads} \\ + -f4 \\ + -b \\ + -o ${{tmp}}/{params.sample}.bam; + rm ${{tmp}}/{params.sample}.R1.trim.fastq.gz; + + java -Xmx{params.javaram} -jar $PICARDJARPATH/picard.jar SamToFastq \\ + -VALIDATION_STRINGENCY SILENT \\ + -INPUT ${{tmp}}/{params.sample}.bam \\ + -FASTQ ${{tmp}}/{params.sample}.R1.trim.fastq + + rm ${{tmp}}/{params.sample}.bam; + + pigz -p {threads} ${{tmp}}/{params.sample}.R1.trim.fastq; - mv ${{tmp}}/{params.sample}.R1.trim.fastq.gz {output.outfq1}; - mv ${{tmp}}/{params.sample}.R2.trim.fastq.gz {output.outfq2}; + fi + mv ${{tmp}}/{params.sample}.R1.trim.fastq.gz {output.outfq1}; + fi """ @@ -124,7 +171,7 @@ rule BWA: """ input: infq1 = join(trim_dir, "{name}.R1.trim.fastq.gz"), - infq2 = join(trim_dir, "{name}.R2.trim.fastq.gz"), + infq2 = join(trim_dir, "{name}.R2.trim.fastq.gz") if paired_end else [], params: d = join(bam_dir), rname = 'bwa', @@ -147,18 +194,30 @@ rule BWA: module load {params.bwaver}; module load {params.samtoolsver}; module load {params.pythonver}; - - bwa mem -t {threads} {params.reference} {input.infq1} {input.infq2} \\ - | samtools sort -@{threads} -o {output.outbam1} - - samtools index {output.outbam1} - samtools flagstat {output.outbam1} > {output.flagstat1} - samtools idxstats {output.outbam1} > {output.idxstat1} - - python {params.script} -i {output.outbam1} -o {output.outbam2} -q 6 - samtools index {output.outbam2} - samtools flagstat {output.outbam2} > {output.flagstat2} - samtools idxstats {output.outbam2} > {output.idxstat2} + if [ '""" + str(paired_end) + """' == True ];then + bwa mem -t {threads} {params.reference} {input.infq1} {input.infq2} \\ + | samtools sort -@{threads} -o {output.outbam1} + + samtools index {output.outbam1} + samtools flagstat {output.outbam1} > {output.flagstat1} + samtools idxstats {output.outbam1} > {output.idxstat1} + + python {params.script} -i {output.outbam1} -o {output.outbam2} -q 6 + samtools index {output.outbam2} + samtools flagstat {output.outbam2} > {output.flagstat2} + samtools idxstats {output.outbam2} > {output.idxstat2} + else + bwa mem -t {threads} {params.reference} {input.infq1} \\ + | samtools sort -@{threads} -o {output.outbam1} + samtools index {output.outbam1} + samtools flagstat {output.outbam1} > {output.flagstat1} + samtools idxstats {output.outbam1} > {output.idxstat1} + samtools view -b -q 6 {output.outbam1} -o {output.outbam2} + + samtools index {output.outbam2} + samtools flagstat {output.outbam2} > {output.flagstat2} + samtools idxstats {output.outbam2} > {output.idxstat2} + fi """ @@ -180,14 +239,13 @@ rule dedup: """ input: - join(bam_dir, "{name}.Q5.bam") + bam2 = join(bam_dir,"{name}.Q5.bam") output: - out_bam = join(bam_dir, "{name}.Q5DD.bam"), - flagstat = join(bam_dir, "{name}.Q5DD.bam.flagstat"), - idxstat = join(bam_dir, "{name}.Q5DD.bam.idxstat"), - bwadups = join(bam_dir, "{name}.bwa.Q5.duplic"), - tagalign = join(bam_dir, "{name}.Q5DD_tagAlign") if assay == "cfchip" \ - else join(bam_dir, "{name}.Q5DD_tagAlign.gz") + out5 = join(bam_dir, "{name}.Q5DD.bam"), + out5f = join(bam_dir, "{name}.Q5DD.bam.flagstat"), + out5i = join(bam_dir, "{name}.Q5DD.bam.idxstat"), + out6 = provided(join(bam_dir, "{name}.bwa.Q5.duplic"), paired_end), + out7 = dedup_out7(join(bam_dir, "{name}"), assay, paired_end) params: rname = 'dedup', picardver = config['tools']['PICARDVER'], @@ -208,45 +266,56 @@ rule dedup: module load {params.bedtoolsver}; module load {params.macsver}; module load {params.rver}; - if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi - tmp=$(mktemp -d -p "{params.tmpdir}") + if [ ! -d \"""" + tmpdir + """\" ]; then mkdir -p \"""" + tmpdir + """\"; fi + tmp=$(mktemp -d -p \"""" + tmpdir + """\") trap 'rm -rf "${{tmp}}"' EXIT - + if [ "{assay}" == "cfchip" ]; then java -Xmx{params.javaram} \\ -jar $PICARDJARPATH/picard.jar MarkDuplicates \\ - -I {input} \\ + -I {input.bam2} \\ -O {params.tmpBam} \\ -TMP_DIR ${{tmp}} \\ -VALIDATION_STRINGENCY SILENT \\ -REMOVE_DUPLICATES true \\ - -METRICS_FILE {output.bwadups}; + -METRICS_FILE {output.out6}; samtools index {params.tmpBam}; - samtools view -b {params.tmpBam} chr{{1..22}} > {output.out_bam}; - Rscript {params.rscript} {params.tmpBam} {output.tagalign}; + samtools view -b {params.tmpBam} chr{{1..22}} > {output.out5}; + Rscript {params.rscript} {params.tmpBam} {output.out7}; rm {params.tmpBam} {params.tmpBam}.bai; - samtools index {output.out_bam}; - samtools flagstat {output.out_bam} > {output.flagstat}; - samtools idxstats {output.out_bam} > {output.idxstat}; + samtools index {output.out5}; + samtools flagstat {output.out5} > {output.out5f}; + samtools idxstats {output.out5} > {output.out5i}; + elif [ '""" + str(paired_end) + """' == False ];then + macs2 filterdup -i {input} -g {params.gsize} --keep-dup="auto" -o ${{tmp}}/TmpTagAlign; + awk -F"\\t" -v OFS="\\t" '{{if ($2>0 && $3>0) {{print}}}}' ${{tmp}}/TmpTagAlign > ${{tmp}}/TmpTagAlign2; + awk -F"\\t" -v OFS="\\t" '{{print $1,1,$2}}' {params.genomefile} | sort -k1,1 -k2,2n > ${{tmp}}/GenomeFileBed; + bedtools intersect -wa -f 1.0 -a ${{tmp}}/TmpTagAlign2 -b ${{tmp}}/GenomeFileBed > ${{tmp}}/TmpTagAlign3; + bedtools bedtobam -i ${{tmp}}/TmpTagAlign3 -g {params.genomefile} | samtools sort -@4 -o {output.out5}; + gzip ${{tmp}}/TmpTagAlign3; + mv ${{tmp}}/TmpTagAlign3.gz {output.out7}; + samtools index {output.out5}; + samtools flagstat {output.out5} > {output.out5f} + samtools idxstats {output.out5} > {output.out5i} else java -Xmx{params.javaram} \\ -jar $PICARDJARPATH/picard.jar MarkDuplicates \\ - -I {input} \\ - -O {output.out_bam} \\ + -I {input.bam2} \\ + -O {output.out5} \\ -TMP_DIR ${{tmp}} \\ -VALIDATION_STRINGENCY SILENT \\ -REMOVE_DUPLICATES true \\ - -METRICS_FILE {output.bwadups}; - samtools index {output.out_bam}; - samtools flagstat {output.out_bam} > {output.flagstat}; - samtools idxstats {output.out_bam} > {output.idxstat}; + -METRICS_FILE {output.out6}; + samtools index {output.out5}; + samtools flagstat {output.out5} > {output.out5f}; + samtools idxstats {output.out5} > {output.out5i}; fi """ rule ppqt: input: - lambda w : join(bam_dir, f"{w.name}.{w.ext}.bam") + bam = lambda w : join(bam_dir, w.name + "." + w.ext + "." + get_bam_ext(w.ext, paired_end)) output: ppqt = join(ppqt_dir, "{name}.{ext}.ppqt"), pdf = join(ppqt_dir, "{name}.{ext}.pdf"), @@ -256,24 +325,35 @@ rule ppqt: samtoolsver = config['tools']['SAMTOOLSVER'], rver = config['tools']['RVER'], scriptPy = join(bin_path, "ppqt_process.py"), - file_name = "{name}", - tmpdir = tmpdir, + file_name = "{name}" container: config['images']['ppqt'] shell: """ - if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi - tmp=$(mktemp -d -p "{params.tmpdir}") + if [ ! -d \"""" + tmpdir + """\" ]; then mkdir -p \"""" + tmpdir + """\"; fi + tmp=$(mktemp -d -p \"""" + tmpdir + """\") trap 'rm -rf "${{tmp}}"' EXIT - samtools view -b \\ + if [ '""" + str(paired_end) + """' == True ]; then + samtools view -b \\ -f 66 \\ - -o ${{tmp}}/bam1.f66.bam {input}; - samtools index ${{tmp}}/bam1.f66.bam; - run_spp.R -c=${{tmp}}/bam1.f66.bam \ - -savp={output.pdf} -out={output.ppqt} \ - -tmpdir=${{tmp}} -rf - + -o ${{tmp}}/bam1.f66.bam {input.bam}; + samtools index ${{tmp}}/bam1.f66.bam; + run_spp.R -c=${{tmp}}/bam1.f66.bam \ + -savp={output.pdf} -out={output.ppqt} \ + -tmpdir=${{tmp}} -rf + else + if [[ '{input.bam}' == *.gz ]]; then + ln -s {input.bam} ${{tmp}}/{params.file_name}.Q5DD.tagAlign.gz; + run_spp.R -c=${{tmp}}/{params.file_name}.Q5DD.tagAlign.gz \ + -savp={output.pdf} -out={output.ppqt} \ + -tmpdir=/lscratch/$SLURM_JOBID -rf + else + run_spp.R -c={input.bam} \ + -savp={output.pdf} -out={output.ppqt} \ + -tmpdir=/lscratch/$SLURM_JOBID -rf + fi + fi python {params.scriptPy} -i {output.ppqt} -o {output.txt} """ @@ -290,28 +370,31 @@ rule bam2bw: an associated score, RPGC """ input: - bam = lambda w: join(bam_dir, f"{w.name}.{w.ext}.bam"), - ppqt = lambda w: join(ppqt_dir, f"{w.name}.{w.ext}.ppqt.txt"), + bam = lambda w: get_bam_input(bam_dir, w, paired_end), + ppqt = lambda w: get_ppqt_input(ppqt_dir, w, paired_end), output: outbw = join(bw_dir, "{name}.{ext}.RPGC.bw"), params: rname = "bam2bw", name = "{name}", effectivegenomesize = config['references'][genome]['EFFECTIVEGENOMESIZE'], - tmpdir = tmpdir, threads: int(allocated("threads", "bam2bw", cluster)), envmodules: config['tools']['DEEPTOOLSVER'], shell: """ - if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi - tmp=$(mktemp -d -p "{params.tmpdir}") + if [ ! -d \"""" + tmpdir + """\" ]; then mkdir -p \"""" + tmpdir + """\"; fi + tmp=$(mktemp -d -p \"""" + tmpdir + """\") trap 'rm -rf "${{tmp}}"' EXIT bam_cov_option={input.ppqt} - ppqt_len=$(awk '{{print $1}}' {input.ppqt}) - bam_cov_option="-e ${{ppqt_len}}" + if [ \"""" + str(paired_end) + """\" == False ]; then + ppqt_len=$(awk '{{print $1}}' {input.ppqt}) + bam_cov_option="-e ${{ppqt_len}}" + else + bam_cov_option=--centerReads + fi echo "printing out value of bam-cov-option $bam_cov_option" bamCoverage \\ @@ -323,4 +406,39 @@ rule bam2bw: --normalizeUsing RPGC \\ --effectiveGenomeSize {params.effectivegenomesize} \\ ${{bam_cov_option}}; + """ + + +rule inputnorm: + """ + bigwigCompare (deepTools) subracts input control from treatment bigWig file, + normalizing treatment bigWig. + @Input: + Treatment sample, which has input control: extension Q5DD.RPGC.bw + and input its control: extension Q5DD.RPGC.bw + @Output: + bigWig file of treatmment sample normalizes with its input control + """ + input: + chip = lambda wc: ctrl_test(chip2input, wc.name, bw_dir, 'chip'), + ctrl = lambda wc: ctrl_test(chip2input, wc.name, bw_dir, 'ctrl') + output: + join(bw_dir, "{name}.Q5DD.RPGC.inputnorm.bw") + params: + rname = "inputnorm", + bigwig_declare = lambda wc, input: f"--bigwig1 {input.chip} --bigwig2 {input.ctrl}", + threads: + int(allocated("threads", "inputnorm", cluster)), + envmodules: + config['tools']['DEEPTOOLSVER'], + shell: + """ + bigwigCompare \\ + --binSize 25 \\ + --outFileName {output} \\ + --outFileFormat 'bigwig' \\ + {params.bigwig_declare} \\ + --operation 'subtract' \\ + --skipNonCoveredRegions \\ + -p {threads} """ \ No newline at end of file diff --git a/workflow/scripts/common.py b/workflow/scripts/common.py index 156065d..6feba1c 100644 --- a/workflow/scripts/common.py +++ b/workflow/scripts/common.py @@ -1,3 +1,4 @@ + #!/usr/bin/env python3 # ~~~ Common helper functions shared across the entire workflow import os @@ -13,9 +14,9 @@ def provided(samplelist, condition): """ if not condition: - # If condition is False, - # returns an empty list - # to prevent rule from + # If condition is False, + # returns an empty list + # to prevent rule from # running samplelist = [] @@ -26,14 +27,14 @@ def ignore(samplelist, condition): """ Determines if optional rules should run. If an empty list is provided to rule all, snakemake will not try to generate that set of target files. If a given condition - is met (i.e. True) then it will not try to run that rule. This function is the - inverse to provided(). + is met (i.e. True) then it will not try to run that rule. This function is the + inverse to provided(). """ if condition: - # If condition is True, - # returns an empty list - # to prevent rule from + # If condition is True, + # returns an empty list + # to prevent rule from # running samplelist = [] @@ -57,7 +58,7 @@ def s3_configured(uri): import re # Get bucket and key from s3 uri - parsed = re.match(r"s3:\/\/(.+?)\/(.+)", uri) + parsed = re.match(r's3:\/\/(.+?)\/(.+)', uri) bucket, key = parsed.groups() accessible = True @@ -87,11 +88,7 @@ def abstract_location(file_address, *args, **kwargs): # Check if user provided any input if not file_address or file_address is None: - raise IOError( - "Failed to provide any input files! Input(s) are required to resolve required files.".format( - file_address - ) - ) + raise IOError("Failed to provide any input files! Input(s) are required to resolve required files.".format(file_address)) # If given file path to one file, convert it a list[] file_list = [file_address] if isinstance(file_address, str) else file_address @@ -99,7 +96,7 @@ def abstract_location(file_address, *args, **kwargs): # Loop through list of provided files, and if a remote storage option # is given, convert its index to a remote file object. for i, uri in enumerate(file_list): - if uri.lower().startswith("s3://"): + if uri.lower().startswith('s3://'): # Remote option for S3 storage import snakemake.remote.S3 import botocore.session @@ -115,22 +112,18 @@ def abstract_location(file_address, *args, **kwargs): # s3 bucket are configured correctly. # If a file in provieded as input to a Snakemake rule, only read # access is needed to access the remote S3 object. - remote_provider = snakemake.remote.S3.RemoteProvider( - config=botocore.client.Config(signature_version=botocore.UNSIGNED) - ) + remote_provider = snakemake.remote.S3.RemoteProvider(config=botocore.client.Config(signature_version=botocore.UNSIGNED)) file_list[i] = remote_provider.remote(uri, *args, **kwargs) - elif uri.lower().startswith("gs://"): + elif uri.lower().startswith('gs://'): # Remote option for Google Cloud Storage import snakemake.remote.GS - remote_provider = snakemake.remote.GS.RemoteProvider() file_list[i] = remote_provider.remote(uri, *args, **kwargs) - elif uri.lower().startswith("sftp://"): + elif uri.lower().startswith('sftp://'): # Remote option for SFTP transfers import snakemake.remote.SFTP - remote_provider = snakemake.remote.SFTP.RemoteProvider() file_list[i] = remote_provider.remote(uri, *args, **kwargs) @@ -149,62 +142,60 @@ def references(config, reflist): _all = True for ref in reflist: - try: - tmp = config["references"][ref] + try: tmp = config['references'][ref] # Check if ref exists in config except KeyError: _all = False break # Check if ref is empty key string - if not tmp: - _all = False + if not tmp: _all = False return _all def allocated(resource, rule, lookup, default="__default__"): - """Pulls resource information for a given rule. If a rule does not have any information + """Pulls resource information for a given rule. If a rule does not have any information for a given resource type, then it will pull from the default. Information is pulled from - definitions in the cluster.json (which is used a job submission). This ensures that any + definitions in the cluster.json (which is used a job submission). This ensures that any resources used at runtime mirror the resources that were allocated. :param resource : resource type to look in cluster.json (i.e. threads, mem, time, gres) :param rule : rule to lookup its information :param lookup : Lookup containing allocation information (i.e. cluster.json) :param default : default information to use if rule information cannot be found - :return allocation : + :return allocation : allocation information for a given resource type for a given rule """ - try: + try: # Try to get allocation information # for a given rule allocation = lookup[rule][resource] except KeyError: # Use default allocation information allocation = lookup[default][resource] - + return allocation def str_bool(s): """Converts a string to boolean. It is dangerous to try to - typecast a string into a boolean value using the built-in + typecast a string into a boolean value using the built-in `bool()` function. This function avoids any issues that can - arise when using `bool()`. + arise when using `bool()`. Example: boolean('True') returns True boolean('False') returns False boolean('asdas') raises TypeError """ val = s.lower() - if val in ["true", "1", "y", "yes"]: + if val in ['true', '1', 'y', 'yes']: return True - elif val in ["false", "0", "n", "no", ""]: + elif val in ['false', '0', 'n', 'no', '']: return False else: # Provided value could not be # type casted into a boolean - raise TypeError("Fatal: cannot type cast {} into a boolean".format(val)) + raise TypeError('Fatal: cannot type cast {} into a boolean'.format(val)) def joint_option(prefix, valueslist): @@ -223,7 +214,7 @@ def joint_option(prefix, valueslist): def mk_dir_if_not_exist(dirs): if isinstance(dirs, str): dirs = [dirs] - assert isinstance(dirs, list), "Supplied directories should be in a list" + assert isinstance(dirs, list), 'Supplied directories should be in a list' for _dir in dirs: if not os.path.exists(_dir): os.mkdir(_dir, mode=0o775) @@ -233,9 +224,38 @@ def mk_dir_if_not_exist(dirs): def get_file_components(pair_ended): alnexts = [] if pair_ended: - alnexts.extend(["sorted.bam", "Q5DD.bam"]) + alnexts.extend(['sorted.bam', 'Q5DD.bam']) else: - alnexts.extend(["sorted.bam", "Q5DD_tagAlign.gz"]) - stems = list(map(lambda x: x.split(".")[0], alnexts)) - rpgc_exts = list(map(lambda x: x.split(".")[0] + ".RPGC", alnexts)) + alnexts.extend(['sorted.bam', 'Q5DD_tagAlign.gz']) + stems = list(map(lambda x: x.split('.')[0], alnexts)) + rpgc_exts = list(map(lambda x: x.split('.')[0] + '.RPGC', alnexts)) return stems, rpgc_exts, alnexts + + +def get_bam_ext(ext, pair_ended): + if pair_ended: + if ext.lower() == 'sorted': + return "bam" + elif ext == 'Q5DD': + return "bam" + else: + if ext.lower() == 'sorted': + return "bam" + elif ext == "Q5DD_tagAlign": + return "gz" + raise ValueError(f'Unknown file component. Pair ended: {str(pair_ended)}. Ext: {str(ext)}') + + +def get_fqscreen_outputs(paired_end, samples, qc_dir): + outs = [] + if paired_end: + outs.extend(expand(join(qc_dir, "FQscreen", "{name}.R{rn}.trim_screen.txt"), name=samples, rn=[1, 2])), + outs.extend(expand(join(qc_dir, "FQscreen", "{name}.R{rn}.trim_screen.png"), name=samples, rn=[1, 2])), + outs.extend(expand(join(qc_dir, "FQscreen2", "{name}.R{rn}.trim_screen.txt"), name=samples, rn=[1, 2])), + outs.extend(expand(join(qc_dir, "FQscreen2", "{name}.R{rn}.trim_screen.png"), name=samples, rn=[1, 2])), + else: + outs.extend(expand(join(qc_dir, "FQscreen", "{name}.R1.trim_screen.txt"), name=samples)), + outs.extend(expand(join(qc_dir, "FQscreen", "{name}.R1.trim_screen.png"), name=samples)), + outs.extend(expand(join(qc_dir, "FQscreen2", "{name}.R1.trim_screen.txt"), name=samples)), + outs.extend(expand(join(qc_dir, "FQscreen2", "{name}.R1.trim_screen.png"), name=samples)), + return outs \ No newline at end of file diff --git a/workflow/scripts/grouping.py b/workflow/scripts/grouping.py index b3a3395..6739fe6 100644 --- a/workflow/scripts/grouping.py +++ b/workflow/scripts/grouping.py @@ -2,23 +2,22 @@ # ~~~ Common helper functions for grouping of outputs from os.path import join - # common functions related to sample grouping or group meta-information def group_samples_by_reps(groupdata, samples, chip2input): groupdatawinput = {} groupswreps = [] - for group, chipsamples in groupdata.items(): - tmp = [] + for group, chipsamples in groupdata.items() : + tmp = [ ] if len(chipsamples) > 1: groupswreps.append(group) - for chip in chipsamples: + for chip in chipsamples : if chip in samples: tmp.append(chip) input = chip2input[chip] - if input != "NA" and input != "": + if input != 'NA' and input != '': tmp.append(input) if len(tmp) != 0: - groupdatawinput[group] = set(tmp) + groupdatawinput[group]=set(tmp) return groupdatawinput, groupswreps @@ -32,30 +31,30 @@ def group_output_files(extensions, groupslist, inputnorm): Note: Inputnorm will only be included when there are input samples. """ dtoolgroups, dtoolext = [], [] - + if len(inputnorm) == 2: - dtoolgroups.extend(["InputNorm"]) - dtoolext.extend([extensions[1]]) - + dtoolgroups.extend(["InputNorm"]) + dtoolext.extend([extensions[1]]) + for group in groupslist: - dtoolgroups.extend([group] * 2) - dtoolext.extend([extensions[1], extensions[0]]) - + dtoolgroups.extend([group] * 2) + dtoolext.extend([extensions[1], extensions[0]]) + if len(inputnorm) == 2: - dtoolgroups.extend(["InputNorm.prot"]) - dtoolext.extend([extensions[1]]) - + dtoolgroups.extend(["InputNorm.prot"]) + dtoolext.extend([extensions[1]]) + for group in groupslist: - dtoolgroups.extend([group + ".prot"] * 2) - dtoolext.extend([extensions[1], extensions[0]]) - + dtoolgroups.extend([group + ".prot"] * 2) + dtoolext.extend([extensions[1], extensions[0]]) + return dtoolgroups, dtoolext def get_peaktools(assay_type): tools = ["macsNarrow"] - if assay_type == "atac": - tools.append("Genrich") + if assay_type == "atac": + tools.append("Genrich") elif assay_type == "chip": tools.append("macsBroad") # turn sicer off for now @@ -63,15 +62,50 @@ def get_peaktools(assay_type): return tools +def dedup_out7(input, assay, paired_end): + dd = [] + if assay == "cfchip": + dd.append(input + ".Q5DD_tagAlign") + elif paired_end == False and assay == "chip": + dd.append(input + ".Q5DD_tagAlign.gz") + return dd + + +def get_ppqt_input(ppqt_dir, wildcards, paired_end): + ppqt = [] + if paired_end: + ppqt.append(join(ppqt_dir, "{0}.{1}.ppqt.txt".format(wildcards.name, wildcards.ext))) + else: + if wildcards.ext == "Q5DD": + ppqt.append(join(ppqt_dir, "{0}.Q5DD_tagAlign.ppqt.txt".format(wildcards.name))) + elif wildcards.ext == "sorted": + ppqt.append(join(ppqt_dir, "{0}.sorted.ppqt.txt".format(wildcards.name))) + else: + raise ValueError(f'Unknown alignment file extension, name: {wildcards.name}, ext: {wildcards.ext}.') + return ppqt + + +def get_bam_input(bam_dir, wildcards, paired_end): + bams = [] + if paired_end: + bams.append(join(bam_dir, "{0}.{1}.bam".format(wildcards.name, wildcards.ext))) + else: + if wildcards.ext == "Q5DD": + bams.append(join(bam_dir, "{0}.Q5DD.bam".format(wildcards.name))) + elif wildcards.ext == "sorted": + bams.append(join(bam_dir, "{0}.sorted.bam".format(wildcards.name))) + return bams + + def test_for_block(groupdata, contrast, blocks): - """only want to run blocking on contrasts where all - individuals are on both sides of the contrast""" - contrastBlock = [] + """ only want to run blocking on contrasts where all + individuals are on both sides of the contrast """ + contrastBlock = [ ] for con in contrast: group1 = con[0] group2 = con[1] - block1 = [blocks[sample] for sample in groupdata[group1]] - block2 = [blocks[sample] for sample in groupdata[group2]] + block1 = [ blocks[sample] for sample in groupdata[group1] ] + block2 = [ blocks[sample] for sample in groupdata[group2] ] if len(block1) == len(block2): if len(set(block1).intersection(block2)) == len(block1): contrastBlock.append(con) @@ -80,13 +114,11 @@ def test_for_block(groupdata, contrast, blocks): def ctrl_test(ctrl_dict, input_name, in_dir, mode=None): sample = join(in_dir, f"{input_name}.Q5DD.RPGC.bw") - assert mode in ("chip", "ctrl"), "Unrecognized input file mode." - + assert mode in ('chip', 'ctrl'), 'Unrecognized input file mode.' + if input_name in ctrl_dict: norm = join(in_dir, ctrl_dict[input_name] + ".Q5DD.RPGC.bw") else: - raise ValueError( - f"ChIP sample {input_name} missing from input lookup: \n{str(ctrl_dict)}" - ) - outs = {"chip": sample, "ctrl": norm} - return outs[mode] + raise ValueError(f'ChIP sample {input_name} missing from input lookup: \n{str(ctrl_dict)}') + outs = {'chip': sample, 'ctrl': norm} + return outs[mode] \ No newline at end of file diff --git a/workflow/scripts/peakcall.py b/workflow/scripts/peakcall.py index 6cc4c00..1567a55 100644 --- a/workflow/scripts/peakcall.py +++ b/workflow/scripts/peakcall.py @@ -13,6 +13,15 @@ def get_input_bam(input_sample, bam_dir): return [] +def get_control_input(ext, paired_end, bam_dir): + i = [] + if paired_end and ext != "": + i = [join(bam_dir, "{0}.Q5DD.bam".format(ext))] + elif not paired_end and ext != "": + i = [join(bam_dir, "{0}.Q5DD_tagAlign.gz".format(ext))] + return i + + def outputIDR(groupswreps, groupdata, chip2input, tools): """ Produces the correct output files for IDR. All supposed replicates @@ -64,6 +73,22 @@ def calc_effective_genome_fraction(effectivesize, genomefile): return(str(float(effectivesize)/ genomelen)) +def getSicerChips(bam_dir, name, paired_end): + if paired_end: + chip = join(bam_dir, name + ".Q5DD.bam") + else: + chip = join(bam_dir, name + ".Q5DD_tagAlign.gz") + return chip + + +def getSicerFragLen(ppqt_dir, qc_dir, name, paired_end): + if paired_end: + fragLen = join(qc_dir, name + ".Q5DD.insert_size_metrics.txt") + else: + fragLen = join(ppqt_dir, name + ".Q5DD_tagAlign.ppqt.txt") + return fragLen + + def get_manorm_sizes(g1, g2, group_data, ppqt_in): if not ppqt_in: return ""