From 34109afa0ae6d1b7ad81c60573d80eaf4b195112 Mon Sep 17 00:00:00 2001 From: Asya Khleborodova Date: Tue, 7 Nov 2023 10:39:53 -0500 Subject: [PATCH] Kraken and Peakcallers for single-end data (#24) * updating kraken rule to work with single and paired end data * updating MACS2_narrow rule to work with single end data * removing print statements * updating MACS2_broad rule wot work with single-end data * updating SICER rule to work with single-end data * fixing spaces in trim rule * SICER to have -egf effectivesize/ genomelen fraction instead of default 0.75 * fix to kraken rule in params * updating path to mm10.blacklist * fix for SICER rule * fixing param.frac iin Sicer rule --- config/cluster.json | 2 +- config/genome.json | 14 +- workflow/rules/peakcall.smk | 231 ++++++++++++++++++++-------- workflow/rules/qc.smk | 23 ++- workflow/rules/trim_align_dedup.smk | 4 +- 5 files changed, 194 insertions(+), 80 deletions(-) diff --git a/config/cluster.json b/config/cluster.json index 9eb7b5b..626a796 100644 --- a/config/cluster.json +++ b/config/cluster.json @@ -10,7 +10,7 @@ "mem": "64g", "threads": "32" }, - "kraken_pe": { + "kraken": { "cpus-per-task": "24", "gres": "lscratch:256", "mem": "110g" diff --git a/config/genome.json b/config/genome.json index 6868e83..df021ff 100644 --- a/config/genome.json +++ b/config/genome.json @@ -12,7 +12,8 @@ "GENOME": "/data/OpenOmics/references/cfChIP-seek/hg19_basic/ref.fa", "GENOMECHR": "/data/OpenOmics/references/cfChIP-seek/hg19_basic/Chromsomes/", "GTFFILE": "/data/OpenOmics/references/cfChIP-seek/hg19_basic/genes.gtf", - "REFLEN": "/data/OpenOmics/references/cfChIP-seek/hg19_basic/indexes/hg19.fa.sizes" + "REFLEN": "/data/OpenOmics/references/cfChIP-seek/hg19_basic/indexes/hg19.fa.sizes", + "FRAC": "0.8970265870106838" }, "hg38": { "ALIAS": "hg38", @@ -26,12 +27,13 @@ "GENOME": "/data/OpenOmics/references/chrom-seek/hg38_basic/ref.fa", "GENOMECHR": "/data/OpenOmics/references/chrom-seek/hg38_basic/Chromsomes/", "GTFFILE": "/data/OpenOmics/references/chrom-seek/hg38_basic/genes.gtf", - "REFLEN": "/data/OpenOmics/references/chrom-seek/hg38_basic/indexes/hg38.fa.sizes" + "REFLEN": "/data/OpenOmics/references/chrom-seek/hg38_basic/indexes/hg38.fa.sizes", + "FRAC": "0.9084767300375779" }, "mm10": { "ALIAS": "mm10", "SUPPORTED_PIPELINES": ["chip", "atac"], - "BLACKLISTBWAINDEX": "/data/OpenOmics/references/chrom-seek/mm10_basic/indexes/mm10.blacklist.chrM", + "BLACKLISTBWAINDEX": "/data/OpenOmics/references/chrom-seek/mm10_basic/indexes/mm10.blacklist.chrM.chr_rDNA", "BLACKLISTGENRICH": "/data/OpenOmics/references/chrom-seek/mm10_basic/indexes/mm10.blacklist.bed.gz", "BWA": "/data/OpenOmics/references/chrom-seek/mm10_basic/indexes/mm10", "cfChIP_TOOLS_SRC": "", @@ -40,7 +42,8 @@ "GENOME": "/data/OpenOmics/references/chrom-seek/mm10_basic/ref.fa", "GENOMECHR": "/data/OpenOmics/references/chrom-seek/mm10_basic/Chromsomes/", "GTFFILE": "/data/OpenOmics/references/chrom-seek/mm10_basic/genes.gtf", - "REFLEN": "/data/OpenOmics/references/chrom-seek/mm10_basic/indexes/mm10.fa.sizes" + "REFLEN": "/data/OpenOmics/references/chrom-seek/mm10_basic/indexes/mm10.fa.sizes", + "FRAC": "0.9053190260640642" }, "rheMac10": { "ALIAS": "rheMac10", @@ -54,7 +57,8 @@ "GENOME": "/data/OpenOmics/references/chrom-seek/rheMac10/Macaca_mulatta.Mmul_10.dna.chr.fa.gz", "GENOMECHR": "", "GTFFILE": "/data/OpenOmics/references/chrom-seek/rheMac10/Macaca_mulatta.Mmul_10.110.chr.gtf", - "REFLEN": "/data/OpenOmics/references/chrom-seek/rheMac10/Macaca_mulatta.Mmul_10.dna.chr.sizes" + "REFLEN": "/data/OpenOmics/references/chrom-seek/rheMac10/Macaca_mulatta.Mmul_10.dna.chr.sizes", + "FRAC": "0.9643910116868353" } } } diff --git a/workflow/rules/peakcall.smk b/workflow/rules/peakcall.smk index c4ad1fd..5064dd6 100644 --- a/workflow/rules/peakcall.smk +++ b/workflow/rules/peakcall.smk @@ -74,64 +74,108 @@ rule genrich: # INDIVIDUAL RULES rule MACS2_narrow: input: - chip = join(workpath,bam_dir,"{name}.Q5DD.bam"), - ctrl = get_input_bam, + chip = lambda w: join(workpath,bam_dir, w.name+".Q5DD.bam") \ + if paired_end else join(workpath,bam_dir, w.name+".Q5DD_tagAlign.gz"), + txt = lambda w: join(workpath,bam_dir, ppqt_dir, w.name+".Q5DD.ppqt.txt") \ + if paired_end else join(workpath,bam_dir, ppqt_dir, w.name+".Q5DD_tagAlign.ppqt.txt"), + c_option_pe = provided( lambda w: "{0}.Q5DD.bam".format( + join(workpath, bam_dir, chip2input[w.name]) + ) if chip2input[w.name] else "", paired_end == True), + c_option_se = provided(lambda w: "{0}.Q5DD_tagAlign.gz".format( + join(workpath, bam_dir, chip2input[w.name]) + ) if chip2input[w.name] else "", paired_end == False) # Building optional argument for paired input option, input: '-c /path/to/input.Q5DD.bam', No input: '' output: join(workpath,macsN_dir,"{name}","{name}_peaks.narrowPeak"), params: rname='MACS2_narrow', gsize=config['references'][genome]['EFFECTIVEGENOMESIZE'], macsver=config['tools']['MACSVER'], - # Building optional argument for paired input option, - # input: '-c /path/to/input.Q5DD.bam', No input: '' - c_option = lambda w: "-c {0}.Q5DD.bam".format( - join(workpath, bam_dir, chip2input[w.name]) - ) if chip2input[w.name] else "", + paired_end = paired_end, + flag= lambda w: "-c" if chip2input[w.name] else "" shell: """ module load {params.macsver}; - macs2 callpeak \\ - -t {input.chip} {params.c_option} \\ - -g {params.gsize} \\ - -n {wildcards.name} \\ - --outdir {workpath}/{macsN_dir}/{wildcards.name} \\ - -q 0.01 \\ - --keep-dup="all" \\ - -f "BAMPE" + if [ '{params.paired_end}' == True ]; then + macs2 callpeak \\ + -t {input.chip} {params.flag} {input.c_option_pe} \\ + -g {params.gsize} \\ + -n {wildcards.name} \\ + --outdir {workpath}/{macsN_dir}/{wildcards.name} \\ + -q 0.01 \\ + --keep-dup="all" \\ + -f "BAMPE" + else + ppqt_len=$(awk '{{print $1}}' {input.txt}) + macs2 callpeak \\ + -t {input.chip} {params.flag} {input.c_option_se} \\ + -g {params.gsize} \\ + -n {wildcards.name} \\ + --outdir {workpath}/{macsN_dir}/{wildcards.name} \\ + -q 0.01 \\ + --keep-dup="all" \\ + --nomodel \\ + --extsize $ppqt_len + fi """ rule MACS2_broad: input: - chip = join(workpath,bam_dir,"{name}.Q5DD.bam"), - ctrl = get_input_bam, + chip = lambda w: join(workpath,bam_dir, w.name+".Q5DD.bam") \ + if paired_end else join(workpath,bam_dir, w.name+".Q5DD_tagAlign.gz"), + txt = lambda w: join(workpath,bam_dir, ppqt_dir, w.name+".Q5DD.ppqt.txt") \ + if paired_end else join(workpath,bam_dir, ppqt_dir, w.name+".Q5DD_tagAlign.ppqt.txt"), + c_option_pe = provided(lambda w: "{0}.Q5DD.bam".format( + join(workpath, bam_dir, chip2input[w.name]) + ) if chip2input[w.name] else "", paired_end == True), + c_option_se = provided(lambda w: "{0}.Q5DD_tagAlign.gz".format( + join(workpath, bam_dir, chip2input[w.name]) + ) if chip2input[w.name] else "", paired_end ==False) output: join(workpath,macsB_dir,"{name}","{name}_peaks.broadPeak"), params: rname='MACS2_broad', gsize=config['references'][genome]['EFFECTIVEGENOMESIZE'], macsver=config['tools']['MACSVER'], - # Building optional argument for paired input option, - # input: '-c /path/to/input.Q5DD.bam', No input: '' - c_option = lambda w: "-c {0}.Q5DD.bam".format( - join(workpath, bam_dir, chip2input[w.name]) - ) if chip2input[w.name] else "", + paired_end = paired_end, + flag= lambda w: "-c" if chip2input[w.name] else "" shell: """ module load {params.macsver}; - macs2 callpeak \\ - -t {input.chip} {params.c_option} \\ - -g {params.gsize} \\ - -n {wildcards.name} \\ - --outdir {workpath}/{macsB_dir}/{wildcards.name} \\ - --broad \\ - --broad-cutoff 0.01 \\ - --keep-dup="all" \\ - -f "BAMPE" + if [ '{params.paired_end}' == True ]; then + macs2 callpeak \\ + -t {input.chip} {params.flag} {input.c_option_pe} \\ + -g {params.gsize} \\ + -n {wildcards.name} \\ + --outdir {workpath}/{macsB_dir}/{wildcards.name} \\ + --broad \\ + --broad-cutoff 0.01 \\ + --keep-dup="all" \\ + -f "BAMPE" + else + ppqt_len=$(awk '{{print $1}}' {input.txt}) + macs2 callpeak \\ + -t {input.chip} {params.flag} {input.c_option_se} \\ + -g {params.gsize} \\ + -n {wildcards.name} \\ + --outdir {workpath}/{macsB_dir}/{wildcards.name} \\ + --broad \\ + --broad-cutoff 0.01 \\ + --keep-dup="all" \\ + --nomodel \\ + --extsize $ppqt_len + fi """ rule SICER: input: - chip = join(workpath,bam_dir,"{name}.Q5DD.bam"), - ctrl = lambda w : join(workpath,bam_dir,chip2input[w.name] + ".Q5DD.bam"), - fragLen= join(workpath,"QC","{name}.Q5DD.insert_size_metrics.txt"), + chip = lambda w: join(workpath,bam_dir, w.name+".Q5DD.bam") \ + if paired_end else join(workpath,bam_dir, w.name+".Q5DD_tagAlign.gz"), + fragLen =lambda w: join(workpath,bam_dir, ppqt_dir, w.name+".Q5DD_tagAlign.ppqt.txt") if \ + not paired_end else join(workpath,"QC", w.name+".Q5DD.insert_size_metrics.txt"), + c_option_pe = provided(lambda w: "{0}.Q5DD.bam".format( + join(workpath, bam_dir, chip2input[w.name]) + ) if chip2input[w.name] else "", paired_end==True), + c_option_se = provided(lambda w: "{0}.Q5DD_tagAlign.gz".format( + join(workpath, bam_dir, chip2input[w.name]) + ) if chip2input[w.name] else "", paired_end==False) output: bed = join(workpath,sicer_dir,"{name}","{name}_broadpeaks.bed"), params: @@ -141,7 +185,10 @@ rule SICER: genomever = config['options']['genome'], name="{name}", sicer_dir=join(workpath,sicer_dir,"{name}"), - tmpdir=tmpdir + tmpdir=tmpdir, + paired_end = paired_end, + frac=config['references'][genome]['FRAC'], + flag= lambda w: "-c" if chip2input[w.name] else "" shell: """ module load {params.sicerver} module load {params.bedtoolsver} @@ -149,37 +196,91 @@ rule SICER: tmp=$(mktemp -d -p "{params.tmpdir}") trap 'rm -rf "${{tmp}}"' EXIT - MEAN_INSERT_SIZE=$(cat {input.fragLen} | awk '/MEDIAN_INSERT_SIZE/{{f=1;next}} /## HISTOGRAM/{{f=0}} f' | cut -f 6) - mean_insert_size=$(printf "%.0f" $MEAN_INSERT_SIZE) + if [ '{params.paired_end}' == True ]; then + MEAN_INSERT_SIZE=$(cat {input.fragLen} | awk '/MEDIAN_INSERT_SIZE/{{f=1;next}} /## HISTOGRAM/{{f=0}} f' | cut -f 6) + mean_insert_size=$(printf "%.0f" $MEAN_INSERT_SIZE) + else + mean_insert_size=$(awk '{{print $1}}' {input.fragLen}) + fi + echo "printing out value of mean-insert-size ${{mean_insert_size}}" + a={input.c_option_se} + echo "Printing input.c_option_se ${{a}}" + if [ '{params.paired_end}' == True ]; then + echo "praired-end with input" + if [ -f "{input.c_option_pe}" ]; + then + echo "praired-end with input" + sicer \\ + -t {input.chip} \\ + -c {input.c_option_pe} \\ + -s {params.genomever} \\ + -rt 100 \\ + -w 300 \\ + -f ${{mean_insert_size}} \\ + -egf {params.frac} \\ + -g 600 \\ + -fdr 1E-2 \\ + -cpu 30 \\ + -o ${{tmp}} + + mv ${{tmp}}/{params.name}.Q5DD-W300-G600-FDR0.01-island.bed {output.bed}; + mv ${{tmp}}/{params.name}.Q5DD-W300-G600-islands-summary {params.sicer_dir} + else + echo "praired-end without input" + sicer \\ + -t {input.chip} \\ + -s {params.genomever} \\ + -rt 100 \\ + -w 300 \\ + -f ${{mean_insert_size}} \\ + -egf {params.frac} \\ + -g 600 \\ + -e 100 \\ + -cpu 30 \\ + -o ${{tmp}} - if [ -f "{input.ctrl}" ]; - then - sicer \\ - -t {input.chip} \\ - -c {input.ctrl} \\ - -s {params.genomever} \\ - -rt 100 \\ - -w 300 \\ - -f ${{mean_insert_size}} \\ - -egf 0.75 \\ - -g 600 \\ - -fdr 1E-2 \\ - -cpu 30 \\ - -o . - mv {params.name}.Q5DD-W300-G600-FDR0.01-island.bed {output.bed}; - mv {params.name}.Q5DD-W300-G600-islands-summary {params.sicer_dir} + mv ${{tmp}}/{params.name}.Q5DD-W300-G600.scoreisland {params.sicer_dir} + fi else - sicer \\ - -t {input.chip} \\ - -s {params.genomever} \\ - -rt 100 \\ - -w 300 \\ - -f ${{mean_insert_size}} \\ - -egf 0.75 \\ - -g 600 \\ - -e 100 \\ - -cpu 30 \\ - -o . - mv {params.name}.Q5DD-W300-G600.scoreisland {output.bed} + if [ -f "{input.c_option_se}" ]; + then + echo "single-end with input" + cp {input.chip} ${{tmp}}/chip.bed.gz; gzip -d ${{tmp}}/chip.bed.gz; + awk 'BEGIN{{FS=OFS="\\t"}} {{gsub(/\./, 0, $5)}} 1' ${{tmp}}/chip.bed > ${{tmp}}/{params.name}.bed; + + cp {input.c_option_se} ${{tmp}}/input.bed.gz; gzip -d ${{tmp}}/input.bed.gz; + awk 'BEGIN{{FS=OFS="\\t"}} {{gsub(/\./, 0, $5)}} 1' ${{tmp}}/input.bed > ${{tmp}}/inputV2.bed; + + sicer \\ + -t ${{tmp}}/{params.name}.bed \\ + -c ${{tmp}}/inputV2.bed \\ + -s {params.genomever} \\ + -rt 100 \\ + -w 300 \\ + -f ${{mean_insert_size}} \\ + -egf {params.frac} \\ + -g 600 \\ + -fdr 1E-2 \\ + -cpu 30 \\ + -o ${{tmp}} + mv ${{tmp}}/{params.name}-W300-G600-FDR0.01-island.bed {output.bed}; + mv ${{tmp}}/{params.name}-W300-G600-islands-summary {params.sicer_dir} + else + echo "single-end without input" + cp {input.chip} ${{tmp}}/chip.bed.gz; gzip -d ${{tmp}}/chip.bed.gz; + awk 'BEGIN{{FS=OFS="\\t"}} {{gsub(/\./, 0, $5)}} 1' ${{tmp}}/chip.bed > ${{tmp}}/{params.name}.bed; + sicer \\ + -t ${{tmp}}/{params.name}.bed \\ + -s {params.genomever} \\ + -rt 100 \\ + -w 300 \\ + -f ${{mean_insert_size}} \\ + -egf {params.frac} \\ + -g 600 \\ + -e 100 \\ + -cpu 30 \\ + -o ${{tmp}} + mv ${{tmp}}/{params.name}-W300-G600.scoreisland {output.bed} + fi fi """ \ No newline at end of file diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index 7654e9a..6c6adf0 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -195,7 +195,7 @@ rule fastq_screen: {input} """ -rule kraken_pe: +rule kraken: """ Quality-control step to assess for potential sources of microbial contamination. If there are high levels of microbial contamination, Kraken will provide an @@ -208,7 +208,7 @@ rule kraken_pe: """ input: fq1=join(workpath,trim_dir,"{name}.R1.trim.fastq.gz"), - fq2=join(workpath,trim_dir,"{name}.R2.trim.fastq.gz"), + fq2=provided(join(workpath,trim_dir,"{name}.R2.trim.fastq.gz"), paired_end) output: krakenout = join(workpath,kraken_dir,"{name}.trim.kraken_bacteria.out.txt"), krakentaxa = join(workpath,kraken_dir,"{name}.trim.kraken_bacteria.taxa.txt"), @@ -218,6 +218,7 @@ rule kraken_pe: outdir=join(workpath,kraken_dir), bacdb=config['shared_resources']['KRAKENBACDB'], tmpdir=tmpdir, + paired_end = paired_end threads: int(allocated("threads", "kraken_pe", cluster)), envmodules: config['tools']['KRAKENVER'], @@ -234,11 +235,19 @@ rule kraken_pe: # location to reduce filesystem strain cp -rv {params.bacdb} ${{tmp}}/; kdb_base=$(basename {params.bacdb}) - kraken2 --db ${{tmp}}/${{kdb_base}} \\ - --threads {threads} --report {output.krakentaxa} \\ - --output {output.krakenout} \\ - --gzip-compressed \\ - --paired {input.fq1} {input.fq2} + if [ '{params.paired_end}' == True ]; then + kraken2 --db ${{tmp}}/${{kdb_base}} \\ + --threads {threads} --report {output.krakentaxa} \\ + --output {output.krakenout} \\ + --gzip-compressed \\ + --paired {input.fq1} {input.fq2} + else + kraken2 --db ${{tmp}}/${{kdb_base}} \\ + --threads {threads} --report {output.krakentaxa} \\ + --output {output.krakenout} \\ + --gzip-compressed \\ + {input.fq1} + fi # Generate Krona Report cut -f2,3 {output.krakenout} | \\ diff --git a/workflow/rules/trim_align_dedup.smk b/workflow/rules/trim_align_dedup.smk index fa85086..078701b 100644 --- a/workflow/rules/trim_align_dedup.smk +++ b/workflow/rules/trim_align_dedup.smk @@ -126,7 +126,7 @@ rule trim: -p ${{tmp}}/{params.sample}.R2.trim.fastq.gz \\ {input.file1} {input.file2} - if [ "{params.blacklistbwaindex}" != ""]; + if [ "{params.blacklistbwaindex}" != "" ]; then bwa mem -t {threads} \\ {params.blacklistbwaindex} \\ ${{tmp}}/{params.sample}.R1.trim.fastq.gz \\ @@ -165,7 +165,7 @@ rule trim: -o ${{tmp}}/{params.sample}.R1.trim.fastq.gz \\ {input.file1} - if [ "{params.blacklistbwaindex}" != ""]; + if [ "{params.blacklistbwaindex}" != "" ]; then bwa mem -t {threads} \\ {params.blacklistbwaindex} \\ ${{tmp}}/{params.sample}.R1.trim.fastq.gz \\