Skip to content

Commit

Permalink
Kraken and Peakcallers for single-end data (#24)
Browse files Browse the repository at this point in the history
* updating kraken rule to work with single and paired end data

* updating MACS2_narrow rule to work with single end data

* removing print statements

* updating MACS2_broad rule wot work with single-end data

* updating SICER rule to work with single-end data

* fixing spaces in trim rule

* SICER to have -egf effectivesize/ genomelen fraction instead of default 0.75

* fix to kraken rule in params

* updating path to mm10.blacklist

* fix for SICER rule

* fixing param.frac iin Sicer rule
  • Loading branch information
asyakhl authored Nov 7, 2023
1 parent 89df4ac commit 34109af
Show file tree
Hide file tree
Showing 5 changed files with 194 additions and 80 deletions.
2 changes: 1 addition & 1 deletion config/cluster.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"mem": "64g",
"threads": "32"
},
"kraken_pe": {
"kraken": {
"cpus-per-task": "24",
"gres": "lscratch:256",
"mem": "110g"
Expand Down
14 changes: 9 additions & 5 deletions config/genome.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
"GENOME": "/data/OpenOmics/references/cfChIP-seek/hg19_basic/ref.fa",
"GENOMECHR": "/data/OpenOmics/references/cfChIP-seek/hg19_basic/Chromsomes/",
"GTFFILE": "/data/OpenOmics/references/cfChIP-seek/hg19_basic/genes.gtf",
"REFLEN": "/data/OpenOmics/references/cfChIP-seek/hg19_basic/indexes/hg19.fa.sizes"
"REFLEN": "/data/OpenOmics/references/cfChIP-seek/hg19_basic/indexes/hg19.fa.sizes",
"FRAC": "0.8970265870106838"
},
"hg38": {
"ALIAS": "hg38",
Expand All @@ -26,12 +27,13 @@
"GENOME": "/data/OpenOmics/references/chrom-seek/hg38_basic/ref.fa",
"GENOMECHR": "/data/OpenOmics/references/chrom-seek/hg38_basic/Chromsomes/",
"GTFFILE": "/data/OpenOmics/references/chrom-seek/hg38_basic/genes.gtf",
"REFLEN": "/data/OpenOmics/references/chrom-seek/hg38_basic/indexes/hg38.fa.sizes"
"REFLEN": "/data/OpenOmics/references/chrom-seek/hg38_basic/indexes/hg38.fa.sizes",
"FRAC": "0.9084767300375779"
},
"mm10": {
"ALIAS": "mm10",
"SUPPORTED_PIPELINES": ["chip", "atac"],
"BLACKLISTBWAINDEX": "/data/OpenOmics/references/chrom-seek/mm10_basic/indexes/mm10.blacklist.chrM",
"BLACKLISTBWAINDEX": "/data/OpenOmics/references/chrom-seek/mm10_basic/indexes/mm10.blacklist.chrM.chr_rDNA",
"BLACKLISTGENRICH": "/data/OpenOmics/references/chrom-seek/mm10_basic/indexes/mm10.blacklist.bed.gz",
"BWA": "/data/OpenOmics/references/chrom-seek/mm10_basic/indexes/mm10",
"cfChIP_TOOLS_SRC": "",
Expand All @@ -40,7 +42,8 @@
"GENOME": "/data/OpenOmics/references/chrom-seek/mm10_basic/ref.fa",
"GENOMECHR": "/data/OpenOmics/references/chrom-seek/mm10_basic/Chromsomes/",
"GTFFILE": "/data/OpenOmics/references/chrom-seek/mm10_basic/genes.gtf",
"REFLEN": "/data/OpenOmics/references/chrom-seek/mm10_basic/indexes/mm10.fa.sizes"
"REFLEN": "/data/OpenOmics/references/chrom-seek/mm10_basic/indexes/mm10.fa.sizes",
"FRAC": "0.9053190260640642"
},
"rheMac10": {
"ALIAS": "rheMac10",
Expand All @@ -54,7 +57,8 @@
"GENOME": "/data/OpenOmics/references/chrom-seek/rheMac10/Macaca_mulatta.Mmul_10.dna.chr.fa.gz",
"GENOMECHR": "",
"GTFFILE": "/data/OpenOmics/references/chrom-seek/rheMac10/Macaca_mulatta.Mmul_10.110.chr.gtf",
"REFLEN": "/data/OpenOmics/references/chrom-seek/rheMac10/Macaca_mulatta.Mmul_10.dna.chr.sizes"
"REFLEN": "/data/OpenOmics/references/chrom-seek/rheMac10/Macaca_mulatta.Mmul_10.dna.chr.sizes",
"FRAC": "0.9643910116868353"
}
}
}
231 changes: 166 additions & 65 deletions workflow/rules/peakcall.smk
Original file line number Diff line number Diff line change
Expand Up @@ -74,64 +74,108 @@ rule genrich:
# INDIVIDUAL RULES
rule MACS2_narrow:
input:
chip = join(workpath,bam_dir,"{name}.Q5DD.bam"),
ctrl = get_input_bam,
chip = lambda w: join(workpath,bam_dir, w.name+".Q5DD.bam") \
if paired_end else join(workpath,bam_dir, w.name+".Q5DD_tagAlign.gz"),
txt = lambda w: join(workpath,bam_dir, ppqt_dir, w.name+".Q5DD.ppqt.txt") \
if paired_end else join(workpath,bam_dir, ppqt_dir, w.name+".Q5DD_tagAlign.ppqt.txt"),
c_option_pe = provided( lambda w: "{0}.Q5DD.bam".format(
join(workpath, bam_dir, chip2input[w.name])
) if chip2input[w.name] else "", paired_end == True),
c_option_se = provided(lambda w: "{0}.Q5DD_tagAlign.gz".format(
join(workpath, bam_dir, chip2input[w.name])
) if chip2input[w.name] else "", paired_end == False) # Building optional argument for paired input option, input: '-c /path/to/input.Q5DD.bam', No input: ''
output:
join(workpath,macsN_dir,"{name}","{name}_peaks.narrowPeak"),
params:
rname='MACS2_narrow',
gsize=config['references'][genome]['EFFECTIVEGENOMESIZE'],
macsver=config['tools']['MACSVER'],
# Building optional argument for paired input option,
# input: '-c /path/to/input.Q5DD.bam', No input: ''
c_option = lambda w: "-c {0}.Q5DD.bam".format(
join(workpath, bam_dir, chip2input[w.name])
) if chip2input[w.name] else "",
paired_end = paired_end,
flag= lambda w: "-c" if chip2input[w.name] else ""
shell: """
module load {params.macsver};
macs2 callpeak \\
-t {input.chip} {params.c_option} \\
-g {params.gsize} \\
-n {wildcards.name} \\
--outdir {workpath}/{macsN_dir}/{wildcards.name} \\
-q 0.01 \\
--keep-dup="all" \\
-f "BAMPE"
if [ '{params.paired_end}' == True ]; then
macs2 callpeak \\
-t {input.chip} {params.flag} {input.c_option_pe} \\
-g {params.gsize} \\
-n {wildcards.name} \\
--outdir {workpath}/{macsN_dir}/{wildcards.name} \\
-q 0.01 \\
--keep-dup="all" \\
-f "BAMPE"
else
ppqt_len=$(awk '{{print $1}}' {input.txt})
macs2 callpeak \\
-t {input.chip} {params.flag} {input.c_option_se} \\
-g {params.gsize} \\
-n {wildcards.name} \\
--outdir {workpath}/{macsN_dir}/{wildcards.name} \\
-q 0.01 \\
--keep-dup="all" \\
--nomodel \\
--extsize $ppqt_len
fi
"""

rule MACS2_broad:
input:
chip = join(workpath,bam_dir,"{name}.Q5DD.bam"),
ctrl = get_input_bam,
chip = lambda w: join(workpath,bam_dir, w.name+".Q5DD.bam") \
if paired_end else join(workpath,bam_dir, w.name+".Q5DD_tagAlign.gz"),
txt = lambda w: join(workpath,bam_dir, ppqt_dir, w.name+".Q5DD.ppqt.txt") \
if paired_end else join(workpath,bam_dir, ppqt_dir, w.name+".Q5DD_tagAlign.ppqt.txt"),
c_option_pe = provided(lambda w: "{0}.Q5DD.bam".format(
join(workpath, bam_dir, chip2input[w.name])
) if chip2input[w.name] else "", paired_end == True),
c_option_se = provided(lambda w: "{0}.Q5DD_tagAlign.gz".format(
join(workpath, bam_dir, chip2input[w.name])
) if chip2input[w.name] else "", paired_end ==False)
output:
join(workpath,macsB_dir,"{name}","{name}_peaks.broadPeak"),
params:
rname='MACS2_broad',
gsize=config['references'][genome]['EFFECTIVEGENOMESIZE'],
macsver=config['tools']['MACSVER'],
# Building optional argument for paired input option,
# input: '-c /path/to/input.Q5DD.bam', No input: ''
c_option = lambda w: "-c {0}.Q5DD.bam".format(
join(workpath, bam_dir, chip2input[w.name])
) if chip2input[w.name] else "",
paired_end = paired_end,
flag= lambda w: "-c" if chip2input[w.name] else ""
shell: """
module load {params.macsver};
macs2 callpeak \\
-t {input.chip} {params.c_option} \\
-g {params.gsize} \\
-n {wildcards.name} \\
--outdir {workpath}/{macsB_dir}/{wildcards.name} \\
--broad \\
--broad-cutoff 0.01 \\
--keep-dup="all" \\
-f "BAMPE"
if [ '{params.paired_end}' == True ]; then
macs2 callpeak \\
-t {input.chip} {params.flag} {input.c_option_pe} \\
-g {params.gsize} \\
-n {wildcards.name} \\
--outdir {workpath}/{macsB_dir}/{wildcards.name} \\
--broad \\
--broad-cutoff 0.01 \\
--keep-dup="all" \\
-f "BAMPE"
else
ppqt_len=$(awk '{{print $1}}' {input.txt})
macs2 callpeak \\
-t {input.chip} {params.flag} {input.c_option_se} \\
-g {params.gsize} \\
-n {wildcards.name} \\
--outdir {workpath}/{macsB_dir}/{wildcards.name} \\
--broad \\
--broad-cutoff 0.01 \\
--keep-dup="all" \\
--nomodel \\
--extsize $ppqt_len
fi
"""

rule SICER:
input:
chip = join(workpath,bam_dir,"{name}.Q5DD.bam"),
ctrl = lambda w : join(workpath,bam_dir,chip2input[w.name] + ".Q5DD.bam"),
fragLen= join(workpath,"QC","{name}.Q5DD.insert_size_metrics.txt"),
chip = lambda w: join(workpath,bam_dir, w.name+".Q5DD.bam") \
if paired_end else join(workpath,bam_dir, w.name+".Q5DD_tagAlign.gz"),
fragLen =lambda w: join(workpath,bam_dir, ppqt_dir, w.name+".Q5DD_tagAlign.ppqt.txt") if \
not paired_end else join(workpath,"QC", w.name+".Q5DD.insert_size_metrics.txt"),
c_option_pe = provided(lambda w: "{0}.Q5DD.bam".format(
join(workpath, bam_dir, chip2input[w.name])
) if chip2input[w.name] else "", paired_end==True),
c_option_se = provided(lambda w: "{0}.Q5DD_tagAlign.gz".format(
join(workpath, bam_dir, chip2input[w.name])
) if chip2input[w.name] else "", paired_end==False)
output:
bed = join(workpath,sicer_dir,"{name}","{name}_broadpeaks.bed"),
params:
Expand All @@ -141,45 +185,102 @@ rule SICER:
genomever = config['options']['genome'],
name="{name}",
sicer_dir=join(workpath,sicer_dir,"{name}"),
tmpdir=tmpdir
tmpdir=tmpdir,
paired_end = paired_end,
frac=config['references'][genome]['FRAC'],
flag= lambda w: "-c" if chip2input[w.name] else ""
shell: """
module load {params.sicerver}
module load {params.bedtoolsver}
if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
tmp=$(mktemp -d -p "{params.tmpdir}")
trap 'rm -rf "${{tmp}}"' EXIT
MEAN_INSERT_SIZE=$(cat {input.fragLen} | awk '/MEDIAN_INSERT_SIZE/{{f=1;next}} /## HISTOGRAM/{{f=0}} f' | cut -f 6)
mean_insert_size=$(printf "%.0f" $MEAN_INSERT_SIZE)
if [ '{params.paired_end}' == True ]; then
MEAN_INSERT_SIZE=$(cat {input.fragLen} | awk '/MEDIAN_INSERT_SIZE/{{f=1;next}} /## HISTOGRAM/{{f=0}} f' | cut -f 6)
mean_insert_size=$(printf "%.0f" $MEAN_INSERT_SIZE)
else
mean_insert_size=$(awk '{{print $1}}' {input.fragLen})
fi
echo "printing out value of mean-insert-size ${{mean_insert_size}}"
a={input.c_option_se}
echo "Printing input.c_option_se ${{a}}"
if [ '{params.paired_end}' == True ]; then
echo "praired-end with input"
if [ -f "{input.c_option_pe}" ];
then
echo "praired-end with input"
sicer \\
-t {input.chip} \\
-c {input.c_option_pe} \\
-s {params.genomever} \\
-rt 100 \\
-w 300 \\
-f ${{mean_insert_size}} \\
-egf {params.frac} \\
-g 600 \\
-fdr 1E-2 \\
-cpu 30 \\
-o ${{tmp}}
mv ${{tmp}}/{params.name}.Q5DD-W300-G600-FDR0.01-island.bed {output.bed};
mv ${{tmp}}/{params.name}.Q5DD-W300-G600-islands-summary {params.sicer_dir}
else
echo "praired-end without input"
sicer \\
-t {input.chip} \\
-s {params.genomever} \\
-rt 100 \\
-w 300 \\
-f ${{mean_insert_size}} \\
-egf {params.frac} \\
-g 600 \\
-e 100 \\
-cpu 30 \\
-o ${{tmp}}
if [ -f "{input.ctrl}" ];
then
sicer \\
-t {input.chip} \\
-c {input.ctrl} \\
-s {params.genomever} \\
-rt 100 \\
-w 300 \\
-f ${{mean_insert_size}} \\
-egf 0.75 \\
-g 600 \\
-fdr 1E-2 \\
-cpu 30 \\
-o .
mv {params.name}.Q5DD-W300-G600-FDR0.01-island.bed {output.bed};
mv {params.name}.Q5DD-W300-G600-islands-summary {params.sicer_dir}
mv ${{tmp}}/{params.name}.Q5DD-W300-G600.scoreisland {params.sicer_dir}
fi
else
sicer \\
-t {input.chip} \\
-s {params.genomever} \\
-rt 100 \\
-w 300 \\
-f ${{mean_insert_size}} \\
-egf 0.75 \\
-g 600 \\
-e 100 \\
-cpu 30 \\
-o .
mv {params.name}.Q5DD-W300-G600.scoreisland {output.bed}
if [ -f "{input.c_option_se}" ];
then
echo "single-end with input"
cp {input.chip} ${{tmp}}/chip.bed.gz; gzip -d ${{tmp}}/chip.bed.gz;
awk 'BEGIN{{FS=OFS="\\t"}} {{gsub(/\./, 0, $5)}} 1' ${{tmp}}/chip.bed > ${{tmp}}/{params.name}.bed;
cp {input.c_option_se} ${{tmp}}/input.bed.gz; gzip -d ${{tmp}}/input.bed.gz;
awk 'BEGIN{{FS=OFS="\\t"}} {{gsub(/\./, 0, $5)}} 1' ${{tmp}}/input.bed > ${{tmp}}/inputV2.bed;
sicer \\
-t ${{tmp}}/{params.name}.bed \\
-c ${{tmp}}/inputV2.bed \\
-s {params.genomever} \\
-rt 100 \\
-w 300 \\
-f ${{mean_insert_size}} \\
-egf {params.frac} \\
-g 600 \\
-fdr 1E-2 \\
-cpu 30 \\
-o ${{tmp}}
mv ${{tmp}}/{params.name}-W300-G600-FDR0.01-island.bed {output.bed};
mv ${{tmp}}/{params.name}-W300-G600-islands-summary {params.sicer_dir}
else
echo "single-end without input"
cp {input.chip} ${{tmp}}/chip.bed.gz; gzip -d ${{tmp}}/chip.bed.gz;
awk 'BEGIN{{FS=OFS="\\t"}} {{gsub(/\./, 0, $5)}} 1' ${{tmp}}/chip.bed > ${{tmp}}/{params.name}.bed;
sicer \\
-t ${{tmp}}/{params.name}.bed \\
-s {params.genomever} \\
-rt 100 \\
-w 300 \\
-f ${{mean_insert_size}} \\
-egf {params.frac} \\
-g 600 \\
-e 100 \\
-cpu 30 \\
-o ${{tmp}}
mv ${{tmp}}/{params.name}-W300-G600.scoreisland {output.bed}
fi
fi
"""
23 changes: 16 additions & 7 deletions workflow/rules/qc.smk
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ rule fastq_screen:
{input}
"""

rule kraken_pe:
rule kraken:
"""
Quality-control step to assess for potential sources of microbial contamination.
If there are high levels of microbial contamination, Kraken will provide an
Expand All @@ -208,7 +208,7 @@ rule kraken_pe:
"""
input:
fq1=join(workpath,trim_dir,"{name}.R1.trim.fastq.gz"),
fq2=join(workpath,trim_dir,"{name}.R2.trim.fastq.gz"),
fq2=provided(join(workpath,trim_dir,"{name}.R2.trim.fastq.gz"), paired_end)
output:
krakenout = join(workpath,kraken_dir,"{name}.trim.kraken_bacteria.out.txt"),
krakentaxa = join(workpath,kraken_dir,"{name}.trim.kraken_bacteria.taxa.txt"),
Expand All @@ -218,6 +218,7 @@ rule kraken_pe:
outdir=join(workpath,kraken_dir),
bacdb=config['shared_resources']['KRAKENBACDB'],
tmpdir=tmpdir,
paired_end = paired_end
threads: int(allocated("threads", "kraken_pe", cluster)),
envmodules:
config['tools']['KRAKENVER'],
Expand All @@ -234,11 +235,19 @@ rule kraken_pe:
# location to reduce filesystem strain
cp -rv {params.bacdb} ${{tmp}}/;
kdb_base=$(basename {params.bacdb})
kraken2 --db ${{tmp}}/${{kdb_base}} \\
--threads {threads} --report {output.krakentaxa} \\
--output {output.krakenout} \\
--gzip-compressed \\
--paired {input.fq1} {input.fq2}
if [ '{params.paired_end}' == True ]; then
kraken2 --db ${{tmp}}/${{kdb_base}} \\
--threads {threads} --report {output.krakentaxa} \\
--output {output.krakenout} \\
--gzip-compressed \\
--paired {input.fq1} {input.fq2}
else
kraken2 --db ${{tmp}}/${{kdb_base}} \\
--threads {threads} --report {output.krakentaxa} \\
--output {output.krakenout} \\
--gzip-compressed \\
{input.fq1}
fi
# Generate Krona Report
cut -f2,3 {output.krakenout} | \\
Expand Down
Loading

0 comments on commit 34109af

Please sign in to comment.