Skip to content

Commit

Permalink
fix: seperate paired and single ended rules, conditionally import
Browse files Browse the repository at this point in the history
  • Loading branch information
rroutsong committed Dec 10, 2024
1 parent 9d89c2d commit e61eac0
Show file tree
Hide file tree
Showing 12 changed files with 1,023 additions and 691 deletions.
13 changes: 11 additions & 2 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -221,9 +221,18 @@ rule all:
input:
rule_all_ins

# Include child rules
# Include endedness rules
if paired_end:
include: join("rules", "paired", "peakcall.smk")
include: join("rules", "paired", "qc.smk")
include: join("rules", "paired", "trim.smk")
else:
include: join("rules", "paired", "peakcall.smk")
include: join("rules", "paired", "qc.smk")
include: join("rules", "paired", "trim.smk")

# Include general rules
include: join("rules", "hooks.smk")
include: join("rules", "trim_align_dedup.smk")
include: join("rules", "qc.smk")
include: join("rules", "peakcall.smk")
include: join("rules", "dba.smk")
Expand Down
132 changes: 132 additions & 0 deletions workflow/rules/paired/peakcall.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
# Calling read coverage as peaks
# ~~~~
# Paired-end peak calling rules


rule MACS2_broad:
input:
chip = join(bam_dir, "{name}.Q5DD_tagAlign.gz"),
txt = join(ppqt_dir, "{name}.Q5DD_tagAlign.ppqt.txt"),
c_option = lambda w: join(bam_dir, f"{chip2input[w.name]}.Q5DD.bam"),
output:
join(macsB_dir, "{name}", "{name}_peaks.broadPeak"),
params:
rname = 'MACS2_broad',
gsize = config['references'][genome]['EFFECTIVEGENOMESIZE'],
macsver = config['tools']['MACSVER'],
flag = lambda w: "-c" if chip2input[w.name] else "",
shell:
"""
module load {params.macsver};
macs2 callpeak \\
-t {input.chip} {params.flag} {input.c_option} \\
-g {params.gsize} \\
-n {wildcards.name} \\
--outdir {macsB_dir}/{wildcards.name} \\
--broad \\
--broad-cutoff 0.01 \\
--keep-dup="all" \\
-f "BAMPE"
"""


rule SICER:
input:
chip = lambda w: join(bam_dir, w.name + ".Q5DD.bam"),
fragLen = lambda w: join(qc_dir, name + ".Q5DD.insert_size_metrics.txt"),
c_option = lambda w: join(bam_dir, f"{chip2input[w.name]}.Q5DD.bam"),
output:
bed = join(sicer_dir, "{name}", "{name}_broadpeaks.bed") if has_inputs else [],
params:
rname = 'SICER',
name = "{name}",
sicerver = config['tools']['SICERVER'],
bedtoolsver = config['tools']['BEDTOOLSVER'],
genomever = config['options']['genome'],
this_sicer_dir = join(sicer_dir,"{name}"),
frac = config['references'][genome]['FRAC'],
flag = lambda w: "-c" if chip2input[w.name] else "",
shell:
"""
module load {params.sicerver}
module load {params.bedtoolsver}
if [ ! -d \"""" + str(tmpdir) + """\" ]; then mkdir -p \"""" + str(tmpdir) + """\"; fi
tmp=$(mktemp -d -p \"""" + str(tmpdir) + """\")
trap 'rm -rf "${{tmp}}"' EXIT
MEAN_INSERT_SIZE=$(cat {input.fragLen} | awk '/MEDIAN_INSERT_SIZE/{{f=1;next}} /## HISTOGRAM/{{f=0}} f' | cut -f 6)
mean_insert_size=$(printf "%.0f" $MEAN_INSERT_SIZE)
echo "printing out value of mean-insert-size ${{mean_insert_size}}"
a={input.c_option}
echo "Printing input.c_option ${{a}}"
if [ -f "{input.c_option}" ]; then
# Copying input to tmpdir due to SICER2
# bam2bed file conversion, if more than
# one sample shares the same IP sample
# than a race condition can occur where
# two jobs can concurrent try to write
# to the same BED file (during bedtools
# bam2bed that sicer calls).
input_bam="$(basename "{input.c_option}")"
cp {input.c_option} ${{tmp}}
echo "paired-end with input... ${{tmp}}/${{input_bam}}"
sicer \\
-t {input.chip} \\
-c "${{tmp}}/${{input_bam}}" \\
-s {params.genomever} \\
-rt 100 \\
-w 300 \\
-f ${{mean_insert_size}} \\
-egf {params.frac} \\
-g 600 \\
-fdr 1E-2 \\
-cpu 30 \\
-o ${{tmp}}
mv ${{tmp}}/{params.name}.Q5DD-W300-G600-FDR0.01-island.bed {output.bed};
mv ${{tmp}}/{params.name}.Q5DD-W300-G600-islands-summary {params.this_sicer_dir}
else
echo "paired-end without input"
sicer \\
-t {input.chip} \\
-s {params.genomever} \\
-rt 100 \\
-w 300 \\
-f ${{mean_insert_size}} \\
-egf {params.frac} \\
-g 600 \\
-e 100 \\
-cpu 30 \\
-o ${{tmp}}
mv ${{tmp}}/{params.name}.Q5DD-W300-G600.scoreisland {params.this_sicer_dir}
fi
"""


rule MACS2_narrow:
input:
chip = join(bam_dir, "{name}.Q5DD_tagAlign.gz"),
txt = join(ppqt_dir, "{name}.Q5DD_tagAlign.ppqt.txt"),
c_option = lambda w: join(bam_dir, f"{chip2input[w.name]}.Q5DD.bam"),
output:
join(macsN_dir, "{name}", "{name}_peaks.narrowPeak"),
params:
rname = 'MACS2_narrow',
gsize = config['references'][genome]['EFFECTIVEGENOMESIZE'],
macsver = config['tools']['MACSVER'],
flag = lambda w: "-c" if chip2input[w.name] else "",
shell:
"""
module load {params.macsver};
macs2 callpeak \\
-t {input.chip} {params.flag} {input.c_option} \\
-g {params.gsize} \\
-n {wildcards.name} \\
--outdir {macsN_dir}/{wildcards.name} \\
-q 0.01 \\
--keep-dup="all" \\
-f "BAMPE"
"""
116 changes: 116 additions & 0 deletions workflow/rules/paired/qc.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
# Quality control metrics and reports
# ~~~~
# Paired-end peak qc rules

rule fastq_screen:
"""
Quality-control step to screen for different sources of contamination.
FastQ Screen compares your sequencing data to a set of different reference
genomes to determine if there is contamination. It allows a user to see if
the composition of your library matches what you expect.
@Input:
Trimmed FastQ files (scatter)
@Output:
FastQ Screen report and logfiles
"""
input:
expand(join(trim_dir, "{name}.R{rn}.trim.fastq.gz"), name=samples, rn=ends)
output:
first_txt = expand(join(
qc_dir, "FQscreen", "{name}.R{rn}.trim_screen.txt"), name=samples, rn=[1, 2]
),
first_png = expand(join(
qc_dir, "FQscreen", "{name}.R{rn}.trim_screen.png"), name=samples, rn=[1, 2]
),
second_txt = expand(join(
qc_dir, "FQscreen2", "{name}.R{rn}.trim_screen.txt"), name=samples, rn=[1, 2]
),
second_png = expand(join(
qc_dir, "FQscreen2", "{name}.R{rn}.trim_screen.png"), name=samples, rn=[1, 2]
)
params:
rname = 'fqscreen',
outdir = join(qc_dir, "FQscreen"),
outdir2 = join(qc_dir, "FQscreen2"),
fastq_screen = config['bin']['FASTQ_SCREEN'],
fastq_screen_config1 = config['shared_resources']['FASTQ_SCREEN_CONFIG_P1'],
fastq_screen_config2 = config['shared_resources']['FASTQ_SCREEN_CONFIG_P2'],
envmodules:
config['tools']['BOWTIE2VER'],
config['tools']['PERLVER'],
threads:
int(allocated("threads", "fastq_screen", cluster))
shell:
"""
# First pass of contamination screening
{params.fastq_screen} \\
--conf {params.fastq_screen_config1} \\
--outdir {params.outdir} \\
--threads {threads} \\
--subset 1000000 \\
--aligner bowtie2 \\
--force \\
{input}
# Second pass of contamination screening
{params.fastq_screen} \\
--conf {params.fastq_screen_config2} \\
--outdir {params.outdir2} \\
--threads {threads} \\
--subset 1000000 \\
--aligner bowtie2 \\
--force \\
{input}
"""

rule kraken:
"""
Quality-control step to assess for potential sources of microbial contamination.
If there are high levels of microbial contamination, Kraken will provide an
estimation of the taxonomic composition. Kraken is used in conjunction with
Krona to produce an interactive reports.
@Input:
Trimmed FastQ files (scatter)
@Output:
Kraken logfile and interative krona report
"""
input:
fq1 = join(trim_dir, "{name}.R1.trim.fastq.gz"),
fq2 = join(trim_dir, "{name}.R2.trim.fastq.gz")
output:
krakenout = join(kraken_dir, "{name}.trim.kraken_bacteria.out.txt"),
krakentaxa = join(kraken_dir, "{name}.trim.kraken_bacteria.taxa.txt"),
kronahtml = join(kraken_dir, "{name}.trim.kraken_bacteria.krona.html"),
params:
rname = 'kraken',
outdir = kraken_dir,
bacdb = config['shared_resources']['KRAKENBACDB'],
tmpdir = tmpdir
threads:
int(allocated("threads", "kraken_pe", cluster)),
envmodules:
config['tools']['KRAKENVER'],
config['tools']['KRONATOOLSVER'],
shell:
"""
# Setups temporary directory for
# intermediate files with built-in
# mechanism for deletion on exit
if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
tmp=$(mktemp -d -p "{params.tmpdir}")
trap 'rm -rf "${{tmp}}"' EXIT
# Copy kraken2 db to /lscratch or temp
# location to reduce filesystem strain
cp -rv {params.bacdb} ${{tmp}}/;
kdb_base=$(basename {params.bacdb})
kraken2 --db ${{tmp}}/${{kdb_base}} \\
--threads {threads} --report {output.krakentaxa} \\
--output {output.krakenout} \\
--gzip-compressed \\
--paired {input.fq1} {input.fq2}
# Generate Krona Report
cut -f2,3 {output.krakenout} | \\
ktImportTaxonomy - -o {output.kronahtml}
"""
Loading

0 comments on commit e61eac0

Please sign in to comment.