Skip to content

Commit

Permalink
fix: correct ci dry run failures
Browse files Browse the repository at this point in the history
  • Loading branch information
rroutsong committed Dec 11, 2024
1 parent 05d7fa0 commit e3da83c
Show file tree
Hide file tree
Showing 6 changed files with 84 additions and 123 deletions.
15 changes: 7 additions & 8 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ from os.path import join
from os import listdir

# Local imports
from scripts.common import provided, get_file_components
from scripts.common import provided
from scripts.grouping import group_samples_by_reps, \
group_output_files, get_peaktools

Expand Down Expand Up @@ -39,7 +39,6 @@ peak_types = config['options']['peak_type_base']
rule_all_ins = []
groupdatawinput, groupswreps = group_samples_by_reps(groupdata, samples, chip2input)
PeakTools = get_peaktools(assay)
file_stems, extRPGC, extaln = get_file_components(paired_end)
groups = list(groupdatawinput.keys())
reps = True if len(groupswreps) > 0 else False
uniq_inputs = list(sorted(set([v for v in chip2input.values() if v])))
Expand All @@ -48,7 +47,6 @@ sampleswinput = [
if chip_value != 'NA' and chip_value != ''
]
inputnorm = [""] if len(sampleswinput) == 0 else ["", ".inputnorm"]
deepgroups, deepexts = group_output_files(extRPGC, groups, inputnorm)
UropaCats = ["protTSS", "prot", "protSEC", "genes"]

# Directory end points
Expand Down Expand Up @@ -134,15 +132,16 @@ elif assay in ["atac", "chip"]:
# if has_inputs:
# rule_all_ins.extend(expand(join(MEME_dir, "{PeakTool}", "{name}_meme", "meme-chip.html"), PeakTool=PeakTools, name=chips))
# rule_all_ins.extend(expand(join(MEME_dir, "{PeakTool}", "{name}_ame", "ame.html"), PeakTool=PeakTools, name=chips))
rule_all_ins.extend(expand(join(qc_dir, "{name}.{stem}.insert_size_metrics.txt"), name=samples, stem=file_stems))
rule_all_ins.extend(expand(join(qc_dir, "{name}.sorted.insert_size_metrics.txt"), name=samples))
if assay == "chip":
rule_all_ins.extend(expand(join(macsB_dir, "{name}", "{name}_peaks.broadPeak"), name=chips))
# sicer outputs turned off for now
# if has_inputs:
# rule_all_ins.extend(expand(join(sicer_dir, "{name}", "{name}_broadpeaks.bed"), name=chips))
rule_all_ins.extend(expand(join(ppqt_dir, "{name}.sorted.ppqt"), name=samples))
rule_all_ins.extend(expand(join(ppqt_dir, "{name}.sorted.pdf"), name=samples))
rule_all_ins.extend(expand(join(ppqt_dir, "{name}.sorted.ppqt.txt"), name=samples))
# if not paired_end:
# rule_all_ins.extend(expand(join(ppqt_dir, "{name}.sorted.ppqt"), name=samples))
# rule_all_ins.extend(expand(join(ppqt_dir, "{name}.sorted.pdf"), name=samples))
# rule_all_ins.extend(expand(join(ppqt_dir, "{name}.sorted.ppqt.txt"), name=samples))

rule_all_ins.extend(expand(
join(uropa_dir, "{application}", "{name}_uropa_{_type}_allhits.txt"),
Expand Down Expand Up @@ -174,7 +173,7 @@ rule_all_ins.extend(expand(join(qc_dir, "{name}.preseq.dat"), name=samples))
rule_all_ins.extend(
expand(join(peakqc_dir, "{PeakTool}", "{PeakTool}.{name}.Q5DD.FRiP_table.txt"), PeakTool=PeakTools, name=chips)
)
rule_all_ins.extend(expand(join(bam_dir, "{name}.{ext}"), name=samples, ext=extaln))
rule_all_ins.extend(expand(join(bam_dir, "{name}.sorted.bam"), name=samples))
rule_all_ins.extend(expand(join(bw_dir, "{name}.{ext}.RPGC.bw"), name=samples, ext=["sorted", "Q5DD"]))

if has_inputs:
Expand Down
2 changes: 1 addition & 1 deletion workflow/rules/paired/qc.smk
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ rule insert_size:
Number of reads per insert size and their histogram
"""
input:
bam = lambda w : join(bam_dir, f"{w.name}.{w.ext}.bam")
bam = join(bam_dir, "{name}.{ext}.bam")
output:
txt = join(qc_dir, "{name}.{ext}.insert_size_metrics.txt"),
pdf = join(qc_dir, "{name}.{ext}.insert_size_histogram.pdf"),
Expand Down
2 changes: 1 addition & 1 deletion workflow/rules/paired/trim_align_dedup.smk
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ rule dedup:

rule ppqt:
input:
lambda w : join(bam_dir, f"{w.name}.{w.ext}.bam")
join(bam_dir, "{name}.{ext}.bam")
output:
ppqt = join(ppqt_dir, "{name}.{ext}.ppqt"),
pdf = join(ppqt_dir, "{name}.{ext}.pdf"),
Expand Down
3 changes: 1 addition & 2 deletions workflow/rules/single/qc.smk
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,7 @@ rule insert_size:
Number of reads per insert size and their histogram
"""
input:
bam = lambda w : join(bam_dir, f"{w.name}.{w.ext}." +
"bam" if w.ext.lower() == 'sorted' else "gz"),
bam = join(bam_dir, "{name}.sorted.bam"),
output:
txt = join(qc_dir, "{name}.{ext}.insert_size_metrics.txt"),
pdf = join(qc_dir, "{name}.{ext}.insert_size_histogram.pdf"),
Expand Down
118 changes: 58 additions & 60 deletions workflow/rules/single/trim_align_dedup.smk
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,9 @@ rule trim:
Trimmed and blacklist-sequences-free FastQ files
"""
input:
file1 = join(workpath, "{name}.R1.fastq.gz"),
file2 = provided(join(workpath, "{name}.R2.fastq.gz"), paired_end)
join(workpath, "{name}.R1.fastq.gz"),
output:
outfq1 = temp(join(trim_dir, "{name}.R1.trim.fastq.gz")),
outfq2 = provided(temp(join(trim_dir, "{name}.R2.trim.fastq.gz")), paired_end)
temp(join(trim_dir, "{name}.R1.trim.fastq.gz")),
params:
rname = "trim",
cutadaptver = config['tools']['CUTADAPTVER'],
Expand Down Expand Up @@ -75,7 +73,7 @@ rule trim:
-b file:{params.fastawithadaptersetd} \\
-j {threads} \\
-o ${{tmp}}/{params.sample}.R1.trim.fastq.gz \\
{input.file1}
{input}
if [ "{params.blacklistbwaindex}" != "" ]; then
bwa mem -t {threads} \\
Expand All @@ -93,7 +91,7 @@ rule trim:
rm ${{tmp}}/{params.sample}.bam;
pigz -p {threads} ${{tmp}}/{params.sample}.R1.trim.fastq;
fi
mv ${{tmp}}/{params.sample}.R1.trim.fastq.gz {output.outfq1};
mv ${{tmp}}/{params.sample}.R1.trim.fastq.gz {output};
"""


Expand Down Expand Up @@ -168,14 +166,13 @@ rule dedup:
"""
input:
join(bam_dir,"{name}.Q5.bam")
join(bam_dir, "{name}.Q5.bam")
output:
out_bam = join(bam_dir, "{name}.Q5DD.bam"),
flagstat = join(bam_dir, "{name}.Q5DD.bam.flagstat"),
idxstat = join(bam_dir, "{name}.Q5DD.bam.idxstat"),
bwadups = join(bam_dir, "{name}.bwa.Q5.duplic"),
tagalign = join(bam_dir, "{name}.Q5DD_tagAlign") if assay == "cfchip" \
else join(bam_dir, "{name}.Q5DD_tagAlign.gz")
tagalign = join(bam_dir, "{name}.Q5DD_tagAlign.gz")
params:
rname = 'dedup',
picardver = config['tools']['PICARDVER'],
Expand All @@ -200,52 +197,32 @@ rule dedup:
tmp=$(mktemp -d -p "{params.tmpdir}")
trap 'rm -rf "${{tmp}}"' EXIT
if [ "{assay}" == "cfchip" ]; then
java -Xmx{params.javaram} \\
-jar $PICARDJARPATH/picard.jar MarkDuplicates \\
-I {input} \\
-O {params.tmpBam} \\
-TMP_DIR ${{tmp}} \\
-VALIDATION_STRINGENCY SILENT \\
-REMOVE_DUPLICATES true
-METRICS_FILE {output.bwadups};
samtools index {params.tmpBam};
samtools view -b {params.tmpBam} chr{{1..22}} > {output.out_bam};
Rscript {params.rscript} {params.tmpBam} {output.tagalign};
rm {params.tmpBam} {params.tmpBam}.bai;
samtools index {output.out_bam};
samtools flagstat {output.out_bam} > {output.flagstat};
samtools idxstats {output.out_bam} > {output.idxstat};
else
java -Xmx{params.javaram} \\
-jar $PICARDJARPATH/picard.jar MarkDuplicates \\
-I {input} \\
-O {output.out_bam} \\
-TMP_DIR ${{tmp}} \\
-VALIDATION_STRINGENCY SILENT \\
-REMOVE_DUPLICATES true \\
-METRICS_FILE {output.bwadups};
samtools index {output.out_bam};
samtools flagstat {output.outout_bam5} > {output.flagstat};
samtools idxstats {output.out_bam} > {output.idxstat};
fi
macs2 filterdup -i {input} -g {params.gsize} --keep-dup="auto" -o ${{tmp}}/TmpTagAlign;
awk -F"\\t" -v OFS="\\t" '{{if ($2>0 && $3>0) {{print}}}}' ${{tmp}}/TmpTagAlign > ${{tmp}}/TmpTagAlign2;
awk -F"\\t" -v OFS="\\t" '{{print $1,1,$2}}' {params.genomefile} | sort -k1,1 -k2,2n > ${{tmp}}/GenomeFileBed;
bedtools intersect -wa -f 1.0 -a ${{tmp}}/TmpTagAlign2 -b ${{tmp}}/GenomeFileBed > ${{tmp}}/TmpTagAlign3;
bedtools bedtobam -i ${{tmp}}/TmpTagAlign3 -g {params.genomefile} | samtools sort -@4 -o {output.out_bam};
gzip ${{tmp}}/TmpTagAlign3;
mv ${{tmp}}/TmpTagAlign3.gz {output.tagalign};
samtools index {output.out_bam};
samtools flagstat {output.out_bam} > {output.flagstat}
samtools idxstats {output.out_bam} > {output.idxstat}
"""


rule ppqt:
input:
bam = lambda w : join(bam_dir, f"{w.name}.{w.ext}." +
"bam" if w.ext.lower() == 'sorted' else "gz"),
bam = join(bam_dir, "{name}.{ext}.bam")
output:
ppqt = join(ppqt_dir, "{name}.{ext}.ppqt"),
ppqt = join(ppqt_dir, "{name}.{ext}.ppqt.txt"),
pdf = join(ppqt_dir, "{name}.{ext}.pdf"),
txt = join(ppqt_dir, "{name}.{ext}.ppqt.txt"),
txt = join(ppqt_dir, "{name}.{ext}.txt"),
params:
rname = "ppqt",
samtoolsver = config['tools']['SAMTOOLSVER'],
rver = config['tools']['RVER'],
scriptPy = join(bin_path, "ppqt_process.py"),
file_name = "{name}"
tmpdir = tmpdir,
container:
config['images']['ppqt']
shell:
Expand All @@ -254,16 +231,39 @@ rule ppqt:
tmp=$(mktemp -d -p "{params.tmpdir}")
trap 'rm -rf "${{tmp}}"' EXIT
if [[ '{input.bam}' == *.gz ]]; then
ln -s {input.bam} ${{tmp}}/{params.file_name}.Q5DD.tagAlign.gz;
run_spp.R -c=${{tmp}}/{params.file_name}.Q5DD.tagAlign.gz \
-savp={output.pdf} -out={output.ppqt} \
-tmpdir=/lscratch/$SLURM_JOBID -rf
else
run_spp.R -c={input.bam} \
-savp={output.pdf} -out={output.ppqt} \
-tmpdir=/lscratch/$SLURM_JOBID -rf
fi
run_spp.R -c={input.bam} \
-savp={output.pdf} -out={output.ppqt} \
-tmpdir=/lscratch/$SLURM_JOBID -rf
python {params.scriptPy} -i {output.ppqt} -o {output.txt}
"""


rule ppqt_tagalign:
input:
bam = join(bam_dir, "{name}.Q5DD_tagAlign.gz")
output:
ppqt = join(ppqt_dir, "{name}.Q5DD_tagAlign.ppqt.txt"),
pdf = join(ppqt_dir, "{name}.Q5DD_tagAlign.pdf"),
txt = join(ppqt_dir, "{name}.Q5DD_tagAlign.txt"),
params:
rname = "ppqt",
samtoolsver = config['tools']['SAMTOOLSVER'],
rver = config['tools']['RVER'],
scriptPy = join(bin_path, "ppqt_process.py"),
tmpdir = tmpdir,
container:
config['images']['ppqt']
shell:
"""
if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
tmp=$(mktemp -d -p "{params.tmpdir}")
trap 'rm -rf "${{tmp}}"' EXIT
ln -s {input.bam} ${{tmp}}/{wildcards.name}.Q5DD_tagAlign.gz;
run_spp.R -c=${{tmp}}/{wildcards.name}.Q5DD_tagAlign.gz \
-savp={output.pdf} -out={output.ppqt} \
-tmpdir=/lscratch/$SLURM_JOBID -rf
python {params.scriptPy} -i {output.ppqt} -o {output.txt}
"""

Expand All @@ -280,25 +280,23 @@ rule bam2bw:
an associated score, RPGC
"""
input:
bam = lambda w: join(bam_dir, "{w.name}." +
"Q5DD.bam" if w.ext == "Q5DD" else "sorted.bam"),
ppqt = lambda w: join(ppqt_dir, "{w.name}." +
"Q5DD_tagAlign.ppqt.txt" if w.ext == "Q5DD" else
"sorted.ppqt.txt")
bam = lambda w: join(bam_dir, f"{w.name}.{w.ext}.bam"),
ppqt = lambda w: join(ppqt_dir, f"{w.name}.{w.ext}.ppqt.txt")
output:
outbw = join(bw_dir, "{name}.{ext}.RPGC.bw"),
params:
rname = "bam2bw",
name = "{name}",
effectivegenomesize = config['references'][genome]['EFFECTIVEGENOMESIZE'],
tmpdir = tmpdir,
threads:
int(allocated("threads", "bam2bw", cluster)),
envmodules:
config['tools']['DEEPTOOLSVER'],
shell:
"""
if [ ! -d \"""" + tmpdir + """\" ]; then mkdir -p \"""" + tmpdir + """\"; fi
tmp=$(mktemp -d -p \"""" + tmpdir + """\")
if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
tmp=$(mktemp -d -p "{params.tmpdir}")
trap 'rm -rf "${{tmp}}"' EXIT
bam_cov_option=--centerReads
Expand Down
67 changes: 16 additions & 51 deletions workflow/scripts/peakcall.py
Original file line number Diff line number Diff line change
@@ -1,51 +1,30 @@
#!/usr/bin/env python3
from os.path import join

def get_input_bam(input_sample, bam_dir):
"""
Returns a ChIP samples input BAM file,
see chip2input for ChIP, Input pairs.
"""
if input_sample:
# Runs in a ChIP, input mode
return join(bam_dir, "{0}.Q5DD.bam".format(input_sample))
# Runs in ChIP-only mode
return []


def get_control_input(ext, paired_end, bam_dir):
i = []
if paired_end and ext != "":
i = [join(bam_dir, "{0}.Q5DD.bam".format(ext))]
elif not paired_end and ext != "":
i = [join(bam_dir, "{0}.Q5DD_tagAlign.gz".format(ext))]
return i


def outputIDR(groupswreps, groupdata, chip2input, tools):
"""
Produces the correct output files for IDR. All supposed replicates
should be directly compared when possible using IDR. IDR malfunctions
with bed files and GEM so it will not run with either of those.
Because there is no q-value calculated for SICER when there is no
Because there is no q-value calculated for SICER when there is no
input file, those samples are also ignored.
"""
IDRgroup, IDRsample1, IDRsample2, IDRpeaktool = [], [], [], []
for group in groupswreps:
nsamples = len(groupdata[group])
for i in range(nsamples):
ctrlTF = chip2input[groupdata[group][i]] != ""
for j in range(i+1,nsamples):
for j in range(i + 1, nsamples):
if ctrlTF == (chip2input[groupdata[group][j]] != ""):
if ctrlTF == False:
tooltmp = [ tool for tool in tools if tool != "sicer" ]
tooltmp = [tool for tool in tools if tool != "sicer"]
else:
tooltmp = tools
tooltmp = tools
IDRgroup.extend([group] * len(tooltmp))
IDRsample1.extend([groupdata[group][i]] * len(tooltmp))
IDRsample2.extend([groupdata[group][j]] * len(tooltmp))
IDRpeaktool.extend(tooltmp)
return( IDRgroup, IDRsample1, IDRsample2, IDRpeaktool )
return (IDRgroup, IDRsample1, IDRsample2, IDRpeaktool)


def zip_peak_files(chips, PeakTools, PeakExtensions):
Expand All @@ -56,7 +35,7 @@ def zip_peak_files(chips, PeakTools, PeakExtensions):
zipSample.append(chip)
zipTool.append(PeakTool)
zipExt.append(PeakExtensions[PeakTool])
return(zipSample, zipTool, zipExt)
return (zipSample, zipTool, zipExt)


def calc_effective_genome_fraction(effectivesize, genomefile):
Expand All @@ -65,34 +44,20 @@ def calc_effective_genome_fraction(effectivesize, genomefile):
actual genome size from a .genome-like file and then dividing
the effective genome size by that number
"""
lines=list(map(lambda x:x.strip().split("\t"),open(genomefile).readlines()))
genomelen=0
for chrom,l in lines:
if not "_" in chrom and chrom!="chrX" and chrom!="chrM" and chrom!="chrY":
genomelen+=int(l)
return(str(float(effectivesize)/ genomelen))


def getSicerChips(bam_dir, name, paired_end):
if paired_end:
chip = join(bam_dir, name + ".Q5DD.bam")
else:
chip = join(bam_dir, name + ".Q5DD_tagAlign.gz")
return chip


def getSicerFragLen(ppqt_dir, qc_dir, name, paired_end):
if paired_end:
fragLen = join(qc_dir, name + ".Q5DD.insert_size_metrics.txt")
else:
fragLen = join(ppqt_dir, name + ".Q5DD_tagAlign.ppqt.txt")
return fragLen
lines = list(map(lambda x: x.strip().split("\t"), open(genomefile).readlines()))
genomelen = 0
for chrom, l in lines:
if not "_" in chrom and chrom != "chrX" and chrom != "chrM" and chrom != "chrY":
genomelen += int(l)
return str(float(effectivesize) / genomelen)


def get_manorm_sizes(g1, g2, group_data, ppqt_in):
if not ppqt_in:
return ""
file = lambda w, _in: list(map(lambda z: z.strip().split(), open(ppqt_in, 'r').readlines()))
file = lambda w, _in: list(
map(lambda z: z.strip().split(), open(ppqt_in, "r").readlines())
)
extsize1 = [ppqt[1] for ppqt in file if ppqt[0] == group_data[g1]][0]
extsize2 = [ppqt[1] for ppqt in file if ppqt[0] == group_data[g2]][0]
return f"--s1 {extsize1} --s2 {extsize2}"
return f"--s1 {extsize1} --s2 {extsize2}"

0 comments on commit e3da83c

Please sign in to comment.