From 5677e4a6d18df8e89f570faabc81b40de1cc1664 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Mon, 4 Nov 2024 11:59:10 -0500 Subject: [PATCH 1/3] fix: handle tmpdir to frip rules, enforce tmpdir for pybedtools, multithread frip.py --- bin/frip.py | 193 +++++++++++++++++++++++++++--------------- config/cluster.json | 12 +++ workflow/rules/qc.smk | 12 ++- 3 files changed, 148 insertions(+), 69 deletions(-) diff --git a/bin/frip.py b/bin/frip.py index 113eb62..b341dd4 100755 --- a/bin/frip.py +++ b/bin/frip.py @@ -12,61 +12,75 @@ ########################################## # Modules import argparse -from argparse import RawTextHelpFormatter -from pybedtools import BedTool import pysam import pandas as pd +import os +from argparse import RawTextHelpFormatter +from pybedtools import BedTool +from pybedtools.helpers import set_tempdir +from textwrap import dedent ########################################## # Functions + def split_infiles(infiles): - """ - breaks the infile string with space-delimited file names and + """ + breaks the infile string with space-delimited file names and creates a list """ - infileList = infiles.strip("\'").strip('\"').split(" ") + infileList = infiles.strip("'").strip('"').split(" ") if len(infileList) == 1: infileList = infileList[0].split(";") - return(infileList) + return infileList + def count_reads_in_bed(bam, bedfile, genomefile): """ - some of this comes directly from the pybedtools site; read in - bed (or bed-like) file, sort it, and then count the number of + some of this comes directly from the pybedtools site; read in + bed (or bed-like) file, sort it, and then count the number of reads within the regions """ bedinfo = BedTool(bedfile) bedinfo.sort(g=genomefile) return ( - BedTool(bam).intersect( bedinfo, bed=True, stream=True, ) - ).count() + BedTool(bam).intersect( + bedinfo, + bed=True, + stream=True, + ) + ).count() + + +def count_reads_in_bam(bam, t=1): + """count the number of reads in a given bam file""" + return pysam.AlignmentFile(bam, threads=t).mapped -def count_reads_in_bam(bam): - """ count the number of reads in a given bam file """ - return( pysam.AlignmentFile(bam).mapped ) def calculate_frip(nreads, noverlaps): - """ calculate FRiP score from nreads and noverlaps """ - return( float(noverlaps) / nreads ) + """calculate FRiP score from nreads and noverlaps""" + return float(noverlaps) / nreads + def measure_bedfile_coverage(bedfile, genomefile): - """ calculate the number of bases covered by a given bed file """ + """calculate the number of bases covered by a given bed file""" bedinfo = BedTool(bedfile) - return( bedinfo.sort(g=genomefile).total_coverage() ) + return bedinfo.sort(g=genomefile).total_coverage() + def clip_bamfile_name(bamfile): - """ - clip bam file name for table/plotting purposes; assumes file + """ + clip bam file name for table/plotting purposes; assumes file naming system matches that of Pipeliner """ sample = bamfile.split("/")[-1].split(".")[0] - condition = ".".join(bamfile.split("/")[-1].split(".")[1:-1]) - return( sample, condition ) + condition = ".".join(bamfile.split("/")[-1].split(".")[1:-1]) + return (sample, condition) + -def clip_bedfile_name(bedfile,filetype): +def clip_bedfile_name(bedfile, filetype): """ - clip bed file name for table/plotting purposes; assumes file + clip bed file name for table/plotting purposes; assumes file naming system matches that of Pipeliner """ if filetype == "": @@ -74,20 +88,25 @@ def clip_bedfile_name(bedfile,filetype): sample = bedfile.split("/")[-2] else: toolused = filetype - sample = bedfile.split("/")[-1].split(".")[0].strip("_peaks").strip("_broadpeaks") - return( toolused, sample ) + sample = ( + bedfile.split("/")[-1].split(".")[0].strip("_peaks").strip("_broadpeaks") + ) + return (toolused, sample) + -def process_files(bamfile, bedfiles, genome, filetypes): - """ - this is the main function to take in list of input files and - put out an array containing key file name information, read +def process_files(bamfile, bedfiles, genome, filetypes, threads): + """ + this is the main function to take in list of input files and + put out an array containing key file name information, read counts, and FRiP scores """ bedfileL = bedfiles filetypesL = filetypes - out = [[ "bedtool", "bedsample", "bamsample", "bamcondition", - "n_reads", "n_overlap_reads", "FRiP", "n_basesM" ]] - nreads = count_reads_in_bam(bamfile) + out = [["bedtool", "bedsample", + "bamsample","bamcondition", + "n_reads","n_overlap_reads", + "FRiP","n_basesM",]] + nreads = count_reads_in_bam(bamfile, threads) (bamsample, condition) = clip_bamfile_name(bamfile) for i in range(len(bedfileL)): bed = bedfileL[i] @@ -95,51 +114,89 @@ def process_files(bamfile, bedfiles, genome, filetypes): filetype = filetypesL[i] else: filetype = filetypesL[0] - (bedtool, bedsample) = clip_bedfile_name(bed,filetype) + (bedtool, bedsample) = clip_bedfile_name(bed, filetype) noverlaps = count_reads_in_bed(bamfile, bed, genome) frip = calculate_frip(nreads, noverlaps) nbases = measure_bedfile_coverage(bed, genome) / 1000000 - out.append( [bedtool, bedsample, bamsample, condition, - nreads, noverlaps, frip, nbases] ) + out.append( + [bedtool, bedsample, bamsample, condition, nreads, noverlaps, frip, nbases] + ) out2 = pd.DataFrame(out[1:], columns=out[0]) - return(out2) + return out2 + def create_outfile_name(bamfile, outroot): - """ uses outroot to create the output file name """ + """uses outroot to create the output file name""" (bamsample, condition) = clip_bamfile_name(bamfile) outtable = bamsample + "." + condition + "." + "FRiP_table.txt" if outroot != "": outtable = outroot + "." + outtable - return(outtable) + return outtable + def write_table(out2, outtable): - out2.to_csv(outtable,sep='\t',index=False) + out2.to_csv(outtable, sep="\t", index=False) ############################################### # Main -def main(): - desc=""" -This function takes a space-delimited or semi-colon delimited list -of bed-like files (extensions must be recognizable by bedtools) -and a single bam file. It will then calculate the FRiP score for -all possible combinations of files and save the information in a -txt file. It will also calculate the number of bases covered by -each bed-like file. Note: this function assumes that the file -naming system of the input files matches that of Pipeliner. - """ - parser = argparse.ArgumentParser(description=desc, formatter_class=RawTextHelpFormatter) - parser.add_argument('-p', nargs = '+', required=True, type=str, help='A space- or semicolon-delimited list of peakfiles \ -(or bed-like files).') - parser.add_argument('-b', required=True, type=str, help='The name of a bamfile to analyze.') - parser.add_argument('-g', required=True, type=str, help='The name of the .genome file so bedtools knows the \ -size of every chromosome.') - parser.add_argument('-o', required=True, type=str, help='The root name of the multiple output files. Default:""') - parser.add_argument('-t', required=False, default=[""], type=list, help='A space- \ -or semicolon-delimited list of input file sources/types. Only needed when \ -source of bed file is not built into the script. Default: ""') +def main(): + desc = dedent(""" + This function takes a space-delimited or semi-colon delimited list + of bed-like files (extensions must be recognizable by bedtools) + and a single bam file. It will then calculate the FRiP score for + all possible combinations of files and save the information in a + txt file. It will also calculate the number of bases covered by + each bed-like file. Note: this function assumes that the file + naming system of the input files matches that of Pipeliner. + """) + + parser = argparse.ArgumentParser( + description=desc, formatter_class=RawTextHelpFormatter + ) + parser.add_argument( + "-p", + nargs="+", + required=True, + type=str, + help="A space- or semicolon-delimited list of peakfiles \ + (or bed-like files).", + ) + parser.add_argument( + "-b", required=True, type=str, help="The name of a bamfile to analyze." + ) + parser.add_argument( + "-g", + required=True, + type=str, + help="The name of the .genome file so bedtools knows the \ + size of every chromosome.", + ) + parser.add_argument( + "-o", + required=True, + type=str, + help='The root name of the multiple output files. Default:""', + ) + + parser.add_argument( + '-x', '--threads', + required=False, + type=int, + default=1, + help='Number of threads available to use for pysam.AlignmentFile', + ) + parser.add_argument( + "-t", + required=False, + default=[""], + type=list, + help='A space- \ + or semicolon-delimited list of input file sources/types. Only needed when \ + source of bed file is not built into the script. Default: ""', + ) args = parser.parse_args() bedfiles = args.p @@ -147,18 +204,22 @@ def main(): genomefile = args.g outroot = args.o filetypes = args.t - - out2 = process_files(bamfile, bedfiles, genomefile, filetypes) + threads = args.x + if 'TMPDIR' in os.environ: + set_tempdir(os.environ['TMPDIR']) + + out2 = process_files(bamfile, bedfiles, genomefile, filetypes, threads) outtable = create_outfile_name(bamfile, outroot) write_table(out2, outtable) -if __name__ == '__main__': + +if __name__ == "__main__": main() ############################################### # example cases -#bedfiles = "macs_broad/mWT_HCF1_mm_i81/mWT_HCF1_mm_i81_peaks.broadPeak macs_broad/mWT_HCF1_mm_i89/mWT_HCF1_mm_i89_peaks.broadPeak" -#bamfiles = "bam/Input_mm_i95.sorted.Q5DD.bam bam/mWT_HCF1_mm_i81.sorted.Q5DD.bam bam/mWT_HCF1_mm_i89.sorted.Q5DD.bam" -#genomefile = "/data/CCBR_Pipeliner/db/PipeDB/Indices/mm10_basic/indexes/mm10.fa.sizes" -#out2 = pd.read_csv("FRIP_test.txt",sep="\t") +# bedfiles = "macs_broad/mWT_HCF1_mm_i81/mWT_HCF1_mm_i81_peaks.broadPeak macs_broad/mWT_HCF1_mm_i89/mWT_HCF1_mm_i89_peaks.broadPeak" +# bamfiles = "bam/Input_mm_i95.sorted.Q5DD.bam bam/mWT_HCF1_mm_i81.sorted.Q5DD.bam bam/mWT_HCF1_mm_i89.sorted.Q5DD.bam" +# genomefile = "/data/CCBR_Pipeliner/db/PipeDB/Indices/mm10_basic/indexes/mm10.fa.sizes" +# out2 = pd.read_csv("FRIP_test.txt", sep="\t") diff --git a/config/cluster.json b/config/cluster.json index 5f8d805..b664964 100644 --- a/config/cluster.json +++ b/config/cluster.json @@ -143,5 +143,17 @@ "ntasks": "--ntasks=28", "ntasks_per_core": "--ntasks-per-core=1", "exclusive": "--exclusive" + }, + "FRiP_macsN": { + "threads": "16", + "gres": "lscratch:400" + }, + "FRiP_macsB": { + "threads": "16", + "gres": "lscratch:400" + }, + "FRiP_Genrich": { + "threads": "16", + "gres": "lscratch:400" } } diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index 35eaaf5..f171dc3 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -478,6 +478,7 @@ rule FRiP_macsN: # intermediate files with built-in # mechanism for deletion on exit if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi + export TMPDIR="{params.tmpdir}" tmp=$(mktemp -d -p "{params.tmpdir}") trap 'rm -rf "${{tmp}}"' EXIT @@ -485,7 +486,8 @@ rule FRiP_macsN: -p {input.bed} \\ -b {input.bam} \\ -g {params.genome} \\ - -o {params.outroot} + -o {params.outroot} \\ + -x {threads} """ @@ -510,13 +512,15 @@ rule FRiP_Genrich: # mechanism for deletion on exit if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi tmp=$(mktemp -d -p "{params.tmpdir}") + export TMPDIR="${{tmp}}" trap 'rm -rf "${{tmp}}"' EXIT python {params.script} \\ -p {input.bed} \\ -b {input.bam} \\ -g {params.genome} \\ - -o {params.outroot} + -o {params.outroot} \\ + -x {threads} """ @@ -541,13 +545,15 @@ rule FRiP_macsB: # mechanism for deletion on exit if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi tmp=$(mktemp -d -p "{params.tmpdir}") + export TMPDIR="{params.tmpdir}" trap 'rm -rf "${{tmp}}"' EXIT python {params.script} \\ -p {input.bed} \\ -b {input.bam} \\ -g {params.genome} \\ - -o {params.outroot} + -o {params.outroot} \\ + -x {threads} """ From bfac9898bbba796bbb0b380e4610a5b905567c2d Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Mon, 4 Nov 2024 12:11:40 -0500 Subject: [PATCH 2/3] fix: export to not params.tmpdir --- workflow/rules/qc.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index f171dc3..a460932 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -478,8 +478,8 @@ rule FRiP_macsN: # intermediate files with built-in # mechanism for deletion on exit if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi - export TMPDIR="{params.tmpdir}" tmp=$(mktemp -d -p "{params.tmpdir}") + export TMPDIR="${{tmp}}" trap 'rm -rf "${{tmp}}"' EXIT python {params.script} \\ @@ -545,7 +545,7 @@ rule FRiP_macsB: # mechanism for deletion on exit if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi tmp=$(mktemp -d -p "{params.tmpdir}") - export TMPDIR="{params.tmpdir}" + export TMPDIR="${{tmp}}" trap 'rm -rf "${{tmp}}"' EXIT python {params.script} \\ From 78974d1d4cceb7f7f4c62d0f734d53c587b56799 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Mon, 4 Nov 2024 16:27:53 -0500 Subject: [PATCH 3/3] fix: correct frip table routing and frip output scripting --- bin/frip.py | 50 ++++++++++++++++++++++--------------------- workflow/Snakefile | 2 +- workflow/rules/qc.smk | 29 +++++++++++-------------- 3 files changed, 40 insertions(+), 41 deletions(-) diff --git a/bin/frip.py b/bin/frip.py index b341dd4..d5abd59 100755 --- a/bin/frip.py +++ b/bin/frip.py @@ -102,10 +102,18 @@ def process_files(bamfile, bedfiles, genome, filetypes, threads): """ bedfileL = bedfiles filetypesL = filetypes - out = [["bedtool", "bedsample", - "bamsample","bamcondition", - "n_reads","n_overlap_reads", - "FRiP","n_basesM",]] + out = [ + [ + "bedtool", + "bedsample", + "bamsample", + "bamcondition", + "n_reads", + "n_overlap_reads", + "FRiP", + "n_basesM", + ] + ] nreads = count_reads_in_bam(bamfile, threads) (bamsample, condition) = clip_bamfile_name(bamfile) for i in range(len(bedfileL)): @@ -125,15 +133,6 @@ def process_files(bamfile, bedfiles, genome, filetypes, threads): return out2 -def create_outfile_name(bamfile, outroot): - """uses outroot to create the output file name""" - (bamsample, condition) = clip_bamfile_name(bamfile) - outtable = bamsample + "." + condition + "." + "FRiP_table.txt" - if outroot != "": - outtable = outroot + "." + outtable - return outtable - - def write_table(out2, outtable): out2.to_csv(outtable, sep="\t", index=False) @@ -143,7 +142,8 @@ def write_table(out2, outtable): def main(): - desc = dedent(""" + desc = dedent( + """ This function takes a space-delimited or semi-colon delimited list of bed-like files (extensions must be recognizable by bedtools) and a single bam file. It will then calculate the FRiP score for @@ -151,7 +151,8 @@ def main(): txt file. It will also calculate the number of bases covered by each bed-like file. Note: this function assumes that the file naming system of the input files matches that of Pipeliner. - """) + """ + ) parser = argparse.ArgumentParser( description=desc, formatter_class=RawTextHelpFormatter @@ -182,11 +183,13 @@ def main(): ) parser.add_argument( - '-x', '--threads', + "-x", + "--threads", required=False, + dest="threads", type=int, default=1, - help='Number of threads available to use for pysam.AlignmentFile', + help="Number of threads available to use for pysam.AlignmentFile", ) parser.add_argument( "-t", @@ -202,15 +205,14 @@ def main(): bedfiles = args.p bamfile = args.b genomefile = args.g - outroot = args.o + outfile = args.o filetypes = args.t - threads = args.x - if 'TMPDIR' in os.environ: - set_tempdir(os.environ['TMPDIR']) - + threads = args.threads + if "TMPDIR" in os.environ: + set_tempdir(os.environ["TMPDIR"]) + out2 = process_files(bamfile, bedfiles, genomefile, filetypes, threads) - outtable = create_outfile_name(bamfile, outroot) - write_table(out2, outtable) + write_table(out2, outfile) if __name__ == "__main__": diff --git a/workflow/Snakefile b/workflow/Snakefile index 7512a93..f9224fd 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -169,7 +169,7 @@ elif assay in ["atac", "chip"]: rule_all_ins.append(join(workpath, "multiqc_report.html")) rule_all_ins.extend(expand(join(qc_dir, "{name}.preseq.dat"), name=samples)) rule_all_ins.extend( - expand(join(peakqc_dir, "{PeakTool}.{name}.Q5DD.FRiP_table.txt"), PeakTool=PeakTools, name=chips) + expand(join(peakqc_dir, "{PeakTool}", "{PeakTool}.{name}.Q5DD.FRiP_table.txt"), PeakTool=PeakTools, name=chips) ) rule_all_ins.extend(expand(join(bam_dir, "{name}.{ext}"), name=samples, ext=extaln)) rule_all_ins.extend(expand(join(bw_dir, "{name}.{ext}.RPGC.bw"), name=samples, ext=["sorted", "Q5DD"])) diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index a460932..fb1d66b 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -460,13 +460,12 @@ rule deeptools_gene_all: rule FRiP_macsN: input: - bed = expand(join(macsN_dir, "{name}", "{name}_peaks.narrowPeak"), name=chips), + bed = join(macsN_dir, "{name}", "{name}_peaks.narrowPeak"), bam = join(bam_dir, "{name}.Q5DD.bam"), output: - join(workpath, "PeakQC", "macsNarrow.{name}.Q5DD.FRiP_table.txt"), + join(peakqc_dir, "macsNarrow", "macsNarrow.{name}.Q5DD.FRiP_table.txt"), params: rname = "FRiP_macsN", - outroot = join(peakqc_dir, "macsNarrow"), script = join(bin_path, "frip.py"), genome = config['references'][genome]['REFLEN'], tmpdir = tmpdir, @@ -486,20 +485,19 @@ rule FRiP_macsN: -p {input.bed} \\ -b {input.bam} \\ -g {params.genome} \\ - -o {params.outroot} \\ - -x {threads} + -o {output} \\ + -x 16 """ rule FRiP_Genrich: input: - bed = expand(join(genrich_dir, "{name}", "{name}.narrowPeak"), name=chips), + bed = join(genrich_dir, "{name}", "{name}.narrowPeak"), bam = join(bam_dir, "{name}.Q5DD.bam"), output: - join(workpath, "PeakQC", "Genrich.{name}.Q5DD.FRiP_table.txt"), + join(peakqc_dir, "Genrich", "Genrich.{name}.Q5DD.FRiP_table.txt"), params: - rname = "FRiP_macsN", - outroot = join(peakqc_dir, "macsNarrow"), + rname = "FRiP_Genrich", script = join(bin_path, "frip.py"), genome = config['references'][genome]['REFLEN'], tmpdir = tmpdir, @@ -519,20 +517,19 @@ rule FRiP_Genrich: -p {input.bed} \\ -b {input.bam} \\ -g {params.genome} \\ - -o {params.outroot} \\ - -x {threads} + -o {output} \\ + -x 16 """ rule FRiP_macsB: input: - bed = expand(join(macsB_dir, "{name}", "{name}_peaks.broadPeak"), name=chips), + bed = join(macsB_dir, "{name}", "{name}_peaks.broadPeak"), bam = join(bam_dir, "{name}.Q5DD.bam"), output: - join(workpath, "PeakQC", "macsBroad.{name}.Q5DD.FRiP_table.txt"), + join(peakqc_dir, "macsBroad", "macsBroad.{name}.Q5DD.FRiP_table.txt"), params: rname = "FRiP_macsB", - outroot = join(peakqc_dir, "macsBroad"), script = join(bin_path, "frip.py"), genome = config['references'][genome]['REFLEN'], tmpdir = tmpdir, @@ -552,8 +549,8 @@ rule FRiP_macsB: -p {input.bed} \\ -b {input.bam} \\ -g {params.genome} \\ - -o {params.outroot} \\ - -x {threads} + -o {output} \\ + -x 16 """