From 5677e4a6d18df8e89f570faabc81b40de1cc1664 Mon Sep 17 00:00:00 2001
From: Ryan Routsong <routsongrm@nih.gov>
Date: Mon, 4 Nov 2024 11:59:10 -0500
Subject: [PATCH 1/3] fix: handle tmpdir to frip rules, enforce tmpdir for
 pybedtools, multithread frip.py

---
 bin/frip.py           | 193 +++++++++++++++++++++++++++---------------
 config/cluster.json   |  12 +++
 workflow/rules/qc.smk |  12 ++-
 3 files changed, 148 insertions(+), 69 deletions(-)

diff --git a/bin/frip.py b/bin/frip.py
index 113eb62..b341dd4 100755
--- a/bin/frip.py
+++ b/bin/frip.py
@@ -12,61 +12,75 @@
 ##########################################
 # Modules
 import argparse
-from argparse import RawTextHelpFormatter
-from pybedtools import BedTool
 import pysam
 import pandas as pd
+import os
+from argparse import RawTextHelpFormatter
+from pybedtools import BedTool
+from pybedtools.helpers import set_tempdir
+from textwrap import dedent
 
 ##########################################
 # Functions
 
+
 def split_infiles(infiles):
-    """ 
-    breaks the infile string with space-delimited file names and 
+    """
+    breaks the infile string with space-delimited file names and
     creates a list
     """
-    infileList = infiles.strip("\'").strip('\"').split(" ")
+    infileList = infiles.strip("'").strip('"').split(" ")
     if len(infileList) == 1:
         infileList = infileList[0].split(";")
-    return(infileList)
+    return infileList
+
 
 def count_reads_in_bed(bam, bedfile, genomefile):
     """
-    some of this comes directly from the pybedtools site; read in 
-    bed (or bed-like) file, sort it, and then count the number of 
+    some of this comes directly from the pybedtools site; read in
+    bed (or bed-like) file, sort it, and then count the number of
     reads within the regions
     """
     bedinfo = BedTool(bedfile)
     bedinfo.sort(g=genomefile)
     return (
-        BedTool(bam).intersect( bedinfo, bed=True, stream=True, )
-        ).count()
+        BedTool(bam).intersect(
+            bedinfo,
+            bed=True,
+            stream=True,
+        )
+    ).count()
+
+
+def count_reads_in_bam(bam, t=1):
+    """count the number of reads in a given bam file"""
+    return pysam.AlignmentFile(bam, threads=t).mapped
 
-def count_reads_in_bam(bam):
-    """ count the number of reads in a given bam file """
-    return( pysam.AlignmentFile(bam).mapped )
 
 def calculate_frip(nreads, noverlaps):
-    """ calculate FRiP score from nreads and noverlaps """
-    return( float(noverlaps) / nreads )
+    """calculate FRiP score from nreads and noverlaps"""
+    return float(noverlaps) / nreads
+
 
 def measure_bedfile_coverage(bedfile, genomefile):
-    """ calculate the number of bases covered by a given bed file """
+    """calculate the number of bases covered by a given bed file"""
     bedinfo = BedTool(bedfile)
-    return( bedinfo.sort(g=genomefile).total_coverage() )
+    return bedinfo.sort(g=genomefile).total_coverage()
+
 
 def clip_bamfile_name(bamfile):
-    """ 
-    clip bam file name for table/plotting purposes; assumes file 
+    """
+    clip bam file name for table/plotting purposes; assumes file
     naming system matches that of Pipeliner
     """
     sample = bamfile.split("/")[-1].split(".")[0]
-    condition =  ".".join(bamfile.split("/")[-1].split(".")[1:-1])
-    return( sample, condition )
+    condition = ".".join(bamfile.split("/")[-1].split(".")[1:-1])
+    return (sample, condition)
+
 
-def clip_bedfile_name(bedfile,filetype):
+def clip_bedfile_name(bedfile, filetype):
     """
-    clip bed file name for table/plotting purposes; assumes file 
+    clip bed file name for table/plotting purposes; assumes file
     naming system matches that of Pipeliner
     """
     if filetype == "":
@@ -74,20 +88,25 @@ def clip_bedfile_name(bedfile,filetype):
         sample = bedfile.split("/")[-2]
     else:
         toolused = filetype
-        sample = bedfile.split("/")[-1].split(".")[0].strip("_peaks").strip("_broadpeaks")
-    return( toolused, sample )
+        sample = (
+            bedfile.split("/")[-1].split(".")[0].strip("_peaks").strip("_broadpeaks")
+        )
+    return (toolused, sample)
+
 
-def process_files(bamfile, bedfiles, genome, filetypes):
-    """ 
-    this is the main function to take in list of input files and 
-    put out an array containing key file name information, read 
+def process_files(bamfile, bedfiles, genome, filetypes, threads):
+    """
+    this is the main function to take in list of input files and
+    put out an array containing key file name information, read
     counts, and FRiP scores
     """
     bedfileL = bedfiles
     filetypesL = filetypes
-    out = [[ "bedtool", "bedsample", "bamsample", "bamcondition", 
-    "n_reads", "n_overlap_reads", "FRiP", "n_basesM" ]]
-    nreads = count_reads_in_bam(bamfile)
+    out = [["bedtool", "bedsample",
+            "bamsample","bamcondition",
+            "n_reads","n_overlap_reads",
+            "FRiP","n_basesM",]]
+    nreads = count_reads_in_bam(bamfile, threads)
     (bamsample, condition) = clip_bamfile_name(bamfile)
     for i in range(len(bedfileL)):
         bed = bedfileL[i]
@@ -95,51 +114,89 @@ def process_files(bamfile, bedfiles, genome, filetypes):
             filetype = filetypesL[i]
         else:
             filetype = filetypesL[0]
-        (bedtool, bedsample) = clip_bedfile_name(bed,filetype)
+        (bedtool, bedsample) = clip_bedfile_name(bed, filetype)
         noverlaps = count_reads_in_bed(bamfile, bed, genome)
         frip = calculate_frip(nreads, noverlaps)
         nbases = measure_bedfile_coverage(bed, genome) / 1000000
-        out.append( [bedtool, bedsample, bamsample, condition, 
-                nreads, noverlaps, frip, nbases] )
+        out.append(
+            [bedtool, bedsample, bamsample, condition, nreads, noverlaps, frip, nbases]
+        )
     out2 = pd.DataFrame(out[1:], columns=out[0])
-    return(out2)
+    return out2
+
 
 def create_outfile_name(bamfile, outroot):
-    """ uses outroot to create the output file name """
+    """uses outroot to create the output file name"""
     (bamsample, condition) = clip_bamfile_name(bamfile)
     outtable = bamsample + "." + condition + "." + "FRiP_table.txt"
     if outroot != "":
         outtable = outroot + "." + outtable
-    return(outtable)
+    return outtable
+
 
 def write_table(out2, outtable):
-    out2.to_csv(outtable,sep='\t',index=False)
+    out2.to_csv(outtable, sep="\t", index=False)
 
 
 ###############################################
 # Main
 
-def main():
-    desc="""
-This function takes a space-delimited or semi-colon delimited list
-of bed-like files (extensions must be recognizable by bedtools)
-and a single bam file. It will then calculate the FRiP score for
-all possible combinations of files and save the information in a
-txt file. It will also calculate the number of bases covered by 
-each bed-like file. Note: this function assumes that the file 
-naming system of the input files matches that of Pipeliner.
-    """
 
-    parser = argparse.ArgumentParser(description=desc, formatter_class=RawTextHelpFormatter)
-    parser.add_argument('-p', nargs = '+', required=True, type=str, help='A space- or semicolon-delimited list of peakfiles \
-(or bed-like files).')
-    parser.add_argument('-b', required=True, type=str, help='The name of a bamfile to analyze.')
-    parser.add_argument('-g', required=True, type=str, help='The name of the .genome file so bedtools knows the \
-size of every chromosome.')
-    parser.add_argument('-o', required=True, type=str, help='The root name of the multiple output files. Default:""')
-    parser.add_argument('-t', required=False, default=[""], type=list, help='A space- \
-or semicolon-delimited list of input file sources/types. Only needed when \
-source of bed file is not built into the script. Default: ""')
+def main():
+    desc = dedent("""
+                    This function takes a space-delimited or semi-colon delimited list
+                    of bed-like files (extensions must be recognizable by bedtools)
+                    and a single bam file. It will then calculate the FRiP score for
+                    all possible combinations of files and save the information in a
+                    txt file. It will also calculate the number of bases covered by 
+                    each bed-like file. Note: this function assumes that the file 
+                    naming system of the input files matches that of Pipeliner.
+                    """)
+
+    parser = argparse.ArgumentParser(
+        description=desc, formatter_class=RawTextHelpFormatter
+    )
+    parser.add_argument(
+        "-p",
+        nargs="+",
+        required=True,
+        type=str,
+        help="A space- or semicolon-delimited list of peakfiles \
+              (or bed-like files).",
+    )
+    parser.add_argument(
+        "-b", required=True, type=str, help="The name of a bamfile to analyze."
+    )
+    parser.add_argument(
+        "-g",
+        required=True,
+        type=str,
+        help="The name of the .genome file so bedtools knows the \
+              size of every chromosome.",
+    )
+    parser.add_argument(
+        "-o",
+        required=True,
+        type=str,
+        help='The root name of the multiple output files. Default:""',
+    )
+
+    parser.add_argument(
+        '-x', '--threads',
+        required=False,
+        type=int,
+        default=1,
+        help='Number of threads available to use for pysam.AlignmentFile',
+    )
+    parser.add_argument(
+        "-t",
+        required=False,
+        default=[""],
+        type=list,
+        help='A space- \
+              or semicolon-delimited list of input file sources/types. Only needed when \
+              source of bed file is not built into the script. Default: ""',
+    )
 
     args = parser.parse_args()
     bedfiles = args.p
@@ -147,18 +204,22 @@ def main():
     genomefile = args.g
     outroot = args.o
     filetypes = args.t
-
-    out2 = process_files(bamfile, bedfiles, genomefile, filetypes)
+    threads = args.x
+    if 'TMPDIR' in os.environ:
+        set_tempdir(os.environ['TMPDIR'])
+    
+    out2 = process_files(bamfile, bedfiles, genomefile, filetypes, threads)
     outtable = create_outfile_name(bamfile, outroot)
     write_table(out2, outtable)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     main()
 
 ###############################################
 # example cases
 
-#bedfiles = "macs_broad/mWT_HCF1_mm_i81/mWT_HCF1_mm_i81_peaks.broadPeak macs_broad/mWT_HCF1_mm_i89/mWT_HCF1_mm_i89_peaks.broadPeak"
-#bamfiles = "bam/Input_mm_i95.sorted.Q5DD.bam bam/mWT_HCF1_mm_i81.sorted.Q5DD.bam bam/mWT_HCF1_mm_i89.sorted.Q5DD.bam"
-#genomefile = "/data/CCBR_Pipeliner/db/PipeDB/Indices/mm10_basic/indexes/mm10.fa.sizes"
-#out2 = pd.read_csv("FRIP_test.txt",sep="\t")
+# bedfiles = "macs_broad/mWT_HCF1_mm_i81/mWT_HCF1_mm_i81_peaks.broadPeak macs_broad/mWT_HCF1_mm_i89/mWT_HCF1_mm_i89_peaks.broadPeak"
+# bamfiles = "bam/Input_mm_i95.sorted.Q5DD.bam bam/mWT_HCF1_mm_i81.sorted.Q5DD.bam bam/mWT_HCF1_mm_i89.sorted.Q5DD.bam"
+# genomefile = "/data/CCBR_Pipeliner/db/PipeDB/Indices/mm10_basic/indexes/mm10.fa.sizes"
+# out2 = pd.read_csv("FRIP_test.txt", sep="\t")
diff --git a/config/cluster.json b/config/cluster.json
index 5f8d805..b664964 100644
--- a/config/cluster.json
+++ b/config/cluster.json
@@ -143,5 +143,17 @@
         "ntasks": "--ntasks=28",
         "ntasks_per_core": "--ntasks-per-core=1",
         "exclusive": "--exclusive"
+    }, 
+    "FRiP_macsN": {
+        "threads": "16",
+        "gres": "lscratch:400"
+    },
+    "FRiP_macsB": {
+        "threads": "16",
+        "gres": "lscratch:400"
+    },
+    "FRiP_Genrich": {
+        "threads": "16",
+        "gres": "lscratch:400"
     }
 }
diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk
index 35eaaf5..f171dc3 100644
--- a/workflow/rules/qc.smk
+++ b/workflow/rules/qc.smk
@@ -478,6 +478,7 @@ rule FRiP_macsN:
         # intermediate files with built-in 
         # mechanism for deletion on exit
         if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
+        export TMPDIR="{params.tmpdir}"
         tmp=$(mktemp -d -p "{params.tmpdir}")
         trap 'rm -rf "${{tmp}}"' EXIT
 
@@ -485,7 +486,8 @@ rule FRiP_macsN:
             -p {input.bed} \\
             -b {input.bam} \\
             -g {params.genome} \\
-            -o {params.outroot}
+            -o {params.outroot} \\
+            -x {threads}
         """
 
 
@@ -510,13 +512,15 @@ rule FRiP_Genrich:
         # mechanism for deletion on exit
         if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
         tmp=$(mktemp -d -p "{params.tmpdir}")
+        export TMPDIR="${{tmp}}"
         trap 'rm -rf "${{tmp}}"' EXIT
 
         python {params.script} \\
             -p {input.bed} \\
             -b {input.bam} \\
             -g {params.genome} \\
-            -o {params.outroot}
+            -o {params.outroot} \\
+            -x {threads}
         """
 
 
@@ -541,13 +545,15 @@ rule FRiP_macsB:
         # mechanism for deletion on exit
         if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
         tmp=$(mktemp -d -p "{params.tmpdir}")
+        export TMPDIR="{params.tmpdir}"
         trap 'rm -rf "${{tmp}}"' EXIT
 
         python {params.script} \\
             -p {input.bed} \\
             -b {input.bam} \\
             -g {params.genome} \\
-            -o {params.outroot}
+            -o {params.outroot} \\
+            -x {threads}
         """
 
 

From bfac9898bbba796bbb0b380e4610a5b905567c2d Mon Sep 17 00:00:00 2001
From: Ryan Routsong <routsongrm@nih.gov>
Date: Mon, 4 Nov 2024 12:11:40 -0500
Subject: [PATCH 2/3] fix: export to  not params.tmpdir

---
 workflow/rules/qc.smk | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk
index f171dc3..a460932 100644
--- a/workflow/rules/qc.smk
+++ b/workflow/rules/qc.smk
@@ -478,8 +478,8 @@ rule FRiP_macsN:
         # intermediate files with built-in 
         # mechanism for deletion on exit
         if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
-        export TMPDIR="{params.tmpdir}"
         tmp=$(mktemp -d -p "{params.tmpdir}")
+        export TMPDIR="${{tmp}}"
         trap 'rm -rf "${{tmp}}"' EXIT
 
         python {params.script} \\
@@ -545,7 +545,7 @@ rule FRiP_macsB:
         # mechanism for deletion on exit
         if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
         tmp=$(mktemp -d -p "{params.tmpdir}")
-        export TMPDIR="{params.tmpdir}"
+        export TMPDIR="${{tmp}}"
         trap 'rm -rf "${{tmp}}"' EXIT
 
         python {params.script} \\

From 78974d1d4cceb7f7f4c62d0f734d53c587b56799 Mon Sep 17 00:00:00 2001
From: Ryan Routsong <routsongrm@nih.gov>
Date: Mon, 4 Nov 2024 16:27:53 -0500
Subject: [PATCH 3/3] fix: correct frip table routing and frip output scripting

---
 bin/frip.py           | 50 ++++++++++++++++++++++---------------------
 workflow/Snakefile    |  2 +-
 workflow/rules/qc.smk | 29 +++++++++++--------------
 3 files changed, 40 insertions(+), 41 deletions(-)

diff --git a/bin/frip.py b/bin/frip.py
index b341dd4..d5abd59 100755
--- a/bin/frip.py
+++ b/bin/frip.py
@@ -102,10 +102,18 @@ def process_files(bamfile, bedfiles, genome, filetypes, threads):
     """
     bedfileL = bedfiles
     filetypesL = filetypes
-    out = [["bedtool", "bedsample",
-            "bamsample","bamcondition",
-            "n_reads","n_overlap_reads",
-            "FRiP","n_basesM",]]
+    out = [
+        [
+            "bedtool",
+            "bedsample",
+            "bamsample",
+            "bamcondition",
+            "n_reads",
+            "n_overlap_reads",
+            "FRiP",
+            "n_basesM",
+        ]
+    ]
     nreads = count_reads_in_bam(bamfile, threads)
     (bamsample, condition) = clip_bamfile_name(bamfile)
     for i in range(len(bedfileL)):
@@ -125,15 +133,6 @@ def process_files(bamfile, bedfiles, genome, filetypes, threads):
     return out2
 
 
-def create_outfile_name(bamfile, outroot):
-    """uses outroot to create the output file name"""
-    (bamsample, condition) = clip_bamfile_name(bamfile)
-    outtable = bamsample + "." + condition + "." + "FRiP_table.txt"
-    if outroot != "":
-        outtable = outroot + "." + outtable
-    return outtable
-
-
 def write_table(out2, outtable):
     out2.to_csv(outtable, sep="\t", index=False)
 
@@ -143,7 +142,8 @@ def write_table(out2, outtable):
 
 
 def main():
-    desc = dedent("""
+    desc = dedent(
+        """
                     This function takes a space-delimited or semi-colon delimited list
                     of bed-like files (extensions must be recognizable by bedtools)
                     and a single bam file. It will then calculate the FRiP score for
@@ -151,7 +151,8 @@ def main():
                     txt file. It will also calculate the number of bases covered by 
                     each bed-like file. Note: this function assumes that the file 
                     naming system of the input files matches that of Pipeliner.
-                    """)
+                    """
+    )
 
     parser = argparse.ArgumentParser(
         description=desc, formatter_class=RawTextHelpFormatter
@@ -182,11 +183,13 @@ def main():
     )
 
     parser.add_argument(
-        '-x', '--threads',
+        "-x",
+        "--threads",
         required=False,
+        dest="threads",
         type=int,
         default=1,
-        help='Number of threads available to use for pysam.AlignmentFile',
+        help="Number of threads available to use for pysam.AlignmentFile",
     )
     parser.add_argument(
         "-t",
@@ -202,15 +205,14 @@ def main():
     bedfiles = args.p
     bamfile = args.b
     genomefile = args.g
-    outroot = args.o
+    outfile = args.o
     filetypes = args.t
-    threads = args.x
-    if 'TMPDIR' in os.environ:
-        set_tempdir(os.environ['TMPDIR'])
-    
+    threads = args.threads
+    if "TMPDIR" in os.environ:
+        set_tempdir(os.environ["TMPDIR"])
+
     out2 = process_files(bamfile, bedfiles, genomefile, filetypes, threads)
-    outtable = create_outfile_name(bamfile, outroot)
-    write_table(out2, outtable)
+    write_table(out2, outfile)
 
 
 if __name__ == "__main__":
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 7512a93..f9224fd 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -169,7 +169,7 @@ elif assay in ["atac", "chip"]:
 rule_all_ins.append(join(workpath, "multiqc_report.html"))
 rule_all_ins.extend(expand(join(qc_dir, "{name}.preseq.dat"), name=samples))
 rule_all_ins.extend(
-    expand(join(peakqc_dir, "{PeakTool}.{name}.Q5DD.FRiP_table.txt"), PeakTool=PeakTools, name=chips)
+    expand(join(peakqc_dir, "{PeakTool}", "{PeakTool}.{name}.Q5DD.FRiP_table.txt"), PeakTool=PeakTools, name=chips)
 )
 rule_all_ins.extend(expand(join(bam_dir, "{name}.{ext}"), name=samples, ext=extaln))
 rule_all_ins.extend(expand(join(bw_dir, "{name}.{ext}.RPGC.bw"), name=samples, ext=["sorted", "Q5DD"]))
diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk
index a460932..fb1d66b 100644
--- a/workflow/rules/qc.smk
+++ b/workflow/rules/qc.smk
@@ -460,13 +460,12 @@ rule deeptools_gene_all:
 
 rule FRiP_macsN:
     input:
-        bed                     = expand(join(macsN_dir, "{name}", "{name}_peaks.narrowPeak"), name=chips),
+        bed                     = join(macsN_dir, "{name}", "{name}_peaks.narrowPeak"),
         bam                     = join(bam_dir, "{name}.Q5DD.bam"),
     output:
-        join(workpath, "PeakQC", "macsNarrow.{name}.Q5DD.FRiP_table.txt"),
+        join(peakqc_dir, "macsNarrow", "macsNarrow.{name}.Q5DD.FRiP_table.txt"),
     params:
         rname                   = "FRiP_macsN",
-        outroot                 = join(peakqc_dir, "macsNarrow"),
         script                  = join(bin_path, "frip.py"),
         genome                  = config['references'][genome]['REFLEN'],
         tmpdir                  = tmpdir,
@@ -486,20 +485,19 @@ rule FRiP_macsN:
             -p {input.bed} \\
             -b {input.bam} \\
             -g {params.genome} \\
-            -o {params.outroot} \\
-            -x {threads}
+            -o {output} \\
+            -x 16
         """
 
 
 rule FRiP_Genrich:
     input:
-        bed                     = expand(join(genrich_dir, "{name}", "{name}.narrowPeak"), name=chips),
+        bed                     = join(genrich_dir, "{name}", "{name}.narrowPeak"),
         bam                     = join(bam_dir, "{name}.Q5DD.bam"),
     output:
-        join(workpath, "PeakQC", "Genrich.{name}.Q5DD.FRiP_table.txt"),
+        join(peakqc_dir, "Genrich", "Genrich.{name}.Q5DD.FRiP_table.txt"),
     params:
-        rname                   = "FRiP_macsN",
-        outroot                 = join(peakqc_dir, "macsNarrow"),
+        rname                   = "FRiP_Genrich",
         script                  = join(bin_path, "frip.py"),
         genome                  = config['references'][genome]['REFLEN'],
         tmpdir                  = tmpdir,
@@ -519,20 +517,19 @@ rule FRiP_Genrich:
             -p {input.bed} \\
             -b {input.bam} \\
             -g {params.genome} \\
-            -o {params.outroot} \\
-            -x {threads}
+            -o {output} \\
+            -x 16
         """
 
 
 rule FRiP_macsB:
     input:
-        bed                     = expand(join(macsB_dir, "{name}", "{name}_peaks.broadPeak"), name=chips),
+        bed                     = join(macsB_dir, "{name}", "{name}_peaks.broadPeak"),
         bam                     = join(bam_dir, "{name}.Q5DD.bam"),
     output:
-        join(workpath, "PeakQC", "macsBroad.{name}.Q5DD.FRiP_table.txt"),
+        join(peakqc_dir, "macsBroad", "macsBroad.{name}.Q5DD.FRiP_table.txt"),
     params:
         rname                   = "FRiP_macsB",
-        outroot                 = join(peakqc_dir, "macsBroad"),
         script                  = join(bin_path, "frip.py"),
         genome                  = config['references'][genome]['REFLEN'],
         tmpdir                  = tmpdir,
@@ -552,8 +549,8 @@ rule FRiP_macsB:
             -p {input.bed} \\
             -b {input.bam} \\
             -g {params.genome} \\
-            -o {params.outroot} \\
-            -x {threads}
+            -o {output} \\
+            -x 16
         """