fix: ppqt routing and acceptance of tagalign

OpenOmics · Jan 2, 2025 · 5d05803 · 5d05803
1 parent f3bf548
commit 5d05803
Show file tree

Hide file tree

Showing 8 changed files with 84 additions and 46 deletions.
diff --git a/bin/DiffBind_v2_Deseq2_block.Rmd b/bin/DiffBind_v2_Deseq2_block.Rmd
diff --git a/bin/DiffBind_v2_EdgeR.Rmd b/bin/DiffBind_v2_EdgeR.Rmd
diff --git a/bin/DiffBind_v2_EdgeR_block.Rmd b/bin/DiffBind_v2_EdgeR_block.Rmd
diff --git a/bin/ppqt_process.py b/bin/ppqt_process.py
@@ -1,27 +1,32 @@
 #!/usr/bin/env python3
 
-#Purpose: To grab the estimated fragment length from the ppqt output and a small txt with that information. For input files, adding an extra value of 200bp as an alternative.
+# Purpose:
+#   To grab the estimated fragment length from the ppqt output and a
+#   small txt with that information. For input files, adding an extra
+#   value of 200bp as an alternative.
 import argparse
-parser = argparse.ArgumentParser(description='Script to extract the the estimated fragment length from the ppqt output.')
-parser.add_argument('-i', required=True,help='Name of the ppqt txt file')
-parser.add_argument('-o', required=True,help='Name of the output file')
-args = parser.parse_args()
 
-output = args.o
-inppqt = args.i
 
-o=open(output,'w')
+def main(args):
+    cln_file = lambda z: z.strip().split()
+    this_file = list(map(cln_file, open(args.ppqt, "r").readlines()))
+    ppqt_values = this_file[0][2].split(",")
+    frag_len = None
+    for ppqt_value in ppqt_values:
+        if int(ppqt_value) > 150:
+            frag_len = ppqt_value
+            break
 
-file = list(map(lambda z:z.strip().split(),open(inppqt,'r').readlines()))
+    if frag_len is None:
+        frag_len = 200
+    print(frag_len)
+    return
 
 
-ppqt_values = file[0][2].split(",")
-extenders = []
-for ppqt_value in ppqt_values:
-    if int(ppqt_value) > 150:
-        extenders.append(ppqt_value)
-if len(extenders) > 0:
-    o.write(extenders[0])
-else:
-    o.write("200")      
-o.close()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Script to extract the the estimated fragment length from the ppqt output."
+    )
+    parser.add_argument("ppqt", help="Name of the ppqt txt file")
+    args = parser.parse_args()
+    main(args)
diff --git a/config/cluster.json b/config/cluster.json
@@ -29,7 +29,12 @@
         "mem": "96g",
         "threads": "2"
     },
+    "ppqt_tagalign": {
+        "threads": "16",
+        "mem": "24g"
+    },
     "ppqt": {
+        "threads": "16",
         "mem": "24g"
     },
     "bam2bw": {

diff --git a/src/run.py b/src/run.py
@@ -690,7 +690,7 @@ def runner(
     threads=2,
     jobname="pl:master",
     submission_script="run.sh",
-    tmp_dir="/lscratch/$SLURM_JOBID/",
+    tmp_dir="/lscratch/$SLURM_JOB_ID/",
 ):
     """Runs the pipeline via selected executor: local or slurm.
     If 'local' is selected, the pipeline is executed locally on a compute node/instance.

diff --git a/workflow/rules/single/peakcall.smk b/workflow/rules/single/peakcall.smk
@@ -9,11 +9,20 @@ genome                          = config['options']['genome']
 tmpdir                          = config['options']['tmp_dir']
 
 
+wildcard_constraints:
+    # - regex to avoid backslashes in name wild cards
+    #   this corrects routing for `ppqt` and 
+    #   `ppqt_tagalign` rules
+    # - also possible to use negative look around regex:
+    #   "^((?!\/).)*$"
+    name                                = "[A-Za-z0-9_-]+"
+
+
 rule MACS2_broad:
     input:
         chip                                = join(bam_dir, "{name}.Q5DD_tagAlign.gz"),
         txt                                 = join(ppqt_dir, "{name}.Q5DD_tagAlign.ppqt.txt"),
-        c_option                            = lambda w: join(bam_dir, f"{chip2input[w.name]}.Q5DD_tagAlign.gz")
+        c_option                            = lambda w: join(bam_dir, f"{chip2input[w.name]}.Q5DD.bam")
                                                 if w.name and chip2input[w.name] else [],
     output:
         join(macsB_dir, "{name}", "{name}_peaks.broadPeak"),
@@ -22,10 +31,11 @@ rule MACS2_broad:
         gsize                           = config['references'][genome]['EFFECTIVEGENOMESIZE'],
         macsver                         = config['tools']['MACSVER'],
         flag                            = lambda w: "-c" if chip2input[w.name] else "",
+        frag_len_script                     = join(bin_path, "ppqt_process.py"),
     shell: 
         """
         module load {params.macsver};
-        ppqt_len=$(awk '{{print $1}}' {input.txt})
+        ppqt_len=$({params.frag_len_script} {input.txt})
         macs2 callpeak \\
             -t {input.chip} {params.flag} {input.c_option} \\
             -g {params.gsize} \\
@@ -41,21 +51,22 @@ rule MACS2_broad:
 
 rule MACS2_narrow:
     input:
-        chip                            = join(bam_dir, "{name}.Q5DD.bam"),
-        txt                             = join(ppqt_dir, "{name}.Q5DD.ppqt.txt"),
-        c_option                        = lambda w: join(bam_dir, f"{chip2input[w.name]}.Q5DD_tagAlign.gz")
-                                            if w.name and chip2input[w.name] else [],
+        chip                                = join(bam_dir, "{name}.Q5DD.bam"),
+        txt                                 = join(ppqt_dir, "{name}.Q5DD.ppqt.txt"),
+        c_option                            = lambda w: join(bam_dir, f"{chip2input[w.name]}.Q5DD.bam")
+                                                if w.name and chip2input[w.name] else [],
     output:
         join(macsN_dir, "{name}", "{name}_peaks.narrowPeak"),
     params:
-        rname                           = 'MACS2_narrow',
-        gsize                           = config['references'][genome]['EFFECTIVEGENOMESIZE'],
-        macsver                         = config['tools']['MACSVER'],
-        flag                            = lambda w: "-c" if chip2input[w.name] else "",
+        rname                               = 'MACS2_narrow',
+        gsize                               = config['references'][genome]['EFFECTIVEGENOMESIZE'],
+        macsver                             = config['tools']['MACSVER'],
+        flag                                = lambda w: "-c" if chip2input[w.name] else "",
+        frag_len_script                     = join(bin_path, "ppqt_process.py"),
     shell: 
         """
         module load {params.macsver};
-        ppqt_len=$(awk '{{print $1}}' {input.txt})
+        ppqt_len=$({params.frag_len_script} {input.txt})
         macs2 callpeak \\
             -t {input.chip} {params.flag} {input.c_option} \\
             -g {params.gsize} \\

diff --git a/workflow/rules/single/trim_align_dedup.smk b/workflow/rules/single/trim_align_dedup.smk
@@ -15,6 +15,15 @@ bw_dir                          = join(workpath, "bigwig")
 ppqt_dir                        = join(bam_dir, "ppqt")
 
 
+wildcard_constraints:
+    # - regex to avoid backslashes in name wild cards
+    #   this corrects routing for `ppqt` and 
+    #   `ppqt_tagalign` rules
+    # - also possible to use negative look around regex:
+    #   "^((?!\/).)*$"
+    name                                = "[A-Za-z0-9_-]+"
+
+
 rule trim:
     """
     Data-processing step to remove adapter sequences and perform quality trimming
@@ -215,25 +224,27 @@ rule ppqt:
     output:                                          
         ppqt                                = join(ppqt_dir, "{name}.{ext}.ppqt.txt"),
         pdf                                 = join(ppqt_dir, "{name}.{ext}.pdf"),
-        txt                                 = join(ppqt_dir, "{name}.{ext}.txt"),
     params:
         rname                               = "ppqt",
         samtoolsver                         = config['tools']['SAMTOOLSVER'],
         rver                                = config['tools']['RVER'],
-        scriptPy                            = join(bin_path, "ppqt_process.py"),
         tmpdir                              = tmpdir,
     container: 
         config['images']['ppqt']
+    threads:
+        cluster['ppqt'].get('threads', cluster['__default__']['threads'])
     shell: 
         """
         if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
         tmp=$(mktemp -d -p "{params.tmpdir}")
         trap 'rm -rf "${{tmp}}"' EXIT
 
-        run_spp.R -c={input.bam} \
-                -savp={output.pdf} -out={output.ppqt} \
-                -tmpdir=/lscratch/$SLURM_JOBID -rf
-        python {params.scriptPy} -i {output.ppqt} -o {output.txt}
+        run_spp.R \\
+            -c={input.bam} \\
+            -savp={output.pdf} \\
+            -out={output.ppqt} \\
+            -p={threads} \\
+            -tmpdir=${{tmp}} 
         """
 
 
@@ -245,32 +256,37 @@ rule ppqt_tagalign:
         pdf                                 = join(ppqt_dir, "{name}.Q5DD_tagAlign.pdf"),
         txt                                 = join(ppqt_dir, "{name}.Q5DD_tagAlign.txt"),
     params:
-        rname                               = "ppqt",
+        rname                               = "ppqt_tagalign",
         samtoolsver                         = config['tools']['SAMTOOLSVER'],
         rver                                = config['tools']['RVER'],
-        scriptPy                            = join(bin_path, "ppqt_process.py"),
         tmpdir                              = tmpdir,
     container: 
         config['images']['ppqt']
+        # "docker://seqeralabs/phantompeakqualtools:latest"
+    threads:
+        cluster['ppqt_tagalign'].get('threads', cluster['__default__']['threads'])
     shell: 
         """
         if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
         tmp=$(mktemp -d -p "{params.tmpdir}")
         trap 'rm -rf "${{tmp}}"' EXIT
 
-        ln -s {input.bam} ${{tmp}}/{wildcards.name}.Q5DD_tagAlign.gz;
-        run_spp.R -c=${{tmp}}/{wildcards.name}.Q5DD_tagAlign.gz \
-                -savp={output.pdf} -out={output.ppqt} \
-                -tmpdir=/lscratch/$SLURM_JOBID -rf
-
-        python {params.scriptPy} -i {output.ppqt} -o {output.txt}
+        ln -s {input.bam} ${{tmp}}/{wildcards.name}.Q5DD_tagAlign.gz
+        zcat ${{tmp}}/{wildcards.name}.Q5DD_tagAlign.gz | head
+        run_spp.R \\
+            -c=${{tmp}}/{wildcards.name}.Q5DD_tagAlign.gz \\
+            -savp={output.pdf} \\
+            -out={output.ppqt} \\
+            -p={threads} \\
+            -tmpdir=${{tmp}}
         """
 
 
 rule bam2bw:
     """
     bamCoverage converts bams to bigwig files for read visialization
     in UCSC Genome Browser or IGV.
+
     @Input:
         Files with extensions: sorted.bam and Q5DD.bam,
         for all samples (treatment and input controls)
@@ -288,6 +304,7 @@ rule bam2bw:
         name                                = "{name}",
         effectivegenomesize                 = config['references'][genome]['EFFECTIVEGENOMESIZE'],
         tmpdir                              = tmpdir,
+        frag_len_script                     = join(bin_path, "ppqt_process.py"),
     threads: 
         int(allocated("threads", "bam2bw", cluster)),
     envmodules: 
@@ -298,7 +315,7 @@ rule bam2bw:
         tmp=$(mktemp -d -p "{params.tmpdir}")
         trap 'rm -rf "${{tmp}}"' EXIT
 
-        ppqt_len=$(awk '{{print $1}}' {input.ppqt})
+        ppqt_len=$({params.frag_len_script} {input.ppqt})
         
         bamCoverage \\
             --bam {input.bam} \\