Skip to content

Commit

Permalink
fix: handle tmpdir to frip rules, enforce tmpdir for pybedtools, mult… (
Browse files Browse the repository at this point in the history
#51)

* fix: handle tmpdir to frip rules, enforce tmpdir for pybedtools, multithread frip.py

* fix: export to  not params.tmpdir

* fix: correct frip table routing and frip output scripting
  • Loading branch information
rroutsong authored Nov 4, 2024
1 parent d5c6d18 commit be2a788
Show file tree
Hide file tree
Showing 4 changed files with 165 additions and 87 deletions.
209 changes: 136 additions & 73 deletions bin/frip.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,153 +12,216 @@
##########################################
# Modules
import argparse
from argparse import RawTextHelpFormatter
from pybedtools import BedTool
import pysam
import pandas as pd
import os
from argparse import RawTextHelpFormatter
from pybedtools import BedTool
from pybedtools.helpers import set_tempdir
from textwrap import dedent

##########################################
# Functions


def split_infiles(infiles):
"""
breaks the infile string with space-delimited file names and
"""
breaks the infile string with space-delimited file names and
creates a list
"""
infileList = infiles.strip("\'").strip('\"').split(" ")
infileList = infiles.strip("'").strip('"').split(" ")
if len(infileList) == 1:
infileList = infileList[0].split(";")
return(infileList)
return infileList


def count_reads_in_bed(bam, bedfile, genomefile):
"""
some of this comes directly from the pybedtools site; read in
bed (or bed-like) file, sort it, and then count the number of
some of this comes directly from the pybedtools site; read in
bed (or bed-like) file, sort it, and then count the number of
reads within the regions
"""
bedinfo = BedTool(bedfile)
bedinfo.sort(g=genomefile)
return (
BedTool(bam).intersect( bedinfo, bed=True, stream=True, )
).count()
BedTool(bam).intersect(
bedinfo,
bed=True,
stream=True,
)
).count()


def count_reads_in_bam(bam, t=1):
"""count the number of reads in a given bam file"""
return pysam.AlignmentFile(bam, threads=t).mapped

def count_reads_in_bam(bam):
""" count the number of reads in a given bam file """
return( pysam.AlignmentFile(bam).mapped )

def calculate_frip(nreads, noverlaps):
""" calculate FRiP score from nreads and noverlaps """
return( float(noverlaps) / nreads )
"""calculate FRiP score from nreads and noverlaps"""
return float(noverlaps) / nreads


def measure_bedfile_coverage(bedfile, genomefile):
""" calculate the number of bases covered by a given bed file """
"""calculate the number of bases covered by a given bed file"""
bedinfo = BedTool(bedfile)
return( bedinfo.sort(g=genomefile).total_coverage() )
return bedinfo.sort(g=genomefile).total_coverage()


def clip_bamfile_name(bamfile):
"""
clip bam file name for table/plotting purposes; assumes file
"""
clip bam file name for table/plotting purposes; assumes file
naming system matches that of Pipeliner
"""
sample = bamfile.split("/")[-1].split(".")[0]
condition = ".".join(bamfile.split("/")[-1].split(".")[1:-1])
return( sample, condition )
condition = ".".join(bamfile.split("/")[-1].split(".")[1:-1])
return (sample, condition)


def clip_bedfile_name(bedfile,filetype):
def clip_bedfile_name(bedfile, filetype):
"""
clip bed file name for table/plotting purposes; assumes file
clip bed file name for table/plotting purposes; assumes file
naming system matches that of Pipeliner
"""
if filetype == "":
toolused = bedfile.split("/")[-3]
sample = bedfile.split("/")[-2]
else:
toolused = filetype
sample = bedfile.split("/")[-1].split(".")[0].strip("_peaks").strip("_broadpeaks")
return( toolused, sample )
sample = (
bedfile.split("/")[-1].split(".")[0].strip("_peaks").strip("_broadpeaks")
)
return (toolused, sample)

def process_files(bamfile, bedfiles, genome, filetypes):
"""
this is the main function to take in list of input files and
put out an array containing key file name information, read

def process_files(bamfile, bedfiles, genome, filetypes, threads):
"""
this is the main function to take in list of input files and
put out an array containing key file name information, read
counts, and FRiP scores
"""
bedfileL = bedfiles
filetypesL = filetypes
out = [[ "bedtool", "bedsample", "bamsample", "bamcondition",
"n_reads", "n_overlap_reads", "FRiP", "n_basesM" ]]
nreads = count_reads_in_bam(bamfile)
out = [
[
"bedtool",
"bedsample",
"bamsample",
"bamcondition",
"n_reads",
"n_overlap_reads",
"FRiP",
"n_basesM",
]
]
nreads = count_reads_in_bam(bamfile, threads)
(bamsample, condition) = clip_bamfile_name(bamfile)
for i in range(len(bedfileL)):
bed = bedfileL[i]
if len(filetypesL) > 1:
filetype = filetypesL[i]
else:
filetype = filetypesL[0]
(bedtool, bedsample) = clip_bedfile_name(bed,filetype)
(bedtool, bedsample) = clip_bedfile_name(bed, filetype)
noverlaps = count_reads_in_bed(bamfile, bed, genome)
frip = calculate_frip(nreads, noverlaps)
nbases = measure_bedfile_coverage(bed, genome) / 1000000
out.append( [bedtool, bedsample, bamsample, condition,
nreads, noverlaps, frip, nbases] )
out.append(
[bedtool, bedsample, bamsample, condition, nreads, noverlaps, frip, nbases]
)
out2 = pd.DataFrame(out[1:], columns=out[0])
return(out2)
return out2

def create_outfile_name(bamfile, outroot):
""" uses outroot to create the output file name """
(bamsample, condition) = clip_bamfile_name(bamfile)
outtable = bamsample + "." + condition + "." + "FRiP_table.txt"
if outroot != "":
outtable = outroot + "." + outtable
return(outtable)

def write_table(out2, outtable):
out2.to_csv(outtable,sep='\t',index=False)
out2.to_csv(outtable, sep="\t", index=False)


###############################################
# Main

def main():
desc="""
This function takes a space-delimited or semi-colon delimited list
of bed-like files (extensions must be recognizable by bedtools)
and a single bam file. It will then calculate the FRiP score for
all possible combinations of files and save the information in a
txt file. It will also calculate the number of bases covered by
each bed-like file. Note: this function assumes that the file
naming system of the input files matches that of Pipeliner.
"""

parser = argparse.ArgumentParser(description=desc, formatter_class=RawTextHelpFormatter)
parser.add_argument('-p', nargs = '+', required=True, type=str, help='A space- or semicolon-delimited list of peakfiles \
(or bed-like files).')
parser.add_argument('-b', required=True, type=str, help='The name of a bamfile to analyze.')
parser.add_argument('-g', required=True, type=str, help='The name of the .genome file so bedtools knows the \
size of every chromosome.')
parser.add_argument('-o', required=True, type=str, help='The root name of the multiple output files. Default:""')
parser.add_argument('-t', required=False, default=[""], type=list, help='A space- \
or semicolon-delimited list of input file sources/types. Only needed when \
source of bed file is not built into the script. Default: ""')
def main():
desc = dedent(
"""
This function takes a space-delimited or semi-colon delimited list
of bed-like files (extensions must be recognizable by bedtools)
and a single bam file. It will then calculate the FRiP score for
all possible combinations of files and save the information in a
txt file. It will also calculate the number of bases covered by
each bed-like file. Note: this function assumes that the file
naming system of the input files matches that of Pipeliner.
"""
)

parser = argparse.ArgumentParser(
description=desc, formatter_class=RawTextHelpFormatter
)
parser.add_argument(
"-p",
nargs="+",
required=True,
type=str,
help="A space- or semicolon-delimited list of peakfiles \
(or bed-like files).",
)
parser.add_argument(
"-b", required=True, type=str, help="The name of a bamfile to analyze."
)
parser.add_argument(
"-g",
required=True,
type=str,
help="The name of the .genome file so bedtools knows the \
size of every chromosome.",
)
parser.add_argument(
"-o",
required=True,
type=str,
help='The root name of the multiple output files. Default:""',
)

parser.add_argument(
"-x",
"--threads",
required=False,
dest="threads",
type=int,
default=1,
help="Number of threads available to use for pysam.AlignmentFile",
)
parser.add_argument(
"-t",
required=False,
default=[""],
type=list,
help='A space- \
or semicolon-delimited list of input file sources/types. Only needed when \
source of bed file is not built into the script. Default: ""',
)

args = parser.parse_args()
bedfiles = args.p
bamfile = args.b
genomefile = args.g
outroot = args.o
outfile = args.o
filetypes = args.t
threads = args.threads
if "TMPDIR" in os.environ:
set_tempdir(os.environ["TMPDIR"])

out2 = process_files(bamfile, bedfiles, genomefile, filetypes, threads)
write_table(out2, outfile)

out2 = process_files(bamfile, bedfiles, genomefile, filetypes)
outtable = create_outfile_name(bamfile, outroot)
write_table(out2, outtable)

if __name__ == '__main__':
if __name__ == "__main__":
main()

###############################################
# example cases

#bedfiles = "macs_broad/mWT_HCF1_mm_i81/mWT_HCF1_mm_i81_peaks.broadPeak macs_broad/mWT_HCF1_mm_i89/mWT_HCF1_mm_i89_peaks.broadPeak"
#bamfiles = "bam/Input_mm_i95.sorted.Q5DD.bam bam/mWT_HCF1_mm_i81.sorted.Q5DD.bam bam/mWT_HCF1_mm_i89.sorted.Q5DD.bam"
#genomefile = "/data/CCBR_Pipeliner/db/PipeDB/Indices/mm10_basic/indexes/mm10.fa.sizes"
#out2 = pd.read_csv("FRIP_test.txt",sep="\t")
# bedfiles = "macs_broad/mWT_HCF1_mm_i81/mWT_HCF1_mm_i81_peaks.broadPeak macs_broad/mWT_HCF1_mm_i89/mWT_HCF1_mm_i89_peaks.broadPeak"
# bamfiles = "bam/Input_mm_i95.sorted.Q5DD.bam bam/mWT_HCF1_mm_i81.sorted.Q5DD.bam bam/mWT_HCF1_mm_i89.sorted.Q5DD.bam"
# genomefile = "/data/CCBR_Pipeliner/db/PipeDB/Indices/mm10_basic/indexes/mm10.fa.sizes"
# out2 = pd.read_csv("FRIP_test.txt", sep="\t")
12 changes: 12 additions & 0 deletions config/cluster.json
Original file line number Diff line number Diff line change
Expand Up @@ -143,5 +143,17 @@
"ntasks": "--ntasks=28",
"ntasks_per_core": "--ntasks-per-core=1",
"exclusive": "--exclusive"
},
"FRiP_macsN": {
"threads": "16",
"gres": "lscratch:400"
},
"FRiP_macsB": {
"threads": "16",
"gres": "lscratch:400"
},
"FRiP_Genrich": {
"threads": "16",
"gres": "lscratch:400"
}
}
2 changes: 1 addition & 1 deletion workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ elif assay in ["atac", "chip"]:
rule_all_ins.append(join(workpath, "multiqc_report.html"))
rule_all_ins.extend(expand(join(qc_dir, "{name}.preseq.dat"), name=samples))
rule_all_ins.extend(
expand(join(peakqc_dir, "{PeakTool}.{name}.Q5DD.FRiP_table.txt"), PeakTool=PeakTools, name=chips)
expand(join(peakqc_dir, "{PeakTool}", "{PeakTool}.{name}.Q5DD.FRiP_table.txt"), PeakTool=PeakTools, name=chips)
)
rule_all_ins.extend(expand(join(bam_dir, "{name}.{ext}"), name=samples, ext=extaln))
rule_all_ins.extend(expand(join(bw_dir, "{name}.{ext}.RPGC.bw"), name=samples, ext=["sorted", "Q5DD"]))
Expand Down
Loading

0 comments on commit be2a788

Please sign in to comment.