Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

calmaxtag subcommand #39

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,5 @@ MACS2/mnumpy.pxd
MACS2/numpy.pxd

test/NOSPMR.ppois.bdg_ppois.bdg

johnChanges
125 changes: 123 additions & 2 deletions MACS2/OptValidator.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,8 +185,8 @@ def opt_validate ( options ):
options.argtxt += "# Larger dataset will be scaled towards smaller dataset.\n"

if options.ratio != 1.0:
options.argtxt += "# Using a custom scaling factor: %.2e\n" % (options.ratio)
options.argtxt += "# Using a custom scaling factor: %.2e\n" % (options.ratio)

if options.cfile:
options.argtxt += "# Range for calculating regional lambda is: %d bps and %d bps\n" % (options.smalllocal,options.largelocal)
else:
Expand Down Expand Up @@ -804,3 +804,124 @@ def opt_validate_bdgcmp ( options ):

return options



def opt_validate_calmaxtag ( options ):
"""Validate options from a OptParser object.

Ret: Validated options object.
"""
# gsize
try:
options.gsize = efgsize[options.gsize]
except:
try:
options.gsize = float(options.gsize)
except:
logging.error("Error when interpreting --gsize option: %s" % options.gsize)
logging.error("Available shortcuts of effective genome sizes are %s" % ",".join(efgsize.keys()))
sys.exit(1)

# format

options.gzip_flag = False # if the input is gzip file

options.format = options.format.upper()
if options.format == "ELAND":
options.parser = ELANDResultParser
elif options.format == "BED":
options.parser = BEDParser
elif options.format == "ELANDMULTI":
options.parser = ELANDMultiParser
elif options.format == "ELANDEXPORT":
options.parser = ELANDExportParser
elif options.format == "SAM":
options.parser = SAMParser
elif options.format == "BAM":
options.parser = BAMParser
options.gzip_flag = True
elif options.format == "BOWTIE":
options.parser = BowtieParser
elif options.format == "AUTO":
options.parser = guess_parser
else:
logging.error("Format \"%s\" cannot be recognized!" % (options.format))
sys.exit(1)

# uppercase the format string
options.format = options.format.upper()

# logging object
logging.basicConfig(level=10,
format='%(levelname)-5s @ %(asctime)s: %(message)s ',
datefmt='%a, %d %b %Y %H:%M:%S',
stream=sys.stderr,
filemode="w"
)

options.error = logging.critical # function alias
options.warn = logging.warning
options.debug = logging.debug
options.info = logging.info

return options

def opt_validate_calmaxtag ( options ):
"""Validate options from a OptParser object.

Ret: Validated options object.
"""
# gsize
try:
options.gsize = efgsize[options.gsize]
except:
try:
options.gsize = float(options.gsize)
except:
logging.error("Error when interpreting --gsize option: %s" % options.gsize)
logging.error("Available shortcuts of effective genome sizes are %s" % ",".join(efgsize.keys()))
sys.exit(1)

# format

options.gzip_flag = False # if the input is gzip file

options.format = options.format.upper()
if options.format == "ELAND":
options.parser = ELANDResultParser
elif options.format == "BED":
options.parser = BEDParser
elif options.format == "ELANDMULTI":
options.parser = ELANDMultiParser
elif options.format == "ELANDEXPORT":
options.parser = ELANDExportParser
elif options.format == "SAM":
options.parser = SAMParser
elif options.format == "BAM":
options.parser = BAMParser
options.gzip_flag = True
elif options.format == "BOWTIE":
options.parser = BowtieParser
elif options.format == "AUTO":
options.parser = guess_parser
else:
logging.error("Format \"%s\" cannot be recognized!" % (options.format))
sys.exit(1)

# uppercase the format string
options.format = options.format.upper()

# logging object
logging.basicConfig(level=10,
format='%(levelname)-5s @ %(asctime)s: %(message)s ',
datefmt='%a, %d %b %Y %H:%M:%S',
stream=sys.stderr,
filemode="w"
)

options.error = logging.critical # function alias
options.warn = logging.warning
options.debug = logging.debug
options.info = logging.info

return options
101 changes: 101 additions & 0 deletions MACS2/calmaxtag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# Time-stamp: <2014-06-14 10:59:30 Tao Liu/JohnUrban>

"""Description: Calculate maximum duplicate (redundant) reads allowed depending on sequencing depth and genome size.

Copyright (c) 2011 Tao Liu <taoliu@jimmy.harvard.edu>

This code is free software; you can redistribute it and/or modify it
under the terms of the BSD License (see the file COPYING included
with the distribution).

@status: release candidate
@version: $Id$
@author: Yong Zhang, Tao Liu
@contact: taoliu@jimmy.harvard.edu
"""

# ------------------------------------
# python modules
# ------------------------------------

import os
import sys
import logging

# ------------------------------------
# own python modules
# ------------------------------------
from MACS2.OptValidator import opt_validate_calmaxtag as opt_validate
from MACS2.cProb import binomial_cdf_inv
from MACS2.Constants import *
# ------------------------------------
# Main function
# ------------------------------------
def run( o_options ):
"""The calculation based on the binomial distribution for how many tags to allow at one site.

"""
# Parse options...
options = opt_validate( o_options )
# end of parsing commandline options
info = options.info
warn = options.warn
debug = options.debug
error = options.error

if options.outputfile != "stdout":
outfhd = open( os.path.join( options.outdir, options.outputfile ) ,"w" )
else:
outfhd = sys.stdout

#1 Read tag files
if options.ifile:
if not options.quiet:
info("counting tags from input files...")
fwtrack = load_tag_files_options (options)
t0 = fwtrack.total
if not options.quiet:
info(" total tags in alignment file: %d" % (t0))
info("tag size = %d" % options.tsize)
fwtrack.fw = options.tsize
elif options.numTags:
t0 = options.numTags

if not options.quiet:
info("calculate max duplicate tags in single position based on binomal distribution...")
max_dup_tags = cal_max_dup_tags(options.gsize,t0)

if not options.quiet:
info(" max_dup_tags based on binomal = %d" % (max_dup_tags))
else:
print max_dup_tags


def cal_max_dup_tags ( genome_size, tags_number, p=1e-5 ):
"""Calculate the maximum duplicated tag number based on genome
size, total tag number and a p-value based on binomial
distribution. Brute force algorithm to calculate reverse CDF no
more than MAX_LAMBDA(100000).

"""
return binomial_cdf_inv(1-p,tags_number,1.0/genome_size)

def load_tag_files_options ( options ):
"""From the options, load alignment tags.

"""
if not options.quiet:
options.info("read alignment tags...")
tp = options.parser(options.ifile)

if not options.tsize: # override tsize if user specified --tsize
ttsize = tp.tsize()
options.tsize = ttsize

treat = tp.build_fwtrack()
treat.sort()

if not options.quiet:
options.info("tag size is determined as %d bps" % options.tsize)
return treat

33 changes: 32 additions & 1 deletion bin/macs2
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,10 @@ def main():
# pileup alignment results with a given extension method
from MACS2.pileup import run
run( args )
elif subcommand == "calmaxtag":
# user only wants to calculate max tags given input paramters
from MACS2.calmaxtag import run
run( args )

def prepare_argparser ():
"""Prepare optparser object. New options will be added in this
Expand Down Expand Up @@ -141,6 +145,9 @@ def prepare_argparser ():

# command for 'refinepeak'
add_refinepeak_parser( subparsers )

# command for 'calmaxtag'
add_calmaxtag_parser( subparsers )

return argparser

Expand Down Expand Up @@ -553,7 +560,31 @@ def add_pileup_parser( subparsers ):
help = "Set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. If you want to know where are the duplicate reads, use 3. DEFAULT:2" )
return


def add_calmaxtag_parser( subparsers ):
argparser_calmaxtag = subparsers.add_parser("calmaxtag",
help = "Simply calculate maximum duplicate tags allowed at a given site with given input paramters" )
input = argparser_calmaxtag.add_mutually_exclusive_group()
input.add_argument( "-i", dest = "ifile", type = str, required = False,
help = "Sequencing alignment file. Provide reads file or number of reads with -n." )
input.add_argument( "-n", dest = "numTags", type = int, required = False,
help = "Number of reads in input file (this allows macs2 to skip counting). Provide number of reads or reads file -i." )
argparser_calmaxtag.add_argument( "-f", "--format", dest = "format", type = str,
choices=("AUTO","BAM","SAM","BED","ELAND","ELANDMULTI","ELANDEXPORT","BOWTIE"),
help = "Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\". The default AUTO option will let '%(prog)s' decide which format the file is. Please check the definition in README file if you choose ELAND/ELANDMULTI/ELANDEXPORT/SAM/BAM/BOWTIE. DEFAULT: \"AUTO\"",
default = "AUTO" )
argparser_calmaxtag.add_argument( "-g", "--gsize", dest = "gsize", type = str, default = "hs", required=True,
help = "Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2.7e9), 'mm' for mouse (1.87e9), 'ce' for C. elegans (9e7) and 'dm' for fruitfly (1.2e8), DEFAULT:hs" )
argparser_calmaxtag.add_argument( "-p", "--pvalue", dest = "pvalue", type = float,
help = "Pvalue cutoff for binomial distribution test. DEFAULT:1e-5" )
argparser_calmaxtag.add_argument( "-s", "--tsize", dest = "tsize", type = int,
help = "Tag size. This will overide the auto detected tag size. DEFAULT: Not set" )
argparser_calmaxtag.add_argument( "-q", "--quiet", dest = "quiet", action = "store_true",
help = "When using -n, this flag will suppress all messages except answer. When using -i, it will suppress most messages except answer. DEFAULT: Not set" )
add_outdir_option( argparser_calmaxtag )
argparser_calmaxtag.add_argument( "-o", "--ofile", dest = "outputfile", type = str,
help = "Output BED file name. If not specified, will write to standard output. DEFAULT: stdout",
default = "stdout" )
return
if __name__ == '__main__':
try:
main()
Expand Down