Skip to content

Commit

Permalink
Merge pull request #35 from epigen/dev
Browse files Browse the repository at this point in the history
Version 0.5.0 release candidate 2
  • Loading branch information
nsheff authored Jul 13, 2017
2 parents d252298 + 4f0b7c9 commit e4090e6
Show file tree
Hide file tree
Showing 7 changed files with 80 additions and 30 deletions.
6 changes: 6 additions & 0 deletions doc/source/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@ Changelog

- Adds 'waiting' flag.

- Eliminates extra spaces in reported results

- Pypiper module is version aware

- Updates Success time format to eliminate space

- **v0.4** (*2017-01-23*):

- First major public release!
Expand Down
2 changes: 2 additions & 0 deletions doc/source/features.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ Pypiper provides the following benefits:
Pypiper provides functions to put key-value pairs into an easy-to-parse stats file, making it easy to summarize your pipeline results.
- **Simplicity:**
It should only take you 15 minutes to run your first pipeline. The basic documentation is just a few pages long. The codebase itself is also only a few thousand lines of code, making it very lightweight.
- **Dynamic recovery:**
If a job is user-interrupted (with SIGINT or SIGTERM), for example by a cluster resource manager, it will get a dynamic recovery flag set, and the next time the run is started it will automatically pick up where it left off.


Furthermore, Pypiper includes a suite of commonly used pieces of code (toolkits) which the user may use to build pipelines.
Expand Down
1 change: 1 addition & 0 deletions pypiper/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from ._version import __version__
from .pypiper import *
from .ngstk import *
from .AttributeDict import *
2 changes: 1 addition & 1 deletion pypiper/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.5.0-rc1"
__version__ = "0.5.0-rc2"
8 changes: 4 additions & 4 deletions pypiper/ngstk.py
Original file line number Diff line number Diff line change
Expand Up @@ -1137,13 +1137,13 @@ def calculate_FRiP(self, inputBam, inputBed, output, cpus=4):
cmd += " | awk '{{sum+=$5}} END {{print sum}}' > {0}".format(output)
return cmd

def macs2CallPeaks(treatmentBams, outputDir, sampleName, genome, controlBams=None, broad=False, paired=False):
def macs2CallPeaks(self, treatmentBams, outputDir, sampleName, genome, controlBams=None, broad=False, paired=False):
"""
Use MACS2 to call peaks.
"""
sizes = {"hg38": 2.7e9, "hg19": 2.7e9, "mm10": 1.87e9, "dr7": 1.412e9}
sizes = {"hg38": 2.7e9, "hg19": 2.7e9, "mm10": 1.87e9, "dr7": 1.412e9, "mm9": 1.87e9}

cmd = "macs2 callpeak -t {0}".format(treatmentBams if type(treatmentBams) is str else " ".join(treatmentBams))
cmd = self.tools.macs2 + " callpeak -t {0}".format(treatmentBams if type(treatmentBams) is str else " ".join(treatmentBams))
if controlBams is not None:
cmd += " -c {0}".format(controlBams if type(controlBams) is str else " ".join(controlBams))
if paired:
Expand All @@ -1157,7 +1157,7 @@ def macs2CallPeaks(treatmentBams, outputDir, sampleName, genome, controlBams=Non
return cmd

def macs2CallPeaksATACSeq(self, treatmentBam, outputDir, sampleName, genome):
sizes = {"hg38": 2.7e9, "hg19": 2.7e9, "mm10": 1.87e9, "dr7": 1.412e9, "mm9": 2.7e9}
sizes = {"hg38": 2.7e9, "hg19": 2.7e9, "mm10": 1.87e9, "dr7": 1.412e9, "mm9": 1.87e9}
cmd = self.tools.macs2 + " callpeak -t {0}".format(treatmentBam)
cmd += " --nomodel --extsize 147 -g {0} -n {1} --outdir {2}".format(sizes[genome], sampleName, outputDir)
return cmd
Expand Down
73 changes: 49 additions & 24 deletions pypiper/pypiper.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,19 +267,23 @@ def start_pipeline(self, args = None, multi = False):
# Wrapped in try blocks so that the code will not fail if the pipeline or pypiper are not git repositories
gitvars = {}
try:
gitvars['pypiper_dir'] = os.path.dirname(os.path.realpath(__file__))
gitvars['pypiper_hash'] = subprocess.check_output("cd " + os.path.dirname(os.path.realpath(__file__)) + "; git rev-parse --verify HEAD 2>/dev/null", shell=True)
gitvars['pypiper_date'] = subprocess.check_output("cd " + os.path.dirname(os.path.realpath(__file__)) + "; git show -s --format=%ai HEAD 2>/dev/null", shell=True)
gitvars['pypiper_diff'] = subprocess.check_output("cd " + os.path.dirname(os.path.realpath(__file__)) + "; git diff --shortstat HEAD 2>/dev/null", shell=True)
gitvars['pypiper_branch'] = subprocess.check_output("cd " + os.path.dirname(os.path.realpath(__file__)) + "; git branch | grep '*' 2>/dev/null", shell=True)
# pypiper dir
ppd = os.path.dirname(os.path.realpath(__file__))
gitvars['pypiper_dir'] = ppd
gitvars['pypiper_hash'] = subprocess.check_output("cd " + ppd + "; git rev-parse --verify HEAD 2>/dev/null", shell=True)
gitvars['pypiper_date'] = subprocess.check_output("cd " + ppd + "; git show -s --format=%ai HEAD 2>/dev/null", shell=True)
gitvars['pypiper_diff'] = subprocess.check_output("cd " + ppd + "; git diff --shortstat HEAD 2>/dev/null", shell=True)
gitvars['pypiper_branch'] = subprocess.check_output("cd " + ppd + "; git branch | grep '*' 2>/dev/null", shell=True)
except Exception:
pass
try:
gitvars['pipe_dir'] = os.path.dirname(os.path.realpath(sys.argv[0]))
gitvars['pipe_hash'] = subprocess.check_output("cd " + os.path.dirname(os.path.realpath(sys.argv[0])) + "; git rev-parse --verify HEAD 2>/dev/null", shell=True)
gitvars['pipe_date'] = subprocess.check_output("cd " + os.path.dirname(os.path.realpath(sys.argv[0])) + "; git show -s --format=%ai HEAD 2>/dev/null", shell=True)
gitvars['pipe_diff'] = subprocess.check_output("cd " + os.path.dirname(os.path.realpath(sys.argv[0])) + "; git diff --shortstat HEAD 2>/dev/null", shell=True)
gitvars['pipe_branch'] = subprocess.check_output("cd " + os.path.dirname(os.path.realpath(sys.argv[0])) + "; git branch | grep '*' 2>/dev/null", shell=True)
# pipeline dir
pld = os.path.dirname(os.path.realpath(sys.argv[0]))
gitvars['pipe_dir'] = pld
gitvars['pipe_hash'] = subprocess.check_output("cd " + pld + "; git rev-parse --verify HEAD 2>/dev/null", shell=True)
gitvars['pipe_date'] = subprocess.check_output("cd " + pld + "; git show -s --format=%ai HEAD 2>/dev/null", shell=True)
gitvars['pipe_diff'] = subprocess.check_output("cd " + pld + "; git diff --shortstat HEAD 2>/dev/null", shell=True)
gitvars['pipe_branch'] = subprocess.check_output("cd " + pld + "; git branch | grep '*' 2>/dev/null", shell=True)
except Exception:
pass

Expand All @@ -305,7 +309,7 @@ def start_pipeline(self, args = None, multi = False):
if (gitvars['pypiper_diff'] != ""):
print("* " + "Pypiper diff".rjust(20) + ": " + gitvars['pypiper_diff'].strip())
except KeyError:
# If any of the keys aren't set, that's OK. It just means pypiper isn't being run from a git repo.
# It is ok if keys aren't set, it means pypiper isn't in a git repo.
pass

try:
Expand Down Expand Up @@ -373,7 +377,8 @@ def run(self, cmd, target=None, lock_name=None, shell="guess", nofail=False, cle
:type target: str or None
:param lock_name: Name of lock file. Optional.
:type lock_name: str or None
:param shell: If command requires should be run in its own shell. Optional. Default: "guess" -- run will try to determine if the command requires a shell.
:param shell: If command requires should be run in its own shell. Optional. Default: "guess" --
run will try to determine if the command requires a shell.
:type shell: bool
:param nofail: Should the pipeline proceed past a nonzero return from a process? Default: False
Nofail can be used to implement non-essential parts of the pipeline; if these processes fail,
Expand Down Expand Up @@ -405,6 +410,9 @@ def run(self, cmd, target=None, lock_name=None, shell="guess", nofail=False, cle
# Prepend "lock." to make it easy to find the lock files.
self.proc_lock_name = lock_name
lock_name = "lock." + lock_name
recover_name = "lock.recover." + self.proc_lock_name
recover_file = os.path.join(self.pipeline_outfolder, recover_name)
recover_mode = False
lock_file = os.path.join(self.pipeline_outfolder, lock_name)
process_return_code = 0
local_maxmem = 0
Expand Down Expand Up @@ -436,22 +444,28 @@ def run(self, cmd, target=None, lock_name=None, shell="guess", nofail=False, cle
if os.path.isfile(lock_file):
if self.overwrite_locks:
print("Found lock file; overwriting this target...")
elif os.path.isfile(recover_file):
print("Found lock file; dynamic recovery set. Overwriting this target...")
# remove the lock file which will then be prompty re-created for the current run.
recover_mode = True
# the recovery flag is now spent, so remove so we don't accidently re-recover a failed job
os.remove(recover_file)
else: # don't overwite locks
self._wait_for_lock(lock_file)
# when it's done loop through again to try one more time (to see if the target exists now)
continue

# If you get to this point, the target doesn't exist, and the lock_file doesn't exist
# (or we should overwrite). create the lock (if you can)
if not self.overwrite_locks:
if self.overwrite_locks or recover_mode:
self._create_file(lock_file)
else:
try:
self._create_file_racefree(lock_file) # Create lock
except OSError as e:
if e.errno == errno.EEXIST: # File already exists
print ("Lock file created after test! Looping again.")
continue # Go back to start
else:
self._create_file(lock_file)

##### End tests block
# If you make it past these tests, we should proceed to run the process.
Expand Down Expand Up @@ -486,7 +500,8 @@ def run(self, cmd, target=None, lock_name=None, shell="guess", nofail=False, cle
break

# Bad idea: don't return follow_result; it seems nice but nothing else
# in your pipeline can depend on this since it won't be run if that command # isn't required because target exists.
# in your pipeline can depend on this since it won't be run if that command
# isn't required because target exists.
return process_return_code


Expand Down Expand Up @@ -777,8 +792,7 @@ def _report_profile(self, command, lock_name, elapsed_time, memory):
str(lock_name) + "\t" + \
str(datetime.timedelta(seconds = round(elapsed_time, 2))) + "\t " + \
str(memory)
# messageMarkdown = "> `" + command + "`\t" + str(elapsed_time).strip() + "\t " + str(memory).strip() + "\t" + "_PROF_"
# print(messageMarkdown)

with open(self.pipeline_profile_file, "a") as myfile:
myfile.write(messageRaw + "\n")

Expand All @@ -799,7 +813,7 @@ def report_result(self, key, value, annotation=None):

# keep the value in memory:
self.stats_dict[key] = str(value).strip()
messageRaw = key + "\t " + str(value).strip() + "\t" + str(annotation)
messageRaw = key + "\t" + str(value).strip() + "\t" + str(annotation)
messageMarkdown = "> `" + key + "`\t" + str(value).strip()\
+ "\t" + str(annotation) + "\t" + "_RES_"
print(messageMarkdown)
Expand Down Expand Up @@ -949,15 +963,15 @@ def stop_pipeline(self):
self.set_status_flag("completed")
self._cleanup()
self.report_result("Time", str(datetime.timedelta(seconds = self.time_elapsed(self.starttime))))
self.report_result("Success", time.strftime("%m-%d %H:%M:%S"))
self.report_result("Success", time.strftime("%m-%d-%H:%M:%S"))
print("\n##### [Epilogue:]")
print("* " + "Total elapsed time".rjust(20) + ": " + str(datetime.timedelta(seconds = self.time_elapsed(self.starttime))))
# print("Peak memory used: " + str(memory_usage()["peak"]) + "kb")
print("* " + "Peak memory used".rjust(20) + ": " + str(round(self.peak_memory, 2)) + " GB")
self.timestamp("* Pipeline completed at: ".rjust(20))


def fail_pipeline(self, e):
def fail_pipeline(self, e, dynamic_recover=False):
"""
If the pipeline does not complete, this function will stop the pipeline gracefully.
It sets the status flag to failed and skips the normal success completion procedure.
Expand All @@ -980,6 +994,17 @@ def fail_pipeline(self, e):
self.set_status_flag("failed")
self.timestamp("### Pipeline failed at: ")
print("Total time: ", str(datetime.timedelta(seconds = self.time_elapsed(self.starttime))))

if dynamic_recover:
# job was terminated, not failed due to a bad process.
# flag this run as recoverable.
if self.proc_lock_name:
# if there is no process locked, then recovery will be automatic.
recover_name = "lock.recover." + self.proc_lock_name
recover_file = os.path.join(self.pipeline_outfolder, recover_name)
print("Setting dynamic recover file: " + recover_file)
self._create_file_racefree(recover_file)

raise e


Expand All @@ -995,7 +1020,7 @@ def _signal_term_handler(self, signal, frame):
message = "Got SIGTERM; Failing gracefully..."
with open(self.pipeline_log_file, "a") as myfile:
myfile.write(message + "\n")
self.fail_pipeline(Exception("SIGTERM"))
self.fail_pipeline(Exception("SIGTERM"), dynamic_recover=True)
sys.exit(1)


Expand All @@ -1006,7 +1031,7 @@ def _signal_int_handler(self, signal, frame):
message = "Got SIGINT (Ctrl +C); Failing gracefully..."
with open(self.pipeline_log_file, "a") as myfile:
myfile.write(message + "\n")
self.fail_pipeline(Exception("SIGINT"))
self.fail_pipeline(Exception("SIGINT"), dynamic_recover=True)
sys.exit(1)


Expand Down Expand Up @@ -1382,7 +1407,7 @@ def add_pypiper_args(parser, groups = ["pypiper"], args = [None], all_args = Fal
if arg == "genome":
parser.add_argument(
"-G", "--genome", dest="genome_assembly", type=str,
help="identifier for genome assempbly (required)",
help="identifier for genome assembly (required)",
required=False)
if arg == "single-or-paired":
parser.add_argument(
Expand Down
18 changes: 17 additions & 1 deletion test_pypiper.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ class PypiperTest(unittest.TestCase):
def _clean(cls):
for d in glob.glob("pipeline_output*/"):
if os.path.isdir(d):
print("Removing " + d)
shutil.rmtree(d)

def setUp(self):
Expand Down Expand Up @@ -71,11 +72,15 @@ def test_me(self):
self.pp.run(cmd, lock_name="sleep")
print("Elapsed: " + str(self.pp.time_elapsed(stamp)))
self.assertTrue(self.pp.time_elapsed(stamp) > 1)


print("Wait for subprocess...")
self.pp._wait_for_process(self.pp.running_subprocess)
self.pp2.wait=True
self.pp.wait=True



print("Make sure the pipeline respects files already existing...")
target = self.pp.pipeline_outfolder + "tgt"
if os.path.isfile(target): # for repeat runs.
Expand Down Expand Up @@ -182,15 +187,26 @@ def test_me(self):

cmd = "thiscommandisbad"

#Should not raise an error
# Should not raise an error
self.pp.run(cmd, target=None, lock_name="badcommand", nofail=True)
self.pp.callprint(cmd, nofail=True)

# Should raise an error
with self.assertRaises(OSError):
self.pp.run(cmd, target=None, lock_name="badcommand")

print("Test dynamic recovery...")
# send sigint
self.pp.proc_lock_name="sleep"
with self.assertRaises(Exception):
self.pp._signal_int_handler(None, None)


sleep_lock = self.pp.pipeline_outfolder + "lock.sleep"
#subprocess.Popen("sleep .5; rm " + sleep_lock, shell=True)
self.pp._create_file(sleep_lock)
cmd = "echo hello"
self.pp.run(cmd, lock_name="sleep")

#subprocess.Popen("sleep .5; rm " + sleep_lock, shell=True)

Expand Down

0 comments on commit e4090e6

Please sign in to comment.