Merge pull request #35 from epigen/dev

Version 0.5.0 release candidate 2
databio · Jul 13, 2017 · e4090e6 · e4090e6
2 parents d252298 + 4f0b7c9
commit e4090e6
Show file tree

Hide file tree

Showing 7 changed files with 80 additions and 30 deletions.
diff --git a/doc/source/changelog.rst b/doc/source/changelog.rst
@@ -10,6 +10,12 @@ Changelog
 
     - Adds 'waiting' flag.
 
+	- Eliminates extra spaces in reported results
+
+	- Pypiper module is version aware
+
+	- Updates Success time format to eliminate space
+
 - **v0.4** (*2017-01-23*):
 
     - First major public release!

diff --git a/doc/source/features.rst b/doc/source/features.rst
@@ -21,6 +21,8 @@ Pypiper provides the following benefits:
 	Pypiper provides functions to put key-value pairs into an easy-to-parse stats file, making it easy to summarize your pipeline results.
 -   **Simplicity:**
 	It should only take you 15 minutes to run your first pipeline. The basic documentation is just a few pages long. The codebase itself is also only a few thousand lines of code, making it very lightweight.
+-	**Dynamic recovery:**
+	If a job is user-interrupted (with SIGINT or SIGTERM), for example by a cluster resource manager, it will get a dynamic recovery flag set, and the next time the run is started it will automatically pick up where it left off.
 
 
 Furthermore, Pypiper includes a suite of commonly used pieces of code (toolkits) which the user may use to build pipelines.

diff --git a/pypiper/__init__.py b/pypiper/__init__.py
@@ -1,3 +1,4 @@
+from ._version import __version__
 from .pypiper import *
 from .ngstk import *
 from .AttributeDict import *
diff --git a/pypiper/_version.py b/pypiper/_version.py
@@ -1 +1 @@
-__version__ = "0.5.0-rc1"
+__version__ = "0.5.0-rc2"
diff --git a/pypiper/ngstk.py b/pypiper/ngstk.py
@@ -1137,13 +1137,13 @@ def calculate_FRiP(self, inputBam, inputBed, output, cpus=4):
 		cmd += " | awk '{{sum+=$5}} END {{print sum}}' > {0}".format(output)
 		return cmd
 
-	def macs2CallPeaks(treatmentBams, outputDir, sampleName, genome, controlBams=None, broad=False, paired=False):
+	def macs2CallPeaks(self, treatmentBams, outputDir, sampleName, genome, controlBams=None, broad=False, paired=False):
 		"""
 		Use MACS2 to call peaks.
 		"""
-		sizes = {"hg38": 2.7e9, "hg19": 2.7e9, "mm10": 1.87e9, "dr7": 1.412e9}
+		sizes = {"hg38": 2.7e9, "hg19": 2.7e9, "mm10": 1.87e9, "dr7": 1.412e9, "mm9": 1.87e9}
 
-		cmd = "macs2 callpeak -t {0}".format(treatmentBams if type(treatmentBams) is str else " ".join(treatmentBams))
+		cmd = self.tools.macs2 + " callpeak -t {0}".format(treatmentBams if type(treatmentBams) is str else " ".join(treatmentBams))
 		if controlBams is not None:
 			cmd += " -c {0}".format(controlBams if type(controlBams) is str else " ".join(controlBams))
 		if paired:
@@ -1157,7 +1157,7 @@ def macs2CallPeaks(treatmentBams, outputDir, sampleName, genome, controlBams=Non
 		return cmd
 
 	def macs2CallPeaksATACSeq(self, treatmentBam, outputDir, sampleName, genome):
-		sizes = {"hg38": 2.7e9, "hg19": 2.7e9, "mm10": 1.87e9, "dr7": 1.412e9, "mm9": 2.7e9}
+		sizes = {"hg38": 2.7e9, "hg19": 2.7e9, "mm10": 1.87e9, "dr7": 1.412e9, "mm9": 1.87e9}
 		cmd = self.tools.macs2 + " callpeak -t {0}".format(treatmentBam)
 		cmd += " --nomodel --extsize 147 -g {0} -n {1} --outdir {2}".format(sizes[genome], sampleName, outputDir)
 		return cmd

diff --git a/pypiper/pypiper.py b/pypiper/pypiper.py
@@ -267,19 +267,23 @@ def start_pipeline(self, args = None, multi = False):
 		# Wrapped in try blocks so that the code will not fail if the pipeline or pypiper are not git repositories
 		gitvars = {}
 		try:
-			gitvars['pypiper_dir'] = os.path.dirname(os.path.realpath(__file__))
-			gitvars['pypiper_hash'] = subprocess.check_output("cd " + os.path.dirname(os.path.realpath(__file__)) + "; git rev-parse --verify HEAD 2>/dev/null", shell=True)
-			gitvars['pypiper_date'] = subprocess.check_output("cd " + os.path.dirname(os.path.realpath(__file__)) + "; git show -s --format=%ai HEAD 2>/dev/null", shell=True)
-			gitvars['pypiper_diff'] = subprocess.check_output("cd " + os.path.dirname(os.path.realpath(__file__)) + "; git diff --shortstat HEAD 2>/dev/null", shell=True)
-			gitvars['pypiper_branch'] = subprocess.check_output("cd " + os.path.dirname(os.path.realpath(__file__)) + "; git branch | grep '*' 2>/dev/null", shell=True)
+			# pypiper dir
+			ppd = os.path.dirname(os.path.realpath(__file__))
+			gitvars['pypiper_dir'] = ppd
+			gitvars['pypiper_hash'] = subprocess.check_output("cd " + ppd + "; git rev-parse --verify HEAD 2>/dev/null", shell=True)
+			gitvars['pypiper_date'] = subprocess.check_output("cd " + ppd + "; git show -s --format=%ai HEAD 2>/dev/null", shell=True)
+			gitvars['pypiper_diff'] = subprocess.check_output("cd " + ppd + "; git diff --shortstat HEAD 2>/dev/null", shell=True)
+			gitvars['pypiper_branch'] = subprocess.check_output("cd " + ppd + "; git branch | grep '*' 2>/dev/null", shell=True)
 		except Exception:
 			pass
 		try:
-			gitvars['pipe_dir'] = os.path.dirname(os.path.realpath(sys.argv[0]))
-			gitvars['pipe_hash'] = subprocess.check_output("cd " + os.path.dirname(os.path.realpath(sys.argv[0])) + "; git rev-parse --verify HEAD 2>/dev/null", shell=True)
-			gitvars['pipe_date'] = subprocess.check_output("cd " + os.path.dirname(os.path.realpath(sys.argv[0])) + "; git show -s --format=%ai HEAD 2>/dev/null", shell=True)
-			gitvars['pipe_diff'] = subprocess.check_output("cd " + os.path.dirname(os.path.realpath(sys.argv[0])) + "; git diff --shortstat HEAD 2>/dev/null", shell=True)
-			gitvars['pipe_branch'] = subprocess.check_output("cd " + os.path.dirname(os.path.realpath(sys.argv[0])) + "; git branch | grep '*' 2>/dev/null", shell=True)
+			# pipeline dir
+			pld = os.path.dirname(os.path.realpath(sys.argv[0]))
+			gitvars['pipe_dir'] = pld
+			gitvars['pipe_hash'] = subprocess.check_output("cd " + pld + "; git rev-parse --verify HEAD 2>/dev/null", shell=True)
+			gitvars['pipe_date'] = subprocess.check_output("cd " + pld + "; git show -s --format=%ai HEAD 2>/dev/null", shell=True)
+			gitvars['pipe_diff'] = subprocess.check_output("cd " + pld + "; git diff --shortstat HEAD 2>/dev/null", shell=True)
+			gitvars['pipe_branch'] = subprocess.check_output("cd " + pld + "; git branch | grep '*' 2>/dev/null", shell=True)
 		except Exception:
 			pass
 
@@ -305,7 +309,7 @@ def start_pipeline(self, args = None, multi = False):
 			if (gitvars['pypiper_diff'] != ""):
 				print("* " + "Pypiper diff".rjust(20) + ":  " + gitvars['pypiper_diff'].strip())
 		except KeyError:
-			# If any of the keys aren't set, that's OK. It just means pypiper isn't being run from a git repo.
+			# It is ok if keys aren't set, it means pypiper isn't in a  git repo.
 			pass
 
 		try:
@@ -373,7 +377,8 @@ def run(self, cmd, target=None, lock_name=None, shell="guess", nofail=False, cle
 		:type target: str or None
 		:param lock_name: Name of lock file. Optional.
 		:type lock_name: str or None
-		:param shell: If command requires should be run in its own shell. Optional. Default: "guess" -- run will try to determine if the command requires a shell.
+		:param shell: If command requires should be run in its own shell. Optional. Default: "guess" --
+			run will try to determine if the command requires a shell.
 		:type shell: bool
 		:param nofail: Should the pipeline proceed past a nonzero return from a process? Default: False
 			Nofail can be used to implement non-essential parts of the pipeline; if these processes fail,
@@ -405,6 +410,9 @@ def run(self, cmd, target=None, lock_name=None, shell="guess", nofail=False, cle
 		# Prepend "lock." to make it easy to find the lock files.
 		self.proc_lock_name = lock_name
 		lock_name = "lock." + lock_name
+		recover_name = "lock.recover." + self.proc_lock_name
+		recover_file = os.path.join(self.pipeline_outfolder, recover_name)
+		recover_mode = False
 		lock_file = os.path.join(self.pipeline_outfolder, lock_name)
 		process_return_code = 0
 		local_maxmem = 0
@@ -436,22 +444,28 @@ def run(self, cmd, target=None, lock_name=None, shell="guess", nofail=False, cle
 			if os.path.isfile(lock_file):
 				if self.overwrite_locks:
 					print("Found lock file; overwriting this target...")
+				elif os.path.isfile(recover_file):
+					print("Found lock file; dynamic recovery set. Overwriting this target...")
+					# remove the lock file which will then be prompty re-created for the current run.
+					recover_mode = True
+					# the recovery flag is now spent, so remove so we don't accidently re-recover a failed job
+					os.remove(recover_file)
 				else:  # don't overwite locks
 					self._wait_for_lock(lock_file)
 					# when it's done loop through again to try one more time (to see if the target exists now)
 					continue
 
 			# If you get to this point, the target doesn't exist, and the lock_file doesn't exist 
 			# (or we should overwrite). create the lock (if you can)
-			if not self.overwrite_locks:
+			if self.overwrite_locks or recover_mode:
+				self._create_file(lock_file)
+			else:
 				try:
 					self._create_file_racefree(lock_file)  # Create lock
 				except OSError as e:
 					if e.errno == errno.EEXIST:  # File already exists
 						print ("Lock file created after test! Looping again.")
 						continue  # Go back to start
-			else:
-				self._create_file(lock_file)
 
 			##### End tests block
 			# If you make it past these tests, we should proceed to run the process.
@@ -486,7 +500,8 @@ def run(self, cmd, target=None, lock_name=None, shell="guess", nofail=False, cle
 			break
 
 		# Bad idea: don't return follow_result; it seems nice but nothing else
-		# in your pipeline can depend on this since it won't be run if that command 		# isn't required because target exists.
+		# in your pipeline can depend on this since it won't be run if that command
+		# isn't required because target exists.
 		return process_return_code
 
 
@@ -777,8 +792,7 @@ def _report_profile(self, command, lock_name, elapsed_time, memory):
 			str(lock_name) + "\t" + \
 			str(datetime.timedelta(seconds = round(elapsed_time, 2))) + "\t " + \
 			str(memory)
-		# messageMarkdown = "> `" + command + "`\t" + str(elapsed_time).strip() + "\t " + str(memory).strip() + "\t" + "_PROF_"
-		# print(messageMarkdown)
+
 		with open(self.pipeline_profile_file, "a") as myfile:
 			myfile.write(messageRaw + "\n")
 
@@ -799,7 +813,7 @@ def report_result(self, key, value, annotation=None):
 
 		# keep the value in memory:
 		self.stats_dict[key] = str(value).strip()
-		messageRaw = key + "\t " + str(value).strip() + "\t" + str(annotation)
+		messageRaw = key + "\t" + str(value).strip() + "\t" + str(annotation)
 		messageMarkdown = "> `" + key + "`\t" + str(value).strip()\
 		 + "\t" + str(annotation) + "\t" + "_RES_"
 		print(messageMarkdown)
@@ -949,15 +963,15 @@ def stop_pipeline(self):
 		self.set_status_flag("completed")
 		self._cleanup()
 		self.report_result("Time", str(datetime.timedelta(seconds = self.time_elapsed(self.starttime))))
-		self.report_result("Success", time.strftime("%m-%d %H:%M:%S"))
+		self.report_result("Success", time.strftime("%m-%d-%H:%M:%S"))
 		print("\n##### [Epilogue:]")
 		print("* " + "Total elapsed time".rjust(20) + ":  " + str(datetime.timedelta(seconds = self.time_elapsed(self.starttime))))
 		# print("Peak memory used: " + str(memory_usage()["peak"]) + "kb")
 		print("* " + "Peak memory used".rjust(20) + ":  " + str(round(self.peak_memory, 2)) + " GB")
 		self.timestamp("* Pipeline completed at: ".rjust(20))
 
 
-	def fail_pipeline(self, e):
+	def fail_pipeline(self, e, dynamic_recover=False):
 		"""
 		If the pipeline does not complete, this function will stop the pipeline gracefully.
 		It sets the status flag to failed and skips the	normal success completion procedure.
@@ -980,6 +994,17 @@ def fail_pipeline(self, e):
 			self.set_status_flag("failed")
 			self.timestamp("### Pipeline failed at: ")
 			print("Total time: ", str(datetime.timedelta(seconds = self.time_elapsed(self.starttime))))
+
+		if dynamic_recover:
+			# job was terminated, not failed due to a bad process.
+			# flag this run as recoverable.
+			if self.proc_lock_name:
+				# if there is no process locked, then recovery will be automatic.
+				recover_name = "lock.recover." + self.proc_lock_name
+				recover_file = os.path.join(self.pipeline_outfolder, recover_name)
+				print("Setting dynamic recover file: " + recover_file)
+				self._create_file_racefree(recover_file)
+
 		raise e
 
 
@@ -995,7 +1020,7 @@ def _signal_term_handler(self, signal, frame):
 		message = "Got SIGTERM; Failing gracefully..."
 		with open(self.pipeline_log_file, "a") as myfile:
 			myfile.write(message + "\n")
-		self.fail_pipeline(Exception("SIGTERM"))
+		self.fail_pipeline(Exception("SIGTERM"), dynamic_recover=True)
 		sys.exit(1)
 
 
@@ -1006,7 +1031,7 @@ def _signal_int_handler(self, signal, frame):
 		message = "Got SIGINT (Ctrl +C); Failing gracefully..."
 		with open(self.pipeline_log_file, "a") as myfile:
 			myfile.write(message + "\n")
-		self.fail_pipeline(Exception("SIGINT"))
+		self.fail_pipeline(Exception("SIGINT"), dynamic_recover=True)
 		sys.exit(1)
 
 
@@ -1382,7 +1407,7 @@ def add_pypiper_args(parser, groups = ["pypiper"], args = [None], all_args = Fal
 		if arg == "genome":
 			parser.add_argument(
 				"-G", "--genome", dest="genome_assembly", type=str,
-				help="identifier for genome assempbly (required)",
+				help="identifier for genome assembly (required)",
 				required=False)
 		if arg == "single-or-paired":
 			parser.add_argument(

diff --git a/test_pypiper.py b/test_pypiper.py
@@ -19,6 +19,7 @@ class PypiperTest(unittest.TestCase):
 	def _clean(cls):
 		for d in glob.glob("pipeline_output*/"):
 			if os.path.isdir(d):
+				print("Removing " + d)
 				shutil.rmtree(d)
 
 	def setUp(self):
@@ -71,11 +72,15 @@ def test_me(self):
 		self.pp.run(cmd, lock_name="sleep")
 		print("Elapsed: " + str(self.pp.time_elapsed(stamp)))
 		self.assertTrue(self.pp.time_elapsed(stamp) > 1)
+
+
 		print("Wait for subprocess...")
 		self.pp._wait_for_process(self.pp.running_subprocess)
 		self.pp2.wait=True
 		self.pp.wait=True
 
+
+
 		print("Make sure the pipeline respects files already existing...")
 		target = self.pp.pipeline_outfolder + "tgt"
 		if os.path.isfile(target):  # for repeat runs.
@@ -182,15 +187,26 @@ def test_me(self):
 
 		cmd = "thiscommandisbad"
 
-		#Should not raise an error
+		# Should not raise an error
 		self.pp.run(cmd, target=None, lock_name="badcommand", nofail=True)
 		self.pp.callprint(cmd, nofail=True)
 
+		# Should raise an error
 		with self.assertRaises(OSError):
 			self.pp.run(cmd, target=None, lock_name="badcommand")
 
+		print("Test dynamic recovery...")
+		# send sigint
+		self.pp.proc_lock_name="sleep"
+		with self.assertRaises(Exception):
+			self.pp._signal_int_handler(None, None)
 
 
+		sleep_lock = self.pp.pipeline_outfolder + "lock.sleep"
+		#subprocess.Popen("sleep .5; rm " + sleep_lock, shell=True)
+		self.pp._create_file(sleep_lock)
+		cmd = "echo hello"
+		self.pp.run(cmd, lock_name="sleep")
 
 		#subprocess.Popen("sleep .5; rm " + sleep_lock, shell=True)
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.5.0-rc1"
		__version__ = "0.5.0-rc2"