Skip to content

Commit e4090e6

Browse files
authored
Merge pull request #35 from epigen/dev
Version 0.5.0 release candidate 2
2 parents d252298 + 4f0b7c9 commit e4090e6

File tree

7 files changed

+80
-30
lines changed

7 files changed

+80
-30
lines changed

doc/source/changelog.rst

+6
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,12 @@ Changelog
1010

1111
- Adds 'waiting' flag.
1212

13+
- Eliminates extra spaces in reported results
14+
15+
- Pypiper module is version aware
16+
17+
- Updates Success time format to eliminate space
18+
1319
- **v0.4** (*2017-01-23*):
1420

1521
- First major public release!

doc/source/features.rst

+2
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ Pypiper provides the following benefits:
2121
Pypiper provides functions to put key-value pairs into an easy-to-parse stats file, making it easy to summarize your pipeline results.
2222
- **Simplicity:**
2323
It should only take you 15 minutes to run your first pipeline. The basic documentation is just a few pages long. The codebase itself is also only a few thousand lines of code, making it very lightweight.
24+
- **Dynamic recovery:**
25+
If a job is user-interrupted (with SIGINT or SIGTERM), for example by a cluster resource manager, it will get a dynamic recovery flag set, and the next time the run is started it will automatically pick up where it left off.
2426

2527

2628
Furthermore, Pypiper includes a suite of commonly used pieces of code (toolkits) which the user may use to build pipelines.

pypiper/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from ._version import __version__
12
from .pypiper import *
23
from .ngstk import *
34
from .AttributeDict import *

pypiper/_version.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.5.0-rc1"
1+
__version__ = "0.5.0-rc2"

pypiper/ngstk.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -1137,13 +1137,13 @@ def calculate_FRiP(self, inputBam, inputBed, output, cpus=4):
11371137
cmd += " | awk '{{sum+=$5}} END {{print sum}}' > {0}".format(output)
11381138
return cmd
11391139

1140-
def macs2CallPeaks(treatmentBams, outputDir, sampleName, genome, controlBams=None, broad=False, paired=False):
1140+
def macs2CallPeaks(self, treatmentBams, outputDir, sampleName, genome, controlBams=None, broad=False, paired=False):
11411141
"""
11421142
Use MACS2 to call peaks.
11431143
"""
1144-
sizes = {"hg38": 2.7e9, "hg19": 2.7e9, "mm10": 1.87e9, "dr7": 1.412e9}
1144+
sizes = {"hg38": 2.7e9, "hg19": 2.7e9, "mm10": 1.87e9, "dr7": 1.412e9, "mm9": 1.87e9}
11451145

1146-
cmd = "macs2 callpeak -t {0}".format(treatmentBams if type(treatmentBams) is str else " ".join(treatmentBams))
1146+
cmd = self.tools.macs2 + " callpeak -t {0}".format(treatmentBams if type(treatmentBams) is str else " ".join(treatmentBams))
11471147
if controlBams is not None:
11481148
cmd += " -c {0}".format(controlBams if type(controlBams) is str else " ".join(controlBams))
11491149
if paired:
@@ -1157,7 +1157,7 @@ def macs2CallPeaks(treatmentBams, outputDir, sampleName, genome, controlBams=Non
11571157
return cmd
11581158

11591159
def macs2CallPeaksATACSeq(self, treatmentBam, outputDir, sampleName, genome):
1160-
sizes = {"hg38": 2.7e9, "hg19": 2.7e9, "mm10": 1.87e9, "dr7": 1.412e9, "mm9": 2.7e9}
1160+
sizes = {"hg38": 2.7e9, "hg19": 2.7e9, "mm10": 1.87e9, "dr7": 1.412e9, "mm9": 1.87e9}
11611161
cmd = self.tools.macs2 + " callpeak -t {0}".format(treatmentBam)
11621162
cmd += " --nomodel --extsize 147 -g {0} -n {1} --outdir {2}".format(sizes[genome], sampleName, outputDir)
11631163
return cmd

pypiper/pypiper.py

+49-24
Original file line numberDiff line numberDiff line change
@@ -267,19 +267,23 @@ def start_pipeline(self, args = None, multi = False):
267267
# Wrapped in try blocks so that the code will not fail if the pipeline or pypiper are not git repositories
268268
gitvars = {}
269269
try:
270-
gitvars['pypiper_dir'] = os.path.dirname(os.path.realpath(__file__))
271-
gitvars['pypiper_hash'] = subprocess.check_output("cd " + os.path.dirname(os.path.realpath(__file__)) + "; git rev-parse --verify HEAD 2>/dev/null", shell=True)
272-
gitvars['pypiper_date'] = subprocess.check_output("cd " + os.path.dirname(os.path.realpath(__file__)) + "; git show -s --format=%ai HEAD 2>/dev/null", shell=True)
273-
gitvars['pypiper_diff'] = subprocess.check_output("cd " + os.path.dirname(os.path.realpath(__file__)) + "; git diff --shortstat HEAD 2>/dev/null", shell=True)
274-
gitvars['pypiper_branch'] = subprocess.check_output("cd " + os.path.dirname(os.path.realpath(__file__)) + "; git branch | grep '*' 2>/dev/null", shell=True)
270+
# pypiper dir
271+
ppd = os.path.dirname(os.path.realpath(__file__))
272+
gitvars['pypiper_dir'] = ppd
273+
gitvars['pypiper_hash'] = subprocess.check_output("cd " + ppd + "; git rev-parse --verify HEAD 2>/dev/null", shell=True)
274+
gitvars['pypiper_date'] = subprocess.check_output("cd " + ppd + "; git show -s --format=%ai HEAD 2>/dev/null", shell=True)
275+
gitvars['pypiper_diff'] = subprocess.check_output("cd " + ppd + "; git diff --shortstat HEAD 2>/dev/null", shell=True)
276+
gitvars['pypiper_branch'] = subprocess.check_output("cd " + ppd + "; git branch | grep '*' 2>/dev/null", shell=True)
275277
except Exception:
276278
pass
277279
try:
278-
gitvars['pipe_dir'] = os.path.dirname(os.path.realpath(sys.argv[0]))
279-
gitvars['pipe_hash'] = subprocess.check_output("cd " + os.path.dirname(os.path.realpath(sys.argv[0])) + "; git rev-parse --verify HEAD 2>/dev/null", shell=True)
280-
gitvars['pipe_date'] = subprocess.check_output("cd " + os.path.dirname(os.path.realpath(sys.argv[0])) + "; git show -s --format=%ai HEAD 2>/dev/null", shell=True)
281-
gitvars['pipe_diff'] = subprocess.check_output("cd " + os.path.dirname(os.path.realpath(sys.argv[0])) + "; git diff --shortstat HEAD 2>/dev/null", shell=True)
282-
gitvars['pipe_branch'] = subprocess.check_output("cd " + os.path.dirname(os.path.realpath(sys.argv[0])) + "; git branch | grep '*' 2>/dev/null", shell=True)
280+
# pipeline dir
281+
pld = os.path.dirname(os.path.realpath(sys.argv[0]))
282+
gitvars['pipe_dir'] = pld
283+
gitvars['pipe_hash'] = subprocess.check_output("cd " + pld + "; git rev-parse --verify HEAD 2>/dev/null", shell=True)
284+
gitvars['pipe_date'] = subprocess.check_output("cd " + pld + "; git show -s --format=%ai HEAD 2>/dev/null", shell=True)
285+
gitvars['pipe_diff'] = subprocess.check_output("cd " + pld + "; git diff --shortstat HEAD 2>/dev/null", shell=True)
286+
gitvars['pipe_branch'] = subprocess.check_output("cd " + pld + "; git branch | grep '*' 2>/dev/null", shell=True)
283287
except Exception:
284288
pass
285289

@@ -305,7 +309,7 @@ def start_pipeline(self, args = None, multi = False):
305309
if (gitvars['pypiper_diff'] != ""):
306310
print("* " + "Pypiper diff".rjust(20) + ": " + gitvars['pypiper_diff'].strip())
307311
except KeyError:
308-
# If any of the keys aren't set, that's OK. It just means pypiper isn't being run from a git repo.
312+
# It is ok if keys aren't set, it means pypiper isn't in a git repo.
309313
pass
310314

311315
try:
@@ -373,7 +377,8 @@ def run(self, cmd, target=None, lock_name=None, shell="guess", nofail=False, cle
373377
:type target: str or None
374378
:param lock_name: Name of lock file. Optional.
375379
:type lock_name: str or None
376-
:param shell: If command requires should be run in its own shell. Optional. Default: "guess" -- run will try to determine if the command requires a shell.
380+
:param shell: If command requires should be run in its own shell. Optional. Default: "guess" --
381+
run will try to determine if the command requires a shell.
377382
:type shell: bool
378383
:param nofail: Should the pipeline proceed past a nonzero return from a process? Default: False
379384
Nofail can be used to implement non-essential parts of the pipeline; if these processes fail,
@@ -405,6 +410,9 @@ def run(self, cmd, target=None, lock_name=None, shell="guess", nofail=False, cle
405410
# Prepend "lock." to make it easy to find the lock files.
406411
self.proc_lock_name = lock_name
407412
lock_name = "lock." + lock_name
413+
recover_name = "lock.recover." + self.proc_lock_name
414+
recover_file = os.path.join(self.pipeline_outfolder, recover_name)
415+
recover_mode = False
408416
lock_file = os.path.join(self.pipeline_outfolder, lock_name)
409417
process_return_code = 0
410418
local_maxmem = 0
@@ -436,22 +444,28 @@ def run(self, cmd, target=None, lock_name=None, shell="guess", nofail=False, cle
436444
if os.path.isfile(lock_file):
437445
if self.overwrite_locks:
438446
print("Found lock file; overwriting this target...")
447+
elif os.path.isfile(recover_file):
448+
print("Found lock file; dynamic recovery set. Overwriting this target...")
449+
# remove the lock file which will then be prompty re-created for the current run.
450+
recover_mode = True
451+
# the recovery flag is now spent, so remove so we don't accidently re-recover a failed job
452+
os.remove(recover_file)
439453
else: # don't overwite locks
440454
self._wait_for_lock(lock_file)
441455
# when it's done loop through again to try one more time (to see if the target exists now)
442456
continue
443457

444458
# If you get to this point, the target doesn't exist, and the lock_file doesn't exist
445459
# (or we should overwrite). create the lock (if you can)
446-
if not self.overwrite_locks:
460+
if self.overwrite_locks or recover_mode:
461+
self._create_file(lock_file)
462+
else:
447463
try:
448464
self._create_file_racefree(lock_file) # Create lock
449465
except OSError as e:
450466
if e.errno == errno.EEXIST: # File already exists
451467
print ("Lock file created after test! Looping again.")
452468
continue # Go back to start
453-
else:
454-
self._create_file(lock_file)
455469

456470
##### End tests block
457471
# If you make it past these tests, we should proceed to run the process.
@@ -486,7 +500,8 @@ def run(self, cmd, target=None, lock_name=None, shell="guess", nofail=False, cle
486500
break
487501

488502
# Bad idea: don't return follow_result; it seems nice but nothing else
489-
# in your pipeline can depend on this since it won't be run if that command # isn't required because target exists.
503+
# in your pipeline can depend on this since it won't be run if that command
504+
# isn't required because target exists.
490505
return process_return_code
491506

492507

@@ -777,8 +792,7 @@ def _report_profile(self, command, lock_name, elapsed_time, memory):
777792
str(lock_name) + "\t" + \
778793
str(datetime.timedelta(seconds = round(elapsed_time, 2))) + "\t " + \
779794
str(memory)
780-
# messageMarkdown = "> `" + command + "`\t" + str(elapsed_time).strip() + "\t " + str(memory).strip() + "\t" + "_PROF_"
781-
# print(messageMarkdown)
795+
782796
with open(self.pipeline_profile_file, "a") as myfile:
783797
myfile.write(messageRaw + "\n")
784798

@@ -799,7 +813,7 @@ def report_result(self, key, value, annotation=None):
799813

800814
# keep the value in memory:
801815
self.stats_dict[key] = str(value).strip()
802-
messageRaw = key + "\t " + str(value).strip() + "\t" + str(annotation)
816+
messageRaw = key + "\t" + str(value).strip() + "\t" + str(annotation)
803817
messageMarkdown = "> `" + key + "`\t" + str(value).strip()\
804818
+ "\t" + str(annotation) + "\t" + "_RES_"
805819
print(messageMarkdown)
@@ -949,15 +963,15 @@ def stop_pipeline(self):
949963
self.set_status_flag("completed")
950964
self._cleanup()
951965
self.report_result("Time", str(datetime.timedelta(seconds = self.time_elapsed(self.starttime))))
952-
self.report_result("Success", time.strftime("%m-%d %H:%M:%S"))
966+
self.report_result("Success", time.strftime("%m-%d-%H:%M:%S"))
953967
print("\n##### [Epilogue:]")
954968
print("* " + "Total elapsed time".rjust(20) + ": " + str(datetime.timedelta(seconds = self.time_elapsed(self.starttime))))
955969
# print("Peak memory used: " + str(memory_usage()["peak"]) + "kb")
956970
print("* " + "Peak memory used".rjust(20) + ": " + str(round(self.peak_memory, 2)) + " GB")
957971
self.timestamp("* Pipeline completed at: ".rjust(20))
958972

959973

960-
def fail_pipeline(self, e):
974+
def fail_pipeline(self, e, dynamic_recover=False):
961975
"""
962976
If the pipeline does not complete, this function will stop the pipeline gracefully.
963977
It sets the status flag to failed and skips the normal success completion procedure.
@@ -980,6 +994,17 @@ def fail_pipeline(self, e):
980994
self.set_status_flag("failed")
981995
self.timestamp("### Pipeline failed at: ")
982996
print("Total time: ", str(datetime.timedelta(seconds = self.time_elapsed(self.starttime))))
997+
998+
if dynamic_recover:
999+
# job was terminated, not failed due to a bad process.
1000+
# flag this run as recoverable.
1001+
if self.proc_lock_name:
1002+
# if there is no process locked, then recovery will be automatic.
1003+
recover_name = "lock.recover." + self.proc_lock_name
1004+
recover_file = os.path.join(self.pipeline_outfolder, recover_name)
1005+
print("Setting dynamic recover file: " + recover_file)
1006+
self._create_file_racefree(recover_file)
1007+
9831008
raise e
9841009

9851010

@@ -995,7 +1020,7 @@ def _signal_term_handler(self, signal, frame):
9951020
message = "Got SIGTERM; Failing gracefully..."
9961021
with open(self.pipeline_log_file, "a") as myfile:
9971022
myfile.write(message + "\n")
998-
self.fail_pipeline(Exception("SIGTERM"))
1023+
self.fail_pipeline(Exception("SIGTERM"), dynamic_recover=True)
9991024
sys.exit(1)
10001025

10011026

@@ -1006,7 +1031,7 @@ def _signal_int_handler(self, signal, frame):
10061031
message = "Got SIGINT (Ctrl +C); Failing gracefully..."
10071032
with open(self.pipeline_log_file, "a") as myfile:
10081033
myfile.write(message + "\n")
1009-
self.fail_pipeline(Exception("SIGINT"))
1034+
self.fail_pipeline(Exception("SIGINT"), dynamic_recover=True)
10101035
sys.exit(1)
10111036

10121037

@@ -1382,7 +1407,7 @@ def add_pypiper_args(parser, groups = ["pypiper"], args = [None], all_args = Fal
13821407
if arg == "genome":
13831408
parser.add_argument(
13841409
"-G", "--genome", dest="genome_assembly", type=str,
1385-
help="identifier for genome assempbly (required)",
1410+
help="identifier for genome assembly (required)",
13861411
required=False)
13871412
if arg == "single-or-paired":
13881413
parser.add_argument(

test_pypiper.py

+17-1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ class PypiperTest(unittest.TestCase):
1919
def _clean(cls):
2020
for d in glob.glob("pipeline_output*/"):
2121
if os.path.isdir(d):
22+
print("Removing " + d)
2223
shutil.rmtree(d)
2324

2425
def setUp(self):
@@ -71,11 +72,15 @@ def test_me(self):
7172
self.pp.run(cmd, lock_name="sleep")
7273
print("Elapsed: " + str(self.pp.time_elapsed(stamp)))
7374
self.assertTrue(self.pp.time_elapsed(stamp) > 1)
75+
76+
7477
print("Wait for subprocess...")
7578
self.pp._wait_for_process(self.pp.running_subprocess)
7679
self.pp2.wait=True
7780
self.pp.wait=True
7881

82+
83+
7984
print("Make sure the pipeline respects files already existing...")
8085
target = self.pp.pipeline_outfolder + "tgt"
8186
if os.path.isfile(target): # for repeat runs.
@@ -182,15 +187,26 @@ def test_me(self):
182187

183188
cmd = "thiscommandisbad"
184189

185-
#Should not raise an error
190+
# Should not raise an error
186191
self.pp.run(cmd, target=None, lock_name="badcommand", nofail=True)
187192
self.pp.callprint(cmd, nofail=True)
188193

194+
# Should raise an error
189195
with self.assertRaises(OSError):
190196
self.pp.run(cmd, target=None, lock_name="badcommand")
191197

198+
print("Test dynamic recovery...")
199+
# send sigint
200+
self.pp.proc_lock_name="sleep"
201+
with self.assertRaises(Exception):
202+
self.pp._signal_int_handler(None, None)
192203

193204

205+
sleep_lock = self.pp.pipeline_outfolder + "lock.sleep"
206+
#subprocess.Popen("sleep .5; rm " + sleep_lock, shell=True)
207+
self.pp._create_file(sleep_lock)
208+
cmd = "echo hello"
209+
self.pp.run(cmd, lock_name="sleep")
194210

195211
#subprocess.Popen("sleep .5; rm " + sleep_lock, shell=True)
196212

0 commit comments

Comments
 (0)