forked from gregorgebhardt/cluster_work
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcluster_work.py.new
1269 lines (1055 loc) · 58 KB
/
cluster_work.py.new
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#############################################################################
#
# ClusterWork
#
# A framework to run experiments on an computing cluster.
#
# Based on the Python Experiment Suite by Thomas Rückstiess. (see expsuite_LICENSE)
# Licensed under the modified BSD License.
#
# Copyright 2017 - Gregor Gebhardt
#
#############################################################################
import abc
import argparse
import itertools
import collections
import os
import sys
import re
import gc
import time
import zlib
from copy import deepcopy
import fnmatch
from typing import Generator, Tuple, List
from shutil import copyfile
import subprocess
from attrdict import AttrDict
import numpy as np
import pandas as pd
import yaml
import logging
class _CWFormatter(logging.Formatter):
def __init__(self):
self.std_formatter = logging.Formatter('[%(asctime)s] [%(name)s] [%(levelname)s] %(message)s')
self.red_formatter = logging.Formatter('[%(asctime)s] %(message)s')
def format(self, record: logging.LogRecord):
if record.levelno <= logging.ERROR:
return self.std_formatter.format(record)
else:
return self.red_formatter.format(record)
_logging_formatter = _CWFormatter()
# _info_output_formatter = logging.Formatter('[%(asctime)s] %(message)s')
_info_content_output_handler = logging.StreamHandler(sys.stdout)
_info_content_output_handler.setFormatter(_logging_formatter)
_info_border_output_handler = logging.StreamHandler(sys.stdout)
_info_border_output_handler.setFormatter(_logging_formatter)
INFO_CONTNT = 200
INFO_BORDER = 150
_info_content_output_handler.setLevel(INFO_CONTNT)
_info_border_output_handler.setLevel(INFO_BORDER)
# _logging_std_handler = logging.StreamHandler(sys.stdout)
# _logging_std_handler.setFormatter(_logging_formatter)
# _logging_std_handler.setLevel(logging.DEBUG)
# _logging_std_handler.addFilter(lambda lr: lr.levelno <= logging.ERROR)
_logging_filtered_std_handler = logging.StreamHandler(sys.stdout)
_logging_filtered_std_handler.setFormatter(_logging_formatter)
_logging_filtered_std_handler.setLevel(logging.DEBUG)
_logging_filtered_std_handler.addFilter(lambda lr: lr.levelno < logging.WARNING)
_logging_err_handler = logging.StreamHandler(sys.stderr)
_logging_err_handler.setFormatter(_logging_formatter)
_logging_err_handler.setLevel(logging.WARNING)
_logging_err_handler.addFilter(lambda lr: lr.levelno <= logging.ERROR)
# default logging configuration: log everything up to WARNING to stdout and from WARNING upwards to stderr
# set log-level to INFO
logging.basicConfig(level=logging.INFO, handlers=[_logging_filtered_std_handler,
_logging_err_handler])
# get logger for cluster_work package
_logger = logging.getLogger('cluster_work')
_logger.addHandler(_logging_filtered_std_handler)
_logger.addHandler(_logging_err_handler)
# _logger.addHandler(_info_content_output_handler)
_logger.addHandler(_info_border_output_handler)
_logger.propagate = False
def deep_update(d, u):
for k, v in u.items():
if isinstance(v, collections.Mapping):
r = deep_update(d.get(k, {}), v)
d[k] = r
else:
d[k] = u[k]
return d
def flatten_dict(d, parent_key='', sep='_'):
items = []
for k, v in d.items():
new_key = parent_key + sep + k if parent_key else k
if isinstance(v, collections.MutableMapping):
items.extend(flatten_dict(v, new_key, sep=sep).items())
elif isinstance(v, collections.MutableSequence):
keys = map(lambda i: new_key + "_" + str(i), range(len(v)))
items.extend(zip(keys, v))
else:
items.append((new_key, v))
return dict(items)
def flatten_dict_to_tuple_keys(d: collections.MutableMapping):
flat_dict = {}
for k, v in d.items():
if isinstance(v, collections.MutableMapping):
sub_dict = flatten_dict_to_tuple_keys(v)
flat_dict.update({(k, *sk): sv for sk, sv in sub_dict.items()})
elif isinstance(v, collections.MutableSequence):
flat_dict[(k,)] = v
return flat_dict
def insert_deep_dictionary(d: collections.MutableMapping, t: tuple, value):
if type(t) is tuple:
if len(t) == 1: # tuple contains only one key
d[t[0]] = value
else: # tuple contains more than one key
if t[0] not in d:
d[t[0]] = dict()
insert_deep_dictionary(d[t[0]], t[1:], value)
else:
d[t] = value
def format_time(time_in_secs: float):
_hours = int(time_in_secs) // 60 ** 2
_minutes = (int(time_in_secs) // 60) % 60
_seconds = time_in_secs % 60
time_str = ""
if _hours:
time_str += "{:d}h:".format(_hours)
if _minutes or _hours:
time_str += "{:d}m:".format(_minutes)
time_str += "{:05.2f}s".format(_seconds)
return time_str
def shorten_param(_param_name):
name_parts = _param_name.split('.')
shortened_parts = '.'.join(map(lambda s: s[:3], name_parts[:-1]))
shortened_leaf = ''.join(map(lambda s: s[0], name_parts[-1].split('_')))
if shortened_parts:
return shortened_parts + '.' + shortened_leaf
else:
return shortened_leaf
class ClusterWork(object):
# change this in subclass, if you support restoring state on iteration level
_restore_supported = False
_default_params = {}
_pandas_to_csv_options = dict(na_rep='NaN', sep='\t', float_format="%+.8e")
_NO_GUI = False
_LOG_LEVEL = 'DEBUG'
_CW_LOG_LEVEL = 'INFO'
_RESTART_FULL_REPETITIONS = False
_MP_CONTEXT = 'fork'
_parser = argparse.ArgumentParser()
_parser.add_argument('config', metavar='CONFIG.yml', type=argparse.FileType('r'))
_parser.add_argument('-g', '--mpi_groups', nargs='?', type=int,
help='The number of MPI groups to create.')
_parser.add_argument('-j', '--job', nargs='+', type=int, default=None,
help='Run only the specified job. CAVEAT: Should only be used with slurm arrays!.'
'Note that each repetition counts as a single job.')
_parser.add_argument('-d', '--delete', action='store_true',
help='CAUTION deletes results of previous runs.')
_parser.add_argument('-o', '--overwrite', action='store_true',
help='CAUTION overwrites results of previous runs if config has changed.')
_parser.add_argument('-e', '--experiments', nargs='+',
help='Allows to specify which experiments should be run.')
_parser.add_argument('-p', '--progress', action='store_true',
help='Outputs the progress of the experiment and exits.')
_parser.add_argument('-P', '--full_progress', action='store_true',
help='Outputs a more detailed progress of the experiment and exits.')
_parser.add_argument('--no_gui', action='store_true',
help='Tells the experiment to not use any feature that requires a GUI.')
_parser.add_argument('-I', '--ignore_config', action='store_true',
help='Ignores changes in the configuration file for skipping or overwriting experiments..')
_parser.add_argument('--restart_full_repetitions', action='store_true')
_parser.add_argument('-l', '--log_level', nargs='?', default='INFO',
choices=['DEBUG', 'INFO', 'WARNING', 'WARN', 'ERROR', 'CRITICAL'],
help='Sets the log-level for the output of the experiment logger.')
_parser.add_argument('-L', '--cw_log_level', nargs='?', default='INFO',
choices=['DEBUG', 'INFO', 'WARNING', 'WARN', 'ERROR', 'CRITICAL'],
help='Sets the log-level for the output of ClusterWork.')
_parser.add_argument('-r', '--repetition', type=int,
help='Start only given repetition, assumes that only one experiment will be started.')
_parser.add_argument('-i', '--iteration', type=int,
help='Restart repetition from iteration i, works only together with -r/--repetition.')
_parser.add_argument('--plot', nargs='?', const=True, default=False,
help='Calls the plotting function of the experiment and exits.')
_parser.add_argument('--filter', default=argparse.SUPPRESS,
help='Allows to filter the plotted experiments.')
_parser.add_argument('-M', '--make_dirs', action='store_true',
help='Create experiment file structure.')
def __init__(self):
self.__log_path_rep = None
self.__log_path_rep_exists = False
self.__log_path_it = None
self.__log_path_it_exists = False
self.__results = None
self.__completed = False
self._COMM = None
@property
def _log_path_rep(self):
if not self.__log_path_rep_exists:
os.makedirs(self.__log_path_rep, exist_ok=True)
self.__log_path_rep_exists = True
return self.__log_path_rep
@_log_path_rep.setter
def _log_path_rep(self, log_path_rep: str):
if os.path.exists(log_path_rep):
if not os.path.isdir(log_path_rep):
raise NotADirectoryError("The log path {} exists but is not a directory".format(log_path_rep))
self.__log_path_rep_exists = True
else:
self.__log_path_rep_exists = False
self.__log_path_rep = log_path_rep
@property
def _log_path_it(self):
if not self.__log_path_it_exists:
os.makedirs(self.__log_path_it, exist_ok=True)
self.__log_path_it_exists = True
return self.__log_path_it
@_log_path_it.setter
def _log_path_it(self, log_path_it: str):
if os.path.exists(log_path_it):
if not os.path.isdir(log_path_it):
raise NotADirectoryError("The log path {} exists but is not a directory".format(log_path_it))
self.__log_path_it_exists = True
else:
self.__log_path_it_exists = False
self.__log_path_it = log_path_it
@staticmethod
def get_experiments(path='.'):
""" go through all subdirectories starting at path and return the experiment
identifiers (= directory names) of all existing experiments. A directory
is considered an experiment if it contains a experiment.yml file.
"""
exps = []
for dp, dn, fn in os.walk(path):
if 'experiment.yml' in fn:
subdirs = [os.path.join(dp, d) for d in os.listdir(dp) if os.path.isdir(os.path.join(dp, d))]
if all(map(lambda s: ClusterWork.get_experiments(s) == [], subdirs)):
exps.append(dp)
return exps
@staticmethod
def get_experiment_config(path, config_filename='experiment.yml'):
""" reads the parameters of the experiment (= path) given.
"""
with open(os.path.join(path, config_filename), 'r') as f:
config = yaml.load(f, Loader=yaml.FullLoader)
return config
@staticmethod
def get_experiment_directories(name, path='.'):
""" given an experiment name (used in section titles), this function
returns all subdirectories that contain an experiment with that name.
"""
experiments = []
for dir_path, dir_names, filenames in os.walk(path):
if 'experiment.yml' in filenames:
with open(os.path.join(dir_path, 'experiment.yml')) as f:
for d in yaml.load_all(f, Loader=yaml.FullLoader):
if 'name' in d and d['name'] == name:
experiments.append(dir_path)
return experiments
@classmethod
def load_experiments(cls, config_file, experiment_selectors=None, include_cluster_config=False):
"""loads all experiment configurations from the given stream, merges them with the default configuration and
expands list or grid parameters
:config_file: file stream of the configuration yaml for the experiments that should be loaded.
:experiment_selectors: list of experiment names. Only the experiments in this list will be loaded.
:return: returns the experiment configurations
"""
try:
_config_documents = [*yaml.load_all(config_file, Loader=yaml.FullLoader)]
except IOError:
raise SystemExit('config file %s not found.' % config_file)
if _config_documents[0]['name'].lower() == 'slurm':
slurm_config = _config_documents[0]
if _config_documents[1]['name'].lower() == 'default': # in case there's a slurm config first
default_config = _config_documents[1]
experiments_config = _config_documents[2:]
else:
# TODO: check if AttrDict breaks things
# default_config = dict()
default_config = AttrDict()
experiments_config = _config_documents
else:
slurm_config = None
if _config_documents[0]['name'].lower() == 'default':
default_config = _config_documents[0]
experiments_config = _config_documents[1:]
else:
# default_config = dict()
default_config = AttrDict()
experiments_config = _config_documents
# TODO use namedtuple or own ParameterStore??
# iterate over experiments and compute effective configuration and parameters
# TODO add warning if yaml has parameters that do not appear in cls._default_parameters
effective_experiments = []
for _config_e in experiments_config:
if not experiment_selectors or _config_e['name'] in experiment_selectors:
# merge config with default config from yaml file
_effective_config = deepcopy(default_config)
deep_update(_effective_config, _config_e)
# merge params with default params from subclass
# _effective_params = dict()
_effective_params = AttrDict()
deep_update(_effective_params, cls._default_params)
deep_update(_effective_params, _effective_config['params'])
_effective_config['params'] = _effective_params
effective_experiments.append(_effective_config)
# check for all required param keys
required_keys = ['name', 'path', 'repetitions', 'iterations']
missing_keys = [key for key in required_keys if key not in _effective_config]
if missing_keys:
raise IncompleteConfigurationError(
'config does not contain all required keys: {}'.format(missing_keys))
_experiments = cls.__adapt_experiment_path(effective_experiments)
_experiments = cls.__expand_experiments(_experiments)
_experiments = cls.__adapt_experiment_log_path(_experiments)
if include_cluster_config:
_experiments = [slurm_config] + _experiments
return _experiments
@staticmethod
def __adapt_experiment_path(config_list):
""" adapts the path of the experiment
"""
# for one single experiment, still wrap it in list
if type(config_list) == dict:
config_list = [config_list]
expanded_config_list = []
for config in config_list:
config['_config_path'] = config['path']
config['path'] = os.path.join(config['path'], config['name'])
expanded_config_list.append(config)
return expanded_config_list
@staticmethod
def __adapt_experiment_log_path(config_list):
""" adapts the log path of the experiment and sets the log-path
"""
# for one single experiment, still wrap it in list
if type(config_list) == dict:
config_list = [config_list]
expanded_config_list = []
for config in config_list:
if 'log_path' in config:
config['log_path'] = os.path.join(config['log_path'], config['name'])
else:
config['log_path'] = os.path.join(config['path'], 'log')
expanded_config_list.append(config)
return expanded_config_list
@staticmethod
def __expand_experiments(config_list):
""" expands the parameters list according to one of these schemes:
grid: every list item is combined with every other list item
list: every n-th list item of parameter lists are combined
"""
if type(config_list) == dict:
config_list = [config_list]
# get all options that are iteratable and build all combinations (grid) or tuples (list)
expanded_config_list = []
for config in config_list:
if 'grid' in config or 'list' in config:
if 'grid' in config:
# if we want a grid then we choose the product of all parameters
iter_fun = itertools.product
key = 'grid'
else:
# if we want a list then we zip the parameters together
iter_fun = zip
key = 'list'
# TODO add support for both list and grid
# convert list/grid dictionary into flat dictionary, where the key is a tuple of the keys and the
# value is the list of values
tuple_dict = flatten_dict_to_tuple_keys(config[key])
_param_names = ['.'.join(t) for t in tuple_dict]
# create a new config for each parameter setting
for values in iter_fun(*tuple_dict.values()):
# create config file for
_config = deepcopy(config)
del _config[key]
_converted_name = '_'.join("{}{}".format(shorten_param(k), v) for k, v in zip(_param_names, values))
# _converted_name = re.sub("[' \[\],()]", '', _converted_name)
_converted_name = re.sub("[' ]", '', _converted_name)
_converted_name = re.sub('["]', '', _converted_name)
_converted_name = re.sub("[(\[]", '_', _converted_name)
_converted_name = re.sub("[)\]]", '', _converted_name)
_converted_name = re.sub("[,]", '_', _converted_name)
_config['_experiment_path'] = config['path']
_config['path'] = os.path.join(config['path'], _converted_name)
_config['experiment_name'] = _config['name']
_config['name'] += '__' + _converted_name
# if 'log_path' in config:
# _config['log_path'] = os.path.join(config['log_path'], config['name'], _converted_name, 'log')
# else:
# _config['log_path'] = os.path.join(_config['path'], 'log')
for i, t in enumerate(tuple_dict.keys()):
insert_deep_dictionary(_config['params'], t, values[i])
expanded_config_list.append(_config)
else:
expanded_config_list.append(config)
return expanded_config_list
@classmethod
def __init_experiments(cls, config_file, experiments=None, delete_old=False, ignore_config=False,
overwrite_old=False, return_all=False, create_dirs=False):
"""initializes the experiment by loading the configuration file and creating the directory structure.
:return:
"""
expanded_experiments = cls.load_experiments(config_file, experiments)
# check for finished experiments
skip_experiments = []
clear_experiments = []
if not delete_old:
for _config in expanded_experiments:
# check if experiment exists and has finished
# FIXME: This only checks if result.csv exists but not which trials have actually run
if cls.__experiment_has_finished(_config):
if ignore_config:
# remove experiment from list
skip_experiments.append(_config)
_logger.info('Experiment {} has finished before. Skipping...'.format(_config['name']))
else:
# check if experiment configs are identical
if cls.__experiment_exists_identically(_config):
# remove experiment from list
skip_experiments.append(_config)
_logger.info('Experiment {} has finished identically before. '
'Skipping...'.format(_config['name']))
elif overwrite_old:
# add experiment to clear list
_logger.warning('Experiment {} has finished before, but configuration has '
'changed! Overwriting...'.format(_config['name']))
clear_experiments.append(_config)
else:
# add experiment to skip list
skip_experiments.append(_config)
_logger.warning('Experiment {} has finished before, but configuration has '
'changed! Skipping...'.format(_config['name']))
_logger.warning('--> To overwrite existing results, use the option -o/--overwrite')
elif cls.__experiment_exists(_config) and not cls.__experiment_exists_identically(_config):
if ignore_config:
_logger.warning('Experiment {} has started before, but configuration has '
'changed! '.format(_config['name'])) + \
'Starting Experiment anyways due to option -I/--ignore-config'
elif overwrite_old:
# add experiment to clear list
_logger.warning('Experiment {} has started before, but configuration has '
'changed! Overwriting...'.format(_config['name']))
clear_experiments.append(_config)
else:
# add experiment to skip list
skip_experiments.append(_config)
_logger.warning('Experiment {} has started before, but configuration has '
'changed! Skipping...'.format(_config['name']))
_logger.warning('--> To overwrite existing results, use the option -o/--overwrite')
if cls.__experiment_has_finished_repetitions(_config):
pass
else:
_logger.info('Experiment {} has started before, but configuration has '
'changed! Restarting since no results were found.'.format(_config['name']))
run_experiments = [_config for _config in expanded_experiments if _config not in skip_experiments]
if not run_experiments:
print("Before systemexit")
SystemExit('No work to do...')
print("After systemexit")
if create_dirs:
# FIXME: with job indices run_experiments will be empty after the first job
for _config in run_experiments:
cls.__create_experiment_directory(_config, delete_old or _config in clear_experiments)
try:
copyfile(os.path.abspath(config_file.name), os.path.join(_config['_config_path'], "config.yml"))
except:
pass
if return_all:
return expanded_experiments
return run_experiments
@classmethod
def init_from_config(cls, config, rep=0, it=0):
instance = cls().__init_rep_without_checks(config, rep)
instance._log_path_it = os.path.join(config['log_path'], 'rep_{:02d}'.format(rep), 'it_{:04d}'.format(it), '')
try:
instance.restore_state(config, rep, it)
except IOError:
_logger.warning('Could not restore experiment {}, rep {} at iteration {}.'.format(config['name'], rep, it))
return None
instance._it = it
def exception_stub(_c, _r, _i):
raise Exception('Experiment not properly initialized. Cannot run.')
instance.iterate = exception_stub
instance.reset = exception_stub
return instance
@classmethod
def run_slurm(cls):
options = cls._parser.parse_args()
slurm_and_experiment_configs = cls.load_experiments(options.config, options.experiments,
include_cluster_config=True)
assert slurm_and_experiment_configs[0]['name'].lower() == 'slurm'
slurm_config = slurm_and_experiment_configs[0]
experiment_configs = slurm_and_experiment_configs[1:]
config_path = os.path.join(slurm_config['experiment_root'], experiment_configs[0]['_config_path'])
# create experiment root path
os.makedirs(config_path, exist_ok=True)
copyfile(os.path.abspath(options.config.name), os.path.join(config_path, "config.yml"))
slurm_config['experiment_cwd'] = config_path
total_number_of_reps = -1 # start at -1 since counting starts at 0
for exp in experiment_configs:
total_number_of_reps += exp['repetitions']
slurm_config['num_jobs'] = total_number_of_reps
cls._create_slurm_file(options, slurm_config)
cmd = "sbatch " + slurm_config['experiment_cwd'] + "/jobs.slurm"
subprocess.check_output(cmd, shell=True)
@classmethod
def _create_slurm_file(cls, cw_options, cluster_options):
job_file = os.path.join(cluster_options['experiment_cwd'], 'jobs.slurm')
fid_in = open(os.path.abspath(cluster_options['path_to_template']), 'r')
fid_out = open(job_file, 'w+')
tline = fid_in.readline()
experiment_code = 'from {:s} import {:s};'.format(cls.__module__, cls.__name__)
experiment_code = experiment_code + 'obj = {:s}();'.format(cls.__name__)
experiment_code = experiment_code + "obj.run()"
while tline:
tline = tline.replace('%%project_name%%', cluster_options['project_name'])
tline = tline.replace('%%experiment_name%%', cluster_options['experiment_name'])
tline = tline.replace('%%time_limit%%', '{:d}:{:d}:00'.format(cluster_options['time_limit'] // 60,
cluster_options['time_limit'] % 60))
tline = tline.replace('%%experiment_root%%', cluster_options['experiment_root'])
tline = tline.replace('%%experiment_cwd%%', cluster_options['experiment_cwd'])
tline = tline.replace('%%python_script%%', experiment_code)
tline = tline.replace('%%path_to_yaml_config%%', os.path.join(cluster_options['experiment_cwd'], "config.yml"))
tline = tline.replace('%%num_jobs%%', '{:d}'.format(cluster_options['num_jobs']))
tline = tline.replace('%%num_parallel_jobs%%', '{:d}'.format(cluster_options['num_parallel_jobs']))
tline = tline.replace('%%mem%%', '{:d}'.format(cluster_options['mem']))
# tline = tline.replace('§§accelerator§§', accelerator)
tline = tline.replace('%%number_of_jobs%%', '{:d}'.format(1)) # TODO: Maybe not always 1?
tline = tline.replace('%%number_of_cpu_per_job%%', '{:d}'.format(cluster_options['number_of_cpu_per_job']))
fid_out.write(tline)
tline = fid_in.readline()
fid_in.close()
fid_out.close()
@classmethod
def run(cls):
""" starts the experiments as given in the config file. """
options = cls._parser.parse_args()
cls._NO_GUI = options.no_gui
cls._LOG_LEVEL = options.log_level.upper()
cls._CW_LOG_LEVEL = options.cw_log_level.upper()
cls._RESTART_FULL_REPETITIONS = options.restart_full_repetitions
logging.root.setLevel(cls._LOG_LEVEL)
_logger.setLevel(cls._CW_LOG_LEVEL)
_logging_filtered_std_handler.setLevel(level=cls._LOG_LEVEL)
if options.progress:
cls.__show_progress(options.config, options.experiments)
return
if options.full_progress:
cls.__show_progress(options.config, options.experiments, full_progress=True)
return
if options.plot:
if hasattr(options, 'filter'):
cls.__plot_experiment_results(options.config, options.experiments, options.filter)
else:
cls.__plot_experiment_results(options.config, options.experiments)
return
_logger.info("starting {} with the following options:".format(cls.__name__))
for option, value in vars(options).items():
_logger.info(" - {}: {}".format(option, value))
config_exps_w_expanded_params = cls.__init_experiments(config_file=options.config,
experiments=options.experiments,
delete_old=options.delete,
ignore_config=options.ignore_config,
overwrite_old=options.overwrite,
create_dirs=options.make_dirs,
return_all=options.job is not None)
# for _config in experiment_configs:
# cls.__create_experiment_directory(_config, delete_old_files=False, root_dir=slurm_config['experiment_root'])
repetitions_list = []
for experiment in config_exps_w_expanded_params:
num_repetitions = experiment['repetitions']
# TODO make reasonable: "start repetition i of experiment j"
if isinstance(options.repetition, int):
if not 0 <= options.repetition < num_repetitions:
# _logger.error('Repetition has to be in range [0, {}]'.format(num_repetitions))
raise InvalidParameterArgument('Repetition has to be in range [0, {}]'.format(num_repetitions))
repetitions_list = [(experiment, options.repetition)]
else:
# expand config_list_w_expanded_params for all repetitions and add self and rep number
repetitions_list += list(zip([experiment] * num_repetitions, range(num_repetitions)))
job_idx = options.job
print(job_idx)
if job_idx is not None:
if isinstance(job_idx, int):
idx = job_idx
elif isinstance(job_idx, list):
if len(job_idx) == 1:
idx = job_idx[0]
else:
idx = slice(*job_idx)
else:
raise NotImplementedError('could not process job_idx of type {}'.format(type(job_idx)))
repetitions_list = repetitions_list[idx]
if not isinstance(repetitions_list, list):
repetitions_list = [repetitions_list]
_logger.info('Selecting the following jobs to be executed:')
for (conf, r) in repetitions_list:
_logger.info(' - <{}> - Rep {}'.format(conf['name'], r + 1))
results = dict()
for repetition in repetitions_list:
cls.__create_experiment_directory(repetition[0], delete_old_files=False)
time_start = time.perf_counter()
# _logger.log(INFO_BORDER, '====================================================')
# _logger.log(INFO_CONTNT, '> Running Repetition {} '.format(repetition[1] + 1))
# result = cls().__init_rep(*repetition).__run_rep(*repetition)
# _elapsed_time = time.perf_counter() - time_start
# _logger.log(INFO_BORDER, '////////////////////////////////////////////////////')
# _logger.log(INFO_CONTNT, '> Finished Repetition {}'.format(repetition[1] + 1))
# _logger.log(INFO_CONTNT, '> Elapsed time: {}'.format(format_time(_elapsed_time)))
_logger.info('====================================================')
_logger.info('> Running Repetition {} '.format(repetition[1] + 1))
result = cls().__init_rep(*repetition).__run_rep(*repetition)
_elapsed_time = time.perf_counter() - time_start
_logger.info('////////////////////////////////////////////////////')
_logger.info('> Finished Repetition {}'.format(repetition[1] + 1))
_logger.info('> Elapsed time: {}'.format(format_time(_elapsed_time)))
results[repetition[1]] = result
gc.collect()
_index = pd.MultiIndex.from_product([range(repetition[0]['repetitions']),
range(repetition[0]['iterations'])],
names=['r', 'i'])
result_frame = None
for i in results:
if results[i] is None:
continue
if result_frame is None:
result_frame = pd.DataFrame(index=_index, columns=results[i].columns, dtype=float)
result_frame.update(results[i])
else:
for experiment in config_exps_w_expanded_params:
cls.__create_experiment_directory(experiment, delete_old_files=False)
num_repetitions = experiment['repetitions']
_logger.log(INFO_BORDER, "starting experiment {}".format(experiment['name']))
results = dict()
for repetition in range(num_repetitions):
time_start = time.perf_counter()
# _logger.log(INFO_BORDER, '====================================================')
# _logger.log(INFO_CONTNT, '> Running Repetition {} '.format(repetition[1] + 1))
# result = cls().__init_rep(*repetition).__run_rep(*repetition)
# _elapsed_time = time.perf_counter() - time_start
# _logger.log(INFO_BORDER, '////////////////////////////////////////////////////')
# _logger.log(INFO_CONTNT, '> Finished Repetition {}'.format(repetition[1] + 1))
# _logger.log(INFO_CONTNT, '> Elapsed time: {}'.format(format_time(_elapsed_time)))
_logger.info('====================================================')
_logger.info('> Running Repetition {} '.format(repetition + 1))
result = cls().__init_rep(experiment, repetition).__run_rep(experiment, repetition)
_elapsed_time = time.perf_counter() - time_start
_logger.info('////////////////////////////////////////////////////')
_logger.info('> Finished Repetition {}'.format(repetition + 1))
_logger.info('> Elapsed time: {}'.format(format_time(_elapsed_time)))
results[repetition] = result
gc.collect()
_index = pd.MultiIndex.from_product([range(experiment['repetitions']),
range(experiment['iterations'])],
names=['r', 'i'])
result_frame = None
for i in results:
if results[i] is None:
continue
if result_frame is None:
result_frame = pd.DataFrame(index=_index, columns=results[i].columns, dtype=float)
result_frame.update(results[i])
# if result_frame is not None:
# with open(os.path.join(experiment['path'], 'results.csv'), 'w') as results_file:
# result_frame.to_csv(results_file, **cls._pandas_to_csv_options)
sys.exit(0)
def __init_rep(self, config, rep):
""" run a single repetition including directory creation, log files, etc. """
# set configuration of this repetition
self._name = config['name']
self._repetitions = config['repetitions']
self._iterations = config['iterations']
self._path = config['path']
self._log_path = config['log_path']
self._log_path_rep = os.path.join(config['log_path'], 'rep_{:02d}'.format(rep), '')
self._plotting = config['plotting'] if 'plotting' in config else True
self._no_gui = (not config['gui'] if 'gui' in config else False) or self._NO_GUI
self._seed_base = zlib.adler32(self._name.encode()) % int(1e6)
self._seed = self._seed_base + 1000 * rep + 5
# set params of this repetition
self._params = config['params']
self._rep = rep
# check if log-file for repetition exists
repetition_has_finished, n_finished_its, results = self.__repetition_has_completed(config, rep)
# skip repetition if it has finished
if repetition_has_finished or n_finished_its == config['iterations']:
_logger.info('Repetition {} of experiment {} has finished before. '
'Skipping...'.format(rep + 1, config['name']))
self.__results = results
self.__completed = True
return self
# set logging handlers for current repetition
file_handler_mode = 'a' if n_finished_its else 'w'
file_handler = logging.FileHandler(os.path.join(self._log_path_rep, 'log.txt'), file_handler_mode)
file_handler.setLevel(self._LOG_LEVEL)
file_handler.setFormatter(_logging_formatter)
# file_handler.addFilter(lambda lr: lr.levelno <= logging.ERROR)
logging.root.handlers.clear()
logging.root.handlers = [file_handler, _logging_filtered_std_handler, _logging_err_handler]
# FIXME: put a flag if we want to write log file (which can become large with many iterations)
_logger.addHandler(file_handler)
self.reset(config, rep)
# if not completed but some iterations have finished, check for restart capabilities
if self._restore_supported and n_finished_its > 0 and not self._RESTART_FULL_REPETITIONS:
_logger.info('Repetition {} of experiment {} has started before. '
'Trying to restore state after iteration {}.'.format(rep + 1, config['name'], n_finished_its))
# set start for iterations and restore state in subclass
self.start_iteration = n_finished_its
for self.start_iteration in n_finished_its, n_finished_its - 1:
try:
self._log_path_it = os.path.join(config['log_path'], 'rep_{:02d}'.format(rep),
'it_{:04d}'.format(n_finished_its - 1), '')
if self.start_iteration and self.restore_state(config, rep, self.start_iteration - 1):
_logger.info('Restoring iteration succeeded. Restarting after iteration {}.'.format(
self.start_iteration))
self.__results = pd.DataFrame(data=results,
index=pd.MultiIndex.from_product(
[[rep], range(config['iterations'])],
names=['r', 'i']),
columns=results.columns, dtype=float)
break
except IOError:
_logger.error('Exception during restore_state of experiment {} in repetition {}.'.format(
config['name'], rep + 1), exc_info=True)
else:
_logger.warning('Restoring iteration NOT successful. Restarting from iteration 1.')
self.start_iteration = 0
self.__results = None
else:
self.start_iteration = 0
self.__results = None
return self
def __run_rep(self, config, rep) -> pd.DataFrame:
log_filename = os.path.join(self._log_path, 'rep_{}.csv'.format(rep))
if self.__completed:
return self.__results
repetition_time = .0
if self.start_iteration > 0:
repetition_time = self.__results.repetition_time.loc[(rep, self.start_iteration - 1)]
for it in range(self.start_iteration, config['iterations']):
self._it = it
self._seed = self._seed_base + 1000 * rep + it
# update iteration log directory
self._log_path_it = os.path.join(config['log_path'], 'rep_{:02d}'.format(rep), 'it_{:04d}'.format(it), '')
# run iteration and get results
iteration_time = None
mean_iteration_time = None
expected_total_time = None
time_start = time.perf_counter()
try:
# _logger.log(INFO_BORDER, '----------------------------------------------------')
# _logger.log(INFO_CONTNT, '> Starting Iteration {}/{} of Repetition {}/{}'.format(
# it + 1, self._iterations, rep + 1, self._repetitions))
# _logger.log(INFO_BORDER, '----------------------------------------------------')
_logger.info('----------------------------------------------------')
_logger.info('> Starting Iteration {}/{} of Repetition {}/{}'.format(
it + 1, self._iterations, rep + 1, self._repetitions))
_logger.info('----------------------------------------------------')
it_result = self.iterate(config, rep, it)
iteration_time = time.perf_counter() - time_start
repetition_time += iteration_time
mean_iteration_time = repetition_time / (it + 1)
expected_total_time = mean_iteration_time * self._iterations
if it_result is None:
continue
flat_it_result = flatten_dict(it_result)
if 'iteration_time' not in flat_it_result:
flat_it_result['iteration_time'] = iteration_time
if 'repetition_time' not in flat_it_result:
flat_it_result['repetition_time'] = repetition_time
if self.__results is None:
self.__results = pd.DataFrame(index=pd.MultiIndex.from_product([[rep], range(config['iterations'])],
names=['r', 'i']),
columns=flat_it_result.keys(), dtype=float)
self.__results.loc[(rep, it)] = flat_it_result
# FIXME: need to do this twice because else first iteration won't record results correctly oO
if it == 0:
self.__results.loc[(rep, it)] = flat_it_result
# save state before results, so that we know the saved state can be restored if we find the results.
self.save_state(config, rep, it)
# write first line with header
if it == 0:
self.__results.iloc[[it]].to_csv(log_filename, mode='w', header=True, **self._pandas_to_csv_options)
else:
self.__results.iloc[[it]].to_csv(log_filename, mode='a', header=False,
**self._pandas_to_csv_options)
if 'current_opt' in it_result:
if it_result['current_opt'] < 1e-5:
self.__results.loc[rep, 'current_opt'][it:] = 1e-5
return self.__results
except ValueError or ArithmeticError or np.linalg.linalg.LinAlgError:
_logger.error('Experiment {} - Repetition {} - Iteration {}'.format(config['name'], rep + 1, it + 1),
exc_info=True)
self.finalize()
return self.__results
except Exception:
self.finalize()
raise
finally:
if iteration_time is None:
iteration_time = time.perf_counter() - time_start
# _logger.log(INFO_BORDER, '----------------------------------------------------')
# _logger.log(INFO_CONTNT, '> Finished Iteration {}/{} of Repetition {}/{}'.format(
# it + 1, self._iterations, rep + 1, self._repetitions))
# _logger.log(INFO_CONTNT, '> Iteration time: {} [{}]'.format(format_time(iteration_time),
# format_time(mean_iteration_time)))
# _logger.log(INFO_CONTNT, '> Repetition time: {} [{}]'.format(format_time(repetition_time),
# format_time(expected_total_time)))
_logger.info('----------------------------------------------------')
_logger.info('> Finished Iteration {}/{} of Repetition {}/{}'.format(
it + 1, self._iterations, rep + 1, self._repetitions))
_logger.info('> Iteration time: {} [{}]'.format(format_time(iteration_time),
format_time(mean_iteration_time)))
_logger.info('> Repetition time: {} [{}]'.format(format_time(repetition_time),
format_time(expected_total_time)))
self.finalize()
self.__completed = True
return self.__results
def __init_rep_without_checks(self, config, rep):
# set configuration of this repetition
self._name = config['name']
self._repetitions = config['repetitions']
self._iterations = config['iterations']
self._path = config['path']
self._log_path = config['log_path']
self._log_path_rep = os.path.join(config['log_path'], 'rep_{:02d}'.format(rep), '')
self._plotting = config['plotting'] if 'plotting' in config else True
self._no_gui = (not config['gui'] if 'gui' in config else False) or self._NO_GUI
self._seed_base = zlib.adler32(self._name.encode()) % int(1e6)
self._seed = self._seed_base + 1000 * rep
# set params of this repetition
self._params = config['params']
self._rep = rep
self.reset(config, rep)
return self
ExperimentProgress = collections.namedtuple('ExperimentProgress', ['name', 'num_iterations',
'num_repetitions',
'exp_progress', 'rep_progress',
'finished_repetitions', 'finished_iterations'])
@classmethod
def get_progress(cls, config_file, experiment_selectors=None) -> Tuple[float, List[ExperimentProgress]]:
experiments_config = cls.load_experiments(config_file, experiment_selectors)
total_progress = .0
experiment_progress = []
for config in experiments_config:
exp_progress, rep_progress, finished_repetitions, finished_iterations = cls.__experiment_progress(config)