From 92e111c63798edbe25137f32e2a9d6968e8fcf64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andris=20V=C4=81ravs?= Date: Mon, 26 Nov 2018 14:45:18 +0200 Subject: [PATCH 01/12] initial WIP version of rnnlm_cleanup.py --- scripts/rnnlm/rnnlm_cleanup.py | 92 ++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 scripts/rnnlm/rnnlm_cleanup.py diff --git a/scripts/rnnlm/rnnlm_cleanup.py b/scripts/rnnlm/rnnlm_cleanup.py new file mode 100644 index 00000000000..6cd75f318ec --- /dev/null +++ b/scripts/rnnlm/rnnlm_cleanup.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 + +import argparse +import sys +import os + +script_name = sys.argv[0] + +# TODO decent description +parser = argparse.ArgumentParser(description="Removes models from past training iterations of " + "RNNLM. Several strategies for picking which iterations " + "to keep are available.", + epilog="E.g. " + script_name + " exp/rnnlm_a", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + +parser.add_argument("rnnlm_dir", + help="Directory where the RNNLM has been trained") +parser.add_argument("last_iteration", + help="Number of the last iteration", + type=int) +parser.add_argument("--iters_to_keep", + help="Max number of iterations to keep", + type=int, + default=3) +parser.add_argument("--keep_latest", + help="Keeps the training iterations that are latest by age", + action="store_const", + const=True, + default=False) +parser.add_argument("--keep_best", + help="Keeps the training iterations that have the best objf", + action="store_const", + const=True, + default=False) + +args = parser.parse_args() + +# validate arguments +if args.keep_latest and args.keep_best: + sys.exit(script_name + ": can only use either 'keep_latest' or 'keep_best', but not both") +elif not args.keep_latest and not args.keep_best: + sys.exit(script_name + ": no cleanup strategy specified: use 'keep_latest' or 'keep_best'") + +# TODO now for some actual logic............ +# check exp dir for model files +# list all files there, look for word_embedding.%d.mat and %d.raw files +# if keep_best, check compute_prob logs for best eval scores or adapt/use get_best_model.py +# if keep_latest, find the latest iteration that is not used or rely on last_iteration arg? + + +def get_iteration_files(exp_dir): + iterations = dict() + for f in os.listdir(exp_dir): + joined_f = os.path.join(exp_dir, f) + if os.path.isfile(joined_f) and (f.startswith("word_embedding") or f.endswith(".raw")): + split = f.split(".") + ext = split[-1] + iter = int(split[-2]) + if iter in iterations: + if ext == "raw": + iterations[iter] = (iterations[iter][0], joined_f) + else: + iterations[iter] = (joined_f, iterations[iter][1]) + else: + if ext == "raw": + iterations[iter] = (None, joined_f) + else: + iterations[iter] = (joined_f, None) + return iterations + + +def keep_latest(iteration_dict): + max_to_keep = args.iters_to_keep + kept = 0 + iterations_in_reverse_order = reversed(sorted(iteration_dict)) + for iter in iterations_in_reverse_order: + if kept < max_to_keep: + kept += 1 + else: + iter_files = iteration_dict[iter] + os.remove(iter_files[0]) + os.remove(iter_files[1]) + + +# TODO just testing +iterations = get_iteration_files(args.rnnlm_dir) +print(iterations) +keep_latest(iterations) +print(get_iteration_files(args.rnnlm_dir)) + + +# TODO implement rest of the bookkeeping \ No newline at end of file From afe09ae0120a3be5d1526cf19d0d48ade918bb4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andris=20V=C4=81ravs?= Date: Tue, 27 Nov 2018 11:48:07 +0200 Subject: [PATCH 02/12] working version of both keep_latest and keep_best --- scripts/rnnlm/rnnlm_cleanup.py | 59 ++++++++++++++++++++++++++-------- 1 file changed, 45 insertions(+), 14 deletions(-) diff --git a/scripts/rnnlm/rnnlm_cleanup.py b/scripts/rnnlm/rnnlm_cleanup.py index 6cd75f318ec..3f942fb9458 100644 --- a/scripts/rnnlm/rnnlm_cleanup.py +++ b/scripts/rnnlm/rnnlm_cleanup.py @@ -3,6 +3,7 @@ import argparse import sys import os +import re script_name = sys.argv[0] @@ -15,9 +16,9 @@ parser.add_argument("rnnlm_dir", help="Directory where the RNNLM has been trained") -parser.add_argument("last_iteration", - help="Number of the last iteration", - type=int) +# parser.add_argument("last_iteration", +# help="Number of the last iteration", +# type=int) parser.add_argument("--iters_to_keep", help="Max number of iterations to keep", type=int, @@ -41,12 +42,6 @@ elif not args.keep_latest and not args.keep_best: sys.exit(script_name + ": no cleanup strategy specified: use 'keep_latest' or 'keep_best'") -# TODO now for some actual logic............ -# check exp dir for model files -# list all files there, look for word_embedding.%d.mat and %d.raw files -# if keep_best, check compute_prob logs for best eval scores or adapt/use get_best_model.py -# if keep_latest, find the latest iteration that is not used or rely on last_iteration arg? - def get_iteration_files(exp_dir): iterations = dict() @@ -82,11 +77,47 @@ def keep_latest(iteration_dict): os.remove(iter_files[1]) -# TODO just testing +def keep_best(iteration_dict, exp_dir): + iters_to_keep = args.iters_to_keep + best = [] + for iter, iter_files in iteration_dict.items(): + # this is roughly taken from get_best_model.py + logfile = "{0}/log/compute_prob.{1}.log".format(exp_dir, iter) + try: + f = open(logfile, "r", encoding="latin-1") + except: + sys.exit(script_name + ": could not open log-file " + logfile) + objf = -2000 + for line in f: + m = re.search('Overall objf .* (\S+)$', str(line)) + if m is not None: + try: + objf = float(m.group(1)) + except Exception as e: + sys.exit(script_name + ": line in file {0} could not be parsed: {1}, error is: {2}".format( + logfile, line, str(e))) + if objf == -2000: + print(script_name + ": warning: could not parse objective function from " + logfile, file=sys.stderr) + continue + # add potential best, sort by objf, trim to iters_to_keep size + best.append((iter, objf)) + best = sorted(best, key=lambda x: -x[1]) + if len(best) > iters_to_keep: + throwaway = best[iters_to_keep:] + best = best[:iters_to_keep] + # remove iters that we know are not the best + for (iter, _) in throwaway: + iter_files = iteration_dict[iter] + os.remove(iter_files[0]) + os.remove(iter_files[1]) + + +# grab all the iterations mapped to their word_embedding and .raw files iterations = get_iteration_files(args.rnnlm_dir) print(iterations) -keep_latest(iterations) +# apply chosen cleanup strategy +if args.keep_latest: + keep_latest(iterations) +else: + keep_best(iterations, args.rnnlm_dir) print(get_iteration_files(args.rnnlm_dir)) - - -# TODO implement rest of the bookkeeping \ No newline at end of file From 085b400081e2cb010df9878475cec667153125e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andris=20V=C4=81ravs?= Date: Tue, 27 Nov 2018 16:13:28 +0200 Subject: [PATCH 03/12] cleanup now only considers those iterations for which rnnlm_compute_prob has finished --- scripts/rnnlm/rnnlm_cleanup.py | 113 ++++++++++++++++++++++----------- 1 file changed, 75 insertions(+), 38 deletions(-) diff --git a/scripts/rnnlm/rnnlm_cleanup.py b/scripts/rnnlm/rnnlm_cleanup.py index 3f942fb9458..61566f0e662 100644 --- a/scripts/rnnlm/rnnlm_cleanup.py +++ b/scripts/rnnlm/rnnlm_cleanup.py @@ -1,7 +1,8 @@ #!/usr/bin/env python3 -import argparse import sys + +import argparse import os import re @@ -16,9 +17,6 @@ parser.add_argument("rnnlm_dir", help="Directory where the RNNLM has been trained") -# parser.add_argument("last_iteration", -# help="Number of the last iteration", -# type=int) parser.add_argument("--iters_to_keep", help="Max number of iterations to keep", type=int, @@ -43,6 +41,48 @@ sys.exit(script_name + ": no cleanup strategy specified: use 'keep_latest' or 'keep_best'") +class IterationInfo: + def __init__(self, word_embedding_file, raw_file, objf, compute_prob_done): + self.word_embedding_file = word_embedding_file + self.raw_file = raw_file + self.objf = objf + self.compute_prob_done = compute_prob_done + + def __str__(self): + return "{word_embedding: %s, raw: %s, compute_prob: %s, objf: %2.3f}" % (self.word_embedding_file, + self.raw_file, + self.compute_prob_done, + self.objf) + + def __repr__(self): + return self.__str__() + + +def get_compute_prob_info(exp_dir, iter): + # roughly based on code in get_best_model.py + log_file = "{0}/log/compute_prob.{1}.log".format(exp_dir, iter) + try: + f = open(log_file, "r", encoding="latin-1") + except: + sys.exit(script_name + ": could not open log-file " + log_file) + # we now want 2 things: objf and whether compute prob is done + objf = -2000 + compute_prob_done = False + for line in f: + objf_m = re.search('Overall objf .* (\S+)$', str(line)) + if objf_m is not None: + try: + objf = float(objf_m.group(1)) + except Exception as e: + sys.exit(script_name + ": line in file {0} could not be parsed: {1}, error is: {2}".format( + log_file, line, str(e))) + if "# Ended" in line: + compute_prob_done = True + if objf == -2000: + print(script_name + ": warning: could not parse objective function from " + log_file, file=sys.stderr) + return objf, compute_prob_done + + def get_iteration_files(exp_dir): iterations = dict() for f in os.listdir(exp_dir): @@ -51,53 +91,52 @@ def get_iteration_files(exp_dir): split = f.split(".") ext = split[-1] iter = int(split[-2]) + objf, compute_prob_done = get_compute_prob_info(exp_dir, iter) if iter in iterations: + iter_info = iterations[iter] if ext == "raw": - iterations[iter] = (iterations[iter][0], joined_f) + iter_info.raw_file = joined_f else: - iterations[iter] = (joined_f, iterations[iter][1]) + iter_info.word_embedding_file = joined_f + iter_info.objf = objf + iter_info.compute_prob_done = compute_prob_done else: if ext == "raw": - iterations[iter] = (None, joined_f) + iterations[iter] = IterationInfo(None, joined_f, objf, compute_prob_done) else: - iterations[iter] = (joined_f, None) + iterations[iter] = IterationInfo(joined_f, None, objf, compute_prob_done) return iterations +def remove_model_files_for_iter(iter_info): + os.remove(iter_info.word_embedding_file) + os.remove(iter_info.raw_file) + + def keep_latest(iteration_dict): max_to_keep = args.iters_to_keep kept = 0 iterations_in_reverse_order = reversed(sorted(iteration_dict)) for iter in iterations_in_reverse_order: - if kept < max_to_keep: - kept += 1 - else: - iter_files = iteration_dict[iter] - os.remove(iter_files[0]) - os.remove(iter_files[1]) + # check if compute prob is done for this iteration, if not, leave it for future cleanups... + if iteration_dict[iter].compute_prob_done: + if kept < max_to_keep: + kept += 1 + else: + remove_model_files_for_iter(iteration_dict[iter]) -def keep_best(iteration_dict, exp_dir): +def keep_best(iteration_dict): iters_to_keep = args.iters_to_keep best = [] - for iter, iter_files in iteration_dict.items(): - # this is roughly taken from get_best_model.py - logfile = "{0}/log/compute_prob.{1}.log".format(exp_dir, iter) - try: - f = open(logfile, "r", encoding="latin-1") - except: - sys.exit(script_name + ": could not open log-file " + logfile) - objf = -2000 - for line in f: - m = re.search('Overall objf .* (\S+)$', str(line)) - if m is not None: - try: - objf = float(m.group(1)) - except Exception as e: - sys.exit(script_name + ": line in file {0} could not be parsed: {1}, error is: {2}".format( - logfile, line, str(e))) + for iter, iter_info in iteration_dict.items(): + objf = iter_info.objf if objf == -2000: - print(script_name + ": warning: could not parse objective function from " + logfile, file=sys.stderr) + print(script_name + ": warning: objf unavailable for iter " + str(iter), file=sys.stderr) + continue + if not iter_info.compute_prob_done: + # if compute_prob is not done, yet, we leave it for future cleanups + print(script_name + ": warning: compute_prob not done yet for iter " + str(iter), file=sys.stderr) continue # add potential best, sort by objf, trim to iters_to_keep size best.append((iter, objf)) @@ -107,17 +146,15 @@ def keep_best(iteration_dict, exp_dir): best = best[:iters_to_keep] # remove iters that we know are not the best for (iter, _) in throwaway: - iter_files = iteration_dict[iter] - os.remove(iter_files[0]) - os.remove(iter_files[1]) + remove_model_files_for_iter(iteration_dict[iter]) # grab all the iterations mapped to their word_embedding and .raw files iterations = get_iteration_files(args.rnnlm_dir) -print(iterations) +# print(iterations) # FIXME remove # apply chosen cleanup strategy if args.keep_latest: keep_latest(iterations) else: - keep_best(iterations, args.rnnlm_dir) -print(get_iteration_files(args.rnnlm_dir)) + keep_best(iterations) +# print(get_iteration_files(args.rnnlm_dir)) # FIXME remove From 430b91e1199883e384c1369b7350573fd399a140 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andris=20V=C4=81ravs?= Date: Tue, 27 Nov 2018 17:21:50 +0200 Subject: [PATCH 04/12] rnnlm_cleanup.py: added copyright/license header, some comments and minor cleanup --- scripts/rnnlm/rnnlm_cleanup.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/scripts/rnnlm/rnnlm_cleanup.py b/scripts/rnnlm/rnnlm_cleanup.py index 61566f0e662..330378a81b2 100644 --- a/scripts/rnnlm/rnnlm_cleanup.py +++ b/scripts/rnnlm/rnnlm_cleanup.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 +# Copyright 2018 Tilde +# License: Apache 2.0 + import sys import argparse @@ -8,11 +11,13 @@ script_name = sys.argv[0] -# TODO decent description parser = argparse.ArgumentParser(description="Removes models from past training iterations of " - "RNNLM. Several strategies for picking which iterations " - "to keep are available.", - epilog="E.g. " + script_name + " exp/rnnlm_a", + "RNNLM. Can use either 'keep_latest' (default) or " + "'keep_best' cleanup strategy, where former keeps " + "the models that are freshest, while latter keeps " + "the models with best training objective score on " + "dev set.", + epilog="E.g. " + script_name + " exp/rnnlm_a --keep_best", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("rnnlm_dir", @@ -36,7 +41,7 @@ # validate arguments if args.keep_latest and args.keep_best: - sys.exit(script_name + ": can only use either 'keep_latest' or 'keep_best', but not both") + sys.exit(script_name + ": can only use one of 'keep_latest' or 'keep_best', but not both") elif not args.keep_latest and not args.keep_best: sys.exit(script_name + ": no cleanup strategy specified: use 'keep_latest' or 'keep_best'") @@ -149,12 +154,10 @@ def keep_best(iteration_dict): remove_model_files_for_iter(iteration_dict[iter]) -# grab all the iterations mapped to their word_embedding and .raw files +# grab all the iterations mapped to their model files, objf score and compute_prob status iterations = get_iteration_files(args.rnnlm_dir) -# print(iterations) # FIXME remove # apply chosen cleanup strategy if args.keep_latest: keep_latest(iterations) else: keep_best(iterations) -# print(get_iteration_files(args.rnnlm_dir)) # FIXME remove From c50364eae9bc9ed220940e2aa684a686ecb4c851 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andris=20V=C4=81ravs?= Date: Tue, 27 Nov 2018 17:23:41 +0200 Subject: [PATCH 05/12] train_rnnlm.sh: initial cleanup script integration --- scripts/rnnlm/train_rnnlm.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/scripts/rnnlm/train_rnnlm.sh b/scripts/rnnlm/train_rnnlm.sh index aedfc470ac9..7ac0ca2fdab 100755 --- a/scripts/rnnlm/train_rnnlm.sh +++ b/scripts/rnnlm/train_rnnlm.sh @@ -38,6 +38,11 @@ num_egs_threads=10 # number of threads used for sampling, if we're using use_gpu=true # use GPU for training use_gpu_for_diagnostics=false # set true to use GPU for compute_prob_*.log +# optional cleanup options +cleanup=false # add option --cleanup true to enable automatic cleanup of old models +cleanup_strategy="keep_latest" # determines cleanup strategy, use either "keep_latest" or "keep_best" +cleanup_keep_iters=3 # number of iterations that will have their models retained + trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM . utils/parse_options.sh @@ -228,6 +233,10 @@ while [ $x -lt $num_iters ]; do # have printed a more specific one. [ -f $dir/.error ] && echo "$0: error with diagnostics on iteration $x of training" && exit 1; fi + # optionally, perform cleanup + if [ "$cleanup" = true ] ; then + python3 rnnlm_cleanup.py $dir --$cleanup_strategy --iters_to_keep $cleanup_keep_iters + fi x=$[x+1] num_splits_processed=$[num_splits_processed+this_num_jobs] done From d124106ca5e32d2813d623753aa2c2de345073b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andris=20V=C4=81ravs?= Date: Wed, 28 Nov 2018 09:40:41 +0200 Subject: [PATCH 06/12] rnnlm_cleanup.py: get_compute_prob_info now skips files without compute_prob log instead of exiting on them --- scripts/rnnlm/rnnlm_cleanup.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/scripts/rnnlm/rnnlm_cleanup.py b/scripts/rnnlm/rnnlm_cleanup.py index 330378a81b2..e3b2c2073de 100644 --- a/scripts/rnnlm/rnnlm_cleanup.py +++ b/scripts/rnnlm/rnnlm_cleanup.py @@ -64,15 +64,17 @@ def __repr__(self): def get_compute_prob_info(exp_dir, iter): + # we want to know 2 things: objf and whether compute prob is done + objf = -2000 + compute_prob_done = False # roughly based on code in get_best_model.py log_file = "{0}/log/compute_prob.{1}.log".format(exp_dir, iter) try: f = open(log_file, "r", encoding="latin-1") except: - sys.exit(script_name + ": could not open log-file " + log_file) - # we now want 2 things: objf and whether compute prob is done - objf = -2000 - compute_prob_done = False + print(script_name + ": warning: compute_prob log not found for iteration " + str(iter) + ". Skipping", + file=sys.stderr) + return objf, compute_prob_done for line in f: objf_m = re.search('Overall objf .* (\S+)$', str(line)) if objf_m is not None: @@ -89,6 +91,7 @@ def get_compute_prob_info(exp_dir, iter): def get_iteration_files(exp_dir): + # TODO handle the case where there are several files per iteration... iterations = dict() for f in os.listdir(exp_dir): joined_f = os.path.join(exp_dir, f) From 0536107720fac658aa628831e72e36c6a4a65324 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andris=20V=C4=81ravs?= Date: Wed, 28 Nov 2018 10:48:35 +0200 Subject: [PATCH 07/12] rnnlm_cleanup.py: iteration model files are now listed based on compute_prob log files --- scripts/rnnlm/rnnlm_cleanup.py | 81 +++++++++++++++------------------- 1 file changed, 35 insertions(+), 46 deletions(-) diff --git a/scripts/rnnlm/rnnlm_cleanup.py b/scripts/rnnlm/rnnlm_cleanup.py index e3b2c2073de..39f3f4d5d42 100644 --- a/scripts/rnnlm/rnnlm_cleanup.py +++ b/scripts/rnnlm/rnnlm_cleanup.py @@ -8,6 +8,7 @@ import argparse import os import re +import glob script_name = sys.argv[0] @@ -47,34 +48,33 @@ class IterationInfo: - def __init__(self, word_embedding_file, raw_file, objf, compute_prob_done): - self.word_embedding_file = word_embedding_file - self.raw_file = raw_file + def __init__(self, model_files, objf, compute_prob_done): + self.model_files = model_files self.objf = objf self.compute_prob_done = compute_prob_done def __str__(self): - return "{word_embedding: %s, raw: %s, compute_prob: %s, objf: %2.3f}" % (self.word_embedding_file, - self.raw_file, - self.compute_prob_done, - self.objf) + return "{model_files: %s, compute_prob: %s, objf: %2.3f}" % (self.model_files, + self.compute_prob_done, + self.objf) def __repr__(self): return self.__str__() -def get_compute_prob_info(exp_dir, iter): - # we want to know 2 things: objf and whether compute prob is done +def get_compute_prob_info(log_file): + # we want to know 3 things: iteration number, objf and whether compute prob is done + iteration = int(log_file.split(".")[-2]) objf = -2000 compute_prob_done = False # roughly based on code in get_best_model.py - log_file = "{0}/log/compute_prob.{1}.log".format(exp_dir, iter) try: f = open(log_file, "r", encoding="latin-1") except: - print(script_name + ": warning: compute_prob log not found for iteration " + str(iter) + ". Skipping", + print(script_name + ": warning: compute_prob log not found for iteration " + + str(iter) + ". Skipping", file=sys.stderr) - return objf, compute_prob_done + return iteration, objf, compute_prob_done for line in f: objf_m = re.search('Overall objf .* (\S+)$', str(line)) if objf_m is not None: @@ -87,38 +87,33 @@ def get_compute_prob_info(exp_dir, iter): compute_prob_done = True if objf == -2000: print(script_name + ": warning: could not parse objective function from " + log_file, file=sys.stderr) - return objf, compute_prob_done + return iteration, objf, compute_prob_done def get_iteration_files(exp_dir): - # TODO handle the case where there are several files per iteration... iterations = dict() - for f in os.listdir(exp_dir): - joined_f = os.path.join(exp_dir, f) - if os.path.isfile(joined_f) and (f.startswith("word_embedding") or f.endswith(".raw")): - split = f.split(".") - ext = split[-1] - iter = int(split[-2]) - objf, compute_prob_done = get_compute_prob_info(exp_dir, iter) - if iter in iterations: - iter_info = iterations[iter] - if ext == "raw": - iter_info.raw_file = joined_f - else: - iter_info.word_embedding_file = joined_f - iter_info.objf = objf - iter_info.compute_prob_done = compute_prob_done - else: - if ext == "raw": - iterations[iter] = IterationInfo(None, joined_f, objf, compute_prob_done) - else: - iterations[iter] = IterationInfo(joined_f, None, objf, compute_prob_done) + compute_prob_logs = glob.glob(exp_dir + "/log/compute_prob.[0-9]*.log") + for log in compute_prob_logs: + iteration, objf, compute_prob_done = get_compute_prob_info(log) + if compute_prob_done: + # this iteration can be safely considered for cleanup + # gather all model files belonging to it + model_files = [] + # when there are multiple jobs per iteration, there can be several model files + # we need to potentially clean them all up without mixing them up + model_files.extend(glob.glob("{0}/word_embedding.{1}.mat".format(exp_dir, iteration))) + model_files.extend(glob.glob("{0}/word_embedding.{1}.[0-9]*.mat".format(exp_dir, iteration))) + model_files.extend(glob.glob("{0}/{1}.raw".format(exp_dir, iteration))) + model_files.extend(glob.glob("{0}/{1}.[0-9]*.raw".format(exp_dir, iteration))) + # compute_prob logs outlive model files, only consider iterations that do still have model files + if len(model_files) > 0: + iterations[iteration] = IterationInfo(model_files, objf, compute_prob_done) return iterations def remove_model_files_for_iter(iter_info): - os.remove(iter_info.word_embedding_file) - os.remove(iter_info.raw_file) + for f in iter_info.model_files: + os.remove(f) def keep_latest(iteration_dict): @@ -126,12 +121,10 @@ def keep_latest(iteration_dict): kept = 0 iterations_in_reverse_order = reversed(sorted(iteration_dict)) for iter in iterations_in_reverse_order: - # check if compute prob is done for this iteration, if not, leave it for future cleanups... - if iteration_dict[iter].compute_prob_done: - if kept < max_to_keep: - kept += 1 - else: - remove_model_files_for_iter(iteration_dict[iter]) + if kept < max_to_keep: + kept += 1 + else: + remove_model_files_for_iter(iteration_dict[iter]) def keep_best(iteration_dict): @@ -142,10 +135,6 @@ def keep_best(iteration_dict): if objf == -2000: print(script_name + ": warning: objf unavailable for iter " + str(iter), file=sys.stderr) continue - if not iter_info.compute_prob_done: - # if compute_prob is not done, yet, we leave it for future cleanups - print(script_name + ": warning: compute_prob not done yet for iter " + str(iter), file=sys.stderr) - continue # add potential best, sort by objf, trim to iters_to_keep size best.append((iter, objf)) best = sorted(best, key=lambda x: -x[1]) From 1d692bd97f5b5a96498ce24124027967e5bbc5ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andris=20V=C4=81ravs?= Date: Wed, 28 Nov 2018 10:49:09 +0200 Subject: [PATCH 08/12] train_rnnlm.sh: fixed cleanup script invocation --- scripts/rnnlm/train_rnnlm.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/rnnlm/train_rnnlm.sh b/scripts/rnnlm/train_rnnlm.sh index 7ac0ca2fdab..d6d38f3d734 100755 --- a/scripts/rnnlm/train_rnnlm.sh +++ b/scripts/rnnlm/train_rnnlm.sh @@ -227,16 +227,16 @@ while [ $x -lt $num_iters ]; do nnet3-average $src_models $dir/$[x+1].raw '&&' \ matrix-sum --average=true $src_matrices $dir/${embedding_type}_embedding.$[x+1].mat fi + # optionally, perform cleanup after training + if [ "$cleanup" = true ] ; then + python3 rnnlm/rnnlm_cleanup.py $dir --$cleanup_strategy --iters_to_keep $cleanup_keep_iters + fi ) - # the error message below is not that informative, but $cmd will # have printed a more specific one. [ -f $dir/.error ] && echo "$0: error with diagnostics on iteration $x of training" && exit 1; fi - # optionally, perform cleanup - if [ "$cleanup" = true ] ; then - python3 rnnlm_cleanup.py $dir --$cleanup_strategy --iters_to_keep $cleanup_keep_iters - fi + x=$[x+1] num_splits_processed=$[num_splits_processed+this_num_jobs] done From 25291d537e99c23498755087bbe0a71c1b7b37d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andris=20V=C4=81ravs?= Date: Wed, 28 Nov 2018 11:53:01 +0200 Subject: [PATCH 09/12] get_best_model.py: now it only considers iterations that do still have model files present --- scripts/rnnlm/get_best_model.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/scripts/rnnlm/get_best_model.py b/scripts/rnnlm/get_best_model.py index e8c6bd8a2f4..be0e63f9417 100755 --- a/scripts/rnnlm/get_best_model.py +++ b/scripts/rnnlm/get_best_model.py @@ -3,14 +3,14 @@ # Copyright 2017 Johns Hopkins University (author: Daniel Povey) # License: Apache 2.0. -import os import argparse -import sys +import glob import re +import sys parser = argparse.ArgumentParser(description="Works out the best iteration of RNNLM training " - "based on dev-set perplexity, and prints the number corresponding " - "to that iteration", + "based on dev-set perplexity, and prints the number corresponding " + "to that iteration", epilog="E.g. " + sys.argv[0] + " exp/rnnlm_a", formatter_class=argparse.ArgumentDefaultsHelpFormatter) @@ -19,8 +19,7 @@ args = parser.parse_args() - -num_iters=None +num_iters = None try: with open(args.rnnlm_dir + "/info.txt", encoding="latin-1") as f: for line in f: @@ -36,15 +35,15 @@ sys.exit(sys.argv[0] + ": could not get num_iters from {0}/info.txt".format( args.rnnlm_dir)) -best_objf=-2000 -best_iter=-1 +best_objf = -2000 +best_iter = -1 for i in range(1, num_iters): this_logfile = "{0}/log/compute_prob.{1}.log".format(args.rnnlm_dir, i) try: f = open(this_logfile, 'r', encoding='latin-1') except: sys.exit(sys.argv[0] + ": could not open log-file {0}".format(this_logfile)) - this_objf=-1000 + this_objf = -1000 for line in f: m = re.search('Overall objf .* (\S+)$', str(line)) if m is not None: @@ -53,6 +52,11 @@ except Exception as e: sys.exit(sys.argv[0] + ": line in file {0} could not be parsed: {1}, error is: {2}".format( this_logfile, line, str(e))) + # verify this iteration still has model files present + if len(glob.glob("{0}/word_embedding.{1}.mat".format(args.rnnlm_dir, i))) == 0: + print(sys.argv[0] + ": warning: no model files found for iteration {0}. Skipping.".format(i), + file=sys.stderr) + continue if this_objf == -1000: print(sys.argv[0] + ": warning: could not parse objective function from {0}".format( this_logfile), file=sys.stderr) @@ -63,5 +67,4 @@ if best_iter == -1: sys.exit(sys.argv[0] + ": error: could not get best iteration.") - print(str(best_iter)) From fdf1a125fda40ea1ec35c561265a2722edea5d4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andris=20V=C4=81ravs?= Date: Thu, 29 Nov 2018 16:30:50 +0200 Subject: [PATCH 10/12] rnnlm_cleanup.py: never touch files belonging to iteration 0 --- scripts/rnnlm/rnnlm_cleanup.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/rnnlm/rnnlm_cleanup.py b/scripts/rnnlm/rnnlm_cleanup.py index 39f3f4d5d42..892e12689ad 100644 --- a/scripts/rnnlm/rnnlm_cleanup.py +++ b/scripts/rnnlm/rnnlm_cleanup.py @@ -95,6 +95,9 @@ def get_iteration_files(exp_dir): compute_prob_logs = glob.glob(exp_dir + "/log/compute_prob.[0-9]*.log") for log in compute_prob_logs: iteration, objf, compute_prob_done = get_compute_prob_info(log) + if iteration == 0: + # iteration 0 is special, never consider it for cleanup + continue if compute_prob_done: # this iteration can be safely considered for cleanup # gather all model files belonging to it From 36a74815f97043be44de1e4cb2be9afa288e3fa6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andris=20V=C4=81ravs?= Date: Fri, 30 Nov 2018 08:57:06 +0200 Subject: [PATCH 11/12] get_best_model.py: model-less iterations will no longer trigger confusing warnings as, given cleanup, it's normal for model files to be absent for most iteration --- scripts/rnnlm/get_best_model.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/rnnlm/get_best_model.py b/scripts/rnnlm/get_best_model.py index be0e63f9417..ba3c6ccb744 100755 --- a/scripts/rnnlm/get_best_model.py +++ b/scripts/rnnlm/get_best_model.py @@ -54,8 +54,7 @@ this_logfile, line, str(e))) # verify this iteration still has model files present if len(glob.glob("{0}/word_embedding.{1}.mat".format(args.rnnlm_dir, i))) == 0: - print(sys.argv[0] + ": warning: no model files found for iteration {0}. Skipping.".format(i), - file=sys.stderr) + # this iteration has log files, but model files have been cleaned up, skip it continue if this_objf == -1000: print(sys.argv[0] + ": warning: could not parse objective function from {0}".format( From 23ea8ad3fd1d4c5f05a4fb7176cc873a7a750530 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andris=20V=C4=81ravs?= Date: Wed, 5 Dec 2018 12:01:24 +0200 Subject: [PATCH 12/12] fixed "feat_embedding" files not considered during cleanup and get_best_model.py --- scripts/rnnlm/get_best_model.py | 2 +- scripts/rnnlm/rnnlm_cleanup.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/rnnlm/get_best_model.py b/scripts/rnnlm/get_best_model.py index ba3c6ccb744..333ed8dbfc7 100755 --- a/scripts/rnnlm/get_best_model.py +++ b/scripts/rnnlm/get_best_model.py @@ -53,7 +53,7 @@ sys.exit(sys.argv[0] + ": line in file {0} could not be parsed: {1}, error is: {2}".format( this_logfile, line, str(e))) # verify this iteration still has model files present - if len(glob.glob("{0}/word_embedding.{1}.mat".format(args.rnnlm_dir, i))) == 0: + if len(glob.glob("{0}/{1}.raw".format(args.rnnlm_dir, i))) == 0: # this iteration has log files, but model files have been cleaned up, skip it continue if this_objf == -1000: diff --git a/scripts/rnnlm/rnnlm_cleanup.py b/scripts/rnnlm/rnnlm_cleanup.py index 892e12689ad..40cbee7a496 100644 --- a/scripts/rnnlm/rnnlm_cleanup.py +++ b/scripts/rnnlm/rnnlm_cleanup.py @@ -106,6 +106,8 @@ def get_iteration_files(exp_dir): # we need to potentially clean them all up without mixing them up model_files.extend(glob.glob("{0}/word_embedding.{1}.mat".format(exp_dir, iteration))) model_files.extend(glob.glob("{0}/word_embedding.{1}.[0-9]*.mat".format(exp_dir, iteration))) + model_files.extend(glob.glob("{0}/feat_embedding.{1}.mat".format(exp_dir, iteration))) + model_files.extend(glob.glob("{0}/feat_embedding.{1}.[0-9]*.mat".format(exp_dir, iteration))) model_files.extend(glob.glob("{0}/{1}.raw".format(exp_dir, iteration))) model_files.extend(glob.glob("{0}/{1}.[0-9]*.raw".format(exp_dir, iteration))) # compute_prob logs outlive model files, only consider iterations that do still have model files