From 92e111c63798edbe25137f32e2a9d6968e8fcf64 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andris=20V=C4=81ravs?= <andris.varavs@Tilde.lv>
Date: Mon, 26 Nov 2018 14:45:18 +0200
Subject: [PATCH 01/12] initial WIP version of rnnlm_cleanup.py

---
 scripts/rnnlm/rnnlm_cleanup.py | 92 ++++++++++++++++++++++++++++++++++
 1 file changed, 92 insertions(+)
 create mode 100644 scripts/rnnlm/rnnlm_cleanup.py

diff --git a/scripts/rnnlm/rnnlm_cleanup.py b/scripts/rnnlm/rnnlm_cleanup.py
new file mode 100644
index 00000000000..6cd75f318ec
--- /dev/null
+++ b/scripts/rnnlm/rnnlm_cleanup.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+
+import argparse
+import sys
+import os
+
+script_name = sys.argv[0]
+
+# TODO decent description
+parser = argparse.ArgumentParser(description="Removes models from past training iterations of "
+                                             "RNNLM. Several strategies for picking which iterations "
+                                             "to keep are available.",
+                                 epilog="E.g. " + script_name + " exp/rnnlm_a",
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+parser.add_argument("rnnlm_dir",
+                    help="Directory where the RNNLM has been trained")
+parser.add_argument("last_iteration",
+                    help="Number of the last iteration",
+                    type=int)
+parser.add_argument("--iters_to_keep",
+                    help="Max number of iterations to keep",
+                    type=int,
+                    default=3)
+parser.add_argument("--keep_latest",
+                    help="Keeps the training iterations that are latest by age",
+                    action="store_const",
+                    const=True,
+                    default=False)
+parser.add_argument("--keep_best",
+                    help="Keeps the training iterations that have the best objf",
+                    action="store_const",
+                    const=True,
+                    default=False)
+
+args = parser.parse_args()
+
+# validate arguments
+if args.keep_latest and args.keep_best:
+    sys.exit(script_name + ": can only use either 'keep_latest' or 'keep_best', but not both")
+elif not args.keep_latest and not args.keep_best:
+    sys.exit(script_name + ": no cleanup strategy specified: use 'keep_latest' or 'keep_best'")
+
+# TODO now for some actual logic............
+# check exp dir for model files
+# list all files there, look for word_embedding.%d.mat and %d.raw files
+# if keep_best, check compute_prob logs for best eval scores or adapt/use get_best_model.py
+# if keep_latest, find the latest iteration that is not used or rely on last_iteration arg?
+
+
+def get_iteration_files(exp_dir):
+    iterations = dict()
+    for f in os.listdir(exp_dir):
+        joined_f = os.path.join(exp_dir, f)
+        if os.path.isfile(joined_f) and (f.startswith("word_embedding") or f.endswith(".raw")):
+            split = f.split(".")
+            ext = split[-1]
+            iter = int(split[-2])
+            if iter in iterations:
+                if ext == "raw":
+                    iterations[iter] = (iterations[iter][0], joined_f)
+                else:
+                    iterations[iter] = (joined_f, iterations[iter][1])
+            else:
+                if ext == "raw":
+                    iterations[iter] = (None, joined_f)
+                else:
+                    iterations[iter] = (joined_f, None)
+    return iterations
+
+
+def keep_latest(iteration_dict):
+    max_to_keep = args.iters_to_keep
+    kept = 0
+    iterations_in_reverse_order = reversed(sorted(iteration_dict))
+    for iter in iterations_in_reverse_order:
+        if kept < max_to_keep:
+            kept += 1
+        else:
+            iter_files = iteration_dict[iter]
+            os.remove(iter_files[0])
+            os.remove(iter_files[1])
+
+
+# TODO just testing
+iterations = get_iteration_files(args.rnnlm_dir)
+print(iterations)
+keep_latest(iterations)
+print(get_iteration_files(args.rnnlm_dir))
+
+
+# TODO implement rest of the bookkeeping
\ No newline at end of file

From afe09ae0120a3be5d1526cf19d0d48ade918bb4a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andris=20V=C4=81ravs?= <andris.varavs@Tilde.lv>
Date: Tue, 27 Nov 2018 11:48:07 +0200
Subject: [PATCH 02/12] working version of both keep_latest and keep_best

---
 scripts/rnnlm/rnnlm_cleanup.py | 59 ++++++++++++++++++++++++++--------
 1 file changed, 45 insertions(+), 14 deletions(-)

diff --git a/scripts/rnnlm/rnnlm_cleanup.py b/scripts/rnnlm/rnnlm_cleanup.py
index 6cd75f318ec..3f942fb9458 100644
--- a/scripts/rnnlm/rnnlm_cleanup.py
+++ b/scripts/rnnlm/rnnlm_cleanup.py
@@ -3,6 +3,7 @@
 import argparse
 import sys
 import os
+import re
 
 script_name = sys.argv[0]
 
@@ -15,9 +16,9 @@
 
 parser.add_argument("rnnlm_dir",
                     help="Directory where the RNNLM has been trained")
-parser.add_argument("last_iteration",
-                    help="Number of the last iteration",
-                    type=int)
+# parser.add_argument("last_iteration",
+#                     help="Number of the last iteration",
+#                     type=int)
 parser.add_argument("--iters_to_keep",
                     help="Max number of iterations to keep",
                     type=int,
@@ -41,12 +42,6 @@
 elif not args.keep_latest and not args.keep_best:
     sys.exit(script_name + ": no cleanup strategy specified: use 'keep_latest' or 'keep_best'")
 
-# TODO now for some actual logic............
-# check exp dir for model files
-# list all files there, look for word_embedding.%d.mat and %d.raw files
-# if keep_best, check compute_prob logs for best eval scores or adapt/use get_best_model.py
-# if keep_latest, find the latest iteration that is not used or rely on last_iteration arg?
-
 
 def get_iteration_files(exp_dir):
     iterations = dict()
@@ -82,11 +77,47 @@ def keep_latest(iteration_dict):
             os.remove(iter_files[1])
 
 
-# TODO just testing
+def keep_best(iteration_dict, exp_dir):
+    iters_to_keep = args.iters_to_keep
+    best = []
+    for iter, iter_files in iteration_dict.items():
+        # this is roughly taken from get_best_model.py
+        logfile = "{0}/log/compute_prob.{1}.log".format(exp_dir, iter)
+        try:
+            f = open(logfile, "r", encoding="latin-1")
+        except:
+            sys.exit(script_name + ": could not open log-file " + logfile)
+        objf = -2000
+        for line in f:
+            m = re.search('Overall objf .* (\S+)$', str(line))
+            if m is not None:
+                try:
+                    objf = float(m.group(1))
+                except Exception as e:
+                    sys.exit(script_name + ": line in file {0} could not be parsed: {1}, error is: {2}".format(
+                        logfile, line, str(e)))
+        if objf == -2000:
+            print(script_name + ": warning: could not parse objective function from " + logfile, file=sys.stderr)
+            continue
+        # add potential best, sort by objf, trim to iters_to_keep size
+        best.append((iter, objf))
+        best = sorted(best, key=lambda x: -x[1])
+        if len(best) > iters_to_keep:
+            throwaway = best[iters_to_keep:]
+            best = best[:iters_to_keep]
+            # remove iters that we know are not the best
+            for (iter, _) in throwaway:
+                iter_files = iteration_dict[iter]
+                os.remove(iter_files[0])
+                os.remove(iter_files[1])
+
+
+# grab all the iterations mapped to their word_embedding and .raw files
 iterations = get_iteration_files(args.rnnlm_dir)
 print(iterations)
-keep_latest(iterations)
+# apply chosen cleanup strategy
+if args.keep_latest:
+    keep_latest(iterations)
+else:
+    keep_best(iterations, args.rnnlm_dir)
 print(get_iteration_files(args.rnnlm_dir))
-
-
-# TODO implement rest of the bookkeeping
\ No newline at end of file

From 085b400081e2cb010df9878475cec667153125e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andris=20V=C4=81ravs?= <andris.varavs@Tilde.lv>
Date: Tue, 27 Nov 2018 16:13:28 +0200
Subject: [PATCH 03/12] cleanup now only considers those iterations for which
 rnnlm_compute_prob has finished

---
 scripts/rnnlm/rnnlm_cleanup.py | 113 ++++++++++++++++++++++-----------
 1 file changed, 75 insertions(+), 38 deletions(-)

diff --git a/scripts/rnnlm/rnnlm_cleanup.py b/scripts/rnnlm/rnnlm_cleanup.py
index 3f942fb9458..61566f0e662 100644
--- a/scripts/rnnlm/rnnlm_cleanup.py
+++ b/scripts/rnnlm/rnnlm_cleanup.py
@@ -1,7 +1,8 @@
 #!/usr/bin/env python3
 
-import argparse
 import sys
+
+import argparse
 import os
 import re
 
@@ -16,9 +17,6 @@
 
 parser.add_argument("rnnlm_dir",
                     help="Directory where the RNNLM has been trained")
-# parser.add_argument("last_iteration",
-#                     help="Number of the last iteration",
-#                     type=int)
 parser.add_argument("--iters_to_keep",
                     help="Max number of iterations to keep",
                     type=int,
@@ -43,6 +41,48 @@
     sys.exit(script_name + ": no cleanup strategy specified: use 'keep_latest' or 'keep_best'")
 
 
+class IterationInfo:
+    def __init__(self, word_embedding_file, raw_file, objf, compute_prob_done):
+        self.word_embedding_file = word_embedding_file
+        self.raw_file = raw_file
+        self.objf = objf
+        self.compute_prob_done = compute_prob_done
+
+    def __str__(self):
+        return "{word_embedding: %s, raw: %s, compute_prob: %s, objf: %2.3f}" % (self.word_embedding_file,
+                                                                                 self.raw_file,
+                                                                                 self.compute_prob_done,
+                                                                                 self.objf)
+
+    def __repr__(self):
+        return self.__str__()
+
+
+def get_compute_prob_info(exp_dir, iter):
+    # roughly based on code in get_best_model.py
+    log_file = "{0}/log/compute_prob.{1}.log".format(exp_dir, iter)
+    try:
+        f = open(log_file, "r", encoding="latin-1")
+    except:
+        sys.exit(script_name + ": could not open log-file " + log_file)
+    # we now want 2 things: objf and whether compute prob is done
+    objf = -2000
+    compute_prob_done = False
+    for line in f:
+        objf_m = re.search('Overall objf .* (\S+)$', str(line))
+        if objf_m is not None:
+            try:
+                objf = float(objf_m.group(1))
+            except Exception as e:
+                sys.exit(script_name + ": line in file {0} could not be parsed: {1}, error is: {2}".format(
+                    log_file, line, str(e)))
+        if "# Ended" in line:
+            compute_prob_done = True
+    if objf == -2000:
+        print(script_name + ": warning: could not parse objective function from " + log_file, file=sys.stderr)
+    return objf, compute_prob_done
+
+
 def get_iteration_files(exp_dir):
     iterations = dict()
     for f in os.listdir(exp_dir):
@@ -51,53 +91,52 @@ def get_iteration_files(exp_dir):
             split = f.split(".")
             ext = split[-1]
             iter = int(split[-2])
+            objf, compute_prob_done = get_compute_prob_info(exp_dir, iter)
             if iter in iterations:
+                iter_info = iterations[iter]
                 if ext == "raw":
-                    iterations[iter] = (iterations[iter][0], joined_f)
+                    iter_info.raw_file = joined_f
                 else:
-                    iterations[iter] = (joined_f, iterations[iter][1])
+                    iter_info.word_embedding_file = joined_f
+                iter_info.objf = objf
+                iter_info.compute_prob_done = compute_prob_done
             else:
                 if ext == "raw":
-                    iterations[iter] = (None, joined_f)
+                    iterations[iter] = IterationInfo(None, joined_f, objf, compute_prob_done)
                 else:
-                    iterations[iter] = (joined_f, None)
+                    iterations[iter] = IterationInfo(joined_f, None, objf, compute_prob_done)
     return iterations
 
 
+def remove_model_files_for_iter(iter_info):
+    os.remove(iter_info.word_embedding_file)
+    os.remove(iter_info.raw_file)
+
+
 def keep_latest(iteration_dict):
     max_to_keep = args.iters_to_keep
     kept = 0
     iterations_in_reverse_order = reversed(sorted(iteration_dict))
     for iter in iterations_in_reverse_order:
-        if kept < max_to_keep:
-            kept += 1
-        else:
-            iter_files = iteration_dict[iter]
-            os.remove(iter_files[0])
-            os.remove(iter_files[1])
+        # check if compute prob is done for this iteration, if not, leave it for future cleanups...
+        if iteration_dict[iter].compute_prob_done:
+            if kept < max_to_keep:
+                kept += 1
+            else:
+                remove_model_files_for_iter(iteration_dict[iter])
 
 
-def keep_best(iteration_dict, exp_dir):
+def keep_best(iteration_dict):
     iters_to_keep = args.iters_to_keep
     best = []
-    for iter, iter_files in iteration_dict.items():
-        # this is roughly taken from get_best_model.py
-        logfile = "{0}/log/compute_prob.{1}.log".format(exp_dir, iter)
-        try:
-            f = open(logfile, "r", encoding="latin-1")
-        except:
-            sys.exit(script_name + ": could not open log-file " + logfile)
-        objf = -2000
-        for line in f:
-            m = re.search('Overall objf .* (\S+)$', str(line))
-            if m is not None:
-                try:
-                    objf = float(m.group(1))
-                except Exception as e:
-                    sys.exit(script_name + ": line in file {0} could not be parsed: {1}, error is: {2}".format(
-                        logfile, line, str(e)))
+    for iter, iter_info in iteration_dict.items():
+        objf = iter_info.objf
         if objf == -2000:
-            print(script_name + ": warning: could not parse objective function from " + logfile, file=sys.stderr)
+            print(script_name + ": warning: objf unavailable for iter " + str(iter), file=sys.stderr)
+            continue
+        if not iter_info.compute_prob_done:
+            # if compute_prob is not done, yet, we leave it for future cleanups
+            print(script_name + ": warning: compute_prob not done yet for iter " + str(iter), file=sys.stderr)
             continue
         # add potential best, sort by objf, trim to iters_to_keep size
         best.append((iter, objf))
@@ -107,17 +146,15 @@ def keep_best(iteration_dict, exp_dir):
             best = best[:iters_to_keep]
             # remove iters that we know are not the best
             for (iter, _) in throwaway:
-                iter_files = iteration_dict[iter]
-                os.remove(iter_files[0])
-                os.remove(iter_files[1])
+                remove_model_files_for_iter(iteration_dict[iter])
 
 
 # grab all the iterations mapped to their word_embedding and .raw files
 iterations = get_iteration_files(args.rnnlm_dir)
-print(iterations)
+# print(iterations)  # FIXME remove
 # apply chosen cleanup strategy
 if args.keep_latest:
     keep_latest(iterations)
 else:
-    keep_best(iterations, args.rnnlm_dir)
-print(get_iteration_files(args.rnnlm_dir))
+    keep_best(iterations)
+# print(get_iteration_files(args.rnnlm_dir))  # FIXME remove

From 430b91e1199883e384c1369b7350573fd399a140 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andris=20V=C4=81ravs?= <andris.varavs@Tilde.lv>
Date: Tue, 27 Nov 2018 17:21:50 +0200
Subject: [PATCH 04/12] rnnlm_cleanup.py: added copyright/license header, some
 comments and minor cleanup

---
 scripts/rnnlm/rnnlm_cleanup.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/scripts/rnnlm/rnnlm_cleanup.py b/scripts/rnnlm/rnnlm_cleanup.py
index 61566f0e662..330378a81b2 100644
--- a/scripts/rnnlm/rnnlm_cleanup.py
+++ b/scripts/rnnlm/rnnlm_cleanup.py
@@ -1,5 +1,8 @@
 #!/usr/bin/env python3
 
+# Copyright 2018 Tilde
+# License: Apache 2.0
+
 import sys
 
 import argparse
@@ -8,11 +11,13 @@
 
 script_name = sys.argv[0]
 
-# TODO decent description
 parser = argparse.ArgumentParser(description="Removes models from past training iterations of "
-                                             "RNNLM. Several strategies for picking which iterations "
-                                             "to keep are available.",
-                                 epilog="E.g. " + script_name + " exp/rnnlm_a",
+                                             "RNNLM. Can use either 'keep_latest' (default) or "
+                                             "'keep_best' cleanup strategy, where former keeps "
+                                             "the models that are freshest, while latter keeps "
+                                             "the models with best training objective score on "
+                                             "dev set.",
+                                 epilog="E.g. " + script_name + " exp/rnnlm_a --keep_best",
                                  formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 
 parser.add_argument("rnnlm_dir",
@@ -36,7 +41,7 @@
 
 # validate arguments
 if args.keep_latest and args.keep_best:
-    sys.exit(script_name + ": can only use either 'keep_latest' or 'keep_best', but not both")
+    sys.exit(script_name + ": can only use one of 'keep_latest' or 'keep_best', but not both")
 elif not args.keep_latest and not args.keep_best:
     sys.exit(script_name + ": no cleanup strategy specified: use 'keep_latest' or 'keep_best'")
 
@@ -149,12 +154,10 @@ def keep_best(iteration_dict):
                 remove_model_files_for_iter(iteration_dict[iter])
 
 
-# grab all the iterations mapped to their word_embedding and .raw files
+# grab all the iterations mapped to their model files, objf score and compute_prob status
 iterations = get_iteration_files(args.rnnlm_dir)
-# print(iterations)  # FIXME remove
 # apply chosen cleanup strategy
 if args.keep_latest:
     keep_latest(iterations)
 else:
     keep_best(iterations)
-# print(get_iteration_files(args.rnnlm_dir))  # FIXME remove

From c50364eae9bc9ed220940e2aa684a686ecb4c851 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andris=20V=C4=81ravs?= <andris.varavs@Tilde.lv>
Date: Tue, 27 Nov 2018 17:23:41 +0200
Subject: [PATCH 05/12] train_rnnlm.sh: initial cleanup script integration

---
 scripts/rnnlm/train_rnnlm.sh | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/scripts/rnnlm/train_rnnlm.sh b/scripts/rnnlm/train_rnnlm.sh
index aedfc470ac9..7ac0ca2fdab 100755
--- a/scripts/rnnlm/train_rnnlm.sh
+++ b/scripts/rnnlm/train_rnnlm.sh
@@ -38,6 +38,11 @@ num_egs_threads=10  # number of threads used for sampling, if we're using
 use_gpu=true  # use GPU for training
 use_gpu_for_diagnostics=false  # set true to use GPU for compute_prob_*.log
 
+# optional cleanup options
+cleanup=false  # add option --cleanup true to enable automatic cleanup of old models
+cleanup_strategy="keep_latest"  # determines cleanup strategy, use either "keep_latest" or "keep_best"
+cleanup_keep_iters=3  # number of iterations that will have their models retained
+
 trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM
 . utils/parse_options.sh
 
@@ -228,6 +233,10 @@ while [ $x -lt $num_iters ]; do
     # have printed a more specific one.
     [ -f $dir/.error ] && echo "$0: error with diagnostics on iteration $x of training" && exit 1;
   fi
+  # optionally, perform cleanup
+  if [ "$cleanup" = true ] ; then
+    python3 rnnlm_cleanup.py $dir --$cleanup_strategy --iters_to_keep $cleanup_keep_iters
+  fi
   x=$[x+1]
   num_splits_processed=$[num_splits_processed+this_num_jobs]
 done

From d124106ca5e32d2813d623753aa2c2de345073b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andris=20V=C4=81ravs?= <andris.varavs@Tilde.lv>
Date: Wed, 28 Nov 2018 09:40:41 +0200
Subject: [PATCH 06/12] rnnlm_cleanup.py: get_compute_prob_info now skips files
 without compute_prob log instead of exiting on them

---
 scripts/rnnlm/rnnlm_cleanup.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/scripts/rnnlm/rnnlm_cleanup.py b/scripts/rnnlm/rnnlm_cleanup.py
index 330378a81b2..e3b2c2073de 100644
--- a/scripts/rnnlm/rnnlm_cleanup.py
+++ b/scripts/rnnlm/rnnlm_cleanup.py
@@ -64,15 +64,17 @@ def __repr__(self):
 
 
 def get_compute_prob_info(exp_dir, iter):
+    # we want to know 2 things: objf and whether compute prob is done
+    objf = -2000
+    compute_prob_done = False
     # roughly based on code in get_best_model.py
     log_file = "{0}/log/compute_prob.{1}.log".format(exp_dir, iter)
     try:
         f = open(log_file, "r", encoding="latin-1")
     except:
-        sys.exit(script_name + ": could not open log-file " + log_file)
-    # we now want 2 things: objf and whether compute prob is done
-    objf = -2000
-    compute_prob_done = False
+        print(script_name + ": warning: compute_prob log not found for iteration " + str(iter) + ". Skipping",
+              file=sys.stderr)
+        return objf, compute_prob_done
     for line in f:
         objf_m = re.search('Overall objf .* (\S+)$', str(line))
         if objf_m is not None:
@@ -89,6 +91,7 @@ def get_compute_prob_info(exp_dir, iter):
 
 
 def get_iteration_files(exp_dir):
+    # TODO handle the case where there are several files per iteration...
     iterations = dict()
     for f in os.listdir(exp_dir):
         joined_f = os.path.join(exp_dir, f)

From 0536107720fac658aa628831e72e36c6a4a65324 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andris=20V=C4=81ravs?= <andris.varavs@Tilde.lv>
Date: Wed, 28 Nov 2018 10:48:35 +0200
Subject: [PATCH 07/12] rnnlm_cleanup.py: iteration model files are now listed
 based on compute_prob log files

---
 scripts/rnnlm/rnnlm_cleanup.py | 81 +++++++++++++++-------------------
 1 file changed, 35 insertions(+), 46 deletions(-)

diff --git a/scripts/rnnlm/rnnlm_cleanup.py b/scripts/rnnlm/rnnlm_cleanup.py
index e3b2c2073de..39f3f4d5d42 100644
--- a/scripts/rnnlm/rnnlm_cleanup.py
+++ b/scripts/rnnlm/rnnlm_cleanup.py
@@ -8,6 +8,7 @@
 import argparse
 import os
 import re
+import glob
 
 script_name = sys.argv[0]
 
@@ -47,34 +48,33 @@
 
 
 class IterationInfo:
-    def __init__(self, word_embedding_file, raw_file, objf, compute_prob_done):
-        self.word_embedding_file = word_embedding_file
-        self.raw_file = raw_file
+    def __init__(self, model_files, objf, compute_prob_done):
+        self.model_files = model_files
         self.objf = objf
         self.compute_prob_done = compute_prob_done
 
     def __str__(self):
-        return "{word_embedding: %s, raw: %s, compute_prob: %s, objf: %2.3f}" % (self.word_embedding_file,
-                                                                                 self.raw_file,
-                                                                                 self.compute_prob_done,
-                                                                                 self.objf)
+        return "{model_files: %s, compute_prob: %s, objf: %2.3f}" % (self.model_files,
+                                                                     self.compute_prob_done,
+                                                                     self.objf)
 
     def __repr__(self):
         return self.__str__()
 
 
-def get_compute_prob_info(exp_dir, iter):
-    # we want to know 2 things: objf and whether compute prob is done
+def get_compute_prob_info(log_file):
+    # we want to know 3 things: iteration number, objf and whether compute prob is done
+    iteration = int(log_file.split(".")[-2])
     objf = -2000
     compute_prob_done = False
     # roughly based on code in get_best_model.py
-    log_file = "{0}/log/compute_prob.{1}.log".format(exp_dir, iter)
     try:
         f = open(log_file, "r", encoding="latin-1")
     except:
-        print(script_name + ": warning: compute_prob log not found for iteration " + str(iter) + ". Skipping",
+        print(script_name + ": warning: compute_prob log not found for iteration " +
+              str(iter) + ". Skipping",
               file=sys.stderr)
-        return objf, compute_prob_done
+        return iteration, objf, compute_prob_done
     for line in f:
         objf_m = re.search('Overall objf .* (\S+)$', str(line))
         if objf_m is not None:
@@ -87,38 +87,33 @@ def get_compute_prob_info(exp_dir, iter):
             compute_prob_done = True
     if objf == -2000:
         print(script_name + ": warning: could not parse objective function from " + log_file, file=sys.stderr)
-    return objf, compute_prob_done
+    return iteration, objf, compute_prob_done
 
 
 def get_iteration_files(exp_dir):
-    # TODO handle the case where there are several files per iteration...
     iterations = dict()
-    for f in os.listdir(exp_dir):
-        joined_f = os.path.join(exp_dir, f)
-        if os.path.isfile(joined_f) and (f.startswith("word_embedding") or f.endswith(".raw")):
-            split = f.split(".")
-            ext = split[-1]
-            iter = int(split[-2])
-            objf, compute_prob_done = get_compute_prob_info(exp_dir, iter)
-            if iter in iterations:
-                iter_info = iterations[iter]
-                if ext == "raw":
-                    iter_info.raw_file = joined_f
-                else:
-                    iter_info.word_embedding_file = joined_f
-                iter_info.objf = objf
-                iter_info.compute_prob_done = compute_prob_done
-            else:
-                if ext == "raw":
-                    iterations[iter] = IterationInfo(None, joined_f, objf, compute_prob_done)
-                else:
-                    iterations[iter] = IterationInfo(joined_f, None, objf, compute_prob_done)
+    compute_prob_logs = glob.glob(exp_dir + "/log/compute_prob.[0-9]*.log")
+    for log in compute_prob_logs:
+        iteration, objf, compute_prob_done = get_compute_prob_info(log)
+        if compute_prob_done:
+            # this iteration can be safely considered for cleanup
+            # gather all model files belonging to it
+            model_files = []
+            # when there are multiple jobs per iteration, there can be several model files
+            # we need to potentially clean them all up without mixing them up
+            model_files.extend(glob.glob("{0}/word_embedding.{1}.mat".format(exp_dir, iteration)))
+            model_files.extend(glob.glob("{0}/word_embedding.{1}.[0-9]*.mat".format(exp_dir, iteration)))
+            model_files.extend(glob.glob("{0}/{1}.raw".format(exp_dir, iteration)))
+            model_files.extend(glob.glob("{0}/{1}.[0-9]*.raw".format(exp_dir, iteration)))
+            # compute_prob logs outlive model files, only consider iterations that do still have model files
+            if len(model_files) > 0:
+                iterations[iteration] = IterationInfo(model_files, objf, compute_prob_done)
     return iterations
 
 
 def remove_model_files_for_iter(iter_info):
-    os.remove(iter_info.word_embedding_file)
-    os.remove(iter_info.raw_file)
+    for f in iter_info.model_files:
+        os.remove(f)
 
 
 def keep_latest(iteration_dict):
@@ -126,12 +121,10 @@ def keep_latest(iteration_dict):
     kept = 0
     iterations_in_reverse_order = reversed(sorted(iteration_dict))
     for iter in iterations_in_reverse_order:
-        # check if compute prob is done for this iteration, if not, leave it for future cleanups...
-        if iteration_dict[iter].compute_prob_done:
-            if kept < max_to_keep:
-                kept += 1
-            else:
-                remove_model_files_for_iter(iteration_dict[iter])
+        if kept < max_to_keep:
+            kept += 1
+        else:
+            remove_model_files_for_iter(iteration_dict[iter])
 
 
 def keep_best(iteration_dict):
@@ -142,10 +135,6 @@ def keep_best(iteration_dict):
         if objf == -2000:
             print(script_name + ": warning: objf unavailable for iter " + str(iter), file=sys.stderr)
             continue
-        if not iter_info.compute_prob_done:
-            # if compute_prob is not done, yet, we leave it for future cleanups
-            print(script_name + ": warning: compute_prob not done yet for iter " + str(iter), file=sys.stderr)
-            continue
         # add potential best, sort by objf, trim to iters_to_keep size
         best.append((iter, objf))
         best = sorted(best, key=lambda x: -x[1])

From 1d692bd97f5b5a96498ce24124027967e5bbc5ac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andris=20V=C4=81ravs?= <andris.varavs@Tilde.lv>
Date: Wed, 28 Nov 2018 10:49:09 +0200
Subject: [PATCH 08/12] train_rnnlm.sh: fixed cleanup script invocation

---
 scripts/rnnlm/train_rnnlm.sh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/scripts/rnnlm/train_rnnlm.sh b/scripts/rnnlm/train_rnnlm.sh
index 7ac0ca2fdab..d6d38f3d734 100755
--- a/scripts/rnnlm/train_rnnlm.sh
+++ b/scripts/rnnlm/train_rnnlm.sh
@@ -227,16 +227,16 @@ while [ $x -lt $num_iters ]; do
           nnet3-average $src_models $dir/$[x+1].raw '&&' \
           matrix-sum --average=true $src_matrices $dir/${embedding_type}_embedding.$[x+1].mat
       fi
+      # optionally, perform cleanup after training
+      if [ "$cleanup" = true ] ; then
+        python3 rnnlm/rnnlm_cleanup.py $dir --$cleanup_strategy --iters_to_keep $cleanup_keep_iters
+      fi
     )
-
     # the error message below is not that informative, but $cmd will
     # have printed a more specific one.
     [ -f $dir/.error ] && echo "$0: error with diagnostics on iteration $x of training" && exit 1;
   fi
-  # optionally, perform cleanup
-  if [ "$cleanup" = true ] ; then
-    python3 rnnlm_cleanup.py $dir --$cleanup_strategy --iters_to_keep $cleanup_keep_iters
-  fi
+
   x=$[x+1]
   num_splits_processed=$[num_splits_processed+this_num_jobs]
 done

From 25291d537e99c23498755087bbe0a71c1b7b37d9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andris=20V=C4=81ravs?= <andris.varavs@Tilde.lv>
Date: Wed, 28 Nov 2018 11:53:01 +0200
Subject: [PATCH 09/12] get_best_model.py: now it only considers iterations
 that do still have model files present

---
 scripts/rnnlm/get_best_model.py | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/scripts/rnnlm/get_best_model.py b/scripts/rnnlm/get_best_model.py
index e8c6bd8a2f4..be0e63f9417 100755
--- a/scripts/rnnlm/get_best_model.py
+++ b/scripts/rnnlm/get_best_model.py
@@ -3,14 +3,14 @@
 # Copyright  2017  Johns Hopkins University (author: Daniel Povey)
 # License: Apache 2.0.
 
-import os
 import argparse
-import sys
+import glob
 import re
+import sys
 
 parser = argparse.ArgumentParser(description="Works out the best iteration of RNNLM training "
-                                 "based on dev-set perplexity, and prints the number corresponding "
-                                 "to that iteration",
+                                             "based on dev-set perplexity, and prints the number corresponding "
+                                             "to that iteration",
                                  epilog="E.g. " + sys.argv[0] + " exp/rnnlm_a",
                                  formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 
@@ -19,8 +19,7 @@
 
 args = parser.parse_args()
 
-
-num_iters=None
+num_iters = None
 try:
     with open(args.rnnlm_dir + "/info.txt", encoding="latin-1") as f:
         for line in f:
@@ -36,15 +35,15 @@
     sys.exit(sys.argv[0] + ": could not get num_iters from {0}/info.txt".format(
         args.rnnlm_dir))
 
-best_objf=-2000
-best_iter=-1
+best_objf = -2000
+best_iter = -1
 for i in range(1, num_iters):
     this_logfile = "{0}/log/compute_prob.{1}.log".format(args.rnnlm_dir, i)
     try:
         f = open(this_logfile, 'r', encoding='latin-1')
     except:
         sys.exit(sys.argv[0] + ": could not open log-file {0}".format(this_logfile))
-    this_objf=-1000
+    this_objf = -1000
     for line in f:
         m = re.search('Overall objf .* (\S+)$', str(line))
         if m is not None:
@@ -53,6 +52,11 @@
             except Exception as e:
                 sys.exit(sys.argv[0] + ": line in file {0} could not be parsed: {1}, error is: {2}".format(
                     this_logfile, line, str(e)))
+    # verify this iteration still has model files present
+    if len(glob.glob("{0}/word_embedding.{1}.mat".format(args.rnnlm_dir, i))) == 0:
+        print(sys.argv[0] + ": warning: no model files found for iteration {0}. Skipping.".format(i),
+              file=sys.stderr)
+        continue
     if this_objf == -1000:
         print(sys.argv[0] + ": warning: could not parse objective function from {0}".format(
             this_logfile), file=sys.stderr)
@@ -63,5 +67,4 @@
 if best_iter == -1:
     sys.exit(sys.argv[0] + ": error: could not get best iteration.")
 
-
 print(str(best_iter))

From fdf1a125fda40ea1ec35c561265a2722edea5d4b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andris=20V=C4=81ravs?= <andris.varavs@Tilde.lv>
Date: Thu, 29 Nov 2018 16:30:50 +0200
Subject: [PATCH 10/12] rnnlm_cleanup.py: never touch files belonging to
 iteration 0

---
 scripts/rnnlm/rnnlm_cleanup.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/scripts/rnnlm/rnnlm_cleanup.py b/scripts/rnnlm/rnnlm_cleanup.py
index 39f3f4d5d42..892e12689ad 100644
--- a/scripts/rnnlm/rnnlm_cleanup.py
+++ b/scripts/rnnlm/rnnlm_cleanup.py
@@ -95,6 +95,9 @@ def get_iteration_files(exp_dir):
     compute_prob_logs = glob.glob(exp_dir + "/log/compute_prob.[0-9]*.log")
     for log in compute_prob_logs:
         iteration, objf, compute_prob_done = get_compute_prob_info(log)
+        if iteration == 0:
+            # iteration 0 is special, never consider it for cleanup
+            continue
         if compute_prob_done:
             # this iteration can be safely considered for cleanup
             # gather all model files belonging to it

From 36a74815f97043be44de1e4cb2be9afa288e3fa6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andris=20V=C4=81ravs?= <andris.varavs@Tilde.lv>
Date: Fri, 30 Nov 2018 08:57:06 +0200
Subject: [PATCH 11/12] get_best_model.py: model-less iterations will no longer
 trigger confusing warnings as, given cleanup, it's normal for model files to
 be absent for most iteration

---
 scripts/rnnlm/get_best_model.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/scripts/rnnlm/get_best_model.py b/scripts/rnnlm/get_best_model.py
index be0e63f9417..ba3c6ccb744 100755
--- a/scripts/rnnlm/get_best_model.py
+++ b/scripts/rnnlm/get_best_model.py
@@ -54,8 +54,7 @@
                     this_logfile, line, str(e)))
     # verify this iteration still has model files present
     if len(glob.glob("{0}/word_embedding.{1}.mat".format(args.rnnlm_dir, i))) == 0:
-        print(sys.argv[0] + ": warning: no model files found for iteration {0}. Skipping.".format(i),
-              file=sys.stderr)
+        # this iteration has log files, but model files have been cleaned up, skip it
         continue
     if this_objf == -1000:
         print(sys.argv[0] + ": warning: could not parse objective function from {0}".format(

From 23ea8ad3fd1d4c5f05a4fb7176cc873a7a750530 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andris=20V=C4=81ravs?= <andris.varavs@Tilde.lv>
Date: Wed, 5 Dec 2018 12:01:24 +0200
Subject: [PATCH 12/12] fixed "feat_embedding" files not considered during
 cleanup and get_best_model.py

---
 scripts/rnnlm/get_best_model.py | 2 +-
 scripts/rnnlm/rnnlm_cleanup.py  | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/rnnlm/get_best_model.py b/scripts/rnnlm/get_best_model.py
index ba3c6ccb744..333ed8dbfc7 100755
--- a/scripts/rnnlm/get_best_model.py
+++ b/scripts/rnnlm/get_best_model.py
@@ -53,7 +53,7 @@
                 sys.exit(sys.argv[0] + ": line in file {0} could not be parsed: {1}, error is: {2}".format(
                     this_logfile, line, str(e)))
     # verify this iteration still has model files present
-    if len(glob.glob("{0}/word_embedding.{1}.mat".format(args.rnnlm_dir, i))) == 0:
+    if len(glob.glob("{0}/{1}.raw".format(args.rnnlm_dir, i))) == 0:
         # this iteration has log files, but model files have been cleaned up, skip it
         continue
     if this_objf == -1000:
diff --git a/scripts/rnnlm/rnnlm_cleanup.py b/scripts/rnnlm/rnnlm_cleanup.py
index 892e12689ad..40cbee7a496 100644
--- a/scripts/rnnlm/rnnlm_cleanup.py
+++ b/scripts/rnnlm/rnnlm_cleanup.py
@@ -106,6 +106,8 @@ def get_iteration_files(exp_dir):
             # we need to potentially clean them all up without mixing them up
             model_files.extend(glob.glob("{0}/word_embedding.{1}.mat".format(exp_dir, iteration)))
             model_files.extend(glob.glob("{0}/word_embedding.{1}.[0-9]*.mat".format(exp_dir, iteration)))
+            model_files.extend(glob.glob("{0}/feat_embedding.{1}.mat".format(exp_dir, iteration)))
+            model_files.extend(glob.glob("{0}/feat_embedding.{1}.[0-9]*.mat".format(exp_dir, iteration)))
             model_files.extend(glob.glob("{0}/{1}.raw".format(exp_dir, iteration)))
             model_files.extend(glob.glob("{0}/{1}.[0-9]*.raw".format(exp_dir, iteration)))
             # compute_prob logs outlive model files, only consider iterations that do still have model files