use loky to support nested paralleled loading

carusyte · carusyte · commit 6b5e5e0f01ab · 2018-06-21T19:26:56.000+08:00
diff --git a/corl/wc_data/input_fn.py b/corl/wc_data/input_fn.py
@@ -3,6 +3,7 @@
 from base import connect, getSeries, getBatch, ftQueryTpl, k_cols
 from time import strftime
 from joblib import Parallel, delayed
+from loky import get_reusable_executor
 import tensorflow as tf
 import sys
 import multiprocessing
@@ -13,6 +14,7 @@
 feat_cols = []
 time_shift = None
 max_step = None
+_prefetch = None
 
 
 maxbno_query = (
@@ -27,6 +29,22 @@
     "        flag LIKE %s) t "
 )
 
+_executor = None
+
+
+def _getExecutor():
+    global parallel, _executor, _prefetch
+    if _executor is not None:
+        return _executor
+    _executor = get_reusable_executor(
+        max_workers=parallel*_prefetch, timeout=20)
+    return _executor
+
+
+def _getSeries(p):
+    uuid, code, klid, rcode, val, max_step, time_shift, ftQueryK, ftQueryD = p
+    return getSeries(uuid, code, klid, rcode, val, max_step, time_shift, ftQueryK, ftQueryD)
+
 
 def _loadTestSet(max_step, ntest):
     global parallel, time_shift
@@ -59,18 +77,18 @@ def _loadTestSet(max_step, ntest):
         # data = [batch, max_step, feature*time_shift]
         # vals = [batch]
         # seqlen = [batch]
-        return np.array(uuids, 'U'), np.array(data,'f'), np.array(vals,'f'), np.array(seqlen, 'i')
+        return np.array(uuids, 'U'), np.array(data, 'f'), np.array(vals, 'f'), np.array(seqlen, 'i')
     except:
         print(sys.exc_info()[0])
         raise
     finally:
         cnx.close()
 
 
-def _loadTrainingData(batch_no):
+def _loadTrainingData(flag):
     global max_step, parallel, time_shift
     print("{} loading training set {}...".format(
-        strftime("%H:%M:%S"), batch_no))
+        strftime("%H:%M:%S"), flag))
     cnx = connect()
     try:
         cursor = cnx.cursor(buffered=True)
@@ -82,22 +100,23 @@ def _loadTrainingData(batch_no):
             'WHERE '
             "   flag = %s"
         )
-        flag = 'TRAIN_{}'.format(batch_no)
         cursor.execute(query, (flag,))
         train_set = cursor.fetchall()
         total = cursor.rowcount
         cursor.close()
         uuids, data, vals, seqlen = [], [], [], []
         if total > 0:
             qk, qd = _getFtQuery()
-            r = Parallel(n_jobs=parallel)(delayed(getSeries)(
-                uuid, code, klid, rcode, val, max_step, time_shift, qk, qd
-            ) for uuid, code, klid, rcode, val in train_set)
+            #joblib doesn't support nested threading
+            exc = _getExecutor()
+            params = [(uuid, code, klid, rcode, val, max_step, time_shift, qk, qd)
+                      for uuid, code, klid, rcode, val in train_set]
+            r = list(exc.map(_getSeries, params))
             uuids, data, vals, seqlen = zip(*r)
         # data = [batch, max_step, feature*time_shift]
         # vals = [batch]
         # seqlen = [batch]
-        return np.array(uuids,'U'), np.array(data,'f'), np.array(vals,'f'), np.array(seqlen, 'i')
+        return np.array(uuids, 'U'), np.array(data, 'f'), np.array(vals, 'f'), np.array(seqlen, 'i')
     except:
         print(sys.exc_info()[0])
         raise
@@ -176,19 +195,20 @@ def _getDataSetMeta(flag, start=0):
     return max_bno, batch_size
 
 
-def getInputs(start=0, shift=0, cols=None, step=30, cores=multiprocessing.cpu_count()):
+def getInputs(start=0, shift=0, cols=None, step=30, cores=multiprocessing.cpu_count(), prefetch=2):
     """Input function for the wcc training dataset.
 
     Returns:
         A dictionary containing:
         uuids,features,labels,seqlens,train_iter,test_iter
     """
     # Create dataset for training
-    global feat_cols, max_step, time_shift, parallel
+    global feat_cols, max_step, time_shift, parallel, _prefetch
     time_shift = shift
     feat_cols = cols
     max_step = step
     parallel = cores
+    _prefetch = prefetch
     feat_size = len(cols)*2*(shift+1)
     print("{} Using parallel level:{}".format(strftime("%H:%M:%S"), parallel))
     with tf.variable_scope("build_inputs"):
@@ -202,7 +222,7 @@ def getInputs(start=0, shift=0, cols=None, step=30, cores=multiprocessing.cpu_co
                 tf.py_func(_loadTrainingData, [f], [
                     tf.string, tf.float32, tf.float32, tf.int32])
             )
-        ).batch(1).prefetch(2)
+        ).batch(1).prefetch(prefetch)
         # Create dataset for testing
         max_bno, batch_size = _getDataSetMeta("TEST", 1)
         test_dataset = tf.data.Dataset.from_tensor_slices(
@@ -215,7 +235,8 @@ def getInputs(start=0, shift=0, cols=None, step=30, cores=multiprocessing.cpu_co
         types = (tf.string, tf.float32, tf.float32, tf.int32)
         shapes = (tf.TensorShape([None]), tf.TensorShape(
             [None, step, feat_size]), tf.TensorShape([None]), tf.TensorShape([None]))
-        iter = tf.data.Iterator.from_string_handle(handle, types, train_dataset.output_shapes)
+        iter = tf.data.Iterator.from_string_handle(
+            handle, types, train_dataset.output_shapes)
 
         next_el = iter.get_next()
         uuids = tf.squeeze(next_el[0])
diff --git a/corl/wc_test/test4.py b/corl/wc_test/test4.py
@@ -39,8 +39,10 @@
 ]
 
 parser = argparse.ArgumentParser()
-parser.add_argument('parallel', type=int, nargs='?', help='database operation parallel level',
+parser.add_argument('--parallel', type=int, help='database operation parallel level',
                     default=multiprocessing.cpu_count())
+parser.add_argument('--prefetch', type=int, help='dataset prefetch batches',
+                    default=2)
 parser.add_argument(
     '--restart', help='restart training', action='store_true')
 args = parser.parse_args()
@@ -77,7 +79,7 @@ def run():
                 bno = int(os.path.basename(
                     ckpt.model_checkpoint_path).split('-')[1])
                 d = input_fn.getInputs(
-                    bno+1, TIME_SHIFT, k_cols, MAX_STEP, args.parallel)
+                    bno+1, TIME_SHIFT, k_cols, MAX_STEP, args.parallel, args.prefetch)
                 model.setNodes(d['uuids'], d['features'],
                                d['labels'], d['seqlens'])
                 saver = tf.train.Saver()
@@ -91,7 +93,7 @@ def run():
 
         if not restored:
             d = input_fn.getInputs(
-                bno+1, TIME_SHIFT, k_cols, MAX_STEP, args.parallel)
+                bno+1, TIME_SHIFT, k_cols, MAX_STEP, args.parallel, args.prefetch)
             model.setNodes(d['uuids'], d['features'],
                            d['labels'], d['seqlens'])
             saver = tf.train.Saver()
diff --git a/executor/test.py b/executor/test.py
@@ -0,0 +1,18 @@
+from loky import get_reusable_executor
+
+_executor = get_reusable_executor(
+    max_workers=2, timeout=20)
+
+
+def fn(p):
+    a, b, c = p
+    print("received:{} {} {}".format(a, b, c))
+    return p[2]+1, [[p[0]+1, p[0]+2], [p[0]+3, p[0]+4]], p[1]+1
+
+
+params = [(1, 2, 3), (4, 5, 6), (7, 8, 9)]
+r = list(_executor.map(fn, params))
+ra, rb, rc = zip(*r)
+print("a:{}".format(ra))
+print("b:{}".format(rb))
+print("c:{}".format(rc))