adds separate files to import datasets and prepare package for gcloud job:

SijanC147 · SijanC147 · commit 43b902d93637 · 2018-10-16T13:46:43.000+02:00
diff --git a/Pipfile b/Pipfile
@@ -4,7 +4,6 @@ verify_ssl = true
 name = "pypi"
 
 [packages]
-tensorflow = "*"
 matplotlib = "*"
 pillow = "*"
 docker = "*"
@@ -14,6 +13,8 @@ tqdm = "*"
 comet-ml = "*"
 pandas = "*"
 spacy = "*"
+tensorflow = {version="*", sys_platform = "== 'darwin'"}
+tensorflow-gpu = {version=">=1.11.0", sys_platform = "== 'linux'"}
 
 [dev-packages]
 tox-pipenv = "*"
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/gcp/_cmd.py b/gcp/_cmd.py
@@ -1,8 +1,8 @@
 from os import system
 
 
-system("""gcloud ml-engine jobs submit training testing_job_script_3 \
---job-dir=gs://tsaplay-bucket/testing_job_script \
+system("""gcloud ml-engine jobs submit training testing_large_embeddings_2 \
+--job-dir=gs://tsaplay-bucket/testing_large_embeddings_2 \
 --module-name=tsaplay.task \
 --staging-bucket=gs://tsaplay-bucket/ \
 --packages=/Users/seanbugeja/Code/Msc/dist/tsaplay-0.1.dev0.tar.gz \
diff --git a/gcp/_config.json b/gcp/_config.json
@@ -5,16 +5,21 @@
         "owner": "sean"
     },
     "trainingInput": {
-        "scaleTier": "BASIC",
+        "scaleTier": "CUSTOM",
+        "masterType": "standard_gpu",
+        "workerType": "standard_gpu",
+        "parameterServerType": "standard_gpu",
+        "workerCount": 4,
+        "parameterServerCount": 3,
         "pythonVersion": "3.5",
         "runtimeVersion": "1.10",
         "region": "europe-west1",
         "args": [
-            "--embedding=wiki-50",
-            "--datasets=debug",
+            "--embedding=commoncrawl-840",
+            "--datasets=dong",
             "--model=lcrrot",
-            "--batch-size=5",
-            "--steps=10",
+            "--batch-size=25",
+            "--steps=1000",
             "--verbosity=INFO"
         ]
     }
diff --git a/logs/.gitignore b/logs/.gitignore
@@ -0,0 +1,4 @@
+# Ignore everything in this directory
+*
+# Except this file
+!.gitignore
diff --git a/requirements.txt b/requirements.txt
@@ -1,21 +1,23 @@
 -i https://pypi.org/simple
 absl-py==0.5.0
 astor==0.7.1
-boto3==1.9.22
+backports.weakref==1.0.post1
+boto3==1.9.24
 boto==2.49.0
-botocore==1.12.22
+botocore==1.12.24
 bz2file==0.98
-certifi==2018.8.24
+certifi==2018.10.15
 chardet==3.0.4
 comet-git-pure==0.19.6
 comet-ml==1.0.29
 cycler==0.10.0
-cymem==1.31.2
+cymem==2.0.2
 cytoolz==0.9.0.1
 dill==0.2.8.2
 docker-pycreds==0.3.0
 docker==3.5.0
 docutils==0.14
+enum34==1.1.6
 gast==0.2.0
 gensim==3.6.0
 grpcio==1.15.0
@@ -27,34 +29,37 @@ keras-preprocessing==1.0.5
 kiwisolver==1.0.1
 markdown==3.0.1
 matplotlib==3.0.0
-msgpack-numpy==0.4.4.1
+mock==2.0.0
+msgpack-numpy==0.4.3.2
 msgpack==0.5.6
-murmurhash==0.28.0
+murmurhash==1.0.1
 netifaces==0.10.7
 numpy==1.15.2
 nvidia-ml-py3==7.352.0
 pandas==0.23.4
+pbr==4.3.0
 pillow==5.3.0
 plac==0.9.6
-preshed==1.0.1
+preshed==2.0.1
 protobuf==3.6.1
 pyparsing==2.2.2
 python-dateutil==2.7.3 ; python_version >= '2.7'
 pytz==2018.5
-regex==2017.4.5
+regex==2018.1.10
 requests==2.19.1
 s3transfer==0.1.13
 scipy==1.1.0
 six==1.11.0
 smart-open==1.7.1
-spacy==2.0.12
+spacy==2.0.16
 tensorboard==1.11.0
+tensorflow-gpu==1.11.0 ; sys_platform == 'linux'
 tensorflow-serving-api==1.11.0
-tensorflow==1.11.0
+tensorflow==1.11.0 ; sys_platform == 'darwin'
 termcolor==1.1.0
-thinc==6.10.3
+thinc==6.12.0
 toolz==0.9.0
-tqdm==4.26.0
+tqdm==4.27.0
 ujson==1.35
 urllib3==1.23
 websocket-client==0.53.0
diff --git a/submit_job.py b/submit_job.py
@@ -89,6 +89,10 @@ def get_arguments():
         default="lcrrot",
     )
 
+    parser.add_argument(
+        "--job-id", "-jid", type=str, help="ID of the job being submitted"
+    )
+
     parser.add_argument(
         "--job-dir",
         help="GCS location to write checkpoints to and export models",
@@ -195,7 +199,7 @@ def write_gcloud_config(args):
     args_dict = vars(args)
     args_list = []
     for (key, value) in args_dict.items():
-        if not value:
+        if not value or key == "job_id":
             continue
         if isinstance(value, list):
             value = " ".join(map(str, value))
@@ -206,7 +210,12 @@ def write_gcloud_config(args):
         "jobId": "my_job",
         "labels": {"type": "dev", "owner": "sean"},
         "trainingInput": {
-            "scaleTier": "BASIC",
+            "scaleTier": "CUSTOM",
+            "masterType": "standard_gpu",
+            "workerType": "standard_gpu",
+            "parameterServerType": "standard_gpu",
+            "workerCount": 4,
+            "parameterServerCount": 3,
             "pythonVersion": "3.5",
             "runtimeVersion": "1.10",
             "region": "europe-west1",
@@ -218,16 +227,16 @@ def write_gcloud_config(args):
         dump(gcloud_config, config_file, indent=4)
 
 
-def write_gcloud_cmd_script():
+def write_gcloud_cmd_script(args):
     gcloud_cmd = """gcloud ml-engine jobs submit training {job_name} \\
 --job-dir={job_dir} \\
 --module-name={module_name} \\
 --staging-bucket={staging_bucket} \\
 --packages={package_name} \\
 --config={config_path} \\
 --stream-logs""".format(
-        job_name="testing_job_script_3",
-        job_dir="gs://tsaplay-bucket/testing_job_script",
+        job_name=args.job_id,
+        job_dir="gs://tsaplay-bucket/{}".format(args.job_id),
         module_name="tsaplay.task",
         staging_bucket="gs://tsaplay-bucket/",
         package_name=abspath(
@@ -257,7 +266,11 @@ def write_gcloud_cmd_script():
         cprnt(bow="Copied to clipboard!")
 
 
-if __name__ == "__main__":
-    prepare_assets(get_arguments())
+def main(args):
+    prepare_assets(args)
     sandbox.run_setup("setup.py", ["sdist"])
-    write_gcloud_cmd_script()
+    write_gcloud_cmd_script(args)
+
+
+if __name__ == "__main__":
+    main(get_arguments())
diff --git a/tsaplay/datasets.py b/tsaplay/datasets.py
@@ -108,12 +108,12 @@ def get_stats_dict(cls, classes=None, **data_dicts):
     @classmethod
     @timeit("Generating corpus for dataset", "Corpus generated")
     def generate_corpus(cls, docs, path):
-        corpus_file = join(path, "_corpus.csv")
+        corpus_file = join(path, "_corpus.pkl")
         if exists(corpus_file):
-            corpus = corpus_from_csv(path=corpus_file)
+            corpus = _unpickle(corpus_file)
         else:
             corpus = cls.corpus_from_docs(docs)
-            corpus_to_csv(corpus_file, corpus)
+            _pickle(data=corpus, path=corpus_file)
         return corpus
 
     @classmethod
diff --git a/tsaplay/embeddings.py b/tsaplay/embeddings.py
@@ -80,7 +80,8 @@ def initializer(self):
         def _init(shape=shape, dtype=tf.float32, partition_info=None):
             return self.vectors
 
-        return _init
+        # return _init
+        return lambda: self.vectors
 
     @property
     def partitioned_initializer(self):
diff --git a/tsaplay/experiments.py b/tsaplay/experiments.py
@@ -18,15 +18,15 @@ def __init__(
         feature_provider,
         model,
         contd_tag=None,
-        config=None,
+        run_config=None,
         job_dir=None,
     ):
         self.feature_provider = feature_provider
         self.model = model
         self.contd_tag = contd_tag
         self.job_dir = job_dir
         self._initialize_experiment_dir()
-        self._initialize_model_run_config(config or {})
+        self._initialize_model_run_config(run_config or {})
         if self.contd_tag is not None:
             self._setup_comet_ml_experiment()
 
@@ -118,13 +118,14 @@ def _update_export_models_config(self):
 
     def _initialize_model_run_config(self, config_dict):
         default_config = {
-            "model_dir": join(self._experiment_dir, "tb_summary"),
-            "save_checkpoints_steps": 100,
-            "save_summary_steps": 25,
-            "log_step_count_steps": 25,
+            "model_dir": join(self._experiment_dir),
+            # "model_dir": join(self._experiment_dir, "tb_summary"),
+            # "save_checkpoints_steps": 100,
+            # "save_summary_steps": 25,
+            # "log_step_count_steps": 25,
         }
         default_config.update(config_dict)
-        self.model.run_config = tf.estimator.RunConfig(**default_config)
+        self.model.run_config = self.model.run_config.replace(**default_config)
 
     def _setup_comet_ml_experiment(self):
         api_key = environ.get("COMET_ML_API_KEY")
diff --git a/tsaplay/hooks.py b/tsaplay/hooks.py
@@ -15,7 +15,7 @@
 from tsaplay.utils.tf import image_to_summary
 from tsaplay.utils.io import cprnt, temp_pngs, get_image_from_plt
 
-matplotlib.use("TkAgg")
+# matplotlib.use("TkAgg")
 import matplotlib.pyplot as plt  # noqa pylint: disable=C0411,C0412,C0413
 
 
diff --git a/tsaplay/models/tsa_model.py b/tsaplay/models/tsa_model.py
@@ -14,12 +14,13 @@
 )
 from tsaplay.features import FeatureProvider
 from tsaplay.utils.draw import plot_distributions
-from tsaplay.utils.io import temp_pngs
+from tsaplay.utils.io import temp_pngs, cprnt
 from tsaplay.utils.decorators import (
     make_input_fn,
     addon,
     cometml,
     embed_sequences,
+    shard_saver,
 )
 from tsaplay.utils.data import make_dense_features
 from tsaplay.utils.addons import (
@@ -175,6 +176,7 @@ def _serving_input_receiver_fn(self):
         return ServingInputReceiver(input_features, inputs)
 
     @cometml
+    @shard_saver
     @addon([scalars, logging, histograms, conf_matrix])
     @addon([prediction_outputs])
     @embed_sequences
diff --git a/tsaplay/scripts/client.py b/tsaplay/scripts/client.py
diff --git a/tsaplay/scripts/import_dataset.py b/tsaplay/scripts/import_dataset.py
@@ -39,7 +39,7 @@ def get_dataset_dicts(train_file, test_file, parsing_fn):
 
 def get_raw_file_paths(path):
     train_file = search_dir(path, "train", first=True, kind="files")
-    test_file = search_dir(path, "train", first=True, kind="files")
+    test_file = search_dir(path, "test", first=True, kind="files")
     return train_file, test_file
 
 
diff --git a/tsaplay/task.py b/tsaplay/task.py
@@ -1,7 +1,10 @@
 import argparse
+from os import environ, getcwd
+from os.path import join
 import comet_ml
 import tensorflow as tf
 import pkg_resources as pkg
+from tsaplay.utils.io import cprnt
 from tsaplay.datasets import Dataset
 from tsaplay.embeddings import Embedding
 from tsaplay.features import FeatureProvider
@@ -60,9 +63,24 @@ def run_experiment(args):
     feature_provider = FeatureProvider(datasets, embedding)
 
     model = MODELS.get(args.model)(params={"batch-size": args.batch_size})
+    # model = MODELS.get(args.model)(
+    #     params={"batch-size": args.batch_size, "hidden_units": 100},
+    #     run_config={"keep_checkpoint_max": 50, "tf_random_seed": 1234},
+    # )
 
+    distribution = tf.contrib.distribute.DistributeConfig(
+        train_distribute=tf.contrib.distribute.OneDeviceStrategy("/gpu:0")
+    )
     experiment = Experiment(
-        feature_provider, model, contd_tag=args.contd_tag, job_dir=args.job_dir
+        feature_provider,
+        model,
+        contd_tag=args.contd_tag,
+        job_dir=args.job_dir,
+        run_config={
+            "tf_random_seed": 1234,
+            "keep_checkpoint_max": 5000,
+            "experimental_distribute": distribution,
+        },
     )
 
     experiment.run(job="train+eval", steps=args.steps)
diff --git a/tsaplay/utils/data.py b/tsaplay/utils/data.py
@@ -1,5 +1,5 @@
-import tensorflow as tf
 import numpy as np
+import tensorflow as tf
 from tsaplay.utils.tf import sparse_sequences_to_dense, get_seq_lengths
 
 
@@ -50,24 +50,28 @@ def make_dense_features(features):
 def prep_dataset(tfrecords, params, processing_fn, mode):
     shuffle_buffer = params.get("shuffle-bufer", 50)
     dataset = tf.data.Dataset.list_files(file_pattern=tfrecords)
+    dataset = dataset.interleave(
+        tf.data.TFRecordDataset, cycle_length=5, block_length=1
+    )
     if mode == "EVAL":
         dataset = dataset.shuffle(buffer_size=shuffle_buffer)
     elif mode == "TRAIN":
         dataset = dataset.apply(
             tf.contrib.data.shuffle_and_repeat(buffer_size=shuffle_buffer)
         )
 
-    dataset = dataset.interleave(
-        tf.data.TFRecordDataset, cycle_length=5, block_length=1
-    )
-    dataset = dataset.map(parse_tf_example, num_parallel_calls=5)
-    if processing_fn is not None:
-        dataset = dataset.map(processing_fn)
+    def parse_and_process(example):
+        return processing_fn(*parse_tf_example(example))
 
-    dataset = dataset.batch(params["batch-size"])
+    dataset = dataset.apply(
+        tf.contrib.data.map_and_batch(
+            parse_and_process, params["batch-size"], num_parallel_batches=5
+        )
+    )
     dataset = dataset.map(
         lambda features, labels: (make_dense_features(features), labels)
     )
+    dataset = dataset.prefetch(buffer_size=None)
 
     return dataset
 
diff --git a/tsaplay/utils/decorators.py b/tsaplay/utils/decorators.py
diff --git a/tsaplay/utils/io.py b/tsaplay/utils/io.py
diff --git a/tsaplay/utils/tf.py b/tsaplay/utils/tf.py