Skip to content

Commit 43b902d

Browse files
committed
adds separate files to import datasets and prepare package for gcloud job:
1 parent 38b0a06 commit 43b902d

19 files changed

+344
-136
lines changed

Pipfile

+2-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ verify_ssl = true
44
name = "pypi"
55

66
[packages]
7-
tensorflow = "*"
87
matplotlib = "*"
98
pillow = "*"
109
docker = "*"
@@ -14,6 +13,8 @@ tqdm = "*"
1413
comet-ml = "*"
1514
pandas = "*"
1615
spacy = "*"
16+
tensorflow = {version="*", sys_platform = "== 'darwin'"}
17+
tensorflow-gpu = {version=">=1.11.0", sys_platform = "== 'linux'"}
1718

1819
[dev-packages]
1920
tox-pipenv = "*"

Pipfile.lock

+209-70
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

gcp/_cmd.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
from os import system
22

33

4-
system("""gcloud ml-engine jobs submit training testing_job_script_3 \
5-
--job-dir=gs://tsaplay-bucket/testing_job_script \
4+
system("""gcloud ml-engine jobs submit training testing_large_embeddings_2 \
5+
--job-dir=gs://tsaplay-bucket/testing_large_embeddings_2 \
66
--module-name=tsaplay.task \
77
--staging-bucket=gs://tsaplay-bucket/ \
88
--packages=/Users/seanbugeja/Code/Msc/dist/tsaplay-0.1.dev0.tar.gz \

gcp/_config.json

+10-5
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,21 @@
55
"owner": "sean"
66
},
77
"trainingInput": {
8-
"scaleTier": "BASIC",
8+
"scaleTier": "CUSTOM",
9+
"masterType": "standard_gpu",
10+
"workerType": "standard_gpu",
11+
"parameterServerType": "standard_gpu",
12+
"workerCount": 4,
13+
"parameterServerCount": 3,
914
"pythonVersion": "3.5",
1015
"runtimeVersion": "1.10",
1116
"region": "europe-west1",
1217
"args": [
13-
"--embedding=wiki-50",
14-
"--datasets=debug",
18+
"--embedding=commoncrawl-840",
19+
"--datasets=dong",
1520
"--model=lcrrot",
16-
"--batch-size=5",
17-
"--steps=10",
21+
"--batch-size=25",
22+
"--steps=1000",
1823
"--verbosity=INFO"
1924
]
2025
}

logs/.gitignore

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# Ignore everything in this directory
2+
*
3+
# Except this file
4+
!.gitignore

requirements.txt

+17-12
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,23 @@
11
-i https://pypi.org/simple
22
absl-py==0.5.0
33
astor==0.7.1
4-
boto3==1.9.22
4+
backports.weakref==1.0.post1
5+
boto3==1.9.24
56
boto==2.49.0
6-
botocore==1.12.22
7+
botocore==1.12.24
78
bz2file==0.98
8-
certifi==2018.8.24
9+
certifi==2018.10.15
910
chardet==3.0.4
1011
comet-git-pure==0.19.6
1112
comet-ml==1.0.29
1213
cycler==0.10.0
13-
cymem==1.31.2
14+
cymem==2.0.2
1415
cytoolz==0.9.0.1
1516
dill==0.2.8.2
1617
docker-pycreds==0.3.0
1718
docker==3.5.0
1819
docutils==0.14
20+
enum34==1.1.6
1921
gast==0.2.0
2022
gensim==3.6.0
2123
grpcio==1.15.0
@@ -27,34 +29,37 @@ keras-preprocessing==1.0.5
2729
kiwisolver==1.0.1
2830
markdown==3.0.1
2931
matplotlib==3.0.0
30-
msgpack-numpy==0.4.4.1
32+
mock==2.0.0
33+
msgpack-numpy==0.4.3.2
3134
msgpack==0.5.6
32-
murmurhash==0.28.0
35+
murmurhash==1.0.1
3336
netifaces==0.10.7
3437
numpy==1.15.2
3538
nvidia-ml-py3==7.352.0
3639
pandas==0.23.4
40+
pbr==4.3.0
3741
pillow==5.3.0
3842
plac==0.9.6
39-
preshed==1.0.1
43+
preshed==2.0.1
4044
protobuf==3.6.1
4145
pyparsing==2.2.2
4246
python-dateutil==2.7.3 ; python_version >= '2.7'
4347
pytz==2018.5
44-
regex==2017.4.5
48+
regex==2018.1.10
4549
requests==2.19.1
4650
s3transfer==0.1.13
4751
scipy==1.1.0
4852
six==1.11.0
4953
smart-open==1.7.1
50-
spacy==2.0.12
54+
spacy==2.0.16
5155
tensorboard==1.11.0
56+
tensorflow-gpu==1.11.0 ; sys_platform == 'linux'
5257
tensorflow-serving-api==1.11.0
53-
tensorflow==1.11.0
58+
tensorflow==1.11.0 ; sys_platform == 'darwin'
5459
termcolor==1.1.0
55-
thinc==6.10.3
60+
thinc==6.12.0
5661
toolz==0.9.0
57-
tqdm==4.26.0
62+
tqdm==4.27.0
5863
ujson==1.35
5964
urllib3==1.23
6065
websocket-client==0.53.0

submit_job.py

+21-8
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,10 @@ def get_arguments():
8989
default="lcrrot",
9090
)
9191

92+
parser.add_argument(
93+
"--job-id", "-jid", type=str, help="ID of the job being submitted"
94+
)
95+
9296
parser.add_argument(
9397
"--job-dir",
9498
help="GCS location to write checkpoints to and export models",
@@ -195,7 +199,7 @@ def write_gcloud_config(args):
195199
args_dict = vars(args)
196200
args_list = []
197201
for (key, value) in args_dict.items():
198-
if not value:
202+
if not value or key == "job_id":
199203
continue
200204
if isinstance(value, list):
201205
value = " ".join(map(str, value))
@@ -206,7 +210,12 @@ def write_gcloud_config(args):
206210
"jobId": "my_job",
207211
"labels": {"type": "dev", "owner": "sean"},
208212
"trainingInput": {
209-
"scaleTier": "BASIC",
213+
"scaleTier": "CUSTOM",
214+
"masterType": "standard_gpu",
215+
"workerType": "standard_gpu",
216+
"parameterServerType": "standard_gpu",
217+
"workerCount": 4,
218+
"parameterServerCount": 3,
210219
"pythonVersion": "3.5",
211220
"runtimeVersion": "1.10",
212221
"region": "europe-west1",
@@ -218,16 +227,16 @@ def write_gcloud_config(args):
218227
dump(gcloud_config, config_file, indent=4)
219228

220229

221-
def write_gcloud_cmd_script():
230+
def write_gcloud_cmd_script(args):
222231
gcloud_cmd = """gcloud ml-engine jobs submit training {job_name} \\
223232
--job-dir={job_dir} \\
224233
--module-name={module_name} \\
225234
--staging-bucket={staging_bucket} \\
226235
--packages={package_name} \\
227236
--config={config_path} \\
228237
--stream-logs""".format(
229-
job_name="testing_job_script_3",
230-
job_dir="gs://tsaplay-bucket/testing_job_script",
238+
job_name=args.job_id,
239+
job_dir="gs://tsaplay-bucket/{}".format(args.job_id),
231240
module_name="tsaplay.task",
232241
staging_bucket="gs://tsaplay-bucket/",
233242
package_name=abspath(
@@ -257,7 +266,11 @@ def write_gcloud_cmd_script():
257266
cprnt(bow="Copied to clipboard!")
258267

259268

260-
if __name__ == "__main__":
261-
prepare_assets(get_arguments())
269+
def main(args):
270+
prepare_assets(args)
262271
sandbox.run_setup("setup.py", ["sdist"])
263-
write_gcloud_cmd_script()
272+
write_gcloud_cmd_script(args)
273+
274+
275+
if __name__ == "__main__":
276+
main(get_arguments())

tsaplay/datasets.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -108,12 +108,12 @@ def get_stats_dict(cls, classes=None, **data_dicts):
108108
@classmethod
109109
@timeit("Generating corpus for dataset", "Corpus generated")
110110
def generate_corpus(cls, docs, path):
111-
corpus_file = join(path, "_corpus.csv")
111+
corpus_file = join(path, "_corpus.pkl")
112112
if exists(corpus_file):
113-
corpus = corpus_from_csv(path=corpus_file)
113+
corpus = _unpickle(corpus_file)
114114
else:
115115
corpus = cls.corpus_from_docs(docs)
116-
corpus_to_csv(corpus_file, corpus)
116+
_pickle(data=corpus, path=corpus_file)
117117
return corpus
118118

119119
@classmethod

tsaplay/embeddings.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,8 @@ def initializer(self):
8080
def _init(shape=shape, dtype=tf.float32, partition_info=None):
8181
return self.vectors
8282

83-
return _init
83+
# return _init
84+
return lambda: self.vectors
8485

8586
@property
8687
def partitioned_initializer(self):

tsaplay/experiments.py

+8-7
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,15 @@ def __init__(
1818
feature_provider,
1919
model,
2020
contd_tag=None,
21-
config=None,
21+
run_config=None,
2222
job_dir=None,
2323
):
2424
self.feature_provider = feature_provider
2525
self.model = model
2626
self.contd_tag = contd_tag
2727
self.job_dir = job_dir
2828
self._initialize_experiment_dir()
29-
self._initialize_model_run_config(config or {})
29+
self._initialize_model_run_config(run_config or {})
3030
if self.contd_tag is not None:
3131
self._setup_comet_ml_experiment()
3232

@@ -118,13 +118,14 @@ def _update_export_models_config(self):
118118

119119
def _initialize_model_run_config(self, config_dict):
120120
default_config = {
121-
"model_dir": join(self._experiment_dir, "tb_summary"),
122-
"save_checkpoints_steps": 100,
123-
"save_summary_steps": 25,
124-
"log_step_count_steps": 25,
121+
"model_dir": join(self._experiment_dir),
122+
# "model_dir": join(self._experiment_dir, "tb_summary"),
123+
# "save_checkpoints_steps": 100,
124+
# "save_summary_steps": 25,
125+
# "log_step_count_steps": 25,
125126
}
126127
default_config.update(config_dict)
127-
self.model.run_config = tf.estimator.RunConfig(**default_config)
128+
self.model.run_config = self.model.run_config.replace(**default_config)
128129

129130
def _setup_comet_ml_experiment(self):
130131
api_key = environ.get("COMET_ML_API_KEY")

tsaplay/hooks.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from tsaplay.utils.tf import image_to_summary
1616
from tsaplay.utils.io import cprnt, temp_pngs, get_image_from_plt
1717

18-
matplotlib.use("TkAgg")
18+
# matplotlib.use("TkAgg")
1919
import matplotlib.pyplot as plt # noqa pylint: disable=C0411,C0412,C0413
2020

2121

tsaplay/models/tsa_model.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,13 @@
1414
)
1515
from tsaplay.features import FeatureProvider
1616
from tsaplay.utils.draw import plot_distributions
17-
from tsaplay.utils.io import temp_pngs
17+
from tsaplay.utils.io import temp_pngs, cprnt
1818
from tsaplay.utils.decorators import (
1919
make_input_fn,
2020
addon,
2121
cometml,
2222
embed_sequences,
23+
shard_saver,
2324
)
2425
from tsaplay.utils.data import make_dense_features
2526
from tsaplay.utils.addons import (
@@ -175,6 +176,7 @@ def _serving_input_receiver_fn(self):
175176
return ServingInputReceiver(input_features, inputs)
176177

177178
@cometml
179+
@shard_saver
178180
@addon([scalars, logging, histograms, conf_matrix])
179181
@addon([prediction_outputs])
180182
@embed_sequences
File renamed without changes.

tsaplay/scripts/import_dataset.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def get_dataset_dicts(train_file, test_file, parsing_fn):
3939

4040
def get_raw_file_paths(path):
4141
train_file = search_dir(path, "train", first=True, kind="files")
42-
test_file = search_dir(path, "train", first=True, kind="files")
42+
test_file = search_dir(path, "test", first=True, kind="files")
4343
return train_file, test_file
4444

4545

tsaplay/task.py

+19-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
import argparse
2+
from os import environ, getcwd
3+
from os.path import join
24
import comet_ml
35
import tensorflow as tf
46
import pkg_resources as pkg
7+
from tsaplay.utils.io import cprnt
58
from tsaplay.datasets import Dataset
69
from tsaplay.embeddings import Embedding
710
from tsaplay.features import FeatureProvider
@@ -60,9 +63,24 @@ def run_experiment(args):
6063
feature_provider = FeatureProvider(datasets, embedding)
6164

6265
model = MODELS.get(args.model)(params={"batch-size": args.batch_size})
66+
# model = MODELS.get(args.model)(
67+
# params={"batch-size": args.batch_size, "hidden_units": 100},
68+
# run_config={"keep_checkpoint_max": 50, "tf_random_seed": 1234},
69+
# )
6370

71+
distribution = tf.contrib.distribute.DistributeConfig(
72+
train_distribute=tf.contrib.distribute.OneDeviceStrategy("/gpu:0")
73+
)
6474
experiment = Experiment(
65-
feature_provider, model, contd_tag=args.contd_tag, job_dir=args.job_dir
75+
feature_provider,
76+
model,
77+
contd_tag=args.contd_tag,
78+
job_dir=args.job_dir,
79+
run_config={
80+
"tf_random_seed": 1234,
81+
"keep_checkpoint_max": 5000,
82+
"experimental_distribute": distribution,
83+
},
6684
)
6785

6886
experiment.run(job="train+eval", steps=args.steps)

tsaplay/utils/data.py

+12-8
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
import tensorflow as tf
21
import numpy as np
2+
import tensorflow as tf
33
from tsaplay.utils.tf import sparse_sequences_to_dense, get_seq_lengths
44

55

@@ -50,24 +50,28 @@ def make_dense_features(features):
5050
def prep_dataset(tfrecords, params, processing_fn, mode):
5151
shuffle_buffer = params.get("shuffle-bufer", 50)
5252
dataset = tf.data.Dataset.list_files(file_pattern=tfrecords)
53+
dataset = dataset.interleave(
54+
tf.data.TFRecordDataset, cycle_length=5, block_length=1
55+
)
5356
if mode == "EVAL":
5457
dataset = dataset.shuffle(buffer_size=shuffle_buffer)
5558
elif mode == "TRAIN":
5659
dataset = dataset.apply(
5760
tf.contrib.data.shuffle_and_repeat(buffer_size=shuffle_buffer)
5861
)
5962

60-
dataset = dataset.interleave(
61-
tf.data.TFRecordDataset, cycle_length=5, block_length=1
62-
)
63-
dataset = dataset.map(parse_tf_example, num_parallel_calls=5)
64-
if processing_fn is not None:
65-
dataset = dataset.map(processing_fn)
63+
def parse_and_process(example):
64+
return processing_fn(*parse_tf_example(example))
6665

67-
dataset = dataset.batch(params["batch-size"])
66+
dataset = dataset.apply(
67+
tf.contrib.data.map_and_batch(
68+
parse_and_process, params["batch-size"], num_parallel_batches=5
69+
)
70+
)
6871
dataset = dataset.map(
6972
lambda features, labels: (make_dense_features(features), labels)
7073
)
74+
dataset = dataset.prefetch(buffer_size=None)
7175

7276
return dataset
7377

0 commit comments

Comments
 (0)