From 4ee8b21432d5fc20ca1694528facbabc9335f07e Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Thu, 5 Jul 2018 17:07:17 -0700 Subject: [PATCH 01/59] Added MNIST-MLP-Module-API models to check model save and load_checkpoint methods --- .../mnist_mlp_module_api_inference.py | 134 ++++++++++++++++ .../mnist_mlp_module_api_train.py | 148 ++++++++++++++++++ 2 files changed, 282 insertions(+) create mode 100644 tests/nightly/model_backwards_compatibility_check/mnist_mlp_module_api_inference.py create mode 100644 tests/nightly/model_backwards_compatibility_check/mnist_mlp_module_api_train.py diff --git a/tests/nightly/model_backwards_compatibility_check/mnist_mlp_module_api_inference.py b/tests/nightly/model_backwards_compatibility_check/mnist_mlp_module_api_inference.py new file mode 100644 index 000000000000..956682081d08 --- /dev/null +++ b/tests/nightly/model_backwards_compatibility_check/mnist_mlp_module_api_inference.py @@ -0,0 +1,134 @@ +import boto3 +import mxnet as mx +import numpy as np +import json +import logging +import os +logging.getLogger().setLevel(logging.DEBUG) +mx.random.seed(7) +np.random.seed(7) + +bucket_name = 'mxnet-model-backwards-compatibility' +backslash = '/' +model_name = 'mnist_mlp_module_api' +s3 = boto3.resource('s3') +num_epoch = 2 +ctx = mx.cpu() + +def prepare_mnist_data(mnist_raw_data): + + #shuffle the indices + indices = np.random.permutation(mnist_raw_data['train_label'].shape[0]) + + #print indices[0:10] + train_idx , val_idx = indices[:50000], indices[50000:] + + train_data = mnist_raw_data['train_data'][train_idx,:] + train_label = mnist_raw_data['train_label'][train_idx] + + val_data = mnist_raw_data['train_data'][val_idx,:] + val_label = mnist_raw_data['train_label'][val_idx] + + test_data = mnist_raw_data['test_data'] + test_label = mnist_raw_data['test_label'] + + #print len(train_data) + #print len(val_data) + + train = {'train_X' : train_data, 'train_Y' : train_label} + test = {'test_X' : test_data, 'test_Y' : test_label} + val = {'val_X' : val_data, 'val_Y' : val_label} + + data = dict() + data['train'] = train + data['test'] = test + data['val'] = val + + return data + +def get_val_test_iter(): + data = prepare_mnist_data(mx.test_utils.get_mnist()) + val = data['val'] + test = data['test'] + batch_size = 100 + val_iter = mx.io.NDArrayIter(val['val_X'], val['val_Y'], batch_size, shuffle=True) + test_iter = mx.io.NDArrayIter(test['test_X'], test['test_Y']) + return val_iter, test_iter + +val_iter, test_iter = get_val_test_iter() + +def get_top_level_folders_in_bucket(s3client, bucket_name): + '''This function returns the top level folders in the S3Bucket. These folders help us to navigate to the trained model files stored for different MXNet versions. ''' + bucket = s3client.Bucket(bucket_name) + result = bucket.meta.client.list_objects(Bucket=bucket.name, + Delimiter=backslash) + folder_list = list() + for obj in result['CommonPrefixes']: + folder_list.append(obj['Prefix'].strip(backslash)) + + return folder_list + +def get_model(): + ##### Old Model ##### : + input = mx.symbol.Variable('data') + input = mx.symbol.Flatten(data=input) + + fc1 = mx.symbol.FullyConnected(data=input, name='fc1', num_hidden=128) + act1 = mx.symbol.Activation(data=fc1, name='relu1', act_type='relu') + + fc2 = mx.symbol.FullyConnected(data=act1, name='fc2', num_hidden=64) + output = mx.symbol.SoftmaxOutput(data=fc2, name='softmax') + + ### this is needed since the model is loaded from a checkpoint ### + sym, arg_params, aux_params = mx.model.load_checkpoint(model_name, num_epoch) + loaded_model = mx.mod.Module(symbol=output, context=ctx, data_names=['data'], label_names=['softmax_label']) + loaded_model.bind(data_shapes=test_iter.provide_data, label_shapes=test_iter.provide_label) + loaded_model.set_params(arg_params, aux_params) + return loaded_model + +def perform_inference(test_iter, val_iter, model, inference_file): + test_inference_score = model.score(test_iter, ['acc']) + val_inference_score = model.score(val_iter, ['acc']) + + with open(inference_file, 'r') as file: + results = json.load(file) + + print ('Validation accuracy on inference is %f while that on the original training file is %f' % (val_inference_score[0][1], results['val_acc'])) + print ('Test accuracy on inference is %f while that on the original training file is %f' % (test_inference_score[0][1], results['test_acc'])) + assert(results['val_acc'] == val_inference_score[0][1]) + assert(results['test_acc'] == test_inference_score[0][1]) + print ('Inference results passed for %s' % model_name) + +def clean_mnist_data(): + if os.path.isfile('train-images-idx3-ubyte.gz'): + os.remove('train-images-idx3-ubyte.gz') + if os.path.isfile('t10k-labels-idx1-ubyte.gz'): + os.remove('t10k-labels-idx1-ubyte.gz') + if os.path.isfile('train-labels-idx1-ubyte.gz'): + os.remove('train-labels-idx1-ubyte.gz') + if os.path.isfile('t10k-images-idx3-ubyte.gz'): + os.remove('t10k-images-idx3-ubyte.gz') + +def clean_model_files(model_files): + for file in model_files: + if os.path.isfile(file): + os.remove(file) + +if __name__=='__main__': + for folder in get_top_level_folders_in_bucket(s3, bucket_name): + bucket = s3.Bucket(bucket_name) + prefix = folder + backslash + model_name + model_files_meta = list(bucket.objects.filter(Prefix = prefix)) + if len(model_files_meta) == 0: + continue + model_files = list() + for obj in model_files_meta: + file_name = obj.key.split('/')[2] + model_files.append(file_name) + ## Download this file--- + bucket.download_file(obj.key, file_name) + + model = get_model() + perform_inference(test_iter, val_iter, model, model_name + '_inference.json') + clean_model_files(model_files) + clean_mnist_data() diff --git a/tests/nightly/model_backwards_compatibility_check/mnist_mlp_module_api_train.py b/tests/nightly/model_backwards_compatibility_check/mnist_mlp_module_api_train.py new file mode 100644 index 000000000000..8254ffe9b3c3 --- /dev/null +++ b/tests/nightly/model_backwards_compatibility_check/mnist_mlp_module_api_train.py @@ -0,0 +1,148 @@ +import boto3 +import mxnet as mx +import numpy as np +import json +import os +import logging +logging.getLogger().setLevel(logging.DEBUG) + + +# Set fixed random seeds. These would be the same for inference files as well +mx.random.seed(7) +np.random.seed(7) + +# get the current mxnet version we are running on +mxnet_version = mx.__version__ +bucket_name = 'mxnet-model-backwards-compatibility' +ctx = mx.cpu() +batch_size = 100 +num_epoch = 2 +backslash = '/' +model_name = 'mnist_mlp_module_api' + + +def prepare_mnist_data(mnist_raw_data): + #shuffle the indices + indices = np.random.permutation(mnist_raw_data['train_label'].shape[0]) + + #print indices[0:10] + train_idx , val_idx = indices[:50000], indices[50000:] + + train_data = mnist_raw_data['train_data'][train_idx,:] + train_label = mnist_raw_data['train_label'][train_idx] + + val_data = mnist_raw_data['train_data'][val_idx,:] + val_label = mnist_raw_data['train_label'][val_idx] + + test_data = mnist_raw_data['test_data'] + test_label = mnist_raw_data['test_label'] + + #print len(train_data) + #print len(val_data) + + train = {'train_X' : train_data, 'train_Y' : train_label} + test = {'test_X' : test_data, 'test_Y' : test_label} + val = {'val_X' : val_data, 'val_Y' : val_label} + + data = dict() + data['train'] = train + data['test'] = test + data['val'] = val + + return data + +def upload_model_files_to_s3(bucket_name, files, folder_name): + s3 = boto3.client('s3') + for file in files: + s3.upload_file(file, bucket_name, folder_name + file) + print ('model successfully uploaded to s3') + +def clean_up_files (): + clean_mnist_data() + clean_model_files() + print ('Model files deleted') + +def clean_mnist_data(): + if os.path.isfile('train-images-idx3-ubyte.gz'): + os.remove('train-images-idx3-ubyte.gz') + if os.path.isfile('t10k-labels-idx1-ubyte.gz'): + os.remove('t10k-labels-idx1-ubyte.gz') + if os.path.isfile('train-labels-idx1-ubyte.gz'): + os.remove('train-labels-idx1-ubyte.gz') + if os.path.isfile('t10k-images-idx3-ubyte.gz'): + os.remove('t10k-images-idx3-ubyte.gz') + +def clean_model_files(): + for i in range(1, num_epoch+1): + if os.path.isfile(model_name + '-000' + str(i) + '.params'): + os.remove(model_name + '-000' + str(i) + '.params') + + if os.path.isfile(model_name + '-symbol.json'): + os.remove(model_name + '-symbol.json') + if os.path.isfile(inference_results_file): + os.remove(inference_results_file) + +def get_model_definition(): + input = mx.symbol.Variable('data') + input = mx.symbol.Flatten(data=input) + + fc1 = mx.symbol.FullyConnected(data=input, name='fc1', num_hidden=128) + act1 = mx.symbol.Activation(data=fc1, name='relu1', act_type='relu') + + fc2 = mx.symbol.FullyConnected(data=act1, name='fc2', num_hidden=64) + output = mx.symbol.SoftmaxOutput(data=fc2, name='softmax') + + model = mx.mod.Module(symbol=output, context=ctx, data_names=['data'], label_names=['softmax_label']) + + return model + +if __name__=='__main__': + data = prepare_mnist_data(mx.test_utils.get_mnist()) + + train = data['train'] + val = data['val'] + test = data['test'] + + train_iter = mx.io.NDArrayIter(train['train_X'], train['train_Y'], batch_size, shuffle=True) + val_iter = mx.io.NDArrayIter(val['val_X'], val['val_Y'], batch_size, shuffle=True) + test_iter = mx.io.NDArrayIter(test['test_X'], test['test_Y']) + + model = get_model_definition() + + train_iter.reset() + checkpoint_callback = mx.callback.do_checkpoint(model_name) + model.fit(train_iter, epoch_end_callback=checkpoint_callback, eval_data=val_iter, optimizer='sgd', optimizer_params={'learning_rate' : 0.1}, eval_metric='acc', num_epoch=num_epoch) + + score_val = model.score(val_iter,['acc']) + val_acc = score_val[0][1] + print ('Validation Accuracy is : %f' % val_acc) + score_test = model.score(test_iter, ['acc']) + test_acc = score_test[0][1] + print ('Test Accuracy is : %f' % test_acc) + + inference_results = dict() + inference_results['val_acc'] = val_acc + inference_results['test_acc'] = test_acc + + inference_results_file = model_name + '_inference' + '.json' + + # Write the inference results to local json file. This will be cleaned up later + with open(inference_results_file, 'w') as file: + json.dump(inference_results, file) + + + model_params_file = model_name + '-000' + str(num_epoch) + '.params' + model_symbol_file = model_name + '-symbol.json' + model_inference_file = inference_results_file + files = list() + files.append(model_params_file) + files.append(model_symbol_file) + files.append(model_inference_file) + + + mxnet_folder = str(mxnet_version) + backslash + model_name + backslash + + # Upload the model files to S3 + upload_model_files_to_s3(bucket_name, files, mxnet_folder) + # Clean up the local files + clean_up_files() \ No newline at end of file From 118850ffa478f34d76e78fa9ac0d0dc931bda592 Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Fri, 6 Jul 2018 13:15:01 -0700 Subject: [PATCH 02/59] Added LENET with Conv2D operator training file --- .../lenet_cnn_gluon_train.py | 208 ++++++++++++++++++ 1 file changed, 208 insertions(+) create mode 100644 tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_train.py diff --git a/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_train.py b/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_train.py new file mode 100644 index 000000000000..fc71940208e3 --- /dev/null +++ b/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_train.py @@ -0,0 +1,208 @@ +import boto3 +import mxnet as mx +import mxnet.ndarray as nd +from mxnet import nd, autograd, gluon +from mxnet.gluon.data.vision import transforms, datasets +import numpy as np +from mxnet import autograd as ag +import logging +import mxnet.ndarray as F +from mxnet.gluon import nn +import json +import os + +logging.getLogger().setLevel(logging.DEBUG) +mx.random.seed(7) +np.random.seed(7) + +batch_size=100 +num_epoch = 2 +bucket_name = 'mxnet-model-backwards-compatibility' +backslash = '/' +model_name = 'lenet_cnn_gluon_api' + +ctx = [mx.cpu(0)] +mxnet_version = mx.__version__ + +class Net(gluon.Block): + def __init__(self, **kwargs): + super(Net, self).__init__(**kwargs) + with self.name_scope(): + # layers created in name_scope will inherit name space + # from parent layer. + self.conv1 = nn.Conv2D(20, kernel_size=(5,5)) + self.pool1 = nn.MaxPool2D(pool_size=(2,2), strides = (2,2)) + self.conv2 = nn.Conv2D(50, kernel_size=(5,5)) + self.pool2 = nn.MaxPool2D(pool_size=(2,2), strides = (2,2)) + self.fc1 = nn.Dense(500) + self.fc2 = nn.Dense(10) + + def forward(self, x): + x = self.pool1(F.tanh(self.conv1(x))) + x = self.pool2(F.tanh(self.conv2(x))) + # 0 means copy over size from corresponding dimension. + # -1 means infer size from the rest of dimensions. + x = x.reshape((0, -1)) + x = F.tanh(self.fc1(x)) + x = F.tanh(self.fc2(x)) + return x + + +def prepare_mnist_data(mnist_raw_data): + #shuffle the indices + indices = np.random.permutation(mnist_raw_data['train_label'].shape[0]) + + #print indices[0:10] + train_idx , val_idx = indices[:50000], indices[50000:] + + train_data = mnist_raw_data['train_data'][train_idx,:] + train_label = mnist_raw_data['train_label'][train_idx] + + val_data = mnist_raw_data['train_data'][val_idx,:] + val_label = mnist_raw_data['train_label'][val_idx] + + test_data = mnist_raw_data['test_data'] + test_label = mnist_raw_data['test_label'] + + #print len(train_data) + #print len(val_data) + + train = {'train_X' : train_data, 'train_Y' : train_label} + test = {'test_X' : test_data, 'test_Y' : test_label} + val = {'val_X' : val_data, 'val_Y' : val_label} + + data = dict() + data['train'] = train + data['test'] = test + data['val'] = val + + return data + +def clean_up_files (model_files): + clean_mnist_data() + clean_model_files(model_files) + print ('Model files deleted') + +def clean_mnist_data(): + if os.path.isfile('train-images-idx3-ubyte.gz'): + os.remove('train-images-idx3-ubyte.gz') + if os.path.isfile('t10k-labels-idx1-ubyte.gz'): + os.remove('t10k-labels-idx1-ubyte.gz') + if os.path.isfile('train-labels-idx1-ubyte.gz'): + os.remove('train-labels-idx1-ubyte.gz') + if os.path.isfile('t10k-images-idx3-ubyte.gz'): + os.remove('t10k-images-idx3-ubyte.gz') + +def clean_model_files(model_files): + for file in model_files: + if os.path.isfile(file): + os.remove(file) + +def save_model_files(network): + model_file_name = model_name + '.params' + network.save_params(model_file_name) + +def save_inference_results(test_acc, val_acc): + inference_results = dict() + inference_results['val_acc'] = val_acc + inference_results['test_acc'] = test_acc + + inference_results_file = model_name + '_inference' + '.json' + + # Write the inference results to local json file. This will be cleaned up later + with open(inference_results_file, 'w') as file: + json.dump(inference_results, file) + +def upload_model_files_to_s3(bucket_name, files, folder_name): + s3 = boto3.client('s3') + for file in files: + s3.upload_file(file, bucket_name, folder_name + file) + print ('model successfully uploaded to s3') + +def get_inference_score(iter, model): + # Use Accuracy as the evaluation metric. + metric = mx.metric.Accuracy() + # Reset the validation data iterator. + iter.reset() + # Loop over the validation data iterator. + for batch in iter: + # Splits validation data into multiple slices along batch_axis + # and copy each slice into a context. + data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0) + # Splits validation label into multiple slices along batch_axis + # and copy each slice into a context. + label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0) + outputs = [] + for x in data: + outputs.append(model(x)) + # Updates internal evaluation + metric.update(label, outputs) + acc = metric.get() + return acc[1] + +if __name__=='__main__': + data = prepare_mnist_data(mx.test_utils.get_mnist()) + + train = data['train'] + val = data['val'] + test = data['test'] + + train_iter = mx.io.NDArrayIter(train['train_X'], train['train_Y'], batch_size, shuffle=True) + val_iter = mx.io.NDArrayIter(val['val_X'], val['val_Y'], batch_size, shuffle=True) + test_iter = mx.io.NDArrayIter(test['test_X'], test['test_Y']) + + + net = Net() + net.initialize(mx.init.Xavier(), ctx=ctx) + + metric = mx.metric.Accuracy() + softmax_cross_entropy_loss = gluon.loss.SoftmaxCrossEntropyLoss() + trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.02}) + + for i in range(num_epoch): + train_iter.reset() + for batch in train_iter: + data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0) + label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0) + outputs = [] + # Inside training scope + with ag.record(): + for x, y in zip(data, label): + z = net(x) + # Computes softmax cross entropy loss. + loss = softmax_cross_entropy_loss(z, y) + # Backpropagate the error for one iteration. + loss.backward() + outputs.append(z) + + metric.update(label, outputs) + # Make one step of parameter update. Trainer needs to know the + # batch size of data to normalize the gradient by 1/batch_size. + trainer.step(batch.data[0].shape[0]) + + name, acc = metric.get() + # Reset evaluation result to initial state. + metric.reset() + print('training acc at epoch %d: %s=%f'%(i, name, acc)) + + save_model_files(net) + + + # In[6]: + val_acc = get_inference_score(val_iter, net) + print('validation acc: =%f'%val_acc) + + test_acc = get_inference_score(test_iter, net) + print('test acc: =%f'%test_acc) + + save_inference_results(test_acc, val_acc) + + mxnet_folder = str(mxnet_version) + backslash + model_name + backslash + + files = list() + files.append(model_name + '.params') + files.append(model_name + '_inference' + '.json') + + upload_model_files_to_s3(bucket_name, files, mxnet_folder) + + clean_up_files(files) \ No newline at end of file From 27863fd0f5ba93a833d79b8ff476532564b87fab Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Fri, 6 Jul 2018 13:47:01 -0700 Subject: [PATCH 03/59] Added LENET with Conv2d operator inference file --- .../lenet_cnn_gluon_inference.py | 180 ++++++++++++++++++ 1 file changed, 180 insertions(+) create mode 100644 tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_inference.py diff --git a/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_inference.py b/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_inference.py new file mode 100644 index 000000000000..8a2d72036714 --- /dev/null +++ b/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_inference.py @@ -0,0 +1,180 @@ +import boto3 +import mxnet as mx +import mxnet.ndarray as nd +from mxnet import nd, autograd, gluon +from mxnet.gluon.data.vision import transforms, datasets +import numpy as np +from mxnet import autograd as ag +import logging +import mxnet.ndarray as F +from mxnet.gluon import nn +import json +import os + +logging.getLogger().setLevel(logging.DEBUG) +mx.random.seed(7) +np.random.seed(7) + +bucket_name = 'mxnet-model-backwards-compatibility' +backslash = '/' +model_name = 'lenet_cnn_gluon_api' +s3 = boto3.resource('s3') +num_epoch = 2 +ctx = [mx.cpu(0)] +batch_size = 100 + +def prepare_mnist_data(mnist_raw_data): + + #shuffle the indices + indices = np.random.permutation(mnist_raw_data['train_label'].shape[0]) + + #print indices[0:10] + train_idx , val_idx = indices[:50000], indices[50000:] + + train_data = mnist_raw_data['train_data'][train_idx,:] + train_label = mnist_raw_data['train_label'][train_idx] + + val_data = mnist_raw_data['train_data'][val_idx,:] + val_label = mnist_raw_data['train_label'][val_idx] + + test_data = mnist_raw_data['test_data'] + test_label = mnist_raw_data['test_label'] + + #print len(train_data) + #print len(val_data) + + train = {'train_X' : train_data, 'train_Y' : train_label} + test = {'test_X' : test_data, 'test_Y' : test_label} + val = {'val_X' : val_data, 'val_Y' : val_label} + + data = dict() + data['train'] = train + data['test'] = test + data['val'] = val + + return data + +def get_val_test_iter(): + data = prepare_mnist_data(mx.test_utils.get_mnist()) + val = data['val'] + test = data['test'] + val_iter = mx.io.NDArrayIter(val['val_X'], val['val_Y'], batch_size, shuffle=True) + test_iter = mx.io.NDArrayIter(test['test_X'], test['test_Y']) + return val_iter, test_iter + +val_iter, test_iter = get_val_test_iter() + +def get_top_level_folders_in_bucket(s3client, bucket_name): + '''This function returns the top level folders in the S3Bucket. These folders help us to navigate to the trained model files stored for different MXNet versions. ''' + bucket = s3client.Bucket(bucket_name) + result = bucket.meta.client.list_objects(Bucket=bucket.name, + Delimiter=backslash) + folder_list = list() + for obj in result['CommonPrefixes']: + folder_list.append(obj['Prefix'].strip(backslash)) + + return folder_list + +class Net(gluon.Block): + def __init__(self, **kwargs): + super(Net, self).__init__(**kwargs) + with self.name_scope(): + # layers created in name_scope will inherit name space + # from parent layer. + self.conv1 = nn.Conv2D(20, kernel_size=(5,5)) + self.pool1 = nn.MaxPool2D(pool_size=(2,2), strides = (2,2)) + self.conv2 = nn.Conv2D(50, kernel_size=(5,5)) + self.pool2 = nn.MaxPool2D(pool_size=(2,2), strides = (2,2)) + self.fc1 = nn.Dense(500) + self.fc2 = nn.Dense(10) + + def forward(self, x): + x = self.pool1(F.tanh(self.conv1(x))) + x = self.pool2(F.tanh(self.conv2(x))) + # 0 means copy over size from corresponding dimension. + # -1 means infer size from the rest of dimensions. + x = x.reshape((0, -1)) + x = F.tanh(self.fc1(x)) + x = F.tanh(self.fc2(x)) + return x + + +def get_model(model_file): + net = Net() + net.load_params(model_file, ctx) + + return net + +def get_inference_score(iter, model): + # Use Accuracy as the evaluation metric. + metric = mx.metric.Accuracy() + # Reset the validation data iterator. + iter.reset() + # Loop over the validation data iterator. + for batch in iter: + # Splits validation data into multiple slices along batch_axis + # and copy each slice into a context. + data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0) + # Splits validation label into multiple slices along batch_axis + # and copy each slice into a context. + label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0) + outputs = [] + for x in data: + outputs.append(model(x)) + # Updates internal evaluation + metric.update(label, outputs) + acc = metric.get() + return acc[1] + +def perform_inference(test_iter, val_iter, model, inference_file): + test_inference_score = get_inference_score(test_iter, model) + val_inference_score = get_inference_score(val_iter, model) + + with open(inference_file, 'r') as file: + results = json.load(file) + + print (test_inference_score, val_inference_score) + print results['val_acc'] + print ('Validation accuracy on inference is %f while that on the original training file is %f' % (val_inference_score, results['val_acc'])) + print ('Test accuracy on inference is %f while that on the original training file is %f' % (test_inference_score, results['test_acc'])) + assert(results['val_acc'] == val_inference_score) + assert(results['test_acc'] == test_inference_score) + print ('Inference results passed for %s' % model_name) + +def clean_up_files (model_files): + clean_mnist_data() + clean_model_files(model_files) + print ('Model files deleted') + +def clean_mnist_data(): + if os.path.isfile('train-images-idx3-ubyte.gz'): + os.remove('train-images-idx3-ubyte.gz') + if os.path.isfile('t10k-labels-idx1-ubyte.gz'): + os.remove('t10k-labels-idx1-ubyte.gz') + if os.path.isfile('train-labels-idx1-ubyte.gz'): + os.remove('train-labels-idx1-ubyte.gz') + if os.path.isfile('t10k-images-idx3-ubyte.gz'): + os.remove('t10k-images-idx3-ubyte.gz') + +def clean_model_files(model_files): + for file in model_files: + if os.path.isfile(file): + os.remove(file) + +if __name__=='__main__': + for folder in get_top_level_folders_in_bucket(s3, bucket_name): + bucket = s3.Bucket(bucket_name) + prefix = folder + backslash + model_name + model_files_meta = list(bucket.objects.filter(Prefix = prefix)) + if len(model_files_meta) == 0: + continue + model_files = list() + for obj in model_files_meta: + file_name = obj.key.split('/')[2] + model_files.append(file_name) + ## Download this file--- + bucket.download_file(obj.key, file_name) + + model = get_model(model_name + '.params') + perform_inference(test_iter, val_iter, model, model_name + '_inference.json') + clean_up_files(model_files) From b3e97749b34288543a0b85bfa6d6a444588c4c5d Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Fri, 6 Jul 2018 17:12:08 -0700 Subject: [PATCH 04/59] Added LanguageModelling with RNN training file --- .../lm_rnn_gluon_train.py | 283 ++++++++++++++++++ 1 file changed, 283 insertions(+) create mode 100644 tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_train.py diff --git a/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_train.py b/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_train.py new file mode 100644 index 000000000000..cb812f69ab62 --- /dev/null +++ b/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_train.py @@ -0,0 +1,283 @@ +import math +import os +import time +import numpy as np +import mxnet as mx +from mxnet import gluon, autograd +from mxnet.gluon import nn, rnn +import logging +import boto3 +import json +logging.getLogger().setLevel(logging.DEBUG) +mx.random.seed(7) +np.random.seed(7) + +mxnet_version = mx.__version__ +bucket_name = 'mxnet-model-backwards-compatibility' +ctx = mx.cpu() +num_epoch = 2 +backslash = '/' +model_name = 'lm_rnn_gluon_api' +s3 = boto3.resource('s3') + + +args_data = 'ptb.' +args_model = 'rnn_relu' +args_emsize = 100 +args_nhid = 100 +args_nlayers = 2 +args_lr = 1.0 +args_clip = 0.2 +args_epochs = 2 +args_batch_size = 32 +args_bptt = 5 +args_dropout = 0.2 +args_tied = True +args_cuda = 'store_true' +args_log_interval = 500 +args_save = model_name + '.params' + +class Dictionary(object): + def __init__(self): + self.word2idx = {} + self.idx2word = [] + + def add_word(self, word): + if word not in self.word2idx: + self.idx2word.append(word) + self.word2idx[word] = len(self.idx2word) - 1 + return self.word2idx[word] + + def __len__(self): + return len(self.idx2word) + +class Corpus(object): + def __init__(self, path): + self.dictionary = Dictionary() + self.download_data_from_s3() + self.train = self.tokenize(path + 'train.txt') + self.valid = self.tokenize(path + 'valid.txt') + self.test = self.tokenize(path + 'test.txt') + + def download_data_from_s3(self): + print ('Downloading files from bucket : %s' %bucket_name) + bucket = s3.Bucket(bucket_name) + files = ['test.txt', 'train.txt', 'valid.txt'] + for file in files: + if os.path.exists(args_data + file) : + print ('File %s'%(args_data + file), 'already exists. Skipping download') + continue + file_path = str(mxnet_version) + backslash + model_name + backslash + args_data + file + bucket.download_file(file_path, args_data + file) + + def tokenize(self, path): + """Tokenizes a text file.""" + assert os.path.exists(path) + # Add words to the dictionary + with open(path, 'r') as f: + tokens = 0 + for line in f: + words = line.split() + [''] + tokens += len(words) + for word in words: + self.dictionary.add_word(word) + + # Tokenize file content + with open(path, 'r') as f: + ids = np.zeros((tokens,), dtype='int32') + token = 0 + for line in f: + words = line.split() + [''] + for word in words: + ids[token] = self.dictionary.word2idx[word] + token += 1 + + return mx.nd.array(ids, dtype='int32') + +class RNNModel(gluon.Block): + """A model with an encoder, recurrent layer, and a decoder.""" + + def __init__(self, mode, vocab_size, num_embed, num_hidden, + num_layers, dropout=0.5, tie_weights=False, **kwargs): + super(RNNModel, self).__init__(**kwargs) + with self.name_scope(): + self.drop = nn.Dropout(dropout) + self.encoder = nn.Embedding(vocab_size, num_embed, + weight_initializer = mx.init.Uniform(0.1)) + if mode == 'rnn_relu': + self.rnn = rnn.RNN(num_hidden, num_layers, activation='relu', dropout=dropout, + input_size=num_embed) + elif mode == 'rnn_tanh': + self.rnn = rnn.RNN(num_hidden, num_layers, dropout=dropout, + input_size=num_embed) + elif mode == 'lstm': + self.rnn = rnn.LSTM(num_hidden, num_layers, dropout=dropout, + input_size=num_embed) + elif mode == 'gru': + self.rnn = rnn.GRU(num_hidden, num_layers, dropout=dropout, + input_size=num_embed) + else: + raise ValueError("Invalid mode %s. Options are rnn_relu, " + "rnn_tanh, lstm, and gru"%mode) + if tie_weights: + self.decoder = nn.Dense(vocab_size, in_units = num_hidden, + params = self.encoder.params) + else: + self.decoder = nn.Dense(vocab_size, in_units = num_hidden) + self.num_hidden = num_hidden + + def forward(self, inputs, hidden): + emb = self.drop(self.encoder(inputs)) + output, hidden = self.rnn(emb, hidden) + output = self.drop(output) + decoded = self.decoder(output.reshape((-1, self.num_hidden))) + return decoded, hidden + + def begin_state(self, *args, **kwargs): + return self.rnn.begin_state(*args, **kwargs) + +context = mx.cpu(0) + +def batchify(data, batch_size): + """Reshape data into (num_example, batch_size)""" + nbatch = data.shape[0] // batch_size + data = data[:nbatch * batch_size] + data = data.reshape((batch_size, nbatch)).T + return data + +def get_batch(source, i): + seq_len = min(args_bptt, source.shape[0] - 1 - i) + data = source[i : i + seq_len] + target = source[i + 1 : i + 1 + seq_len] + return data, target.reshape((-1,)) + +def detach(hidden): + if isinstance(hidden, (tuple, list)): + hidden = [i.detach() for i in hidden] + else: + hidden = hidden.detach() + return hidden + +def eval(data_source, model): + total_L = 0.0 + ntotal = 0 + hidden = model.begin_state(func = mx.nd.zeros, batch_size = args_batch_size, ctx=context) + for i in range(0, data_source.shape[0] - 1, args_bptt): + data, target = get_batch(data_source, i) + output, hidden = model(data, hidden) + L = loss(output, target) + total_L += mx.nd.sum(L).asscalar() + ntotal += L.size + return total_L / ntotal + +def train(model, train_data): + best_val = float("Inf") + for epoch in range(args_epochs): + total_L = 0.0 + start_time = time.time() + hidden = model.begin_state(func = mx.nd.zeros, batch_size = args_batch_size, ctx = context) + for ibatch, i in enumerate(range(0, train_data.shape[0] - 1, args_bptt)): + data, target = get_batch(train_data, i) + hidden = detach(hidden) + with autograd.record(): + output, hidden = model(data, hidden) + L = loss(output, target) + L.backward() + + grads = [i.grad(context) for i in model.collect_params().values()] + # Here gradient is for the whole batch. + # So we multiply max_norm by batch_size and bptt size to balance it. + gluon.utils.clip_global_norm(grads, args_clip * args_bptt * args_batch_size) + + trainer.step(args_batch_size) + total_L += mx.nd.sum(L).asscalar() + + if ibatch % args_log_interval == 0 and ibatch > 0: + cur_L = total_L / args_bptt / args_batch_size / args_log_interval + print('[Epoch %d Batch %d] loss %.2f, perplexity %.2f' % ( + epoch + 1, ibatch, cur_L, math.exp(cur_L))) + total_L = 0.0 + + val_L = eval(val_data, model) + + print('[Epoch %d] time cost %.2fs, validation loss %.2f, validation perplexity %.2f' % ( + epoch + 1, time.time() - start_time, val_L, math.exp(val_L))) + + if val_L < best_val: + best_val = val_L + model.save_parameters(args_save) + +def test(test_data, model): + test_L = eval(test_data, model) + return test_L, math.exp(test_L) + +def save_inference_results(test, val): + inference_results = dict() + inference_results['val'] = val + inference_results['test'] = test + + inference_results_file = model_name + '_inference' + '.json' + + # Write the inference results to local json file. This will be cleaned up later + with open(inference_results_file, 'w') as file: + json.dump(inference_results, file) + +def upload_model_files_to_s3(bucket_name, files, folder_name): + s3 = boto3.client('s3') + for file in files: + s3.upload_file(file, bucket_name, folder_name + file) + print ('model successfully uploaded to s3') + +def clean_up_files (model_files): + clean_ptb_data() + clean_model_files(model_files) + print ('Model files deleted') + +def clean_ptb_data(): + files = ['test.txt', 'train.txt', 'valid.txt'] + for file in files: + if os.path.isfile(args_data + file): + os.remove(args_data + file) + +def clean_model_files(model_files): + for file in model_files: + if os.path.isfile(file): + os.remove(file) + +if __name__=='__main__': + corpus = Corpus(args_data) + train_data = batchify(corpus.train, args_batch_size).as_in_context(context) + val_data = batchify(corpus.valid, args_batch_size).as_in_context(context) + test_data = batchify(corpus.test, args_batch_size).as_in_context(context) + + ntokens = len(corpus.dictionary) + + model = RNNModel(args_model, ntokens, args_emsize, args_nhid, + args_nlayers, args_dropout, args_tied) + model.collect_params().initialize(mx.init.Xavier(), ctx=context) + trainer = gluon.Trainer(model.collect_params(), 'sgd', {'learning_rate': args_lr, 'momentum': 0, 'wd': 0}) + loss = gluon.loss.SoftmaxCrossEntropyLoss() + + train(model, train_data) + val_loss, val_ppl = test(val_data, model) + print('Validation loss %f, Validation perplexity %f'%(val_loss, val_ppl)) + test_loss, test_ppl = test(test_data, model) + print('test loss %f, test perplexity %f'%(test_loss, test_ppl)) + + val_results = dict() + val_results['loss'] = val_loss + val_results['ppl'] = val_ppl + + test_results = dict() + test_results['loss'] = test_loss + test_results['ppl'] = test_ppl + + save_inference_results(test_results, val_results) + + mxnet_folder = str(mxnet_version) + backslash + model_name + backslash + + files = list() + files.append(model_name + '.params') + files.append(model_name + '_inference' + '.json') + upload_model_files_to_s3(bucket_name, files, mxnet_folder) + clean_up_files(files) \ No newline at end of file From c141701f97fb0bdee79b78e032ccbc88eea8c29f Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Fri, 6 Jul 2018 17:39:23 -0700 Subject: [PATCH 05/59] Added LamguageModelling with RNN inference file --- .../lm_rnn_gluon_inference.py | 256 ++++++++++++++++++ 1 file changed, 256 insertions(+) create mode 100644 tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_inference.py diff --git a/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_inference.py b/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_inference.py new file mode 100644 index 000000000000..d7d787088ef8 --- /dev/null +++ b/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_inference.py @@ -0,0 +1,256 @@ +import math +import os +import time +import numpy as np +import mxnet as mx +from mxnet import gluon, autograd +from mxnet.gluon import nn, rnn +import logging +import boto3 +import json +logging.getLogger().setLevel(logging.DEBUG) +mx.random.seed(7) +np.random.seed(7) + +mxnet_version = mx.__version__ +bucket_name = 'mxnet-model-backwards-compatibility' +ctx = mx.cpu() +backslash = '/' +model_name = 'lm_rnn_gluon_api' +s3 = boto3.resource('s3') + + +args_data = 'ptb.' +args_model = 'rnn_relu' +args_emsize = 100 +args_nhid = 100 +args_nlayers = 2 +args_lr = 1.0 +args_clip = 0.2 +args_epochs = 2 +args_batch_size = 32 +args_bptt = 5 +args_dropout = 0.2 +args_tied = True +args_cuda = 'store_true' +args_log_interval = 500 +args_save = model_name + '.params' + +class Dictionary(object): + def __init__(self): + self.word2idx = {} + self.idx2word = [] + + def add_word(self, word): + if word not in self.word2idx: + self.idx2word.append(word) + self.word2idx[word] = len(self.idx2word) - 1 + return self.word2idx[word] + + def __len__(self): + return len(self.idx2word) + +class Corpus(object): + def __init__(self, path): + self.dictionary = Dictionary() + self.download_data_from_s3() + self.train = self.tokenize(path + 'train.txt') + self.valid = self.tokenize(path + 'valid.txt') + self.test = self.tokenize(path + 'test.txt') + + def download_data_from_s3(self): + print ('Downloading files from bucket : %s' %bucket_name) + bucket = s3.Bucket(bucket_name) + files = ['test.txt', 'train.txt', 'valid.txt'] + for file in files: + if os.path.exists(args_data + file) : + print ('File %s'%(args_data + file), 'already exists. Skipping download') + continue + file_path = str(mxnet_version) + backslash + model_name + backslash + args_data + file + bucket.download_file(file_path, args_data + file) + + def tokenize(self, path): + """Tokenizes a text file.""" + assert os.path.exists(path) + # Add words to the dictionary + with open(path, 'r') as f: + tokens = 0 + for line in f: + words = line.split() + [''] + tokens += len(words) + for word in words: + self.dictionary.add_word(word) + + # Tokenize file content + with open(path, 'r') as f: + ids = np.zeros((tokens,), dtype='int32') + token = 0 + for line in f: + words = line.split() + [''] + for word in words: + ids[token] = self.dictionary.word2idx[word] + token += 1 + + return mx.nd.array(ids, dtype='int32') + +class RNNModel(gluon.Block): + """A model with an encoder, recurrent layer, and a decoder.""" + + def __init__(self, mode, vocab_size, num_embed, num_hidden, + num_layers, dropout=0.5, tie_weights=False, **kwargs): + super(RNNModel, self).__init__(**kwargs) + with self.name_scope(): + self.drop = nn.Dropout(dropout) + self.encoder = nn.Embedding(vocab_size, num_embed, + weight_initializer = mx.init.Uniform(0.1)) + if mode == 'rnn_relu': + self.rnn = rnn.RNN(num_hidden, num_layers, activation='relu', dropout=dropout, + input_size=num_embed) + elif mode == 'rnn_tanh': + self.rnn = rnn.RNN(num_hidden, num_layers, dropout=dropout, + input_size=num_embed) + elif mode == 'lstm': + self.rnn = rnn.LSTM(num_hidden, num_layers, dropout=dropout, + input_size=num_embed) + elif mode == 'gru': + self.rnn = rnn.GRU(num_hidden, num_layers, dropout=dropout, + input_size=num_embed) + else: + raise ValueError("Invalid mode %s. Options are rnn_relu, " + "rnn_tanh, lstm, and gru"%mode) + if tie_weights: + self.decoder = nn.Dense(vocab_size, in_units = num_hidden, + params = self.encoder.params) + else: + self.decoder = nn.Dense(vocab_size, in_units = num_hidden) + self.num_hidden = num_hidden + + def forward(self, inputs, hidden): + emb = self.drop(self.encoder(inputs)) + output, hidden = self.rnn(emb, hidden) + output = self.drop(output) + decoded = self.decoder(output.reshape((-1, self.num_hidden))) + return decoded, hidden + + def begin_state(self, *args, **kwargs): + return self.rnn.begin_state(*args, **kwargs) + +context = mx.cpu(0) + +def batchify(data, batch_size): + """Reshape data into (num_example, batch_size)""" + nbatch = data.shape[0] // batch_size + data = data[:nbatch * batch_size] + data = data.reshape((batch_size, nbatch)).T + return data + +def get_batch(source, i): + seq_len = min(args_bptt, source.shape[0] - 1 - i) + data = source[i : i + seq_len] + target = source[i + 1 : i + 1 + seq_len] + return data, target.reshape((-1,)) + +def detach(hidden): + if isinstance(hidden, (tuple, list)): + hidden = [i.detach() for i in hidden] + else: + hidden = hidden.detach() + return hidden + +def eval(data_source, model): + total_L = 0.0 + ntotal = 0 + loss = gluon.loss.SoftmaxCrossEntropyLoss() + hidden = model.begin_state(func = mx.nd.zeros, batch_size = args_batch_size, ctx=context) + for i in range(0, data_source.shape[0] - 1, args_bptt): + data, target = get_batch(data_source, i) + output, hidden = model(data, hidden) + L = loss(output, target) + total_L += mx.nd.sum(L).asscalar() + ntotal += L.size + return total_L / ntotal + +def test(test_data, model): + test_L = eval(test_data, model) + return test_L, math.exp(test_L) + +def get_top_level_folders_in_bucket(s3client, bucket_name): + '''This function returns the top level folders in the S3Bucket. These folders help us to navigate to the trained model files stored for different MXNet versions. ''' + bucket = s3client.Bucket(bucket_name) + result = bucket.meta.client.list_objects(Bucket=bucket.name, + Delimiter=backslash) + folder_list = list() + for obj in result['CommonPrefixes']: + folder_list.append(obj['Prefix'].strip(backslash)) + + return folder_list + +def get_model(model_file): + model_2 = RNNModel(args_model, ntokens, args_emsize, args_nhid, + args_nlayers, args_dropout, args_tied) + model_2.load_parameters(args_save, context) + + return model_2 + +def perform_inference(test_data, val_data, model, inference_file): + test_loss, test_ppl = test(test_data, model) + val_loss, val_ppl = test(val_data, model) + + with open(inference_file, 'r') as file: + results = json.load(file) + val_results = results['val'] + test_results = results['test'] + + print ('Validation loss on inference is %f while that on the original training file is %f' % (val_loss, val_results['loss'])) + print ('Test loss on inference is %f while that on the original training file is %f' % (test_loss, test_results['loss'])) + + assert(test_loss == test_results['loss']) + assert(test_ppl == test_results['ppl']) + + assert(val_loss == val_results['loss']) + assert(val_ppl == val_results['ppl']) + + print ('Inference results passed for %s' % model_name) + +def clean_up_files (model_files): + clean_ptb_data() + clean_model_files(model_files) + print ('Model files deleted') + +def clean_ptb_data(): + files = ['test.txt', 'train.txt', 'valid.txt'] + for file in files: + if os.path.isfile(args_data + file): + os.remove(args_data + file) + +def clean_model_files(model_files): + for file in model_files: + if os.path.isfile(file): + os.remove(file) + +if __name__=='__main__': + corpus = Corpus(args_data) + train_data = batchify(corpus.train, args_batch_size).as_in_context(context) + val_data = batchify(corpus.valid, args_batch_size).as_in_context(context) + test_data = batchify(corpus.test, args_batch_size).as_in_context(context) + + ntokens = len(corpus.dictionary) + for folder in get_top_level_folders_in_bucket(s3, bucket_name): + bucket = s3.Bucket(bucket_name) + prefix = folder + backslash + model_name + model_files_meta = list(bucket.objects.filter(Prefix = prefix)) + if len(model_files_meta) == 0: + continue + model_files = list() + for obj in model_files_meta: + # print + file_name = obj.key.split('/')[2] + if file_name is None or len(file_name) == 0: + continue + model_files.append(file_name) + ## Download this file--- + bucket.download_file(obj.key, file_name) + + model = get_model(model_name + '.params') + perform_inference(test_data, val_data, model, model_name + '_inference.json') + clean_up_files(model_files) From 35cbefb00722cf154e11bc46a5fe05e589c9d6c4 Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Sun, 8 Jul 2018 23:58:08 -0700 Subject: [PATCH 06/59] Added hybridized LENET Gluon Model training file --- .../lenet_cnn_gluon_hybrid_train.py | 209 ++++++++++++++++++ 1 file changed, 209 insertions(+) create mode 100644 tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_hybrid_train.py diff --git a/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_hybrid_train.py b/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_hybrid_train.py new file mode 100644 index 000000000000..e356744ce921 --- /dev/null +++ b/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_hybrid_train.py @@ -0,0 +1,209 @@ +import boto3 +import mxnet as mx +import mxnet.ndarray as nd +from mxnet import nd, autograd, gluon +from mxnet.gluon.data.vision import transforms, datasets +import numpy as np +from mxnet import autograd as ag +import logging +import mxnet.ndarray as F +from mxnet.gluon import nn +import json +import os + +logging.getLogger().setLevel(logging.DEBUG) +mx.random.seed(7) +np.random.seed(7) + +batch_size=100 +num_epoch = 2 +bucket_name = 'mxnet-model-backwards-compatibility' +backslash = '/' +model_name = 'lenet_cnn_gluon_hybrid_api' + +ctx = [mx.cpu(0)] +mxnet_version = mx.__version__ + +class Net(gluon.HybridBlock): + def __init__(self, **kwargs): + super(Net, self).__init__(**kwargs) + with self.name_scope(): + # layers created in name_scope will inherit name space + # from parent layer. + self.conv1 = nn.Conv2D(20, kernel_size=(5,5)) + self.pool1 = nn.MaxPool2D(pool_size=(2,2), strides = (2,2)) + self.conv2 = nn.Conv2D(50, kernel_size=(5,5)) + self.pool2 = nn.MaxPool2D(pool_size=(2,2), strides = (2,2)) + self.fc1 = nn.Dense(500) + self.fc2 = nn.Dense(10) + + def hybrid_forward(self, F, x): + x = self.pool1(F.tanh(self.conv1(x))) + x = self.pool2(F.tanh(self.conv2(x))) + # 0 means copy over size from corresponding dimension. + # -1 means infer size from the rest of dimensions. + x = x.reshape((0, -1)) + x = F.tanh(self.fc1(x)) + x = F.tanh(self.fc2(x)) + return x + + +def prepare_mnist_data(mnist_raw_data): + #shuffle the indices + indices = np.random.permutation(mnist_raw_data['train_label'].shape[0]) + + #print indices[0:10] + train_idx , val_idx = indices[:50000], indices[50000:] + + train_data = mnist_raw_data['train_data'][train_idx,:] + train_label = mnist_raw_data['train_label'][train_idx] + + val_data = mnist_raw_data['train_data'][val_idx,:] + val_label = mnist_raw_data['train_label'][val_idx] + + test_data = mnist_raw_data['test_data'] + test_label = mnist_raw_data['test_label'] + + #print len(train_data) + #print len(val_data) + + train = {'train_X' : train_data, 'train_Y' : train_label} + test = {'test_X' : test_data, 'test_Y' : test_label} + val = {'val_X' : val_data, 'val_Y' : val_label} + + data = dict() + data['train'] = train + data['test'] = test + data['val'] = val + + return data + +def clean_up_files (model_files): + clean_mnist_data() + clean_model_files(model_files) + print ('Model files deleted') + +def clean_mnist_data(): + if os.path.isfile('train-images-idx3-ubyte.gz'): + os.remove('train-images-idx3-ubyte.gz') + if os.path.isfile('t10k-labels-idx1-ubyte.gz'): + os.remove('t10k-labels-idx1-ubyte.gz') + if os.path.isfile('train-labels-idx1-ubyte.gz'): + os.remove('train-labels-idx1-ubyte.gz') + if os.path.isfile('t10k-images-idx3-ubyte.gz'): + os.remove('t10k-images-idx3-ubyte.gz') + +def clean_model_files(model_files): + for file in model_files: + if os.path.isfile(file): + os.remove(file) + +def save_model_files(network): + network.export(model_name, epoch=num_epoch) + +def save_inference_results(test_acc, val_acc): + inference_results = dict() + inference_results['val_acc'] = val_acc + inference_results['test_acc'] = test_acc + + inference_results_file = model_name + '_inference' + '.json' + + # Write the inference results to local json file. This will be cleaned up later + with open(inference_results_file, 'w') as file: + json.dump(inference_results, file) + +def upload_model_files_to_s3(bucket_name, files, folder_name): + s3 = boto3.client('s3') + for file in files: + s3.upload_file(file, bucket_name, folder_name + file) + print ('model successfully uploaded to s3') + +def get_inference_score(iter, model): + # Use Accuracy as the evaluation metric. + metric = mx.metric.Accuracy() + # Reset the validation data iterator. + iter.reset() + # Loop over the validation data iterator. + for batch in iter: + # Splits validation data into multiple slices along batch_axis + # and copy each slice into a context. + data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0) + # Splits validation label into multiple slices along batch_axis + # and copy each slice into a context. + label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0) + outputs = [] + for x in data: + outputs.append(model(x)) + # Updates internal evaluation + metric.update(label, outputs) + acc = metric.get() + return acc[1] + +if __name__=='__main__': + data = prepare_mnist_data(mx.test_utils.get_mnist()) + + train = data['train'] + val = data['val'] + test = data['test'] + + train_iter = mx.io.NDArrayIter(train['train_X'], train['train_Y'], batch_size, shuffle=True) + val_iter = mx.io.NDArrayIter(val['val_X'], val['val_Y'], batch_size, shuffle=True) + test_iter = mx.io.NDArrayIter(test['test_X'], test['test_Y']) + + + net = Net() + net.initialize(mx.init.Xavier(), ctx=ctx) + net.hybridize() + + metric = mx.metric.Accuracy() + softmax_cross_entropy_loss = gluon.loss.SoftmaxCrossEntropyLoss() + trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.02}) + + for i in range(num_epoch): + train_iter.reset() + for batch in train_iter: + data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0) + label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0) + outputs = [] + # Inside training scope + with ag.record(): + for x, y in zip(data, label): + z = net(x) + # Computes softmax cross entropy loss. + loss = softmax_cross_entropy_loss(z, y) + # Backpropagate the error for one iteration. + loss.backward() + outputs.append(z) + + metric.update(label, outputs) + # Make one step of parameter update. Trainer needs to know the + # batch size of data to normalize the gradient by 1/batch_size. + trainer.step(batch.data[0].shape[0]) + + name, acc = metric.get() + # Reset evaluation result to initial state. + metric.reset() + print('training acc at epoch %d: %s=%f'%(i, name, acc)) + + save_model_files(net) + + + # In[6]: + val_acc = get_inference_score(val_iter, net) + print('validation acc: =%f'%val_acc) + + test_acc = get_inference_score(test_iter, net) + print('test acc: =%f'%test_acc) + + save_inference_results(test_acc, val_acc) + + mxnet_folder = str(mxnet_version) + backslash + model_name + backslash + + files = list() + files.append(model_name + '-000' + str(num_epoch) + '.params') + files.append(model_name + '-symbol.json') + files.append(model_name + '_inference' + '.json') + + upload_model_files_to_s3(bucket_name, files, mxnet_folder) + + clean_up_files(files) \ No newline at end of file From 418f8051b0645af195803b5087937c0b289579a9 Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Sun, 8 Jul 2018 23:58:28 -0700 Subject: [PATCH 07/59] Added hybridized LENET gluon model inference file --- .../lenet_cnn_gluon_hybrid_inference.py | 178 ++++++++++++++++++ 1 file changed, 178 insertions(+) create mode 100644 tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_hybrid_inference.py diff --git a/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_hybrid_inference.py b/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_hybrid_inference.py new file mode 100644 index 000000000000..e6b52697fa40 --- /dev/null +++ b/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_hybrid_inference.py @@ -0,0 +1,178 @@ +import boto3 +import mxnet as mx +import mxnet.ndarray as nd +from mxnet import nd, autograd, gluon +from mxnet.gluon.data.vision import transforms, datasets +import numpy as np +from mxnet import autograd as ag +import logging +import mxnet.ndarray as F +from mxnet.gluon import nn +import json +import os + +logging.getLogger().setLevel(logging.DEBUG) +mx.random.seed(7) +np.random.seed(7) + +bucket_name = 'mxnet-model-backwards-compatibility' +backslash = '/' +model_name = 'lenet_cnn_gluon_hybrid_api' +s3 = boto3.resource('s3') +num_epoch = 2 +ctx = [mx.cpu(0)] +batch_size = 100 + +def prepare_mnist_data(mnist_raw_data): + + #shuffle the indices + indices = np.random.permutation(mnist_raw_data['train_label'].shape[0]) + + #print indices[0:10] + train_idx , val_idx = indices[:50000], indices[50000:] + + train_data = mnist_raw_data['train_data'][train_idx,:] + train_label = mnist_raw_data['train_label'][train_idx] + + val_data = mnist_raw_data['train_data'][val_idx,:] + val_label = mnist_raw_data['train_label'][val_idx] + + test_data = mnist_raw_data['test_data'] + test_label = mnist_raw_data['test_label'] + + #print len(train_data) + #print len(val_data) + + train = {'train_X' : train_data, 'train_Y' : train_label} + test = {'test_X' : test_data, 'test_Y' : test_label} + val = {'val_X' : val_data, 'val_Y' : val_label} + + data = dict() + data['train'] = train + data['test'] = test + data['val'] = val + + return data + +def get_val_test_iter(): + data = prepare_mnist_data(mx.test_utils.get_mnist()) + val = data['val'] + test = data['test'] + val_iter = mx.io.NDArrayIter(val['val_X'], val['val_Y'], batch_size, shuffle=True) + test_iter = mx.io.NDArrayIter(test['test_X'], test['test_Y']) + return val_iter, test_iter + +val_iter, test_iter = get_val_test_iter() + +def get_top_level_folders_in_bucket(s3client, bucket_name): + '''This function returns the top level folders in the S3Bucket. These folders help us to navigate to the trained model files stored for different MXNet versions. ''' + bucket = s3client.Bucket(bucket_name) + result = bucket.meta.client.list_objects(Bucket=bucket.name, + Delimiter=backslash) + folder_list = list() + for obj in result['CommonPrefixes']: + folder_list.append(obj['Prefix'].strip(backslash)) + + return folder_list + +class Net(gluon.HybridBlock): + def __init__(self, **kwargs): + super(Net, self).__init__(**kwargs) + with self.name_scope(): + # layers created in name_scope will inherit name space + # from parent layer. + self.conv1 = nn.Conv2D(20, kernel_size=(5,5)) + self.pool1 = nn.MaxPool2D(pool_size=(2,2), strides = (2,2)) + self.conv2 = nn.Conv2D(50, kernel_size=(5,5)) + self.pool2 = nn.MaxPool2D(pool_size=(2,2), strides = (2,2)) + self.fc1 = nn.Dense(500) + self.fc2 = nn.Dense(10) + + def hybrid_forward(self, F, x): + x = self.pool1(F.tanh(self.conv1(x))) + x = self.pool2(F.tanh(self.conv2(x))) + # 0 means copy over size from corresponding dimension. + # -1 means infer size from the rest of dimensions. + x = x.reshape((0, -1)) + x = F.tanh(self.fc1(x)) + x = F.tanh(self.fc2(x)) + return x + + +def get_model(model_name): + net = gluon.SymbolBlock.imports(model_name + '-symbol.json', ['data'], model_name + '-000' + str(num_epoch) + '.params') + return net + +def get_inference_score(iter, model): + # Use Accuracy as the evaluation metric. + metric = mx.metric.Accuracy() + # Reset the validation data iterator. + iter.reset() + # Loop over the validation data iterator. + for batch in iter: + # Splits validation data into multiple slices along batch_axis + # and copy each slice into a context. + data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0) + # Splits validation label into multiple slices along batch_axis + # and copy each slice into a context. + label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0) + outputs = [] + for x in data: + outputs.append(model(x)) + # Updates internal evaluation + metric.update(label, outputs) + acc = metric.get() + return acc[1] + +def perform_inference(test_iter, val_iter, model, inference_file): + test_inference_score = get_inference_score(test_iter, model) + val_inference_score = get_inference_score(val_iter, model) + + with open(inference_file, 'r') as file: + results = json.load(file) + + print (test_inference_score, val_inference_score) + print results['val_acc'] + print ('Validation accuracy on inference is %f while that on the original training file is %f' % (val_inference_score, results['val_acc'])) + print ('Test accuracy on inference is %f while that on the original training file is %f' % (test_inference_score, results['test_acc'])) + assert(results['val_acc'] == val_inference_score) + assert(results['test_acc'] == test_inference_score) + print ('Inference results passed for %s' % model_name) + +def clean_up_files (model_files): + clean_mnist_data() + clean_model_files(model_files) + print ('Model files deleted') + +def clean_mnist_data(): + if os.path.isfile('train-images-idx3-ubyte.gz'): + os.remove('train-images-idx3-ubyte.gz') + if os.path.isfile('t10k-labels-idx1-ubyte.gz'): + os.remove('t10k-labels-idx1-ubyte.gz') + if os.path.isfile('train-labels-idx1-ubyte.gz'): + os.remove('train-labels-idx1-ubyte.gz') + if os.path.isfile('t10k-images-idx3-ubyte.gz'): + os.remove('t10k-images-idx3-ubyte.gz') + +def clean_model_files(model_files): + for file in model_files: + if os.path.isfile(file): + os.remove(file) + +if __name__=='__main__': + for folder in get_top_level_folders_in_bucket(s3, bucket_name): + bucket = s3.Bucket(bucket_name) + prefix = folder + backslash + model_name + model_files_meta = list(bucket.objects.filter(Prefix = prefix)) + if len(model_files_meta) == 0: + continue + model_files = list() + for obj in model_files_meta: + file_name = obj.key.split('/')[2] + model_files.append(file_name) + ## Download this file--- + bucket.download_file(obj.key, file_name) + + model = get_model(model_name) + perform_inference(test_iter, val_iter, model, model_name + '_inference.json') + clean_up_files(model_files) From 600efaff9a1db731ec7e0553a3e971493bcfb61b Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Mon, 9 Jul 2018 13:09:35 -0700 Subject: [PATCH 08/59] Added license headers --- .../lenet_cnn_gluon_hybrid_inference.py | 20 +++++++++++++++++++ .../lenet_cnn_gluon_hybrid_train.py | 20 +++++++++++++++++++ .../lenet_cnn_gluon_inference.py | 20 +++++++++++++++++++ .../lenet_cnn_gluon_train.py | 20 +++++++++++++++++++ .../lm_rnn_gluon_inference.py | 19 ++++++++++++++++++ .../lm_rnn_gluon_train.py | 20 +++++++++++++++++++ .../mnist_mlp_module_api_inference.py | 20 +++++++++++++++++++ .../mnist_mlp_module_api_train.py | 19 ++++++++++++++++++ 8 files changed, 158 insertions(+) diff --git a/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_hybrid_inference.py b/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_hybrid_inference.py index e6b52697fa40..dbd5c85a22bc 100644 --- a/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_hybrid_inference.py +++ b/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_hybrid_inference.py @@ -1,3 +1,23 @@ +#!/usr/bin/env python + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + import boto3 import mxnet as mx import mxnet.ndarray as nd diff --git a/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_hybrid_train.py b/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_hybrid_train.py index e356744ce921..00c4b50d0f0f 100644 --- a/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_hybrid_train.py +++ b/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_hybrid_train.py @@ -1,3 +1,23 @@ +#!/usr/bin/env python + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + import boto3 import mxnet as mx import mxnet.ndarray as nd diff --git a/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_inference.py b/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_inference.py index 8a2d72036714..960a7a986d6a 100644 --- a/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_inference.py +++ b/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_inference.py @@ -1,3 +1,23 @@ +#!/usr/bin/env python + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + import boto3 import mxnet as mx import mxnet.ndarray as nd diff --git a/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_train.py b/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_train.py index fc71940208e3..5222e68871ac 100644 --- a/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_train.py +++ b/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_train.py @@ -1,3 +1,23 @@ +#!/usr/bin/env python + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + import boto3 import mxnet as mx import mxnet.ndarray as nd diff --git a/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_inference.py b/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_inference.py index d7d787088ef8..23aad2b9f4a6 100644 --- a/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_inference.py +++ b/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_inference.py @@ -1,3 +1,22 @@ +#!/usr/bin/env python + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import math import os import time diff --git a/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_train.py b/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_train.py index cb812f69ab62..c6d9932d2a21 100644 --- a/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_train.py +++ b/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_train.py @@ -1,3 +1,23 @@ +#!/usr/bin/env python + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + import math import os import time diff --git a/tests/nightly/model_backwards_compatibility_check/mnist_mlp_module_api_inference.py b/tests/nightly/model_backwards_compatibility_check/mnist_mlp_module_api_inference.py index 956682081d08..77e59be381c7 100644 --- a/tests/nightly/model_backwards_compatibility_check/mnist_mlp_module_api_inference.py +++ b/tests/nightly/model_backwards_compatibility_check/mnist_mlp_module_api_inference.py @@ -1,3 +1,23 @@ +#!/usr/bin/env python + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + import boto3 import mxnet as mx import numpy as np diff --git a/tests/nightly/model_backwards_compatibility_check/mnist_mlp_module_api_train.py b/tests/nightly/model_backwards_compatibility_check/mnist_mlp_module_api_train.py index 8254ffe9b3c3..e9834677d196 100644 --- a/tests/nightly/model_backwards_compatibility_check/mnist_mlp_module_api_train.py +++ b/tests/nightly/model_backwards_compatibility_check/mnist_mlp_module_api_train.py @@ -1,3 +1,22 @@ +#!/usr/bin/env python + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import boto3 import mxnet as mx import numpy as np From d73b9e23fc8d860733a7c7c871bc323a4c0a34dc Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Mon, 9 Jul 2018 13:40:57 -0700 Subject: [PATCH 09/59] Refactored the model and inference files and extracted out duplicate code in a common file --- .../common.py | 147 ++++++++++++++++++ .../lenet_cnn_gluon_hybrid_inference.py | 109 +------------ .../lenet_cnn_gluon_hybrid_train.py | 117 ++------------ .../lenet_cnn_gluon_inference.py | 110 +------------ .../lenet_cnn_gluon_train.py | 115 +------------- .../mnist_mlp_module_api_inference.py | 84 +--------- .../mnist_mlp_module_api_train.py | 88 ++--------- 7 files changed, 179 insertions(+), 591 deletions(-) create mode 100644 tests/nightly/model_backwards_compatibility_check/common.py diff --git a/tests/nightly/model_backwards_compatibility_check/common.py b/tests/nightly/model_backwards_compatibility_check/common.py new file mode 100644 index 000000000000..4204dc0b3e61 --- /dev/null +++ b/tests/nightly/model_backwards_compatibility_check/common.py @@ -0,0 +1,147 @@ +import boto3 +import mxnet as mx +import json +import os +import numpy as np +import logging +from mxnet import nd, autograd, gluon +import mxnet.ndarray as nd +from mxnet.gluon.data.vision import transforms, datasets +from mxnet import autograd as ag +import mxnet.ndarray as F +from mxnet.gluon import nn + +# Set fixed random seeds. +mx.random.seed(7) +np.random.seed(7) +logging.getLogger().setLevel(logging.DEBUG) + +# get the current mxnet version we are running on +mxnet_version = mx.__version__ +bucket_name = 'mxnet-model-backwards-compatibility' +backslash = '/' +s3 = boto3.resource('s3') +num_epoch = 2 + +def prepare_mnist_data(mnist_raw_data): + + #shuffle the indices + indices = np.random.permutation(mnist_raw_data['train_label'].shape[0]) + + #print indices[0:10] + train_idx , val_idx = indices[:50000], indices[50000:] + + train_data = mnist_raw_data['train_data'][train_idx,:] + train_label = mnist_raw_data['train_label'][train_idx] + + val_data = mnist_raw_data['train_data'][val_idx,:] + val_label = mnist_raw_data['train_label'][val_idx] + + test_data = mnist_raw_data['test_data'] + test_label = mnist_raw_data['test_label'] + + #print len(train_data) + #print len(val_data) + + train = {'train_X' : train_data, 'train_Y' : train_label} + test = {'test_X' : test_data, 'test_Y' : test_label} + val = {'val_X' : val_data, 'val_Y' : val_label} + + data = dict() + data['train'] = train + data['test'] = test + data['val'] = val + + return data + +def get_top_level_folders_in_bucket(s3client, bucket_name): + '''This function returns the top level folders in the S3Bucket. These folders help us to navigate to the trained model files stored for different MXNet versions. ''' + bucket = s3client.Bucket(bucket_name) + result = bucket.meta.client.list_objects(Bucket=bucket.name, + Delimiter=backslash) + folder_list = list() + for obj in result['CommonPrefixes']: + folder_list.append(obj['Prefix'].strip(backslash)) + + return folder_list + +def clean_mnist_data(): + if os.path.isfile('train-images-idx3-ubyte.gz'): + os.remove('train-images-idx3-ubyte.gz') + if os.path.isfile('t10k-labels-idx1-ubyte.gz'): + os.remove('t10k-labels-idx1-ubyte.gz') + if os.path.isfile('train-labels-idx1-ubyte.gz'): + os.remove('train-labels-idx1-ubyte.gz') + if os.path.isfile('t10k-images-idx3-ubyte.gz'): + os.remove('t10k-images-idx3-ubyte.gz') + +def clean_model_files(model_files): + for file in model_files: + if os.path.isfile(file): + os.remove(file) + +def upload_model_files_to_s3(bucket_name, files, folder_name): + s3 = boto3.client('s3') + for file in files: + s3.upload_file(file, bucket_name, folder_name + file) + print ('model successfully uploaded to s3') + +def save_inference_results(inference_results_file, inference_results): + # Write the inference results to local json file. This will be cleaned up later + with open(inference_results_file, 'w') as file: + json.dump(inference_results, file) + +def get_val_test_iter(): + data = prepare_mnist_data(mx.test_utils.get_mnist()) + val = data['val'] + test = data['test'] + batch_size = 100 + val_iter = mx.io.NDArrayIter(val['val_X'], val['val_Y'], batch_size, shuffle=True) + test_iter = mx.io.NDArrayIter(test['test_X'], test['test_Y']) + return val_iter, test_iter + +class HybridNet(gluon.HybridBlock): + def __init__(self, **kwargs): + super(HybridNet, self).__init__(**kwargs) + with self.name_scope(): + # layers created in name_scope will inherit name space + # from parent layer. + self.conv1 = nn.Conv2D(20, kernel_size=(5,5)) + self.pool1 = nn.MaxPool2D(pool_size=(2,2), strides = (2,2)) + self.conv2 = nn.Conv2D(50, kernel_size=(5,5)) + self.pool2 = nn.MaxPool2D(pool_size=(2,2), strides = (2,2)) + self.fc1 = nn.Dense(500) + self.fc2 = nn.Dense(10) + + def hybrid_forward(self, F, x): + x = self.pool1(F.tanh(self.conv1(x))) + x = self.pool2(F.tanh(self.conv2(x))) + # 0 means copy over size from corresponding dimension. + # -1 means infer size from the rest of dimensions. + x = x.reshape((0, -1)) + x = F.tanh(self.fc1(x)) + x = F.tanh(self.fc2(x)) + return x + +class Net(gluon.Block): + def __init__(self, **kwargs): + super(Net, self).__init__(**kwargs) + with self.name_scope(): + # layers created in name_scope will inherit name space + # from parent layer. + self.conv1 = nn.Conv2D(20, kernel_size=(5,5)) + self.pool1 = nn.MaxPool2D(pool_size=(2,2), strides = (2,2)) + self.conv2 = nn.Conv2D(50, kernel_size=(5,5)) + self.pool2 = nn.MaxPool2D(pool_size=(2,2), strides = (2,2)) + self.fc1 = nn.Dense(500) + self.fc2 = nn.Dense(10) + + def forward(self, x): + x = self.pool1(F.tanh(self.conv1(x))) + x = self.pool2(F.tanh(self.conv2(x))) + # 0 means copy over size from corresponding dimension. + # -1 means infer size from the rest of dimensions. + x = x.reshape((0, -1)) + x = F.tanh(self.fc1(x)) + x = F.tanh(self.fc2(x)) + return x diff --git a/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_hybrid_inference.py b/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_hybrid_inference.py index dbd5c85a22bc..756f73e9bf28 100644 --- a/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_hybrid_inference.py +++ b/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_hybrid_inference.py @@ -18,107 +18,15 @@ # under the License. -import boto3 -import mxnet as mx -import mxnet.ndarray as nd -from mxnet import nd, autograd, gluon -from mxnet.gluon.data.vision import transforms, datasets -import numpy as np -from mxnet import autograd as ag -import logging -import mxnet.ndarray as F -from mxnet.gluon import nn -import json -import os +from common import * -logging.getLogger().setLevel(logging.DEBUG) -mx.random.seed(7) -np.random.seed(7) - -bucket_name = 'mxnet-model-backwards-compatibility' -backslash = '/' model_name = 'lenet_cnn_gluon_hybrid_api' -s3 = boto3.resource('s3') num_epoch = 2 ctx = [mx.cpu(0)] batch_size = 100 -def prepare_mnist_data(mnist_raw_data): - - #shuffle the indices - indices = np.random.permutation(mnist_raw_data['train_label'].shape[0]) - - #print indices[0:10] - train_idx , val_idx = indices[:50000], indices[50000:] - - train_data = mnist_raw_data['train_data'][train_idx,:] - train_label = mnist_raw_data['train_label'][train_idx] - - val_data = mnist_raw_data['train_data'][val_idx,:] - val_label = mnist_raw_data['train_label'][val_idx] - - test_data = mnist_raw_data['test_data'] - test_label = mnist_raw_data['test_label'] - - #print len(train_data) - #print len(val_data) - - train = {'train_X' : train_data, 'train_Y' : train_label} - test = {'test_X' : test_data, 'test_Y' : test_label} - val = {'val_X' : val_data, 'val_Y' : val_label} - - data = dict() - data['train'] = train - data['test'] = test - data['val'] = val - - return data - -def get_val_test_iter(): - data = prepare_mnist_data(mx.test_utils.get_mnist()) - val = data['val'] - test = data['test'] - val_iter = mx.io.NDArrayIter(val['val_X'], val['val_Y'], batch_size, shuffle=True) - test_iter = mx.io.NDArrayIter(test['test_X'], test['test_Y']) - return val_iter, test_iter - val_iter, test_iter = get_val_test_iter() -def get_top_level_folders_in_bucket(s3client, bucket_name): - '''This function returns the top level folders in the S3Bucket. These folders help us to navigate to the trained model files stored for different MXNet versions. ''' - bucket = s3client.Bucket(bucket_name) - result = bucket.meta.client.list_objects(Bucket=bucket.name, - Delimiter=backslash) - folder_list = list() - for obj in result['CommonPrefixes']: - folder_list.append(obj['Prefix'].strip(backslash)) - - return folder_list - -class Net(gluon.HybridBlock): - def __init__(self, **kwargs): - super(Net, self).__init__(**kwargs) - with self.name_scope(): - # layers created in name_scope will inherit name space - # from parent layer. - self.conv1 = nn.Conv2D(20, kernel_size=(5,5)) - self.pool1 = nn.MaxPool2D(pool_size=(2,2), strides = (2,2)) - self.conv2 = nn.Conv2D(50, kernel_size=(5,5)) - self.pool2 = nn.MaxPool2D(pool_size=(2,2), strides = (2,2)) - self.fc1 = nn.Dense(500) - self.fc2 = nn.Dense(10) - - def hybrid_forward(self, F, x): - x = self.pool1(F.tanh(self.conv1(x))) - x = self.pool2(F.tanh(self.conv2(x))) - # 0 means copy over size from corresponding dimension. - # -1 means infer size from the rest of dimensions. - x = x.reshape((0, -1)) - x = F.tanh(self.fc1(x)) - x = F.tanh(self.fc2(x)) - return x - - def get_model(model_name): net = gluon.SymbolBlock.imports(model_name + '-symbol.json', ['data'], model_name + '-000' + str(num_epoch) + '.params') return net @@ -164,21 +72,6 @@ def clean_up_files (model_files): clean_model_files(model_files) print ('Model files deleted') -def clean_mnist_data(): - if os.path.isfile('train-images-idx3-ubyte.gz'): - os.remove('train-images-idx3-ubyte.gz') - if os.path.isfile('t10k-labels-idx1-ubyte.gz'): - os.remove('t10k-labels-idx1-ubyte.gz') - if os.path.isfile('train-labels-idx1-ubyte.gz'): - os.remove('train-labels-idx1-ubyte.gz') - if os.path.isfile('t10k-images-idx3-ubyte.gz'): - os.remove('t10k-images-idx3-ubyte.gz') - -def clean_model_files(model_files): - for file in model_files: - if os.path.isfile(file): - os.remove(file) - if __name__=='__main__': for folder in get_top_level_folders_in_bucket(s3, bucket_name): bucket = s3.Bucket(bucket_name) diff --git a/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_hybrid_train.py b/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_hybrid_train.py index 00c4b50d0f0f..b7965ce7c2a5 100644 --- a/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_hybrid_train.py +++ b/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_hybrid_train.py @@ -17,127 +17,22 @@ # specific language governing permissions and limitations # under the License. - -import boto3 -import mxnet as mx -import mxnet.ndarray as nd -from mxnet import nd, autograd, gluon -from mxnet.gluon.data.vision import transforms, datasets -import numpy as np -from mxnet import autograd as ag -import logging -import mxnet.ndarray as F -from mxnet.gluon import nn -import json -import os - -logging.getLogger().setLevel(logging.DEBUG) -mx.random.seed(7) -np.random.seed(7) +from common import * batch_size=100 num_epoch = 2 -bucket_name = 'mxnet-model-backwards-compatibility' -backslash = '/' model_name = 'lenet_cnn_gluon_hybrid_api' ctx = [mx.cpu(0)] -mxnet_version = mx.__version__ - -class Net(gluon.HybridBlock): - def __init__(self, **kwargs): - super(Net, self).__init__(**kwargs) - with self.name_scope(): - # layers created in name_scope will inherit name space - # from parent layer. - self.conv1 = nn.Conv2D(20, kernel_size=(5,5)) - self.pool1 = nn.MaxPool2D(pool_size=(2,2), strides = (2,2)) - self.conv2 = nn.Conv2D(50, kernel_size=(5,5)) - self.pool2 = nn.MaxPool2D(pool_size=(2,2), strides = (2,2)) - self.fc1 = nn.Dense(500) - self.fc2 = nn.Dense(10) - - def hybrid_forward(self, F, x): - x = self.pool1(F.tanh(self.conv1(x))) - x = self.pool2(F.tanh(self.conv2(x))) - # 0 means copy over size from corresponding dimension. - # -1 means infer size from the rest of dimensions. - x = x.reshape((0, -1)) - x = F.tanh(self.fc1(x)) - x = F.tanh(self.fc2(x)) - return x - - -def prepare_mnist_data(mnist_raw_data): - #shuffle the indices - indices = np.random.permutation(mnist_raw_data['train_label'].shape[0]) - - #print indices[0:10] - train_idx , val_idx = indices[:50000], indices[50000:] - - train_data = mnist_raw_data['train_data'][train_idx,:] - train_label = mnist_raw_data['train_label'][train_idx] - - val_data = mnist_raw_data['train_data'][val_idx,:] - val_label = mnist_raw_data['train_label'][val_idx] - - test_data = mnist_raw_data['test_data'] - test_label = mnist_raw_data['test_label'] - - #print len(train_data) - #print len(val_data) - - train = {'train_X' : train_data, 'train_Y' : train_label} - test = {'test_X' : test_data, 'test_Y' : test_label} - val = {'val_X' : val_data, 'val_Y' : val_label} - - data = dict() - data['train'] = train - data['test'] = test - data['val'] = val - - return data def clean_up_files (model_files): clean_mnist_data() clean_model_files(model_files) print ('Model files deleted') -def clean_mnist_data(): - if os.path.isfile('train-images-idx3-ubyte.gz'): - os.remove('train-images-idx3-ubyte.gz') - if os.path.isfile('t10k-labels-idx1-ubyte.gz'): - os.remove('t10k-labels-idx1-ubyte.gz') - if os.path.isfile('train-labels-idx1-ubyte.gz'): - os.remove('train-labels-idx1-ubyte.gz') - if os.path.isfile('t10k-images-idx3-ubyte.gz'): - os.remove('t10k-images-idx3-ubyte.gz') - -def clean_model_files(model_files): - for file in model_files: - if os.path.isfile(file): - os.remove(file) - def save_model_files(network): network.export(model_name, epoch=num_epoch) -def save_inference_results(test_acc, val_acc): - inference_results = dict() - inference_results['val_acc'] = val_acc - inference_results['test_acc'] = test_acc - - inference_results_file = model_name + '_inference' + '.json' - - # Write the inference results to local json file. This will be cleaned up later - with open(inference_results_file, 'w') as file: - json.dump(inference_results, file) - -def upload_model_files_to_s3(bucket_name, files, folder_name): - s3 = boto3.client('s3') - for file in files: - s3.upload_file(file, bucket_name, folder_name + file) - print ('model successfully uploaded to s3') - def get_inference_score(iter, model): # Use Accuracy as the evaluation metric. metric = mx.metric.Accuracy() @@ -171,7 +66,7 @@ def get_inference_score(iter, model): test_iter = mx.io.NDArrayIter(test['test_X'], test['test_Y']) - net = Net() + net = HybridNet() net.initialize(mx.init.Xavier(), ctx=ctx) net.hybridize() @@ -215,7 +110,13 @@ def get_inference_score(iter, model): test_acc = get_inference_score(test_iter, net) print('test acc: =%f'%test_acc) - save_inference_results(test_acc, val_acc) + inference_results = dict() + inference_results['val_acc'] = val_acc + inference_results['test_acc'] = test_acc + + inference_results_file = model_name + '_inference' + '.json' + + save_inference_results(inference_results_file, inference_results) mxnet_folder = str(mxnet_version) + backslash + model_name + backslash diff --git a/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_inference.py b/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_inference.py index 960a7a986d6a..47ef040fd126 100644 --- a/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_inference.py +++ b/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_inference.py @@ -17,108 +17,15 @@ # specific language governing permissions and limitations # under the License. +from common import * -import boto3 -import mxnet as mx -import mxnet.ndarray as nd -from mxnet import nd, autograd, gluon -from mxnet.gluon.data.vision import transforms, datasets -import numpy as np -from mxnet import autograd as ag -import logging -import mxnet.ndarray as F -from mxnet.gluon import nn -import json -import os - -logging.getLogger().setLevel(logging.DEBUG) -mx.random.seed(7) -np.random.seed(7) - -bucket_name = 'mxnet-model-backwards-compatibility' -backslash = '/' model_name = 'lenet_cnn_gluon_api' -s3 = boto3.resource('s3') num_epoch = 2 ctx = [mx.cpu(0)] batch_size = 100 -def prepare_mnist_data(mnist_raw_data): - - #shuffle the indices - indices = np.random.permutation(mnist_raw_data['train_label'].shape[0]) - - #print indices[0:10] - train_idx , val_idx = indices[:50000], indices[50000:] - - train_data = mnist_raw_data['train_data'][train_idx,:] - train_label = mnist_raw_data['train_label'][train_idx] - - val_data = mnist_raw_data['train_data'][val_idx,:] - val_label = mnist_raw_data['train_label'][val_idx] - - test_data = mnist_raw_data['test_data'] - test_label = mnist_raw_data['test_label'] - - #print len(train_data) - #print len(val_data) - - train = {'train_X' : train_data, 'train_Y' : train_label} - test = {'test_X' : test_data, 'test_Y' : test_label} - val = {'val_X' : val_data, 'val_Y' : val_label} - - data = dict() - data['train'] = train - data['test'] = test - data['val'] = val - - return data - -def get_val_test_iter(): - data = prepare_mnist_data(mx.test_utils.get_mnist()) - val = data['val'] - test = data['test'] - val_iter = mx.io.NDArrayIter(val['val_X'], val['val_Y'], batch_size, shuffle=True) - test_iter = mx.io.NDArrayIter(test['test_X'], test['test_Y']) - return val_iter, test_iter - val_iter, test_iter = get_val_test_iter() -def get_top_level_folders_in_bucket(s3client, bucket_name): - '''This function returns the top level folders in the S3Bucket. These folders help us to navigate to the trained model files stored for different MXNet versions. ''' - bucket = s3client.Bucket(bucket_name) - result = bucket.meta.client.list_objects(Bucket=bucket.name, - Delimiter=backslash) - folder_list = list() - for obj in result['CommonPrefixes']: - folder_list.append(obj['Prefix'].strip(backslash)) - - return folder_list - -class Net(gluon.Block): - def __init__(self, **kwargs): - super(Net, self).__init__(**kwargs) - with self.name_scope(): - # layers created in name_scope will inherit name space - # from parent layer. - self.conv1 = nn.Conv2D(20, kernel_size=(5,5)) - self.pool1 = nn.MaxPool2D(pool_size=(2,2), strides = (2,2)) - self.conv2 = nn.Conv2D(50, kernel_size=(5,5)) - self.pool2 = nn.MaxPool2D(pool_size=(2,2), strides = (2,2)) - self.fc1 = nn.Dense(500) - self.fc2 = nn.Dense(10) - - def forward(self, x): - x = self.pool1(F.tanh(self.conv1(x))) - x = self.pool2(F.tanh(self.conv2(x))) - # 0 means copy over size from corresponding dimension. - # -1 means infer size from the rest of dimensions. - x = x.reshape((0, -1)) - x = F.tanh(self.fc1(x)) - x = F.tanh(self.fc2(x)) - return x - - def get_model(model_file): net = Net() net.load_params(model_file, ctx) @@ -166,21 +73,6 @@ def clean_up_files (model_files): clean_model_files(model_files) print ('Model files deleted') -def clean_mnist_data(): - if os.path.isfile('train-images-idx3-ubyte.gz'): - os.remove('train-images-idx3-ubyte.gz') - if os.path.isfile('t10k-labels-idx1-ubyte.gz'): - os.remove('t10k-labels-idx1-ubyte.gz') - if os.path.isfile('train-labels-idx1-ubyte.gz'): - os.remove('train-labels-idx1-ubyte.gz') - if os.path.isfile('t10k-images-idx3-ubyte.gz'): - os.remove('t10k-images-idx3-ubyte.gz') - -def clean_model_files(model_files): - for file in model_files: - if os.path.isfile(file): - os.remove(file) - if __name__=='__main__': for folder in get_top_level_folders_in_bucket(s3, bucket_name): bucket = s3.Bucket(bucket_name) diff --git a/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_train.py b/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_train.py index 5222e68871ac..7f1fcad076ca 100644 --- a/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_train.py +++ b/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_train.py @@ -16,129 +16,24 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - - -import boto3 -import mxnet as mx -import mxnet.ndarray as nd -from mxnet import nd, autograd, gluon -from mxnet.gluon.data.vision import transforms, datasets -import numpy as np -from mxnet import autograd as ag -import logging -import mxnet.ndarray as F -from mxnet.gluon import nn -import json -import os - -logging.getLogger().setLevel(logging.DEBUG) -mx.random.seed(7) -np.random.seed(7) +from common import * batch_size=100 num_epoch = 2 -bucket_name = 'mxnet-model-backwards-compatibility' -backslash = '/' model_name = 'lenet_cnn_gluon_api' ctx = [mx.cpu(0)] mxnet_version = mx.__version__ -class Net(gluon.Block): - def __init__(self, **kwargs): - super(Net, self).__init__(**kwargs) - with self.name_scope(): - # layers created in name_scope will inherit name space - # from parent layer. - self.conv1 = nn.Conv2D(20, kernel_size=(5,5)) - self.pool1 = nn.MaxPool2D(pool_size=(2,2), strides = (2,2)) - self.conv2 = nn.Conv2D(50, kernel_size=(5,5)) - self.pool2 = nn.MaxPool2D(pool_size=(2,2), strides = (2,2)) - self.fc1 = nn.Dense(500) - self.fc2 = nn.Dense(10) - - def forward(self, x): - x = self.pool1(F.tanh(self.conv1(x))) - x = self.pool2(F.tanh(self.conv2(x))) - # 0 means copy over size from corresponding dimension. - # -1 means infer size from the rest of dimensions. - x = x.reshape((0, -1)) - x = F.tanh(self.fc1(x)) - x = F.tanh(self.fc2(x)) - return x - - -def prepare_mnist_data(mnist_raw_data): - #shuffle the indices - indices = np.random.permutation(mnist_raw_data['train_label'].shape[0]) - - #print indices[0:10] - train_idx , val_idx = indices[:50000], indices[50000:] - - train_data = mnist_raw_data['train_data'][train_idx,:] - train_label = mnist_raw_data['train_label'][train_idx] - - val_data = mnist_raw_data['train_data'][val_idx,:] - val_label = mnist_raw_data['train_label'][val_idx] - - test_data = mnist_raw_data['test_data'] - test_label = mnist_raw_data['test_label'] - - #print len(train_data) - #print len(val_data) - - train = {'train_X' : train_data, 'train_Y' : train_label} - test = {'test_X' : test_data, 'test_Y' : test_label} - val = {'val_X' : val_data, 'val_Y' : val_label} - - data = dict() - data['train'] = train - data['test'] = test - data['val'] = val - - return data - def clean_up_files (model_files): clean_mnist_data() clean_model_files(model_files) print ('Model files deleted') -def clean_mnist_data(): - if os.path.isfile('train-images-idx3-ubyte.gz'): - os.remove('train-images-idx3-ubyte.gz') - if os.path.isfile('t10k-labels-idx1-ubyte.gz'): - os.remove('t10k-labels-idx1-ubyte.gz') - if os.path.isfile('train-labels-idx1-ubyte.gz'): - os.remove('train-labels-idx1-ubyte.gz') - if os.path.isfile('t10k-images-idx3-ubyte.gz'): - os.remove('t10k-images-idx3-ubyte.gz') - -def clean_model_files(model_files): - for file in model_files: - if os.path.isfile(file): - os.remove(file) - def save_model_files(network): model_file_name = model_name + '.params' network.save_params(model_file_name) -def save_inference_results(test_acc, val_acc): - inference_results = dict() - inference_results['val_acc'] = val_acc - inference_results['test_acc'] = test_acc - - inference_results_file = model_name + '_inference' + '.json' - - # Write the inference results to local json file. This will be cleaned up later - with open(inference_results_file, 'w') as file: - json.dump(inference_results, file) - -def upload_model_files_to_s3(bucket_name, files, folder_name): - s3 = boto3.client('s3') - for file in files: - s3.upload_file(file, bucket_name, folder_name + file) - print ('model successfully uploaded to s3') - def get_inference_score(iter, model): # Use Accuracy as the evaluation metric. metric = mx.metric.Accuracy() @@ -215,7 +110,13 @@ def get_inference_score(iter, model): test_acc = get_inference_score(test_iter, net) print('test acc: =%f'%test_acc) - save_inference_results(test_acc, val_acc) + inference_results = dict() + inference_results['val_acc'] = val_acc + inference_results['test_acc'] = test_acc + + inference_results_file = model_name + '_inference' + '.json' + + save_inference_results(inference_results_file, inference_results) mxnet_folder = str(mxnet_version) + backslash + model_name + backslash diff --git a/tests/nightly/model_backwards_compatibility_check/mnist_mlp_module_api_inference.py b/tests/nightly/model_backwards_compatibility_check/mnist_mlp_module_api_inference.py index 77e59be381c7..b96a1e8c9c8d 100644 --- a/tests/nightly/model_backwards_compatibility_check/mnist_mlp_module_api_inference.py +++ b/tests/nightly/model_backwards_compatibility_check/mnist_mlp_module_api_inference.py @@ -18,77 +18,14 @@ # under the License. -import boto3 -import mxnet as mx -import numpy as np -import json -import logging -import os -logging.getLogger().setLevel(logging.DEBUG) -mx.random.seed(7) -np.random.seed(7) +from common import * -bucket_name = 'mxnet-model-backwards-compatibility' -backslash = '/' model_name = 'mnist_mlp_module_api' -s3 = boto3.resource('s3') -num_epoch = 2 ctx = mx.cpu() -def prepare_mnist_data(mnist_raw_data): - - #shuffle the indices - indices = np.random.permutation(mnist_raw_data['train_label'].shape[0]) - - #print indices[0:10] - train_idx , val_idx = indices[:50000], indices[50000:] - - train_data = mnist_raw_data['train_data'][train_idx,:] - train_label = mnist_raw_data['train_label'][train_idx] - - val_data = mnist_raw_data['train_data'][val_idx,:] - val_label = mnist_raw_data['train_label'][val_idx] - - test_data = mnist_raw_data['test_data'] - test_label = mnist_raw_data['test_label'] - - #print len(train_data) - #print len(val_data) - - train = {'train_X' : train_data, 'train_Y' : train_label} - test = {'test_X' : test_data, 'test_Y' : test_label} - val = {'val_X' : val_data, 'val_Y' : val_label} - - data = dict() - data['train'] = train - data['test'] = test - data['val'] = val - - return data - -def get_val_test_iter(): - data = prepare_mnist_data(mx.test_utils.get_mnist()) - val = data['val'] - test = data['test'] - batch_size = 100 - val_iter = mx.io.NDArrayIter(val['val_X'], val['val_Y'], batch_size, shuffle=True) - test_iter = mx.io.NDArrayIter(test['test_X'], test['test_Y']) - return val_iter, test_iter - val_iter, test_iter = get_val_test_iter() -def get_top_level_folders_in_bucket(s3client, bucket_name): - '''This function returns the top level folders in the S3Bucket. These folders help us to navigate to the trained model files stored for different MXNet versions. ''' - bucket = s3client.Bucket(bucket_name) - result = bucket.meta.client.list_objects(Bucket=bucket.name, - Delimiter=backslash) - folder_list = list() - for obj in result['CommonPrefixes']: - folder_list.append(obj['Prefix'].strip(backslash)) - - return folder_list - -def get_model(): +def get_model_definition(): ##### Old Model ##### : input = mx.symbol.Variable('data') input = mx.symbol.Flatten(data=input) @@ -119,21 +56,6 @@ def perform_inference(test_iter, val_iter, model, inference_file): assert(results['test_acc'] == test_inference_score[0][1]) print ('Inference results passed for %s' % model_name) -def clean_mnist_data(): - if os.path.isfile('train-images-idx3-ubyte.gz'): - os.remove('train-images-idx3-ubyte.gz') - if os.path.isfile('t10k-labels-idx1-ubyte.gz'): - os.remove('t10k-labels-idx1-ubyte.gz') - if os.path.isfile('train-labels-idx1-ubyte.gz'): - os.remove('train-labels-idx1-ubyte.gz') - if os.path.isfile('t10k-images-idx3-ubyte.gz'): - os.remove('t10k-images-idx3-ubyte.gz') - -def clean_model_files(model_files): - for file in model_files: - if os.path.isfile(file): - os.remove(file) - if __name__=='__main__': for folder in get_top_level_folders_in_bucket(s3, bucket_name): bucket = s3.Bucket(bucket_name) @@ -148,7 +70,7 @@ def clean_model_files(model_files): ## Download this file--- bucket.download_file(obj.key, file_name) - model = get_model() + model = get_model_definition() perform_inference(test_iter, val_iter, model, model_name + '_inference.json') clean_model_files(model_files) clean_mnist_data() diff --git a/tests/nightly/model_backwards_compatibility_check/mnist_mlp_module_api_train.py b/tests/nightly/model_backwards_compatibility_check/mnist_mlp_module_api_train.py index e9834677d196..b89480e608f4 100644 --- a/tests/nightly/model_backwards_compatibility_check/mnist_mlp_module_api_train.py +++ b/tests/nightly/model_backwards_compatibility_check/mnist_mlp_module_api_train.py @@ -17,89 +17,24 @@ # specific language governing permissions and limitations # under the License. -import boto3 -import mxnet as mx -import numpy as np -import json -import os -import logging -logging.getLogger().setLevel(logging.DEBUG) - - -# Set fixed random seeds. These would be the same for inference files as well -mx.random.seed(7) -np.random.seed(7) - -# get the current mxnet version we are running on -mxnet_version = mx.__version__ -bucket_name = 'mxnet-model-backwards-compatibility' +from common import * + ctx = mx.cpu() batch_size = 100 num_epoch = 2 backslash = '/' model_name = 'mnist_mlp_module_api' - -def prepare_mnist_data(mnist_raw_data): - #shuffle the indices - indices = np.random.permutation(mnist_raw_data['train_label'].shape[0]) - - #print indices[0:10] - train_idx , val_idx = indices[:50000], indices[50000:] - - train_data = mnist_raw_data['train_data'][train_idx,:] - train_label = mnist_raw_data['train_label'][train_idx] - - val_data = mnist_raw_data['train_data'][val_idx,:] - val_label = mnist_raw_data['train_label'][val_idx] - - test_data = mnist_raw_data['test_data'] - test_label = mnist_raw_data['test_label'] - - #print len(train_data) - #print len(val_data) - - train = {'train_X' : train_data, 'train_Y' : train_label} - test = {'test_X' : test_data, 'test_Y' : test_label} - val = {'val_X' : val_data, 'val_Y' : val_label} - - data = dict() - data['train'] = train - data['test'] = test - data['val'] = val - - return data - -def upload_model_files_to_s3(bucket_name, files, folder_name): - s3 = boto3.client('s3') - for file in files: - s3.upload_file(file, bucket_name, folder_name + file) - print ('model successfully uploaded to s3') - def clean_up_files (): clean_mnist_data() - clean_model_files() - print ('Model files deleted') - -def clean_mnist_data(): - if os.path.isfile('train-images-idx3-ubyte.gz'): - os.remove('train-images-idx3-ubyte.gz') - if os.path.isfile('t10k-labels-idx1-ubyte.gz'): - os.remove('t10k-labels-idx1-ubyte.gz') - if os.path.isfile('train-labels-idx1-ubyte.gz'): - os.remove('train-labels-idx1-ubyte.gz') - if os.path.isfile('t10k-images-idx3-ubyte.gz'): - os.remove('t10k-images-idx3-ubyte.gz') - -def clean_model_files(): + files = list() for i in range(1, num_epoch+1): - if os.path.isfile(model_name + '-000' + str(i) + '.params'): - os.remove(model_name + '-000' + str(i) + '.params') - - if os.path.isfile(model_name + '-symbol.json'): - os.remove(model_name + '-symbol.json') - if os.path.isfile(inference_results_file): - os.remove(inference_results_file) + files.append(model_name + '-000' + str(i) + '.params') + + files.append(model_name + '-symbol.json') + files.append(inference_results_file) + clean_model_files(files) + print ('Model files deleted') def get_model_definition(): input = mx.symbol.Variable('data') @@ -145,10 +80,7 @@ def get_model_definition(): inference_results_file = model_name + '_inference' + '.json' - # Write the inference results to local json file. This will be cleaned up later - with open(inference_results_file, 'w') as file: - json.dump(inference_results, file) - + save_inference_results(inference_results_file, inference_results) model_params_file = model_name + '-000' + str(num_epoch) + '.params' model_symbol_file = model_name + '-symbol.json' From 3eeba08e4115269630aa57366cfe08bb9d9402d3 Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Mon, 9 Jul 2018 17:06:35 -0700 Subject: [PATCH 10/59] Added runtime function for executing the MBCC files --- ci/docker/runtime_functions.sh | 7 ++++ .../model_backward_compat_checker.sh | 42 +++++++++++++++++++ 2 files changed, 49 insertions(+) create mode 100755 tests/nightly/model_backwards_compatibility_check/model_backward_compat_checker.sh diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index 93e839933c7a..f4d44b69bb6d 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -854,6 +854,13 @@ nightly_test_javascript() { make -C /work/mxnet/amalgamation libmxnet_predict.js MIN=1 EMCC=/work/deps/emscripten/emcc } +#Tests Model backwards compatibility on MXNet +nightly_model_backwards_compat_test() { + set -ex + export PYTHONPATH=./python/ + ./tests/nightly/model_backwards_compatibility_check/model_backward_compat_checker.sh +} + # Deploy deploy_docs() { diff --git a/tests/nightly/model_backwards_compatibility_check/model_backward_compat_checker.sh b/tests/nightly/model_backwards_compatibility_check/model_backward_compat_checker.sh new file mode 100755 index 000000000000..69a031cafe23 --- /dev/null +++ b/tests/nightly/model_backwards_compatibility_check/model_backward_compat_checker.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#Author: Piyush Ghai + +echo "Invoking model_backwards_compat_test.sh script" +echo `pwd` +cd tests/nightly/model_backwards_compatibility_check +echo `pwd` + +echo '==========================' +echo 'running mlp with module api' +python mnist_mlp_module_api_inference.py + +echo '==========================' +echo 'running lenet with gluon api (non - hybridized)' +python lenet_cnn_gluon_inference.py + +echo '==========================' +echo 'running lenet with gluon api (hybridized)' +python lenet_cnn_gluon_hybrid_inference.py + +echo '==========================' +echo 'running rnn with gluon - save and load parameters' +python lm_rnn_gluon_inference.py + From 9c0157c22b61c9d35955957cd2866ce9354bec7a Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Mon, 9 Jul 2018 21:44:25 -0700 Subject: [PATCH 11/59] Added JenkinsFile for MBCC to be run as a nightly job --- .../JenkinsfileForMBCC | 79 +++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC diff --git a/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC b/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC new file mode 100644 index 000000000000..2d312fa4ea96 --- /dev/null +++ b/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC @@ -0,0 +1,79 @@ +// -*- mode: groovy -*- +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +//This is a Jenkinsfile for the model backwards compatibility checker. The format and some functions have been picked up from the top-level Jenkinsfile. + +err = null + +def init_git() { + deleteDir() + retry(5) { + try { + timeout(time: 15, unit: 'MINUTES') { + checkout scm + sh 'git submodule update --init --recursive' + sh 'git clean -d -f' + } + } catch (exc) { + deleteDir() + error "Failed to fetch source codes with ${exc}" + sleep 2 + } + } +} + +def docker_run(platform, function_name, use_nvidia, shared_mem = '500m') { + def command = "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY} %USE_NVIDIA% --platform %PLATFORM% --shm-size %SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%" + command = command.replaceAll('%USE_NVIDIA%', use_nvidia ? '--nvidiadocker' : '') + command = command.replaceAll('%PLATFORM%', platform) + command = command.replaceAll('%FUNCTION_NAME%', function_name) + command = command.replaceAll('%SHARED_MEM%', shared_mem) + + sh command +} + +try { + stage('MBCC'){ + parallel 'ModelBackwardsCompat: CPU': { + node('mxnetlinux-cpu') { + ws('workspace/modelBackwardsCompat') { + init_git() + docker_run('ubuntu_nightly_cpu', 'nightly_model_backwards_compat_test', false) + } + } + } + } +} catch (caughtError) { + node("mxnetlinux-cpu") { + sh "echo caught ${caughtError}" + err = caughtError + currentBuild.result = "FAILURE" + } +} finally { + node("mxnetlinux-cpu") { + // Only send email if model backwards compat test failed + if (currentBuild.result == "FAILURE") { + emailext body: 'Nightly tests for model backwards compatibity on MXNet branch : ${BRANCH_NAME} failed. Please view the build at ${BUILD_URL}', replyTo: '${EMAIL}', subject: '[MODEL BACKWARDS COMPATIBILITY TEST FAILED] build ${BUILD_NUMBER}', to: '${EMAIL}' + } + // Remember to rethrow so the build is marked as failing + if (err) { + throw err + } + } +} \ No newline at end of file From 3d43bcd24a6cb617b798f0dc97a0746339554a22 Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Mon, 9 Jul 2018 21:49:49 -0700 Subject: [PATCH 12/59] Added boto3 install for s3 uploads --- ci/docker/runtime_functions.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index f4d44b69bb6d..f79c87360569 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -858,6 +858,7 @@ nightly_test_javascript() { nightly_model_backwards_compat_test() { set -ex export PYTHONPATH=./python/ + pip install boto3 ./tests/nightly/model_backwards_compatibility_check/model_backward_compat_checker.sh } From 4b70e4a69dc27c8400687c5cd6c95330a8b398cd Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Mon, 9 Jul 2018 22:11:36 -0700 Subject: [PATCH 13/59] Added README for MBCC --- .../README.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 tests/nightly/model_backwards_compatibility_check/README.md diff --git a/tests/nightly/model_backwards_compatibility_check/README.md b/tests/nightly/model_backwards_compatibility_check/README.md new file mode 100644 index 000000000000..5a4e81f7b2f2 --- /dev/null +++ b/tests/nightly/model_backwards_compatibility_check/README.md @@ -0,0 +1,19 @@ +# Model Backwards Compatibility Tests + +This folder contains the scripts that are required to run the nightly job of verifying the compatibility and inference results of models (trained on earlier versions of MXNet) when loaded on the latest release candidate. The tests flag if: +- The models fail to load on the latest version of MXNet. +- The inference results are different. + + +## JenkinsfileForMBCC +This is configuration file for jenkins job. + +## Details +- The `model_backward_compat_checker.sh` is a top level script that invokes the inference files in python. +- Currently the APIs that covered for model saving/loading are : do_checkpoint/load_checkpoint, save_params/load_params, save_parameters/load_parameters(added v1.2.1 onwards), export/gluon.SymbolBlock.imports. +- These APIs are covered over models with architectures such as : MLP, RNNs, LeNet covering the four scenarios described above. +- More operators/models will be added in the future to extend the operator coverage. +- The model train files suffixed by `_train.py` and the trained models are hosted in AWS S3. +- The trained models for now are backfilled into S3 starting from every MXNet release version v1.0.0 +- The inference files are suffixed by `_inference.py`. + From 08ad3420e95f1e19c143f91a6e2761bbc88cf9e7 Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Mon, 9 Jul 2018 22:37:41 -0700 Subject: [PATCH 14/59] Added license header --- .../common.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/nightly/model_backwards_compatibility_check/common.py b/tests/nightly/model_backwards_compatibility_check/common.py index 4204dc0b3e61..0098cc1dabbe 100644 --- a/tests/nightly/model_backwards_compatibility_check/common.py +++ b/tests/nightly/model_backwards_compatibility_check/common.py @@ -1,3 +1,23 @@ +#!/usr/bin/env python + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + import boto3 import mxnet as mx import json From 5d1c3fc606e188ee932362e9a8eecb65f486c825 Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Tue, 10 Jul 2018 16:51:14 -0700 Subject: [PATCH 15/59] Added more common functions from lm_rnn_gluon_train and inference files into common.py to clean up code --- .../common.py | 174 ++++++++++++- .../lm_rnn_gluon_inference.py | 219 ++-------------- .../lm_rnn_gluon_train.py | 246 +++--------------- 3 files changed, 234 insertions(+), 405 deletions(-) diff --git a/tests/nightly/model_backwards_compatibility_check/common.py b/tests/nightly/model_backwards_compatibility_check/common.py index 0098cc1dabbe..463ee2239692 100644 --- a/tests/nightly/model_backwards_compatibility_check/common.py +++ b/tests/nightly/model_backwards_compatibility_check/common.py @@ -29,7 +29,10 @@ from mxnet.gluon.data.vision import transforms, datasets from mxnet import autograd as ag import mxnet.ndarray as F -from mxnet.gluon import nn +from mxnet.gluon import nn, rnn +import re +import time +import sys # Set fixed random seeds. mx.random.seed(7) @@ -111,6 +114,15 @@ def save_inference_results(inference_results_file, inference_results): with open(inference_results_file, 'w') as file: json.dump(inference_results, file) + +def compare_versions(version1, version2): + ''' + https://stackoverflow.com/questions/1714027/version-number-comparison-in-python + ''' + def normalize(v): + return [int(x) for x in re.sub(r'(\.0+)*$','', v).split(".")] + return cmp(normalize(version1), normalize(version2)) + def get_val_test_iter(): data = prepare_mnist_data(mx.test_utils.get_mnist()) val = data['val'] @@ -165,3 +177,163 @@ def forward(self, x): x = F.tanh(self.fc1(x)) x = F.tanh(self.fc2(x)) return x + +class Dictionary(object): + def __init__(self): + self.word2idx = {} + self.idx2word = [] + + def add_word(self, word): + if word not in self.word2idx: + self.idx2word.append(word) + self.word2idx[word] = len(self.idx2word) - 1 + return self.word2idx[word] + + def __len__(self): + return len(self.idx2word) + +class Corpus(object): + def __init__(self, path): + self.dictionary = Dictionary() + self.download_data_from_s3() + self.train = self.tokenize(path + 'train.txt') + self.valid = self.tokenize(path + 'valid.txt') + self.test = self.tokenize(path + 'test.txt') + + def download_data_from_s3(self, ): + print ('Downloading files from bucket : ptb-small-dataset' ) + bucket = s3.Bucket('ptb-small-dataset') + files = ['test.txt', 'train.txt', 'valid.txt'] + for file in files: + if os.path.exists(args_data + file) : + print ('File %s'%(args_data + file), 'already exists. Skipping download') + continue + file_path = args_data + file + bucket.download_file(file_path, args_data + file) + + def tokenize(self, path): + """Tokenizes a text file.""" + assert os.path.exists(path) + # Add words to the dictionary + with open(path, 'r') as f: + tokens = 0 + for line in f: + words = line.split() + [''] + tokens += len(words) + for word in words: + self.dictionary.add_word(word) + + # Tokenize file content + with open(path, 'r') as f: + ids = np.zeros((tokens,), dtype='int32') + token = 0 + for line in f: + words = line.split() + [''] + for word in words: + ids[token] = self.dictionary.word2idx[word] + token += 1 + + return mx.nd.array(ids, dtype='int32') + + + +#### Common utilies for lm_rnn_gluon_train & inference files +args_data = 'ptb.' +args_model = 'rnn_relu' +args_emsize = 100 +args_nhid = 100 +args_nlayers = 2 +args_lr = 1.0 +args_clip = 0.2 +args_epochs = 2 +args_batch_size = 32 +args_bptt = 5 +args_dropout = 0.2 +args_tied = True +args_cuda = 'store_true' +args_log_interval = 500 + +class RNNModel(gluon.Block): + """A model with an encoder, recurrent layer, and a decoder.""" + + def __init__(self, mode, vocab_size, num_embed, num_hidden, + num_layers, dropout=0.5, tie_weights=False, **kwargs): + super(RNNModel, self).__init__(**kwargs) + with self.name_scope(): + self.drop = nn.Dropout(dropout) + self.encoder = nn.Embedding(vocab_size, num_embed, + weight_initializer = mx.init.Uniform(0.1)) + if mode == 'rnn_relu': + self.rnn = rnn.RNN(num_hidden, num_layers, activation='relu', dropout=dropout, + input_size=num_embed) + elif mode == 'rnn_tanh': + self.rnn = rnn.RNN(num_hidden, num_layers, dropout=dropout, + input_size=num_embed) + elif mode == 'lstm': + self.rnn = rnn.LSTM(num_hidden, num_layers, dropout=dropout, + input_size=num_embed) + elif mode == 'gru': + self.rnn = rnn.GRU(num_hidden, num_layers, dropout=dropout, + input_size=num_embed) + else: + raise ValueError("Invalid mode %s. Options are rnn_relu, " + "rnn_tanh, lstm, and gru"%mode) + if tie_weights: + self.decoder = nn.Dense(vocab_size, in_units = num_hidden, + params = self.encoder.params) + else: + self.decoder = nn.Dense(vocab_size, in_units = num_hidden) + self.num_hidden = num_hidden + + def forward(self, inputs, hidden): + emb = self.drop(self.encoder(inputs)) + output, hidden = self.rnn(emb, hidden) + output = self.drop(output) + decoded = self.decoder(output.reshape((-1, self.num_hidden))) + return decoded, hidden + + def begin_state(self, *args, **kwargs): + return self.rnn.begin_state(*args, **kwargs) + +def batchify(data, batch_size): + """Reshape data into (num_example, batch_size)""" + nbatch = data.shape[0] // batch_size + data = data[:nbatch * batch_size] + data = data.reshape((batch_size, nbatch)).T + return data + +def get_batch(source, i): + seq_len = min(args_bptt, source.shape[0] - 1 - i) + data = source[i : i + seq_len] + target = source[i + 1 : i + 1 + seq_len] + return data, target.reshape((-1,)) + +def detach(hidden): + if isinstance(hidden, (tuple, list)): + hidden = [i.detach() for i in hidden] + else: + hidden = hidden.detach() + return hidden + +def eval(data_source, model): + total_L = 0.0 + ntotal = 0 + loss = gluon.loss.SoftmaxCrossEntropyLoss() + hidden = model.begin_state(func = mx.nd.zeros, batch_size = args_batch_size, ctx=mx.cpu(0)) + for i in range(0, data_source.shape[0] - 1, args_bptt): + data, target = get_batch(data_source, i) + output, hidden = model(data, hidden) + L = loss(output, target) + total_L += mx.nd.sum(L).asscalar() + ntotal += L.size + return total_L / ntotal + +def clean_ptb_data(): + files = ['test.txt', 'train.txt', 'valid.txt'] + for file in files: + if os.path.isfile(args_data + file): + os.remove(args_data + file) + +# This function is added so that if a download gets interrupted in between, one can clean the corrupted files +clean_mnist_data() +clean_ptb_data() diff --git a/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_inference.py b/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_inference.py index 23aad2b9f4a6..04f6b41cc273 100644 --- a/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_inference.py +++ b/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_inference.py @@ -17,197 +17,31 @@ # specific language governing permissions and limitations # under the License. -import math -import os -import time -import numpy as np -import mxnet as mx -from mxnet import gluon, autograd -from mxnet.gluon import nn, rnn -import logging -import boto3 -import json -logging.getLogger().setLevel(logging.DEBUG) -mx.random.seed(7) -np.random.seed(7) +from common import * -mxnet_version = mx.__version__ -bucket_name = 'mxnet-model-backwards-compatibility' -ctx = mx.cpu() -backslash = '/' model_name = 'lm_rnn_gluon_api' -s3 = boto3.resource('s3') - - -args_data = 'ptb.' -args_model = 'rnn_relu' -args_emsize = 100 -args_nhid = 100 -args_nlayers = 2 -args_lr = 1.0 -args_clip = 0.2 -args_epochs = 2 -args_batch_size = 32 -args_bptt = 5 -args_dropout = 0.2 -args_tied = True -args_cuda = 'store_true' -args_log_interval = 500 -args_save = model_name + '.params' - -class Dictionary(object): - def __init__(self): - self.word2idx = {} - self.idx2word = [] - - def add_word(self, word): - if word not in self.word2idx: - self.idx2word.append(word) - self.word2idx[word] = len(self.idx2word) - 1 - return self.word2idx[word] - - def __len__(self): - return len(self.idx2word) - -class Corpus(object): - def __init__(self, path): - self.dictionary = Dictionary() - self.download_data_from_s3() - self.train = self.tokenize(path + 'train.txt') - self.valid = self.tokenize(path + 'valid.txt') - self.test = self.tokenize(path + 'test.txt') - - def download_data_from_s3(self): - print ('Downloading files from bucket : %s' %bucket_name) - bucket = s3.Bucket(bucket_name) - files = ['test.txt', 'train.txt', 'valid.txt'] - for file in files: - if os.path.exists(args_data + file) : - print ('File %s'%(args_data + file), 'already exists. Skipping download') - continue - file_path = str(mxnet_version) + backslash + model_name + backslash + args_data + file - bucket.download_file(file_path, args_data + file) - - def tokenize(self, path): - """Tokenizes a text file.""" - assert os.path.exists(path) - # Add words to the dictionary - with open(path, 'r') as f: - tokens = 0 - for line in f: - words = line.split() + [''] - tokens += len(words) - for word in words: - self.dictionary.add_word(word) - - # Tokenize file content - with open(path, 'r') as f: - ids = np.zeros((tokens,), dtype='int32') - token = 0 - for line in f: - words = line.split() + [''] - for word in words: - ids[token] = self.dictionary.word2idx[word] - token += 1 - - return mx.nd.array(ids, dtype='int32') - -class RNNModel(gluon.Block): - """A model with an encoder, recurrent layer, and a decoder.""" - - def __init__(self, mode, vocab_size, num_embed, num_hidden, - num_layers, dropout=0.5, tie_weights=False, **kwargs): - super(RNNModel, self).__init__(**kwargs) - with self.name_scope(): - self.drop = nn.Dropout(dropout) - self.encoder = nn.Embedding(vocab_size, num_embed, - weight_initializer = mx.init.Uniform(0.1)) - if mode == 'rnn_relu': - self.rnn = rnn.RNN(num_hidden, num_layers, activation='relu', dropout=dropout, - input_size=num_embed) - elif mode == 'rnn_tanh': - self.rnn = rnn.RNN(num_hidden, num_layers, dropout=dropout, - input_size=num_embed) - elif mode == 'lstm': - self.rnn = rnn.LSTM(num_hidden, num_layers, dropout=dropout, - input_size=num_embed) - elif mode == 'gru': - self.rnn = rnn.GRU(num_hidden, num_layers, dropout=dropout, - input_size=num_embed) - else: - raise ValueError("Invalid mode %s. Options are rnn_relu, " - "rnn_tanh, lstm, and gru"%mode) - if tie_weights: - self.decoder = nn.Dense(vocab_size, in_units = num_hidden, - params = self.encoder.params) - else: - self.decoder = nn.Dense(vocab_size, in_units = num_hidden) - self.num_hidden = num_hidden - - def forward(self, inputs, hidden): - emb = self.drop(self.encoder(inputs)) - output, hidden = self.rnn(emb, hidden) - output = self.drop(output) - decoded = self.decoder(output.reshape((-1, self.num_hidden))) - return decoded, hidden - - def begin_state(self, *args, **kwargs): - return self.rnn.begin_state(*args, **kwargs) context = mx.cpu(0) -def batchify(data, batch_size): - """Reshape data into (num_example, batch_size)""" - nbatch = data.shape[0] // batch_size - data = data[:nbatch * batch_size] - data = data.reshape((batch_size, nbatch)).T - return data - -def get_batch(source, i): - seq_len = min(args_bptt, source.shape[0] - 1 - i) - data = source[i : i + seq_len] - target = source[i + 1 : i + 1 + seq_len] - return data, target.reshape((-1,)) - -def detach(hidden): - if isinstance(hidden, (tuple, list)): - hidden = [i.detach() for i in hidden] - else: - hidden = hidden.detach() - return hidden - -def eval(data_source, model): - total_L = 0.0 - ntotal = 0 - loss = gluon.loss.SoftmaxCrossEntropyLoss() - hidden = model.begin_state(func = mx.nd.zeros, batch_size = args_batch_size, ctx=context) - for i in range(0, data_source.shape[0] - 1, args_bptt): - data, target = get_batch(data_source, i) - output, hidden = model(data, hidden) - L = loss(output, target) - total_L += mx.nd.sum(L).asscalar() - ntotal += L.size - return total_L / ntotal - def test(test_data, model): - test_L = eval(test_data, model) - return test_L, math.exp(test_L) + test_L = eval(test_data, model) + return test_L, np.exp(test_L) def get_top_level_folders_in_bucket(s3client, bucket_name): - '''This function returns the top level folders in the S3Bucket. These folders help us to navigate to the trained model files stored for different MXNet versions. ''' - bucket = s3client.Bucket(bucket_name) - result = bucket.meta.client.list_objects(Bucket=bucket.name, - Delimiter=backslash) - folder_list = list() - for obj in result['CommonPrefixes']: - folder_list.append(obj['Prefix'].strip(backslash)) + '''This function returns the top level folders in the S3Bucket. These folders help us to navigate to the trained model files stored for different MXNet versions. ''' + bucket = s3client.Bucket(bucket_name) + result = bucket.meta.client.list_objects(Bucket=bucket.name, + Delimiter=backslash) + folder_list = list() + for obj in result['CommonPrefixes']: + folder_list.append(obj['Prefix'].strip(backslash)) - return folder_list + return folder_list def get_model(model_file): model_2 = RNNModel(args_model, ntokens, args_emsize, args_nhid, - args_nlayers, args_dropout, args_tied) - model_2.load_parameters(args_save, context) + args_nlayers, args_dropout, args_tied) + model_2.load_parameters(model_name + '.params', context) return model_2 @@ -232,28 +66,23 @@ def perform_inference(test_data, val_data, model, inference_file): print ('Inference results passed for %s' % model_name) def clean_up_files (model_files): - clean_ptb_data() - clean_model_files(model_files) - print ('Model files deleted') - -def clean_ptb_data(): - files = ['test.txt', 'train.txt', 'valid.txt'] - for file in files: - if os.path.isfile(args_data + file): - os.remove(args_data + file) - + clean_ptb_data() + clean_model_files(model_files) + print ('Model files deleted') + def clean_model_files(model_files): - for file in model_files: - if os.path.isfile(file): - os.remove(file) - + for file in model_files: + if os.path.isfile(file): + os.remove(file) + if __name__=='__main__': + corpus = Corpus(args_data) train_data = batchify(corpus.train, args_batch_size).as_in_context(context) val_data = batchify(corpus.valid, args_batch_size).as_in_context(context) test_data = batchify(corpus.test, args_batch_size).as_in_context(context) - ntokens = len(corpus.dictionary) + for folder in get_top_level_folders_in_bucket(s3, bucket_name): bucket = s3.Bucket(bucket_name) prefix = folder + backslash + model_name @@ -272,4 +101,4 @@ def clean_model_files(model_files): model = get_model(model_name + '.params') perform_inference(test_data, val_data, model, model_name + '_inference.json') - clean_up_files(model_files) + clean_up_files(model_files) diff --git a/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_train.py b/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_train.py index c6d9932d2a21..26a4e50fd25d 100644 --- a/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_train.py +++ b/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_train.py @@ -18,178 +18,13 @@ # under the License. -import math -import os -import time -import numpy as np -import mxnet as mx -from mxnet import gluon, autograd -from mxnet.gluon import nn, rnn -import logging -import boto3 -import json -logging.getLogger().setLevel(logging.DEBUG) -mx.random.seed(7) -np.random.seed(7) +from common import * -mxnet_version = mx.__version__ -bucket_name = 'mxnet-model-backwards-compatibility' -ctx = mx.cpu() num_epoch = 2 -backslash = '/' model_name = 'lm_rnn_gluon_api' -s3 = boto3.resource('s3') - - -args_data = 'ptb.' -args_model = 'rnn_relu' -args_emsize = 100 -args_nhid = 100 -args_nlayers = 2 -args_lr = 1.0 -args_clip = 0.2 -args_epochs = 2 -args_batch_size = 32 -args_bptt = 5 -args_dropout = 0.2 -args_tied = True -args_cuda = 'store_true' -args_log_interval = 500 -args_save = model_name + '.params' - -class Dictionary(object): - def __init__(self): - self.word2idx = {} - self.idx2word = [] - - def add_word(self, word): - if word not in self.word2idx: - self.idx2word.append(word) - self.word2idx[word] = len(self.idx2word) - 1 - return self.word2idx[word] - - def __len__(self): - return len(self.idx2word) - -class Corpus(object): - def __init__(self, path): - self.dictionary = Dictionary() - self.download_data_from_s3() - self.train = self.tokenize(path + 'train.txt') - self.valid = self.tokenize(path + 'valid.txt') - self.test = self.tokenize(path + 'test.txt') - - def download_data_from_s3(self): - print ('Downloading files from bucket : %s' %bucket_name) - bucket = s3.Bucket(bucket_name) - files = ['test.txt', 'train.txt', 'valid.txt'] - for file in files: - if os.path.exists(args_data + file) : - print ('File %s'%(args_data + file), 'already exists. Skipping download') - continue - file_path = str(mxnet_version) + backslash + model_name + backslash + args_data + file - bucket.download_file(file_path, args_data + file) - - def tokenize(self, path): - """Tokenizes a text file.""" - assert os.path.exists(path) - # Add words to the dictionary - with open(path, 'r') as f: - tokens = 0 - for line in f: - words = line.split() + [''] - tokens += len(words) - for word in words: - self.dictionary.add_word(word) - - # Tokenize file content - with open(path, 'r') as f: - ids = np.zeros((tokens,), dtype='int32') - token = 0 - for line in f: - words = line.split() + [''] - for word in words: - ids[token] = self.dictionary.word2idx[word] - token += 1 - - return mx.nd.array(ids, dtype='int32') - -class RNNModel(gluon.Block): - """A model with an encoder, recurrent layer, and a decoder.""" - - def __init__(self, mode, vocab_size, num_embed, num_hidden, - num_layers, dropout=0.5, tie_weights=False, **kwargs): - super(RNNModel, self).__init__(**kwargs) - with self.name_scope(): - self.drop = nn.Dropout(dropout) - self.encoder = nn.Embedding(vocab_size, num_embed, - weight_initializer = mx.init.Uniform(0.1)) - if mode == 'rnn_relu': - self.rnn = rnn.RNN(num_hidden, num_layers, activation='relu', dropout=dropout, - input_size=num_embed) - elif mode == 'rnn_tanh': - self.rnn = rnn.RNN(num_hidden, num_layers, dropout=dropout, - input_size=num_embed) - elif mode == 'lstm': - self.rnn = rnn.LSTM(num_hidden, num_layers, dropout=dropout, - input_size=num_embed) - elif mode == 'gru': - self.rnn = rnn.GRU(num_hidden, num_layers, dropout=dropout, - input_size=num_embed) - else: - raise ValueError("Invalid mode %s. Options are rnn_relu, " - "rnn_tanh, lstm, and gru"%mode) - if tie_weights: - self.decoder = nn.Dense(vocab_size, in_units = num_hidden, - params = self.encoder.params) - else: - self.decoder = nn.Dense(vocab_size, in_units = num_hidden) - self.num_hidden = num_hidden - - def forward(self, inputs, hidden): - emb = self.drop(self.encoder(inputs)) - output, hidden = self.rnn(emb, hidden) - output = self.drop(output) - decoded = self.decoder(output.reshape((-1, self.num_hidden))) - return decoded, hidden - - def begin_state(self, *args, **kwargs): - return self.rnn.begin_state(*args, **kwargs) context = mx.cpu(0) -def batchify(data, batch_size): - """Reshape data into (num_example, batch_size)""" - nbatch = data.shape[0] // batch_size - data = data[:nbatch * batch_size] - data = data.reshape((batch_size, nbatch)).T - return data - -def get_batch(source, i): - seq_len = min(args_bptt, source.shape[0] - 1 - i) - data = source[i : i + seq_len] - target = source[i + 1 : i + 1 + seq_len] - return data, target.reshape((-1,)) - -def detach(hidden): - if isinstance(hidden, (tuple, list)): - hidden = [i.detach() for i in hidden] - else: - hidden = hidden.detach() - return hidden - -def eval(data_source, model): - total_L = 0.0 - ntotal = 0 - hidden = model.begin_state(func = mx.nd.zeros, batch_size = args_batch_size, ctx=context) - for i in range(0, data_source.shape[0] - 1, args_bptt): - data, target = get_batch(data_source, i) - output, hidden = model(data, hidden) - L = loss(output, target) - total_L += mx.nd.sum(L).asscalar() - ntotal += L.size - return total_L / ntotal - def train(model, train_data): best_val = float("Inf") for epoch in range(args_epochs): @@ -215,21 +50,21 @@ def train(model, train_data): if ibatch % args_log_interval == 0 and ibatch > 0: cur_L = total_L / args_bptt / args_batch_size / args_log_interval print('[Epoch %d Batch %d] loss %.2f, perplexity %.2f' % ( - epoch + 1, ibatch, cur_L, math.exp(cur_L))) + epoch + 1, ibatch, cur_L, np.exp(cur_L))) total_L = 0.0 val_L = eval(val_data, model) print('[Epoch %d] time cost %.2fs, validation loss %.2f, validation perplexity %.2f' % ( - epoch + 1, time.time() - start_time, val_L, math.exp(val_L))) + epoch + 1, time.time() - start_time, val_L, np.exp(val_L))) if val_L < best_val: best_val = val_L - model.save_parameters(args_save) + model.save_parameters(model_name + '.params') def test(test_data, model): test_L = eval(test_data, model) - return test_L, math.exp(test_L) + return test_L, np.exp(test_L) def save_inference_results(test, val): inference_results = dict() @@ -242,22 +77,10 @@ def save_inference_results(test, val): with open(inference_results_file, 'w') as file: json.dump(inference_results, file) -def upload_model_files_to_s3(bucket_name, files, folder_name): - s3 = boto3.client('s3') - for file in files: - s3.upload_file(file, bucket_name, folder_name + file) - print ('model successfully uploaded to s3') - def clean_up_files (model_files): clean_ptb_data() clean_model_files(model_files) print ('Model files deleted') - -def clean_ptb_data(): - files = ['test.txt', 'train.txt', 'valid.txt'] - for file in files: - if os.path.isfile(args_data + file): - os.remove(args_data + file) def clean_model_files(model_files): for file in model_files: @@ -265,39 +88,44 @@ def clean_model_files(model_files): os.remove(file) if __name__=='__main__': - corpus = Corpus(args_data) - train_data = batchify(corpus.train, args_batch_size).as_in_context(context) - val_data = batchify(corpus.valid, args_batch_size).as_in_context(context) - test_data = batchify(corpus.test, args_batch_size).as_in_context(context) + ## If this code is being run on version >= 1.2.0 only then execute it, since it uses save_parameters and load_parameters API + if compare_versions(str(mxnet_version), '1.2.1') < 0: + print ('Found MXNet version %s and exiting because this version does not contain save_parameters and load_parameters functions' %str(mxnet_version)) + sys.exit(1) + + corpus = Corpus(args_data) + train_data = batchify(corpus.train, args_batch_size).as_in_context(context) + val_data = batchify(corpus.valid, args_batch_size).as_in_context(context) + test_data = batchify(corpus.test, args_batch_size).as_in_context(context) - ntokens = len(corpus.dictionary) + ntokens = len(corpus.dictionary) - model = RNNModel(args_model, ntokens, args_emsize, args_nhid, + model = RNNModel(args_model, ntokens, args_emsize, args_nhid, args_nlayers, args_dropout, args_tied) - model.collect_params().initialize(mx.init.Xavier(), ctx=context) - trainer = gluon.Trainer(model.collect_params(), 'sgd', {'learning_rate': args_lr, 'momentum': 0, 'wd': 0}) - loss = gluon.loss.SoftmaxCrossEntropyLoss() + model.collect_params().initialize(mx.init.Xavier(), ctx=context) + trainer = gluon.Trainer(model.collect_params(), 'sgd', {'learning_rate': args_lr, 'momentum': 0, 'wd': 0}) + loss = gluon.loss.SoftmaxCrossEntropyLoss() - train(model, train_data) - val_loss, val_ppl = test(val_data, model) - print('Validation loss %f, Validation perplexity %f'%(val_loss, val_ppl)) - test_loss, test_ppl = test(test_data, model) - print('test loss %f, test perplexity %f'%(test_loss, test_ppl)) + train(model, train_data) + val_loss, val_ppl = test(val_data, model) + print('Validation loss %f, Validation perplexity %f'%(val_loss, val_ppl)) + test_loss, test_ppl = test(test_data, model) + print('test loss %f, test perplexity %f'%(test_loss, test_ppl)) - val_results = dict() - val_results['loss'] = val_loss - val_results['ppl'] = val_ppl + val_results = dict() + val_results['loss'] = val_loss + val_results['ppl'] = val_ppl - test_results = dict() - test_results['loss'] = test_loss - test_results['ppl'] = test_ppl + test_results = dict() + test_results['loss'] = test_loss + test_results['ppl'] = test_ppl - save_inference_results(test_results, val_results) + save_inference_results(test_results, val_results) - mxnet_folder = str(mxnet_version) + backslash + model_name + backslash + mxnet_folder = str(mxnet_version) + backslash + model_name + backslash - files = list() - files.append(model_name + '.params') - files.append(model_name + '_inference' + '.json') - upload_model_files_to_s3(bucket_name, files, mxnet_folder) - clean_up_files(files) \ No newline at end of file + files = list() + files.append(model_name + '.params') + files.append(model_name + '_inference' + '.json') + upload_model_files_to_s3(bucket_name, files, mxnet_folder) + clean_up_files(files) \ No newline at end of file From cfe8dfc0c827151b486a024ff24372eb47f2e9ba Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Tue, 10 Jul 2018 17:06:27 -0700 Subject: [PATCH 16/59] Added scripts for training models on older versions of MXNet --- ci/docker/runtime_functions.sh | 13 +++++ .../README.md | 3 +- .../train_mxnet_legacy_models.sh | 57 +++++++++++++++++++ 3 files changed, 72 insertions(+), 1 deletion(-) create mode 100755 tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index f79c87360569..71545c55f98a 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -862,6 +862,19 @@ nightly_model_backwards_compat_test() { ./tests/nightly/model_backwards_compatibility_check/model_backward_compat_checker.sh } +#Backfills S3 bucket with models trained on earlier versions of mxnet +nightly_model_backwards_compat_train() { + set -ex + export PYTHONPATH=./python/ + VENV=mbcc_py2_venv + virtualenv -p `which python2` $VENV + source $VENV/bin/activate + pip install boto3 + ./tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh + #Deactivate the virtual env once we are done with it + deactivate +} + # Deploy deploy_docs() { diff --git a/tests/nightly/model_backwards_compatibility_check/README.md b/tests/nightly/model_backwards_compatibility_check/README.md index 5a4e81f7b2f2..c24b26151108 100644 --- a/tests/nightly/model_backwards_compatibility_check/README.md +++ b/tests/nightly/model_backwards_compatibility_check/README.md @@ -14,6 +14,7 @@ This is configuration file for jenkins job. - These APIs are covered over models with architectures such as : MLP, RNNs, LeNet covering the four scenarios described above. - More operators/models will be added in the future to extend the operator coverage. - The model train files suffixed by `_train.py` and the trained models are hosted in AWS S3. -- The trained models for now are backfilled into S3 starting from every MXNet release version v1.0.0 +- The trained models for now are backfilled into S3 starting from every MXNet release version v1.1.0. +- The script for training the models on older versions of MXNet is : `train_mxnet_legacy_models.sh`. - The inference files are suffixed by `_inference.py`. diff --git a/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh new file mode 100755 index 000000000000..4055895a2e68 --- /dev/null +++ b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh @@ -0,0 +1,57 @@ +#!/usr/bin/env bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#Author: Piyush Ghai + +run_models() { + echo '==========================' + echo "Running training files and preparing models" + echo '==========================' + python mnist_mlp_module_api_train.py + echo '==========================' + python lenet_cnn_gluon_hybrid_train.py + echo '==========================' + python lm_rnn_gluon_train.py + echo '==========================' + python lenet_cnn_gluon_train.py + echo '==========================' +} + +install_mxnet() { + version=$1 + echo "Installing MXNet "$version + pip install mxnet==$version +} + +install_boto3(){ + echo "Intalling boto3" + pip install boto3 +} + +echo `pwd` +cd tests/nightly/model_backwards_compatibility_check +echo `pwd` + +install_boto3 + +install_mxnet 1.1.0 +run_models + +install_mxnet 1.2.0 +run_models \ No newline at end of file From 7c41488fe3d8679ec12a5d0694a596cc6a72017d Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Tue, 10 Jul 2018 23:15:06 -0700 Subject: [PATCH 17/59] Added check for preventing inference script from crashing in case no trained models are found --- tests/nightly/model_backwards_compatibility_check/common.py | 3 +++ .../mnist_mlp_module_api_inference.py | 1 + 2 files changed, 4 insertions(+) diff --git a/tests/nightly/model_backwards_compatibility_check/common.py b/tests/nightly/model_backwards_compatibility_check/common.py index 463ee2239692..64163a7120c3 100644 --- a/tests/nightly/model_backwards_compatibility_check/common.py +++ b/tests/nightly/model_backwards_compatibility_check/common.py @@ -83,6 +83,9 @@ def get_top_level_folders_in_bucket(s3client, bucket_name): result = bucket.meta.client.list_objects(Bucket=bucket.name, Delimiter=backslash) folder_list = list() + if 'CommonPrefixes' not in result: + print ('No trained models found in S3 bucket : %s for this file. Please train the models and run inference again' %bucket_name) + sys.exit(1) for obj in result['CommonPrefixes']: folder_list.append(obj['Prefix'].strip(backslash)) diff --git a/tests/nightly/model_backwards_compatibility_check/mnist_mlp_module_api_inference.py b/tests/nightly/model_backwards_compatibility_check/mnist_mlp_module_api_inference.py index b96a1e8c9c8d..63302430cd02 100644 --- a/tests/nightly/model_backwards_compatibility_check/mnist_mlp_module_api_inference.py +++ b/tests/nightly/model_backwards_compatibility_check/mnist_mlp_module_api_inference.py @@ -62,6 +62,7 @@ def perform_inference(test_iter, val_iter, model, inference_file): prefix = folder + backslash + model_name model_files_meta = list(bucket.objects.filter(Prefix = prefix)) if len(model_files_meta) == 0: + print ('No trained models found under path : %s' %prefix) continue model_files = list() for obj in model_files_meta: From 50be5d8c728237a1d74526c850a8a78b7118418b Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Tue, 10 Jul 2018 23:32:18 -0700 Subject: [PATCH 18/59] Fixed indentation issue --- .../lm_rnn_gluon_inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_inference.py b/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_inference.py index 04f6b41cc273..6050ba5da24d 100644 --- a/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_inference.py +++ b/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_inference.py @@ -101,4 +101,4 @@ def clean_model_files(model_files): model = get_model(model_name + '.params') perform_inference(test_data, val_data, model, model_name + '_inference.json') - clean_up_files(model_files) + clean_up_files(model_files) From c3c91295b7c4e11241b5a32d4885c3d775f052b6 Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Wed, 11 Jul 2018 11:09:00 -0700 Subject: [PATCH 19/59] Replaced Penn Tree Bank Dataset with Sherlock Holmes Dataset --- .../model_backwards_compatibility_check/common.py | 10 +++++----- .../lm_rnn_gluon_inference.py | 2 +- .../lm_rnn_gluon_train.py | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/nightly/model_backwards_compatibility_check/common.py b/tests/nightly/model_backwards_compatibility_check/common.py index 64163a7120c3..3b4a106ef382 100644 --- a/tests/nightly/model_backwards_compatibility_check/common.py +++ b/tests/nightly/model_backwards_compatibility_check/common.py @@ -204,8 +204,8 @@ def __init__(self, path): self.test = self.tokenize(path + 'test.txt') def download_data_from_s3(self, ): - print ('Downloading files from bucket : ptb-small-dataset' ) - bucket = s3.Bucket('ptb-small-dataset') + print ('Downloading files from bucket : sherlock-dataset' ) + bucket = s3.Bucket('sherlock-dataset') files = ['test.txt', 'train.txt', 'valid.txt'] for file in files: if os.path.exists(args_data + file) : @@ -241,7 +241,7 @@ def tokenize(self, path): #### Common utilies for lm_rnn_gluon_train & inference files -args_data = 'ptb.' +args_data = 'sherlockholmes.' args_model = 'rnn_relu' args_emsize = 100 args_nhid = 100 @@ -331,7 +331,7 @@ def eval(data_source, model): ntotal += L.size return total_L / ntotal -def clean_ptb_data(): +def clean_sherlock_data(): files = ['test.txt', 'train.txt', 'valid.txt'] for file in files: if os.path.isfile(args_data + file): @@ -339,4 +339,4 @@ def clean_ptb_data(): # This function is added so that if a download gets interrupted in between, one can clean the corrupted files clean_mnist_data() -clean_ptb_data() +clean_sherlock_data() diff --git a/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_inference.py b/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_inference.py index 6050ba5da24d..cdbc8daed68d 100644 --- a/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_inference.py +++ b/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_inference.py @@ -66,7 +66,7 @@ def perform_inference(test_data, val_data, model, inference_file): print ('Inference results passed for %s' % model_name) def clean_up_files (model_files): - clean_ptb_data() + clean_sherlock_data() clean_model_files(model_files) print ('Model files deleted') diff --git a/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_train.py b/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_train.py index 26a4e50fd25d..d6ac8abe0365 100644 --- a/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_train.py +++ b/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_train.py @@ -78,7 +78,7 @@ def save_inference_results(test, val): json.dump(inference_results, file) def clean_up_files (model_files): - clean_ptb_data() + clean_sherlock_data() clean_model_files(model_files) print ('Model files deleted') From 34853525c99a72c114c2f754403de421b0854409 Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Wed, 11 Jul 2018 11:11:45 -0700 Subject: [PATCH 20/59] Fixed indentation issue --- .../lm_rnn_gluon_inference.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_inference.py b/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_inference.py index cdbc8daed68d..9f8b2a55bc0e 100644 --- a/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_inference.py +++ b/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_inference.py @@ -30,8 +30,7 @@ def test(test_data, model): def get_top_level_folders_in_bucket(s3client, bucket_name): '''This function returns the top level folders in the S3Bucket. These folders help us to navigate to the trained model files stored for different MXNet versions. ''' bucket = s3client.Bucket(bucket_name) - result = bucket.meta.client.list_objects(Bucket=bucket.name, - Delimiter=backslash) + result = bucket.meta.client.list_objects(Bucket=bucket.name, Delimiter=backslash) folder_list = list() for obj in result['CommonPrefixes']: folder_list.append(obj['Prefix'].strip(backslash)) From af9b86d36b9e153926982e01b1ed92c6102a1261 Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Thu, 12 Jul 2018 12:21:20 -0700 Subject: [PATCH 21/59] Removed training in models and added smaller models. Now we are simply checking a forward pass in the model with dummy data. --- .../common.py | 327 +++++------------- .../lenet_cnn_gluon_hybrid_inference.py | 91 ----- .../lenet_cnn_gluon_hybrid_train.py | 130 ------- .../lenet_cnn_gluon_inference.py | 92 ----- .../lenet_cnn_gluon_train.py | 129 ------- .../lm_rnn_gluon_inference.py | 103 ------ .../lm_rnn_gluon_train.py | 131 ------- .../mnist_mlp_module_api_inference.py | 77 ----- .../mnist_mlp_module_api_train.py | 99 ------ .../model_backward_compat_checker.sh | 18 +- .../model_backwards_compat_inference.py | 168 +++++++++ .../model_backwards_compat_train.py | 159 +++++++++ .../train_mxnet_legacy_models.sh | 8 +- 13 files changed, 422 insertions(+), 1110 deletions(-) delete mode 100644 tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_hybrid_inference.py delete mode 100644 tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_hybrid_train.py delete mode 100644 tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_inference.py delete mode 100644 tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_train.py delete mode 100644 tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_inference.py delete mode 100644 tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_train.py delete mode 100644 tests/nightly/model_backwards_compatibility_check/mnist_mlp_module_api_inference.py delete mode 100644 tests/nightly/model_backwards_compatibility_check/mnist_mlp_module_api_train.py create mode 100644 tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py create mode 100644 tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py diff --git a/tests/nightly/model_backwards_compatibility_check/common.py b/tests/nightly/model_backwards_compatibility_check/common.py index 3b4a106ef382..52542cfee5b2 100644 --- a/tests/nightly/model_backwards_compatibility_check/common.py +++ b/tests/nightly/model_backwards_compatibility_check/common.py @@ -33,6 +33,7 @@ import re import time import sys +from mxnet.test_utils import assert_almost_equal # Set fixed random seeds. mx.random.seed(7) @@ -41,42 +42,78 @@ # get the current mxnet version we are running on mxnet_version = mx.__version__ -bucket_name = 'mxnet-model-backwards-compatibility' +model_bucket_name = 'mxnet-model-backwards-compatibility-models' +data_bucket_name = 'mxnet-model-backwards-compatibility-data' backslash = '/' s3 = boto3.resource('s3') -num_epoch = 2 +ctx = mx.cpu(0) -def prepare_mnist_data(mnist_raw_data): - - #shuffle the indices - indices = np.random.permutation(mnist_raw_data['train_label'].shape[0]) +def get_module_api_model_definition(): + input = mx.symbol.Variable('data') + input = mx.symbol.Flatten(data=input) - #print indices[0:10] - train_idx , val_idx = indices[:50000], indices[50000:] + fc1 = mx.symbol.FullyConnected(data=input, name='fc1', num_hidden=128) + act1 = mx.sym.Activation(data=fc1, name='relu1', act_type="relu") + fc2 = mx.symbol.FullyConnected(data=fc1, name='fc2', num_hidden=2) + op = mx.symbol.SoftmaxOutput(data=fc2, name='softmax') + model = mx.mod.Module(symbol=op, context=ctx, data_names=['data'], label_names=['softmax_label']) + return model - train_data = mnist_raw_data['train_data'][train_idx,:] - train_label = mnist_raw_data['train_label'][train_idx] - - val_data = mnist_raw_data['train_data'][val_idx,:] - val_label = mnist_raw_data['train_label'][val_idx] - - test_data = mnist_raw_data['test_data'] - test_label = mnist_raw_data['test_label'] +def save_inference_results(inference_results, model_name): + assert (isinstance(inference_results, mx.ndarray.ndarray.NDArray)) + mx.nd.save(model_name + '-inference', {'inference' : inference_results}) - #print len(train_data) - #print len(val_data) - - train = {'train_X' : train_data, 'train_Y' : train_label} - test = {'test_X' : test_data, 'test_Y' : test_label} - val = {'val_X' : val_data, 'val_Y' : val_label} - - data = dict() - data['train'] = train - data['test'] = test - data['val'] = val - +def load_inference_results(model_name): + inf_dict = mx.nd.load(model_name+'-inference') + return inf_dict['inference'] + +def save_data_and_labels(test_data, test_labels, model_name): + assert (isinstance(test_data, mx.ndarray.ndarray.NDArray)) + assert (isinstance(test_labels, mx.ndarray.ndarray.NDArray)) + mx.nd.save(model_name + '-data', {'data' : test_data, 'labels' : test_labels}) + +def upload_data_and_labels_to_s3(model_name): + s3 = boto3.client('s3') + file = model_name + '-data' + s3.upload_file(file, data_bucket_name, file) + print ('data files successfully uploaded to s3') + +def upload_model_files_to_s3(files, folder_name): + s3 = boto3.client('s3') + for file in files: + s3.upload_file(file, model_bucket_name, folder_name + file) + +def clean_model_files(files, model_name): + files.append(model_name + '-inference') + files.append(model_name + '-data') + + for file in files: + if os.path.isfile(file): + os.remove(file) + +def download_data_from_s3(model_name): + print ('Downloading data files for %s from bucket %s'%(model_name, data_bucket_name)) + bucket = s3.Bucket(data_bucket_name) + bucket.download_file(model_name+'-data', model_name+'-data') + + data = mx.nd.load(model_name+'-data') return data +def download_model_files_from_s3(model_name): + for folder in get_top_level_folders_in_bucket(s3, model_bucket_name): + bucket = s3.Bucket(model_bucket_name) + prefix = folder + backslash + model_name + model_files_meta = list(bucket.objects.filter(Prefix = prefix)) + if len(model_files_meta) == 0: + print ('No trained models found under path : %s' %prefix) + continue + model_files = list() + for obj in model_files_meta: + file_name = obj.key.split('/')[2] + model_files.append(file_name) + ## Download this file--- + bucket.download_file(obj.key, file_name) + def get_top_level_folders_in_bucket(s3client, bucket_name): '''This function returns the top level folders in the S3Bucket. These folders help us to navigate to the trained model files stored for different MXNet versions. ''' bucket = s3client.Bucket(bucket_name) @@ -91,53 +128,9 @@ def get_top_level_folders_in_bucket(s3client, bucket_name): return folder_list -def clean_mnist_data(): - if os.path.isfile('train-images-idx3-ubyte.gz'): - os.remove('train-images-idx3-ubyte.gz') - if os.path.isfile('t10k-labels-idx1-ubyte.gz'): - os.remove('t10k-labels-idx1-ubyte.gz') - if os.path.isfile('train-labels-idx1-ubyte.gz'): - os.remove('train-labels-idx1-ubyte.gz') - if os.path.isfile('t10k-images-idx3-ubyte.gz'): - os.remove('t10k-images-idx3-ubyte.gz') - -def clean_model_files(model_files): - for file in model_files: - if os.path.isfile(file): - os.remove(file) - -def upload_model_files_to_s3(bucket_name, files, folder_name): - s3 = boto3.client('s3') - for file in files: - s3.upload_file(file, bucket_name, folder_name + file) - print ('model successfully uploaded to s3') - -def save_inference_results(inference_results_file, inference_results): - # Write the inference results to local json file. This will be cleaned up later - with open(inference_results_file, 'w') as file: - json.dump(inference_results, file) - - -def compare_versions(version1, version2): - ''' - https://stackoverflow.com/questions/1714027/version-number-comparison-in-python - ''' - def normalize(v): - return [int(x) for x in re.sub(r'(\.0+)*$','', v).split(".")] - return cmp(normalize(version1), normalize(version2)) - -def get_val_test_iter(): - data = prepare_mnist_data(mx.test_utils.get_mnist()) - val = data['val'] - test = data['test'] - batch_size = 100 - val_iter = mx.io.NDArrayIter(val['val_X'], val['val_Y'], batch_size, shuffle=True) - test_iter = mx.io.NDArrayIter(test['test_X'], test['test_Y']) - return val_iter, test_iter - -class HybridNet(gluon.HybridBlock): +class Net(gluon.Block): def __init__(self, **kwargs): - super(HybridNet, self).__init__(**kwargs) + super(Net, self).__init__(**kwargs) with self.name_scope(): # layers created in name_scope will inherit name space # from parent layer. @@ -146,9 +139,9 @@ def __init__(self, **kwargs): self.conv2 = nn.Conv2D(50, kernel_size=(5,5)) self.pool2 = nn.MaxPool2D(pool_size=(2,2), strides = (2,2)) self.fc1 = nn.Dense(500) - self.fc2 = nn.Dense(10) + self.fc2 = nn.Dense(2) - def hybrid_forward(self, F, x): + def forward(self, x): x = self.pool1(F.tanh(self.conv1(x))) x = self.pool2(F.tanh(self.conv2(x))) # 0 means copy over size from corresponding dimension. @@ -158,9 +151,9 @@ def hybrid_forward(self, F, x): x = F.tanh(self.fc2(x)) return x -class Net(gluon.Block): +class HybridNet(gluon.HybridBlock): def __init__(self, **kwargs): - super(Net, self).__init__(**kwargs) + super(HybridNet, self).__init__(**kwargs) with self.name_scope(): # layers created in name_scope will inherit name space # from parent layer. @@ -169,9 +162,9 @@ def __init__(self, **kwargs): self.conv2 = nn.Conv2D(50, kernel_size=(5,5)) self.pool2 = nn.MaxPool2D(pool_size=(2,2), strides = (2,2)) self.fc1 = nn.Dense(500) - self.fc2 = nn.Dense(10) + self.fc2 = nn.Dense(2) - def forward(self, x): + def hybrid_forward(self, F, x): x = self.pool1(F.tanh(self.conv1(x))) x = self.pool2(F.tanh(self.conv2(x))) # 0 means copy over size from corresponding dimension. @@ -181,162 +174,26 @@ def forward(self, x): x = F.tanh(self.fc2(x)) return x -class Dictionary(object): - def __init__(self): - self.word2idx = {} - self.idx2word = [] - - def add_word(self, word): - if word not in self.word2idx: - self.idx2word.append(word) - self.word2idx[word] = len(self.idx2word) - 1 - return self.word2idx[word] - - def __len__(self): - return len(self.idx2word) - -class Corpus(object): - def __init__(self, path): - self.dictionary = Dictionary() - self.download_data_from_s3() - self.train = self.tokenize(path + 'train.txt') - self.valid = self.tokenize(path + 'valid.txt') - self.test = self.tokenize(path + 'test.txt') - - def download_data_from_s3(self, ): - print ('Downloading files from bucket : sherlock-dataset' ) - bucket = s3.Bucket('sherlock-dataset') - files = ['test.txt', 'train.txt', 'valid.txt'] - for file in files: - if os.path.exists(args_data + file) : - print ('File %s'%(args_data + file), 'already exists. Skipping download') - continue - file_path = args_data + file - bucket.download_file(file_path, args_data + file) - - def tokenize(self, path): - """Tokenizes a text file.""" - assert os.path.exists(path) - # Add words to the dictionary - with open(path, 'r') as f: - tokens = 0 - for line in f: - words = line.split() + [''] - tokens += len(words) - for word in words: - self.dictionary.add_word(word) - - # Tokenize file content - with open(path, 'r') as f: - ids = np.zeros((tokens,), dtype='int32') - token = 0 - for line in f: - words = line.split() + [''] - for word in words: - ids[token] = self.dictionary.word2idx[word] - token += 1 - - return mx.nd.array(ids, dtype='int32') - - - -#### Common utilies for lm_rnn_gluon_train & inference files -args_data = 'sherlockholmes.' -args_model = 'rnn_relu' -args_emsize = 100 -args_nhid = 100 -args_nlayers = 2 -args_lr = 1.0 -args_clip = 0.2 -args_epochs = 2 -args_batch_size = 32 -args_bptt = 5 -args_dropout = 0.2 -args_tied = True -args_cuda = 'store_true' -args_log_interval = 500 - -class RNNModel(gluon.Block): - """A model with an encoder, recurrent layer, and a decoder.""" - - def __init__(self, mode, vocab_size, num_embed, num_hidden, - num_layers, dropout=0.5, tie_weights=False, **kwargs): - super(RNNModel, self).__init__(**kwargs) +class SimpleLSTMModel(gluon.Block): + def __init__(self, **kwargs): + super(SimpleLSTMModel, self).__init__(**kwargs) with self.name_scope(): - self.drop = nn.Dropout(dropout) - self.encoder = nn.Embedding(vocab_size, num_embed, - weight_initializer = mx.init.Uniform(0.1)) - if mode == 'rnn_relu': - self.rnn = rnn.RNN(num_hidden, num_layers, activation='relu', dropout=dropout, - input_size=num_embed) - elif mode == 'rnn_tanh': - self.rnn = rnn.RNN(num_hidden, num_layers, dropout=dropout, - input_size=num_embed) - elif mode == 'lstm': - self.rnn = rnn.LSTM(num_hidden, num_layers, dropout=dropout, - input_size=num_embed) - elif mode == 'gru': - self.rnn = rnn.GRU(num_hidden, num_layers, dropout=dropout, - input_size=num_embed) - else: - raise ValueError("Invalid mode %s. Options are rnn_relu, " - "rnn_tanh, lstm, and gru"%mode) - if tie_weights: - self.decoder = nn.Dense(vocab_size, in_units = num_hidden, - params = self.encoder.params) - else: - self.decoder = nn.Dense(vocab_size, in_units = num_hidden) - self.num_hidden = num_hidden - - def forward(self, inputs, hidden): - emb = self.drop(self.encoder(inputs)) - output, hidden = self.rnn(emb, hidden) - output = self.drop(output) - decoded = self.decoder(output.reshape((-1, self.num_hidden))) - return decoded, hidden - - def begin_state(self, *args, **kwargs): - return self.rnn.begin_state(*args, **kwargs) - -def batchify(data, batch_size): - """Reshape data into (num_example, batch_size)""" - nbatch = data.shape[0] // batch_size - data = data[:nbatch * batch_size] - data = data.reshape((batch_size, nbatch)).T - return data - -def get_batch(source, i): - seq_len = min(args_bptt, source.shape[0] - 1 - i) - data = source[i : i + seq_len] - target = source[i + 1 : i + 1 + seq_len] - return data, target.reshape((-1,)) + self.model = mx.gluon.nn.Sequential(prefix='') + with self.model.name_scope(): + self.model.add(mx.gluon.nn.Embedding(30, 10)) + self.model.add(mx.gluon.rnn.LSTM(20)) + self.model.add(mx.gluon.nn.Dense(100)) + self.model.add(mx.gluon.nn.Dropout(0.5)) + self.model.add(mx.gluon.nn.Dense(2, flatten=True, activation='tanh')) -def detach(hidden): - if isinstance(hidden, (tuple, list)): - hidden = [i.detach() for i in hidden] - else: - hidden = hidden.detach() - return hidden -def eval(data_source, model): - total_L = 0.0 - ntotal = 0 - loss = gluon.loss.SoftmaxCrossEntropyLoss() - hidden = model.begin_state(func = mx.nd.zeros, batch_size = args_batch_size, ctx=mx.cpu(0)) - for i in range(0, data_source.shape[0] - 1, args_bptt): - data, target = get_batch(data_source, i) - output, hidden = model(data, hidden) - L = loss(output, target) - total_L += mx.nd.sum(L).asscalar() - ntotal += L.size - return total_L / ntotal - -def clean_sherlock_data(): - files = ['test.txt', 'train.txt', 'valid.txt'] - for file in files: - if os.path.isfile(args_data + file): - os.remove(args_data + file) + def forward(self, x): + return self.model(x) -# This function is added so that if a download gets interrupted in between, one can clean the corrupted files -clean_mnist_data() -clean_sherlock_data() +def compare_versions(version1, version2): + ''' + https://stackoverflow.com/questions/1714027/version-number-comparison-in-python + ''' + def normalize(v): + return [int(x) for x in re.sub(r'(\.0+)*$','', v).split(".")] + return cmp(normalize(version1), normalize(version2)) \ No newline at end of file diff --git a/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_hybrid_inference.py b/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_hybrid_inference.py deleted file mode 100644 index 756f73e9bf28..000000000000 --- a/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_hybrid_inference.py +++ /dev/null @@ -1,91 +0,0 @@ -#!/usr/bin/env python - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - - -from common import * - -model_name = 'lenet_cnn_gluon_hybrid_api' -num_epoch = 2 -ctx = [mx.cpu(0)] -batch_size = 100 - -val_iter, test_iter = get_val_test_iter() - -def get_model(model_name): - net = gluon.SymbolBlock.imports(model_name + '-symbol.json', ['data'], model_name + '-000' + str(num_epoch) + '.params') - return net - -def get_inference_score(iter, model): - # Use Accuracy as the evaluation metric. - metric = mx.metric.Accuracy() - # Reset the validation data iterator. - iter.reset() - # Loop over the validation data iterator. - for batch in iter: - # Splits validation data into multiple slices along batch_axis - # and copy each slice into a context. - data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0) - # Splits validation label into multiple slices along batch_axis - # and copy each slice into a context. - label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0) - outputs = [] - for x in data: - outputs.append(model(x)) - # Updates internal evaluation - metric.update(label, outputs) - acc = metric.get() - return acc[1] - -def perform_inference(test_iter, val_iter, model, inference_file): - test_inference_score = get_inference_score(test_iter, model) - val_inference_score = get_inference_score(val_iter, model) - - with open(inference_file, 'r') as file: - results = json.load(file) - - print (test_inference_score, val_inference_score) - print results['val_acc'] - print ('Validation accuracy on inference is %f while that on the original training file is %f' % (val_inference_score, results['val_acc'])) - print ('Test accuracy on inference is %f while that on the original training file is %f' % (test_inference_score, results['test_acc'])) - assert(results['val_acc'] == val_inference_score) - assert(results['test_acc'] == test_inference_score) - print ('Inference results passed for %s' % model_name) - -def clean_up_files (model_files): - clean_mnist_data() - clean_model_files(model_files) - print ('Model files deleted') - -if __name__=='__main__': - for folder in get_top_level_folders_in_bucket(s3, bucket_name): - bucket = s3.Bucket(bucket_name) - prefix = folder + backslash + model_name - model_files_meta = list(bucket.objects.filter(Prefix = prefix)) - if len(model_files_meta) == 0: - continue - model_files = list() - for obj in model_files_meta: - file_name = obj.key.split('/')[2] - model_files.append(file_name) - ## Download this file--- - bucket.download_file(obj.key, file_name) - - model = get_model(model_name) - perform_inference(test_iter, val_iter, model, model_name + '_inference.json') - clean_up_files(model_files) diff --git a/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_hybrid_train.py b/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_hybrid_train.py deleted file mode 100644 index b7965ce7c2a5..000000000000 --- a/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_hybrid_train.py +++ /dev/null @@ -1,130 +0,0 @@ -#!/usr/bin/env python - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from common import * - -batch_size=100 -num_epoch = 2 -model_name = 'lenet_cnn_gluon_hybrid_api' - -ctx = [mx.cpu(0)] - -def clean_up_files (model_files): - clean_mnist_data() - clean_model_files(model_files) - print ('Model files deleted') - -def save_model_files(network): - network.export(model_name, epoch=num_epoch) - -def get_inference_score(iter, model): - # Use Accuracy as the evaluation metric. - metric = mx.metric.Accuracy() - # Reset the validation data iterator. - iter.reset() - # Loop over the validation data iterator. - for batch in iter: - # Splits validation data into multiple slices along batch_axis - # and copy each slice into a context. - data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0) - # Splits validation label into multiple slices along batch_axis - # and copy each slice into a context. - label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0) - outputs = [] - for x in data: - outputs.append(model(x)) - # Updates internal evaluation - metric.update(label, outputs) - acc = metric.get() - return acc[1] - -if __name__=='__main__': - data = prepare_mnist_data(mx.test_utils.get_mnist()) - - train = data['train'] - val = data['val'] - test = data['test'] - - train_iter = mx.io.NDArrayIter(train['train_X'], train['train_Y'], batch_size, shuffle=True) - val_iter = mx.io.NDArrayIter(val['val_X'], val['val_Y'], batch_size, shuffle=True) - test_iter = mx.io.NDArrayIter(test['test_X'], test['test_Y']) - - - net = HybridNet() - net.initialize(mx.init.Xavier(), ctx=ctx) - net.hybridize() - - metric = mx.metric.Accuracy() - softmax_cross_entropy_loss = gluon.loss.SoftmaxCrossEntropyLoss() - trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.02}) - - for i in range(num_epoch): - train_iter.reset() - for batch in train_iter: - data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0) - label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0) - outputs = [] - # Inside training scope - with ag.record(): - for x, y in zip(data, label): - z = net(x) - # Computes softmax cross entropy loss. - loss = softmax_cross_entropy_loss(z, y) - # Backpropagate the error for one iteration. - loss.backward() - outputs.append(z) - - metric.update(label, outputs) - # Make one step of parameter update. Trainer needs to know the - # batch size of data to normalize the gradient by 1/batch_size. - trainer.step(batch.data[0].shape[0]) - - name, acc = metric.get() - # Reset evaluation result to initial state. - metric.reset() - print('training acc at epoch %d: %s=%f'%(i, name, acc)) - - save_model_files(net) - - - # In[6]: - val_acc = get_inference_score(val_iter, net) - print('validation acc: =%f'%val_acc) - - test_acc = get_inference_score(test_iter, net) - print('test acc: =%f'%test_acc) - - inference_results = dict() - inference_results['val_acc'] = val_acc - inference_results['test_acc'] = test_acc - - inference_results_file = model_name + '_inference' + '.json' - - save_inference_results(inference_results_file, inference_results) - - mxnet_folder = str(mxnet_version) + backslash + model_name + backslash - - files = list() - files.append(model_name + '-000' + str(num_epoch) + '.params') - files.append(model_name + '-symbol.json') - files.append(model_name + '_inference' + '.json') - - upload_model_files_to_s3(bucket_name, files, mxnet_folder) - - clean_up_files(files) \ No newline at end of file diff --git a/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_inference.py b/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_inference.py deleted file mode 100644 index 47ef040fd126..000000000000 --- a/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_inference.py +++ /dev/null @@ -1,92 +0,0 @@ -#!/usr/bin/env python - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from common import * - -model_name = 'lenet_cnn_gluon_api' -num_epoch = 2 -ctx = [mx.cpu(0)] -batch_size = 100 - -val_iter, test_iter = get_val_test_iter() - -def get_model(model_file): - net = Net() - net.load_params(model_file, ctx) - - return net - -def get_inference_score(iter, model): - # Use Accuracy as the evaluation metric. - metric = mx.metric.Accuracy() - # Reset the validation data iterator. - iter.reset() - # Loop over the validation data iterator. - for batch in iter: - # Splits validation data into multiple slices along batch_axis - # and copy each slice into a context. - data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0) - # Splits validation label into multiple slices along batch_axis - # and copy each slice into a context. - label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0) - outputs = [] - for x in data: - outputs.append(model(x)) - # Updates internal evaluation - metric.update(label, outputs) - acc = metric.get() - return acc[1] - -def perform_inference(test_iter, val_iter, model, inference_file): - test_inference_score = get_inference_score(test_iter, model) - val_inference_score = get_inference_score(val_iter, model) - - with open(inference_file, 'r') as file: - results = json.load(file) - - print (test_inference_score, val_inference_score) - print results['val_acc'] - print ('Validation accuracy on inference is %f while that on the original training file is %f' % (val_inference_score, results['val_acc'])) - print ('Test accuracy on inference is %f while that on the original training file is %f' % (test_inference_score, results['test_acc'])) - assert(results['val_acc'] == val_inference_score) - assert(results['test_acc'] == test_inference_score) - print ('Inference results passed for %s' % model_name) - -def clean_up_files (model_files): - clean_mnist_data() - clean_model_files(model_files) - print ('Model files deleted') - -if __name__=='__main__': - for folder in get_top_level_folders_in_bucket(s3, bucket_name): - bucket = s3.Bucket(bucket_name) - prefix = folder + backslash + model_name - model_files_meta = list(bucket.objects.filter(Prefix = prefix)) - if len(model_files_meta) == 0: - continue - model_files = list() - for obj in model_files_meta: - file_name = obj.key.split('/')[2] - model_files.append(file_name) - ## Download this file--- - bucket.download_file(obj.key, file_name) - - model = get_model(model_name + '.params') - perform_inference(test_iter, val_iter, model, model_name + '_inference.json') - clean_up_files(model_files) diff --git a/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_train.py b/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_train.py deleted file mode 100644 index 7f1fcad076ca..000000000000 --- a/tests/nightly/model_backwards_compatibility_check/lenet_cnn_gluon_train.py +++ /dev/null @@ -1,129 +0,0 @@ -#!/usr/bin/env python - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -from common import * - -batch_size=100 -num_epoch = 2 -model_name = 'lenet_cnn_gluon_api' - -ctx = [mx.cpu(0)] -mxnet_version = mx.__version__ - -def clean_up_files (model_files): - clean_mnist_data() - clean_model_files(model_files) - print ('Model files deleted') - -def save_model_files(network): - model_file_name = model_name + '.params' - network.save_params(model_file_name) - -def get_inference_score(iter, model): - # Use Accuracy as the evaluation metric. - metric = mx.metric.Accuracy() - # Reset the validation data iterator. - iter.reset() - # Loop over the validation data iterator. - for batch in iter: - # Splits validation data into multiple slices along batch_axis - # and copy each slice into a context. - data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0) - # Splits validation label into multiple slices along batch_axis - # and copy each slice into a context. - label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0) - outputs = [] - for x in data: - outputs.append(model(x)) - # Updates internal evaluation - metric.update(label, outputs) - acc = metric.get() - return acc[1] - -if __name__=='__main__': - data = prepare_mnist_data(mx.test_utils.get_mnist()) - - train = data['train'] - val = data['val'] - test = data['test'] - - train_iter = mx.io.NDArrayIter(train['train_X'], train['train_Y'], batch_size, shuffle=True) - val_iter = mx.io.NDArrayIter(val['val_X'], val['val_Y'], batch_size, shuffle=True) - test_iter = mx.io.NDArrayIter(test['test_X'], test['test_Y']) - - - net = Net() - net.initialize(mx.init.Xavier(), ctx=ctx) - - metric = mx.metric.Accuracy() - softmax_cross_entropy_loss = gluon.loss.SoftmaxCrossEntropyLoss() - trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.02}) - - for i in range(num_epoch): - train_iter.reset() - for batch in train_iter: - data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0) - label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0) - outputs = [] - # Inside training scope - with ag.record(): - for x, y in zip(data, label): - z = net(x) - # Computes softmax cross entropy loss. - loss = softmax_cross_entropy_loss(z, y) - # Backpropagate the error for one iteration. - loss.backward() - outputs.append(z) - - metric.update(label, outputs) - # Make one step of parameter update. Trainer needs to know the - # batch size of data to normalize the gradient by 1/batch_size. - trainer.step(batch.data[0].shape[0]) - - name, acc = metric.get() - # Reset evaluation result to initial state. - metric.reset() - print('training acc at epoch %d: %s=%f'%(i, name, acc)) - - save_model_files(net) - - - # In[6]: - val_acc = get_inference_score(val_iter, net) - print('validation acc: =%f'%val_acc) - - test_acc = get_inference_score(test_iter, net) - print('test acc: =%f'%test_acc) - - inference_results = dict() - inference_results['val_acc'] = val_acc - inference_results['test_acc'] = test_acc - - inference_results_file = model_name + '_inference' + '.json' - - save_inference_results(inference_results_file, inference_results) - - mxnet_folder = str(mxnet_version) + backslash + model_name + backslash - - files = list() - files.append(model_name + '.params') - files.append(model_name + '_inference' + '.json') - - upload_model_files_to_s3(bucket_name, files, mxnet_folder) - - clean_up_files(files) \ No newline at end of file diff --git a/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_inference.py b/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_inference.py deleted file mode 100644 index 9f8b2a55bc0e..000000000000 --- a/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_inference.py +++ /dev/null @@ -1,103 +0,0 @@ -#!/usr/bin/env python - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from common import * - -model_name = 'lm_rnn_gluon_api' - -context = mx.cpu(0) - -def test(test_data, model): - test_L = eval(test_data, model) - return test_L, np.exp(test_L) - -def get_top_level_folders_in_bucket(s3client, bucket_name): - '''This function returns the top level folders in the S3Bucket. These folders help us to navigate to the trained model files stored for different MXNet versions. ''' - bucket = s3client.Bucket(bucket_name) - result = bucket.meta.client.list_objects(Bucket=bucket.name, Delimiter=backslash) - folder_list = list() - for obj in result['CommonPrefixes']: - folder_list.append(obj['Prefix'].strip(backslash)) - - return folder_list - -def get_model(model_file): - model_2 = RNNModel(args_model, ntokens, args_emsize, args_nhid, - args_nlayers, args_dropout, args_tied) - model_2.load_parameters(model_name + '.params', context) - - return model_2 - -def perform_inference(test_data, val_data, model, inference_file): - test_loss, test_ppl = test(test_data, model) - val_loss, val_ppl = test(val_data, model) - - with open(inference_file, 'r') as file: - results = json.load(file) - val_results = results['val'] - test_results = results['test'] - - print ('Validation loss on inference is %f while that on the original training file is %f' % (val_loss, val_results['loss'])) - print ('Test loss on inference is %f while that on the original training file is %f' % (test_loss, test_results['loss'])) - - assert(test_loss == test_results['loss']) - assert(test_ppl == test_results['ppl']) - - assert(val_loss == val_results['loss']) - assert(val_ppl == val_results['ppl']) - - print ('Inference results passed for %s' % model_name) - -def clean_up_files (model_files): - clean_sherlock_data() - clean_model_files(model_files) - print ('Model files deleted') - -def clean_model_files(model_files): - for file in model_files: - if os.path.isfile(file): - os.remove(file) - -if __name__=='__main__': - - corpus = Corpus(args_data) - train_data = batchify(corpus.train, args_batch_size).as_in_context(context) - val_data = batchify(corpus.valid, args_batch_size).as_in_context(context) - test_data = batchify(corpus.test, args_batch_size).as_in_context(context) - ntokens = len(corpus.dictionary) - - for folder in get_top_level_folders_in_bucket(s3, bucket_name): - bucket = s3.Bucket(bucket_name) - prefix = folder + backslash + model_name - model_files_meta = list(bucket.objects.filter(Prefix = prefix)) - if len(model_files_meta) == 0: - continue - model_files = list() - for obj in model_files_meta: - # print - file_name = obj.key.split('/')[2] - if file_name is None or len(file_name) == 0: - continue - model_files.append(file_name) - ## Download this file--- - bucket.download_file(obj.key, file_name) - - model = get_model(model_name + '.params') - perform_inference(test_data, val_data, model, model_name + '_inference.json') - clean_up_files(model_files) diff --git a/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_train.py b/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_train.py deleted file mode 100644 index d6ac8abe0365..000000000000 --- a/tests/nightly/model_backwards_compatibility_check/lm_rnn_gluon_train.py +++ /dev/null @@ -1,131 +0,0 @@ -#!/usr/bin/env python - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - - -from common import * - -num_epoch = 2 -model_name = 'lm_rnn_gluon_api' - -context = mx.cpu(0) - -def train(model, train_data): - best_val = float("Inf") - for epoch in range(args_epochs): - total_L = 0.0 - start_time = time.time() - hidden = model.begin_state(func = mx.nd.zeros, batch_size = args_batch_size, ctx = context) - for ibatch, i in enumerate(range(0, train_data.shape[0] - 1, args_bptt)): - data, target = get_batch(train_data, i) - hidden = detach(hidden) - with autograd.record(): - output, hidden = model(data, hidden) - L = loss(output, target) - L.backward() - - grads = [i.grad(context) for i in model.collect_params().values()] - # Here gradient is for the whole batch. - # So we multiply max_norm by batch_size and bptt size to balance it. - gluon.utils.clip_global_norm(grads, args_clip * args_bptt * args_batch_size) - - trainer.step(args_batch_size) - total_L += mx.nd.sum(L).asscalar() - - if ibatch % args_log_interval == 0 and ibatch > 0: - cur_L = total_L / args_bptt / args_batch_size / args_log_interval - print('[Epoch %d Batch %d] loss %.2f, perplexity %.2f' % ( - epoch + 1, ibatch, cur_L, np.exp(cur_L))) - total_L = 0.0 - - val_L = eval(val_data, model) - - print('[Epoch %d] time cost %.2fs, validation loss %.2f, validation perplexity %.2f' % ( - epoch + 1, time.time() - start_time, val_L, np.exp(val_L))) - - if val_L < best_val: - best_val = val_L - model.save_parameters(model_name + '.params') - -def test(test_data, model): - test_L = eval(test_data, model) - return test_L, np.exp(test_L) - -def save_inference_results(test, val): - inference_results = dict() - inference_results['val'] = val - inference_results['test'] = test - - inference_results_file = model_name + '_inference' + '.json' - - # Write the inference results to local json file. This will be cleaned up later - with open(inference_results_file, 'w') as file: - json.dump(inference_results, file) - -def clean_up_files (model_files): - clean_sherlock_data() - clean_model_files(model_files) - print ('Model files deleted') - -def clean_model_files(model_files): - for file in model_files: - if os.path.isfile(file): - os.remove(file) - -if __name__=='__main__': - ## If this code is being run on version >= 1.2.0 only then execute it, since it uses save_parameters and load_parameters API - if compare_versions(str(mxnet_version), '1.2.1') < 0: - print ('Found MXNet version %s and exiting because this version does not contain save_parameters and load_parameters functions' %str(mxnet_version)) - sys.exit(1) - - corpus = Corpus(args_data) - train_data = batchify(corpus.train, args_batch_size).as_in_context(context) - val_data = batchify(corpus.valid, args_batch_size).as_in_context(context) - test_data = batchify(corpus.test, args_batch_size).as_in_context(context) - - ntokens = len(corpus.dictionary) - - model = RNNModel(args_model, ntokens, args_emsize, args_nhid, - args_nlayers, args_dropout, args_tied) - model.collect_params().initialize(mx.init.Xavier(), ctx=context) - trainer = gluon.Trainer(model.collect_params(), 'sgd', {'learning_rate': args_lr, 'momentum': 0, 'wd': 0}) - loss = gluon.loss.SoftmaxCrossEntropyLoss() - - train(model, train_data) - val_loss, val_ppl = test(val_data, model) - print('Validation loss %f, Validation perplexity %f'%(val_loss, val_ppl)) - test_loss, test_ppl = test(test_data, model) - print('test loss %f, test perplexity %f'%(test_loss, test_ppl)) - - val_results = dict() - val_results['loss'] = val_loss - val_results['ppl'] = val_ppl - - test_results = dict() - test_results['loss'] = test_loss - test_results['ppl'] = test_ppl - - save_inference_results(test_results, val_results) - - mxnet_folder = str(mxnet_version) + backslash + model_name + backslash - - files = list() - files.append(model_name + '.params') - files.append(model_name + '_inference' + '.json') - upload_model_files_to_s3(bucket_name, files, mxnet_folder) - clean_up_files(files) \ No newline at end of file diff --git a/tests/nightly/model_backwards_compatibility_check/mnist_mlp_module_api_inference.py b/tests/nightly/model_backwards_compatibility_check/mnist_mlp_module_api_inference.py deleted file mode 100644 index 63302430cd02..000000000000 --- a/tests/nightly/model_backwards_compatibility_check/mnist_mlp_module_api_inference.py +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/bin/env python - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - - -from common import * - -model_name = 'mnist_mlp_module_api' -ctx = mx.cpu() - -val_iter, test_iter = get_val_test_iter() - -def get_model_definition(): - ##### Old Model ##### : - input = mx.symbol.Variable('data') - input = mx.symbol.Flatten(data=input) - - fc1 = mx.symbol.FullyConnected(data=input, name='fc1', num_hidden=128) - act1 = mx.symbol.Activation(data=fc1, name='relu1', act_type='relu') - - fc2 = mx.symbol.FullyConnected(data=act1, name='fc2', num_hidden=64) - output = mx.symbol.SoftmaxOutput(data=fc2, name='softmax') - - ### this is needed since the model is loaded from a checkpoint ### - sym, arg_params, aux_params = mx.model.load_checkpoint(model_name, num_epoch) - loaded_model = mx.mod.Module(symbol=output, context=ctx, data_names=['data'], label_names=['softmax_label']) - loaded_model.bind(data_shapes=test_iter.provide_data, label_shapes=test_iter.provide_label) - loaded_model.set_params(arg_params, aux_params) - return loaded_model - -def perform_inference(test_iter, val_iter, model, inference_file): - test_inference_score = model.score(test_iter, ['acc']) - val_inference_score = model.score(val_iter, ['acc']) - - with open(inference_file, 'r') as file: - results = json.load(file) - - print ('Validation accuracy on inference is %f while that on the original training file is %f' % (val_inference_score[0][1], results['val_acc'])) - print ('Test accuracy on inference is %f while that on the original training file is %f' % (test_inference_score[0][1], results['test_acc'])) - assert(results['val_acc'] == val_inference_score[0][1]) - assert(results['test_acc'] == test_inference_score[0][1]) - print ('Inference results passed for %s' % model_name) - -if __name__=='__main__': - for folder in get_top_level_folders_in_bucket(s3, bucket_name): - bucket = s3.Bucket(bucket_name) - prefix = folder + backslash + model_name - model_files_meta = list(bucket.objects.filter(Prefix = prefix)) - if len(model_files_meta) == 0: - print ('No trained models found under path : %s' %prefix) - continue - model_files = list() - for obj in model_files_meta: - file_name = obj.key.split('/')[2] - model_files.append(file_name) - ## Download this file--- - bucket.download_file(obj.key, file_name) - - model = get_model_definition() - perform_inference(test_iter, val_iter, model, model_name + '_inference.json') - clean_model_files(model_files) - clean_mnist_data() diff --git a/tests/nightly/model_backwards_compatibility_check/mnist_mlp_module_api_train.py b/tests/nightly/model_backwards_compatibility_check/mnist_mlp_module_api_train.py deleted file mode 100644 index b89480e608f4..000000000000 --- a/tests/nightly/model_backwards_compatibility_check/mnist_mlp_module_api_train.py +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env python - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from common import * - -ctx = mx.cpu() -batch_size = 100 -num_epoch = 2 -backslash = '/' -model_name = 'mnist_mlp_module_api' - -def clean_up_files (): - clean_mnist_data() - files = list() - for i in range(1, num_epoch+1): - files.append(model_name + '-000' + str(i) + '.params') - - files.append(model_name + '-symbol.json') - files.append(inference_results_file) - clean_model_files(files) - print ('Model files deleted') - -def get_model_definition(): - input = mx.symbol.Variable('data') - input = mx.symbol.Flatten(data=input) - - fc1 = mx.symbol.FullyConnected(data=input, name='fc1', num_hidden=128) - act1 = mx.symbol.Activation(data=fc1, name='relu1', act_type='relu') - - fc2 = mx.symbol.FullyConnected(data=act1, name='fc2', num_hidden=64) - output = mx.symbol.SoftmaxOutput(data=fc2, name='softmax') - - model = mx.mod.Module(symbol=output, context=ctx, data_names=['data'], label_names=['softmax_label']) - - return model - -if __name__=='__main__': - data = prepare_mnist_data(mx.test_utils.get_mnist()) - - train = data['train'] - val = data['val'] - test = data['test'] - - train_iter = mx.io.NDArrayIter(train['train_X'], train['train_Y'], batch_size, shuffle=True) - val_iter = mx.io.NDArrayIter(val['val_X'], val['val_Y'], batch_size, shuffle=True) - test_iter = mx.io.NDArrayIter(test['test_X'], test['test_Y']) - - model = get_model_definition() - - train_iter.reset() - checkpoint_callback = mx.callback.do_checkpoint(model_name) - model.fit(train_iter, epoch_end_callback=checkpoint_callback, eval_data=val_iter, optimizer='sgd', optimizer_params={'learning_rate' : 0.1}, eval_metric='acc', num_epoch=num_epoch) - - score_val = model.score(val_iter,['acc']) - val_acc = score_val[0][1] - print ('Validation Accuracy is : %f' % val_acc) - score_test = model.score(test_iter, ['acc']) - test_acc = score_test[0][1] - print ('Test Accuracy is : %f' % test_acc) - - inference_results = dict() - inference_results['val_acc'] = val_acc - inference_results['test_acc'] = test_acc - - inference_results_file = model_name + '_inference' + '.json' - - save_inference_results(inference_results_file, inference_results) - - model_params_file = model_name + '-000' + str(num_epoch) + '.params' - model_symbol_file = model_name + '-symbol.json' - model_inference_file = inference_results_file - files = list() - files.append(model_params_file) - files.append(model_symbol_file) - files.append(model_inference_file) - - - mxnet_folder = str(mxnet_version) + backslash + model_name + backslash - - # Upload the model files to S3 - upload_model_files_to_s3(bucket_name, files, mxnet_folder) - # Clean up the local files - clean_up_files() \ No newline at end of file diff --git a/tests/nightly/model_backwards_compatibility_check/model_backward_compat_checker.sh b/tests/nightly/model_backwards_compatibility_check/model_backward_compat_checker.sh index 69a031cafe23..66bd93102663 100755 --- a/tests/nightly/model_backwards_compatibility_check/model_backward_compat_checker.sh +++ b/tests/nightly/model_backwards_compatibility_check/model_backward_compat_checker.sh @@ -19,24 +19,10 @@ #Author: Piyush Ghai -echo "Invoking model_backwards_compat_test.sh script" +echo "Invoking model_backwards_compat_checker.sh script" echo `pwd` cd tests/nightly/model_backwards_compatibility_check echo `pwd` echo '==========================' -echo 'running mlp with module api' -python mnist_mlp_module_api_inference.py - -echo '==========================' -echo 'running lenet with gluon api (non - hybridized)' -python lenet_cnn_gluon_inference.py - -echo '==========================' -echo 'running lenet with gluon api (hybridized)' -python lenet_cnn_gluon_hybrid_inference.py - -echo '==========================' -echo 'running rnn with gluon - save and load parameters' -python lm_rnn_gluon_inference.py - +python model_backwards_compat_inference.py \ No newline at end of file diff --git a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py new file mode 100644 index 000000000000..3323c6fe4ba7 --- /dev/null +++ b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py @@ -0,0 +1,168 @@ +#!/usr/bin/env python + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from common import * + +def test_module_checkpoint_api(): + model_name = 'module_checkpoint_api' + print ('Performing inference for model/API %s' %model_name) + data = download_data_from_s3(model_name) + test_data = data['data'] + test_label = data['labels'] + + data_iter = mx.io.NDArrayIter(test_data, test_label, batch_size=10) + + ## For each MXNet version that has the saved models + for folder in get_top_level_folders_in_bucket(s3, model_bucket_name): + bucket = s3.Bucket(model_bucket_name) + prefix = folder + backslash + model_name + model_files_meta = list(bucket.objects.filter(Prefix = prefix)) + if len(model_files_meta) == 0: + print ('No trained models found under path : %s' %prefix) + continue + model_files = list() + ## For each file found under a model folder : + for obj in model_files_meta: + file_name = obj.key.split('/')[2] + model_files.append(file_name) + ## Download this file--- + bucket.download_file(obj.key, file_name) + + ## Load the model and perform inference + loaded_model = get_module_api_model_definition() + + sym, arg_params, aux_params = mx.model.load_checkpoint(model_name, 1) + loaded_model.bind(data_shapes=data_iter.provide_data, label_shapes=data_iter.provide_label) + loaded_model.set_params(arg_params, aux_params) + + old_inference_results = load_inference_results(model_name) + inference_results = loaded_model.predict(data_iter) + ## Check whether they are equal or not ? + assert_almost_equal(inference_results.asnumpy(), old_inference_results.asnumpy()) + clean_model_files(model_files, model_name) + + print ('Assertion passed for model : %s' %model_name) + + +def test_lenet_gluon_load_params_api(): + model_name = 'lenet_gluon_save_params_api' + print ('Performing inference for model/API %s' %model_name) + ## Get data from S3 + data = download_data_from_s3(model_name) + + test_data = data['data'] + + for folder in get_top_level_folders_in_bucket(s3, model_bucket_name): + bucket = s3.Bucket(model_bucket_name) + prefix = folder + backslash + model_name + model_files_meta = list(bucket.objects.filter(Prefix = prefix)) + if len(model_files_meta) == 0: + print ('No trained models found under path : %s' %prefix) + continue + model_files = list() + ## For each file found under a model folder : + for obj in model_files_meta: + file_name = obj.key.split('/')[2] + model_files.append(file_name) + ## Download this file--- + bucket.download_file(obj.key, file_name) + + ## Load the model and perform inference + loaded_model = Net() + loaded_model.load_params(model_name+'-params') + output = loaded_model(test_data) + old_inference_results = mx.nd.load(model_name + '-inference')['inference'] + assert_almost_equal(old_inference_results.asnumpy(), output.asnumpy()) + clean_model_files(model_files, model_name) + print ('Assertion passed for model : %s' %model_name) + +def test_lenet_gluon_hybrid_imports_api(): + model_name = 'lenet_gluon_hybrid_export_api' + print ('Performing inference for model/API %s' %model_name) + ## Get data from S3 + data = download_data_from_s3(model_name) + + test_data = data['data'] + + for folder in get_top_level_folders_in_bucket(s3, model_bucket_name): + bucket = s3.Bucket(model_bucket_name) + prefix = folder + backslash + model_name + model_files_meta = list(bucket.objects.filter(Prefix = prefix)) + if len(model_files_meta) == 0: + print ('No trained models found under path : %s' %prefix) + continue + model_files = list() + ## For each file found under a model folder : + for obj in model_files_meta: + file_name = obj.key.split('/')[2] + model_files.append(file_name) + ## Download this file--- + bucket.download_file(obj.key, file_name) + + ## Load the model and perform inference + loaded_model = HybridNet() + loaded_model = gluon.SymbolBlock.imports(model_name + '-symbol.json', ['data'], model_name + '-0001.params') + output = loaded_model(test_data) + old_inference_results = mx.nd.load(model_name + '-inference')['inference'] + assert_almost_equal(old_inference_results.asnumpy(), output.asnumpy()) + clean_model_files(model_files, model_name) + print ('Assertion passed for model : %s' %model_name) + +def test_lstm_gluon_load_parameters_api(): + ## If this code is being run on version >= 1.2.0 only then execute it, since it uses save_parameters and load_parameters API + if compare_versions(str(mxnet_version), '1.2.1') < 0: + print ('Found MXNet version %s and exiting because this version does not contain save_parameters and load_parameters functions' %str(mxnet_version)) + sys.exit(1) + + model_name = 'lstm_gluon_save_parameters_api' + print ('Performing inference for model/API %s' %model_name) + ## Get data from S3 + data = download_data_from_s3(model_name) + + test_data = data['data'] + + for folder in get_top_level_folders_in_bucket(s3, model_bucket_name): + bucket = s3.Bucket(model_bucket_name) + prefix = folder + backslash + model_name + model_files_meta = list(bucket.objects.filter(Prefix = prefix)) + if len(model_files_meta) == 0: + print ('No trained models found under path : %s' %prefix) + continue + model_files = list() + ## For each file found under a model folder : + for obj in model_files_meta: + file_name = obj.key.split('/')[2] + model_files.append(file_name) + ## Download this file--- + bucket.download_file(obj.key, file_name) + + ## Load the model and perform inference + loaded_model = SimpleLSTMModel() + loaded_model.load_params(model_name+'-params') + output = loaded_model(test_data) + old_inference_results = mx.nd.load(model_name + '-inference')['inference'] + assert_almost_equal(old_inference_results.asnumpy(), output.asnumpy()) + clean_model_files(model_files, model_name) + print ('Assertion passed for model : %s' %model_name) + +if __name__=='__main__': + test_module_checkpoint_api() + test_lenet_gluon_load_params_api() + test_lenet_gluon_hybrid_imports_api() + test_lstm_gluon_load_parameters_api() diff --git a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py new file mode 100644 index 000000000000..b79607582647 --- /dev/null +++ b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from common import * + +def train_module_checkpoint_api(): + model_name = 'module_checkpoint_api' + print ('Saving files for model %s' %model_name) + ### Prepare data + test_data = mx.nd.array(np.random.uniform(-1, 1, size=(20, 1))) + test_label = mx.nd.array(np.random.randint(0, 2, size=(20,)), dtype='float32') + data_iter = mx.io.NDArrayIter(test_data, test_label, batch_size=10) + + + mod = get_module_api_model_definition() + mod.bind(data_shapes=data_iter.provide_data, label_shapes=data_iter.provide_label) + weights = mx.initializer.Xavier(magnitude = 2.57) + mod.init_params(weights) + + mod.save_checkpoint(model_name, 1) + ### Save the data, labels + save_data_and_labels(test_data, test_label, model_name) + upload_data_and_labels_to_s3(model_name) + + inference_results = mod.predict(data_iter) + ### Save inference_results + save_inference_results(inference_results, model_name) + ### upload model and inference files to S3 + files = list() + files.append(model_name + '-0001.params') + files.append(model_name + '-symbol.json') + files.append(model_name + '-inference') + + mxnet_folder = str(mxnet_version) + backslash + model_name + backslash + + upload_model_files_to_s3(files, mxnet_folder) + + clean_model_files(files, model_name) + +def train_lenet_gluon_save_params_api(): + model_name = 'lenet_gluon_save_params_api' + print ('Saving files for model %s' %model_name) + net = Net() + weights = mx.initializer.Xavier(magnitude = 2.57) + net.initialize(weights, ctx = [mx.cpu(0)]) + ### Prepare data + + test_data = mx.nd.array(np.random.uniform(-1, 1, size=(20, 1, 30, 30))) + output = net(test_data) + # print (y) + # ### Save the test data as well. + # ### Save the inference output ys + # ### Save the model params + + mx.nd.save(model_name + '-data', {'data' : test_data}) + save_inference_results(output, model_name) + net.save_params(model_name + '-params') + + mxnet_folder = str(mxnet_version) + backslash + model_name + backslash + + files = list() + files.append(model_name + '-data') + files.append(model_name + '-inference') + files.append(model_name + '-params') + + upload_data_and_labels_to_s3(model_name) + + upload_model_files_to_s3(files, mxnet_folder) + + clean_model_files(files, model_name) + +def train_lenet_gluon_hybrid_export_api(): + model_name = 'lenet_gluon_hybrid_export_api' + print ('Saving files for model %s' %model_name) + net = HybridNet() + weights = mx.initializer.Xavier(magnitude = 2.57) + net.initialize(weights, ctx = [mx.cpu(0)]) + net.hybridize() + ### Prepare data + test_data = mx.nd.array(np.random.uniform(-1, 1, size=(20, 1, 30, 30))) + output = net(test_data) + # print (y) + ### Save the test data as well. + ### Save the inference output ys + ### Save the model params + + mx.nd.save(model_name + '-data', {'data' : test_data}) + save_inference_results(output, model_name) + net.export(model_name, epoch=1) + + mxnet_folder = str(mxnet_version) + backslash + model_name + backslash + + files = list() + files.append(model_name + '-data') + files.append(model_name + '-inference') + files.append(model_name + '-0001.params') + files.append(model_name + '-symbol.json') + + + upload_data_and_labels_to_s3(model_name) + + upload_model_files_to_s3(files, mxnet_folder) + + clean_model_files(files, model_name) + +def train_lstm_gluon_save_parameters_api(): + ## If this code is being run on version >= 1.2.0 only then execute it, since it uses save_parameters and load_parameters API + if compare_versions(str(mxnet_version), '1.2.1') < 0: + print ('Found MXNet version %s and exiting because this version does not contain save_parameters and load_parameters functions' %str(mxnet_version)) + sys.exit(1) + + model_name = 'lstm_gluon_save_parameters_api' + print ('Saving files for model %s' %model_name) + net = SimpleLSTMModel() + weights = mx.initializer.Xavier(magnitude = 2.57) + net.initialize(weights, ctx = [mx.cpu(0)]) + + test_data = mx.nd.array(np.random.uniform(-1, 1, size=(10, 30))) + output = net(test_data) + # print output + mx.nd.save(model_name + '-data', {'data' : test_data}) + save_inference_results(output, model_name) + net.save_parameters(model_name + '-params') + + mxnet_folder = str(mxnet_version) + backslash + model_name + backslash + + files = list() + files.append(model_name + '-data') + files.append(model_name + '-inference') + files.append(model_name + '-params') + + upload_data_and_labels_to_s3(model_name) + + upload_model_files_to_s3(files, mxnet_folder) + + clean_model_files(files, model_name) + + +if __name__=='__main__': + train_module_checkpoint_api() + train_lenet_gluon_save_params_api() + train_lenet_gluon_hybrid_export_api() + train_lstm_gluon_save_parameters_api() diff --git a/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh index 4055895a2e68..8d57f7b7d724 100755 --- a/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh +++ b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh @@ -23,13 +23,7 @@ run_models() { echo '==========================' echo "Running training files and preparing models" echo '==========================' - python mnist_mlp_module_api_train.py - echo '==========================' - python lenet_cnn_gluon_hybrid_train.py - echo '==========================' - python lm_rnn_gluon_train.py - echo '==========================' - python lenet_cnn_gluon_train.py + python model_backwards_compat_train.py echo '==========================' } From 79cfa467fd36e3722aeaf7dda0bb3e880594ca76 Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Thu, 12 Jul 2018 12:23:18 -0700 Subject: [PATCH 22/59] Updated README --- tests/nightly/model_backwards_compatibility_check/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/nightly/model_backwards_compatibility_check/README.md b/tests/nightly/model_backwards_compatibility_check/README.md index c24b26151108..f894eafc322f 100644 --- a/tests/nightly/model_backwards_compatibility_check/README.md +++ b/tests/nightly/model_backwards_compatibility_check/README.md @@ -13,8 +13,8 @@ This is configuration file for jenkins job. - Currently the APIs that covered for model saving/loading are : do_checkpoint/load_checkpoint, save_params/load_params, save_parameters/load_parameters(added v1.2.1 onwards), export/gluon.SymbolBlock.imports. - These APIs are covered over models with architectures such as : MLP, RNNs, LeNet covering the four scenarios described above. - More operators/models will be added in the future to extend the operator coverage. -- The model train files suffixed by `_train.py` and the trained models are hosted in AWS S3. +- The model train file is suffixed by `_train.py` and the trained models are hosted in AWS S3. - The trained models for now are backfilled into S3 starting from every MXNet release version v1.1.0. - The script for training the models on older versions of MXNet is : `train_mxnet_legacy_models.sh`. -- The inference files are suffixed by `_inference.py`. +- The inference file is suffixed by `_inference.py`. From 4df779b217c99aa8a9eaebbfd7568461d62b0dc1 Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Thu, 12 Jul 2018 15:38:16 -0700 Subject: [PATCH 23/59] Fixed indentation error --- .../model_backwards_compat_train.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py index b79607582647..eeda865dd32f 100644 --- a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py +++ b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py @@ -121,10 +121,10 @@ def train_lenet_gluon_hybrid_export_api(): def train_lstm_gluon_save_parameters_api(): ## If this code is being run on version >= 1.2.0 only then execute it, since it uses save_parameters and load_parameters API - if compare_versions(str(mxnet_version), '1.2.1') < 0: - print ('Found MXNet version %s and exiting because this version does not contain save_parameters and load_parameters functions' %str(mxnet_version)) - sys.exit(1) - + if compare_versions(str(mxnet_version), '1.2.1') < 0: + print ('Found MXNet version %s and exiting because this version does not contain save_parameters and load_parameters functions' %str(mxnet_version)) + sys.exit(1) + model_name = 'lstm_gluon_save_parameters_api' print ('Saving files for model %s' %model_name) net = SimpleLSTMModel() From 04465b03ce3f50dc0a2e3af65626d5c05be0e692 Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Thu, 12 Jul 2018 15:49:27 -0700 Subject: [PATCH 24/59] Fixed indentation error --- .../model_backwards_compatibility_check/common.py | 2 +- .../model_backwards_compat_inference.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/nightly/model_backwards_compatibility_check/common.py b/tests/nightly/model_backwards_compatibility_check/common.py index 52542cfee5b2..1f69d666ee48 100644 --- a/tests/nightly/model_backwards_compatibility_check/common.py +++ b/tests/nightly/model_backwards_compatibility_check/common.py @@ -36,7 +36,7 @@ from mxnet.test_utils import assert_almost_equal # Set fixed random seeds. -mx.random.seed(7) +mx.random.seed(7, ctx = mx.cpu()) np.random.seed(7) logging.getLogger().setLevel(logging.DEBUG) diff --git a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py index 3323c6fe4ba7..9eb01360ae74 100644 --- a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py +++ b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py @@ -126,10 +126,10 @@ def test_lenet_gluon_hybrid_imports_api(): def test_lstm_gluon_load_parameters_api(): ## If this code is being run on version >= 1.2.0 only then execute it, since it uses save_parameters and load_parameters API - if compare_versions(str(mxnet_version), '1.2.1') < 0: - print ('Found MXNet version %s and exiting because this version does not contain save_parameters and load_parameters functions' %str(mxnet_version)) - sys.exit(1) - + if compare_versions(str(mxnet_version), '1.2.1') < 0: + print ('Found MXNet version %s and exiting because this version does not contain save_parameters and load_parameters functions' %str(mxnet_version)) + sys.exit(1) + model_name = 'lstm_gluon_save_parameters_api' print ('Performing inference for model/API %s' %model_name) ## Get data from S3 @@ -154,7 +154,7 @@ def test_lstm_gluon_load_parameters_api(): ## Load the model and perform inference loaded_model = SimpleLSTMModel() - loaded_model.load_params(model_name+'-params') + loaded_model.load_parameters(model_name+'-params') output = loaded_model(test_data) old_inference_results = mx.nd.load(model_name + '-inference')['inference'] assert_almost_equal(old_inference_results.asnumpy(), output.asnumpy()) From 2d5cf097ce4d340a89c329df15411e34ca807ed9 Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Fri, 13 Jul 2018 15:23:46 -0700 Subject: [PATCH 25/59] Removed code duplication in the training file --- .../common.py | 4 +- .../model_backwards_compat_inference.py | 62 ++----------------- 2 files changed, 8 insertions(+), 58 deletions(-) diff --git a/tests/nightly/model_backwards_compatibility_check/common.py b/tests/nightly/model_backwards_compatibility_check/common.py index 1f69d666ee48..a988e2211f9c 100644 --- a/tests/nightly/model_backwards_compatibility_check/common.py +++ b/tests/nightly/model_backwards_compatibility_check/common.py @@ -114,6 +114,8 @@ def download_model_files_from_s3(model_name): ## Download this file--- bucket.download_file(obj.key, file_name) + return model_files + def get_top_level_folders_in_bucket(s3client, bucket_name): '''This function returns the top level folders in the S3Bucket. These folders help us to navigate to the trained model files stored for different MXNet versions. ''' bucket = s3client.Bucket(bucket_name) @@ -122,7 +124,7 @@ def get_top_level_folders_in_bucket(s3client, bucket_name): folder_list = list() if 'CommonPrefixes' not in result: print ('No trained models found in S3 bucket : %s for this file. Please train the models and run inference again' %bucket_name) - sys.exit(1) + return for obj in result['CommonPrefixes']: folder_list.append(obj['Prefix'].strip(backslash)) diff --git a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py index 9eb01360ae74..0e8201da44c9 100644 --- a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py +++ b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py @@ -30,20 +30,7 @@ def test_module_checkpoint_api(): ## For each MXNet version that has the saved models for folder in get_top_level_folders_in_bucket(s3, model_bucket_name): - bucket = s3.Bucket(model_bucket_name) - prefix = folder + backslash + model_name - model_files_meta = list(bucket.objects.filter(Prefix = prefix)) - if len(model_files_meta) == 0: - print ('No trained models found under path : %s' %prefix) - continue - model_files = list() - ## For each file found under a model folder : - for obj in model_files_meta: - file_name = obj.key.split('/')[2] - model_files.append(file_name) - ## Download this file--- - bucket.download_file(obj.key, file_name) - + model_files = download_model_files_from_s3(model_name) ## Load the model and perform inference loaded_model = get_module_api_model_definition() @@ -69,20 +56,7 @@ def test_lenet_gluon_load_params_api(): test_data = data['data'] for folder in get_top_level_folders_in_bucket(s3, model_bucket_name): - bucket = s3.Bucket(model_bucket_name) - prefix = folder + backslash + model_name - model_files_meta = list(bucket.objects.filter(Prefix = prefix)) - if len(model_files_meta) == 0: - print ('No trained models found under path : %s' %prefix) - continue - model_files = list() - ## For each file found under a model folder : - for obj in model_files_meta: - file_name = obj.key.split('/')[2] - model_files.append(file_name) - ## Download this file--- - bucket.download_file(obj.key, file_name) - + model_files = download_model_files_from_s3(model_name) ## Load the model and perform inference loaded_model = Net() loaded_model.load_params(model_name+'-params') @@ -101,20 +75,7 @@ def test_lenet_gluon_hybrid_imports_api(): test_data = data['data'] for folder in get_top_level_folders_in_bucket(s3, model_bucket_name): - bucket = s3.Bucket(model_bucket_name) - prefix = folder + backslash + model_name - model_files_meta = list(bucket.objects.filter(Prefix = prefix)) - if len(model_files_meta) == 0: - print ('No trained models found under path : %s' %prefix) - continue - model_files = list() - ## For each file found under a model folder : - for obj in model_files_meta: - file_name = obj.key.split('/')[2] - model_files.append(file_name) - ## Download this file--- - bucket.download_file(obj.key, file_name) - + model_files = download_model_files_from_s3(model_name) ## Load the model and perform inference loaded_model = HybridNet() loaded_model = gluon.SymbolBlock.imports(model_name + '-symbol.json', ['data'], model_name + '-0001.params') @@ -128,7 +89,7 @@ def test_lstm_gluon_load_parameters_api(): ## If this code is being run on version >= 1.2.0 only then execute it, since it uses save_parameters and load_parameters API if compare_versions(str(mxnet_version), '1.2.1') < 0: print ('Found MXNet version %s and exiting because this version does not contain save_parameters and load_parameters functions' %str(mxnet_version)) - sys.exit(1) + return model_name = 'lstm_gluon_save_parameters_api' print ('Performing inference for model/API %s' %model_name) @@ -138,20 +99,7 @@ def test_lstm_gluon_load_parameters_api(): test_data = data['data'] for folder in get_top_level_folders_in_bucket(s3, model_bucket_name): - bucket = s3.Bucket(model_bucket_name) - prefix = folder + backslash + model_name - model_files_meta = list(bucket.objects.filter(Prefix = prefix)) - if len(model_files_meta) == 0: - print ('No trained models found under path : %s' %prefix) - continue - model_files = list() - ## For each file found under a model folder : - for obj in model_files_meta: - file_name = obj.key.split('/')[2] - model_files.append(file_name) - ## Download this file--- - bucket.download_file(obj.key, file_name) - + model_files = download_model_files_from_s3(model_name) ## Load the model and perform inference loaded_model = SimpleLSTMModel() loaded_model.load_parameters(model_name+'-params') From 7bfdf874d4ae25cf464a0556121db72d6e25bac3 Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Mon, 16 Jul 2018 09:49:39 -0700 Subject: [PATCH 26/59] Added comments for runtime_functions script for training files --- ci/docker/runtime_functions.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index 71545c55f98a..01ea6ec391b9 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -863,6 +863,8 @@ nightly_model_backwards_compat_test() { } #Backfills S3 bucket with models trained on earlier versions of mxnet +# Note : This script should not be called from the regular docker environment because the IAM roles required for S3 uploads +# do not get propagated into the container as of now. nightly_model_backwards_compat_train() { set -ex export PYTHONPATH=./python/ From c80ee31fd252f45e057a07afa392f27185ef036c Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Mon, 16 Jul 2018 13:03:47 -0700 Subject: [PATCH 27/59] Merged S3 Buckets for storing data and models into one --- .../model_backwards_compatibility_check/common.py | 14 +++++++++----- .../model_backwards_compat_train.py | 4 ++-- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/tests/nightly/model_backwards_compatibility_check/common.py b/tests/nightly/model_backwards_compatibility_check/common.py index a988e2211f9c..195a4ada95dc 100644 --- a/tests/nightly/model_backwards_compatibility_check/common.py +++ b/tests/nightly/model_backwards_compatibility_check/common.py @@ -43,7 +43,7 @@ # get the current mxnet version we are running on mxnet_version = mx.__version__ model_bucket_name = 'mxnet-model-backwards-compatibility-models' -data_bucket_name = 'mxnet-model-backwards-compatibility-data' +data_folder = 'mxnet-model-backwards-compatibility-data' backslash = '/' s3 = boto3.resource('s3') ctx = mx.cpu(0) @@ -75,7 +75,7 @@ def save_data_and_labels(test_data, test_labels, model_name): def upload_data_and_labels_to_s3(model_name): s3 = boto3.client('s3') file = model_name + '-data' - s3.upload_file(file, data_bucket_name, file) + s3.upload_file(file, model_bucket_name, data_folder + backslash + file) print ('data files successfully uploaded to s3') def upload_model_files_to_s3(files, folder_name): @@ -92,9 +92,9 @@ def clean_model_files(files, model_name): os.remove(file) def download_data_from_s3(model_name): - print ('Downloading data files for %s from bucket %s'%(model_name, data_bucket_name)) - bucket = s3.Bucket(data_bucket_name) - bucket.download_file(model_name+'-data', model_name+'-data') + print ('Downloading data files for %s from bucket %s'%(model_name, model_bucket_name + backslash + data_folder)) + bucket = s3.Bucket(model_bucket_name) + bucket.download_file(data_folder + backslash + model_name+'-data', model_name+'-data') data = mx.nd.load(model_name+'-data') return data @@ -126,6 +126,10 @@ def get_top_level_folders_in_bucket(s3client, bucket_name): print ('No trained models found in S3 bucket : %s for this file. Please train the models and run inference again' %bucket_name) return for obj in result['CommonPrefixes']: + folder_name = obj['Prefix'].strip(backslash) + # The top level folders contain MXNet Version # for trained models. Skipping the data folder here + if folder_name == data_folder: + continue folder_list.append(obj['Prefix'].strip(backslash)) return folder_list diff --git a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py index eeda865dd32f..92d377eaa381 100644 --- a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py +++ b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py @@ -120,10 +120,10 @@ def train_lenet_gluon_hybrid_export_api(): clean_model_files(files, model_name) def train_lstm_gluon_save_parameters_api(): - ## If this code is being run on version >= 1.2.0 only then execute it, since it uses save_parameters and load_parameters API + ## If this code is being run on version >= 1.2.1 only then execute it, since it uses save_parameters and load_parameters API if compare_versions(str(mxnet_version), '1.2.1') < 0: print ('Found MXNet version %s and exiting because this version does not contain save_parameters and load_parameters functions' %str(mxnet_version)) - sys.exit(1) + return model_name = 'lstm_gluon_save_parameters_api' print ('Saving files for model %s' %model_name) From e764d5a2898d42f263688685bf575234a492a650 Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Mon, 16 Jul 2018 14:58:21 -0700 Subject: [PATCH 28/59] Automated the process to fetch MXNet versions from git tags --- .../common.py | 2 +- .../train_mxnet_legacy_models.sh | 26 ++++++++++++++++--- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/tests/nightly/model_backwards_compatibility_check/common.py b/tests/nightly/model_backwards_compatibility_check/common.py index 195a4ada95dc..910a7e6d5544 100644 --- a/tests/nightly/model_backwards_compatibility_check/common.py +++ b/tests/nightly/model_backwards_compatibility_check/common.py @@ -36,7 +36,7 @@ from mxnet.test_utils import assert_almost_equal # Set fixed random seeds. -mx.random.seed(7, ctx = mx.cpu()) +mx.random.seed(7) np.random.seed(7) logging.getLogger().setLevel(logging.DEBUG) diff --git a/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh index 8d57f7b7d724..b186cf7fcfbb 100755 --- a/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh +++ b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh @@ -44,8 +44,26 @@ echo `pwd` install_boto3 -install_mxnet 1.1.0 -run_models +## Fetch the latest release tags, filtering out 'rcs' and filtering out some other irrelevant ones +## This list is sorted in descending order chronologically. Keeping n = 5 for a precautionary check. +## Sample output for the below git tag command is : 1.2.0 utils 1.1.0 1.0.0 0.12.1 +previous_versions=($(git tag --sort=-creatordate | grep --invert-match rc | head -n 5)) +count=0 +for version in ${previous_versions[*]} +do + # We just need to train the previous two versions. This logic can be changed later on as welll. + if [[ "$count" -gt 1 ]] + then + echo "Successfully trained files for the previous two MXNet release versions" + exit 1 + fi -install_mxnet 1.2.0 -run_models \ No newline at end of file + ## If MXNet major version starts with a number >=1. with a wildcard match for the minor version numbers + if [[ $version = [1-9]* ]] + then + count=$((count + 1)) + # echo $version + install_mxnet $version + run_models + fi +done \ No newline at end of file From 05ded05f97b933d47789ee41877b7b483ff08992 Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Mon, 16 Jul 2018 15:13:27 -0700 Subject: [PATCH 29/59] Added defensive checks for the case where the data might not be found --- .../model_backwards_compatibility_check/common.py | 6 ++++++ .../model_backwards_compat_inference.py | 15 ++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/tests/nightly/model_backwards_compatibility_check/common.py b/tests/nightly/model_backwards_compatibility_check/common.py index 910a7e6d5544..ea27ed262356 100644 --- a/tests/nightly/model_backwards_compatibility_check/common.py +++ b/tests/nightly/model_backwards_compatibility_check/common.py @@ -94,6 +94,12 @@ def clean_model_files(files, model_name): def download_data_from_s3(model_name): print ('Downloading data files for %s from bucket %s'%(model_name, model_bucket_name + backslash + data_folder)) bucket = s3.Bucket(model_bucket_name) + prefix = data_folder + backslash + model_name + '-data' + data_files_meta = list(bucket.objects.filter(Prefix = prefix)) + if len(data_files_meta) == 0: + print ('No data files found for %s' %model_name) + return None + bucket.download_file(data_folder + backslash + model_name+'-data', model_name+'-data') data = mx.nd.load(model_name+'-data') diff --git a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py index 0e8201da44c9..0b15fb009d99 100644 --- a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py +++ b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py @@ -23,6 +23,10 @@ def test_module_checkpoint_api(): model_name = 'module_checkpoint_api' print ('Performing inference for model/API %s' %model_name) data = download_data_from_s3(model_name) + if data is None: + print ('No data files found for %s' %model_name) + return + test_data = data['data'] test_label = data['labels'] @@ -52,6 +56,9 @@ def test_lenet_gluon_load_params_api(): print ('Performing inference for model/API %s' %model_name) ## Get data from S3 data = download_data_from_s3(model_name) + if data is None: + print ('No data files found for %s' %model_name) + return test_data = data['data'] @@ -71,6 +78,9 @@ def test_lenet_gluon_hybrid_imports_api(): print ('Performing inference for model/API %s' %model_name) ## Get data from S3 data = download_data_from_s3(model_name) + if data is None: + print ('No data files found for %s' %model_name) + return test_data = data['data'] @@ -95,7 +105,10 @@ def test_lstm_gluon_load_parameters_api(): print ('Performing inference for model/API %s' %model_name) ## Get data from S3 data = download_data_from_s3(model_name) - + if data is None: + print ('No data files found for %s' %model_name) + return + test_data = data['data'] for folder in get_top_level_folders_in_bucket(s3, model_bucket_name): From 60c7be0e52935d1b2e9cd3cf25411156588381c8 Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Mon, 16 Jul 2018 15:37:33 -0700 Subject: [PATCH 30/59] Fixed issue where we were performing inference on state model files --- .../common.py | 27 ++++++++--------- .../model_backwards_compat_inference.py | 30 +++++++++++++++---- 2 files changed, 38 insertions(+), 19 deletions(-) diff --git a/tests/nightly/model_backwards_compatibility_check/common.py b/tests/nightly/model_backwards_compatibility_check/common.py index ea27ed262356..877169acffbf 100644 --- a/tests/nightly/model_backwards_compatibility_check/common.py +++ b/tests/nightly/model_backwards_compatibility_check/common.py @@ -105,20 +105,19 @@ def download_data_from_s3(model_name): data = mx.nd.load(model_name+'-data') return data -def download_model_files_from_s3(model_name): - for folder in get_top_level_folders_in_bucket(s3, model_bucket_name): - bucket = s3.Bucket(model_bucket_name) - prefix = folder + backslash + model_name - model_files_meta = list(bucket.objects.filter(Prefix = prefix)) - if len(model_files_meta) == 0: - print ('No trained models found under path : %s' %prefix) - continue - model_files = list() - for obj in model_files_meta: - file_name = obj.key.split('/')[2] - model_files.append(file_name) +def download_model_files_from_s3(model_name, folder_name): + model_files = list() + bucket = s3.Bucket(model_bucket_name) + prefix = folder_name + backslash + model_name + model_files_meta = list(bucket.objects.filter(Prefix = prefix)) + if len(model_files_meta) == 0: + print ('No trained models found under path : %s' %prefix) + return model_files + for obj in model_files_meta: + file_name = obj.key.split('/')[2] + model_files.append(file_name) ## Download this file--- - bucket.download_file(obj.key, file_name) + bucket.download_file(obj.key, file_name) return model_files @@ -130,7 +129,7 @@ def get_top_level_folders_in_bucket(s3client, bucket_name): folder_list = list() if 'CommonPrefixes' not in result: print ('No trained models found in S3 bucket : %s for this file. Please train the models and run inference again' %bucket_name) - return + return folder_list for obj in result['CommonPrefixes']: folder_name = obj['Prefix'].strip(backslash) # The top level folders contain MXNet Version # for trained models. Skipping the data folder here diff --git a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py index 0b15fb009d99..d4302e70c3ef 100644 --- a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py +++ b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py @@ -34,7 +34,11 @@ def test_module_checkpoint_api(): ## For each MXNet version that has the saved models for folder in get_top_level_folders_in_bucket(s3, model_bucket_name): - model_files = download_model_files_from_s3(model_name) + print ('Fetching files for MXNet version : %s and model %s' %(folder, model_name)) + model_files = download_model_files_from_s3(model_name, folder) + if len(model_files) == 0: + print('No training files found for %s for MXNet version : %s'%(model_name, folder)) + continue ## Load the model and perform inference loaded_model = get_module_api_model_definition() @@ -47,6 +51,7 @@ def test_module_checkpoint_api(): ## Check whether they are equal or not ? assert_almost_equal(inference_results.asnumpy(), old_inference_results.asnumpy()) clean_model_files(model_files, model_name) + print ('=================================') print ('Assertion passed for model : %s' %model_name) @@ -63,7 +68,11 @@ def test_lenet_gluon_load_params_api(): test_data = data['data'] for folder in get_top_level_folders_in_bucket(s3, model_bucket_name): - model_files = download_model_files_from_s3(model_name) + print ('Fetching files for MXNet version : %s and model %s' %(folder, model_name)) + model_files = download_model_files_from_s3(model_name, folder) + if len(model_files) == 0: + print('No training files found for %s for MXNet version : %s'%(model_name, folder)) + continue ## Load the model and perform inference loaded_model = Net() loaded_model.load_params(model_name+'-params') @@ -71,6 +80,7 @@ def test_lenet_gluon_load_params_api(): old_inference_results = mx.nd.load(model_name + '-inference')['inference'] assert_almost_equal(old_inference_results.asnumpy(), output.asnumpy()) clean_model_files(model_files, model_name) + print ('=================================') print ('Assertion passed for model : %s' %model_name) def test_lenet_gluon_hybrid_imports_api(): @@ -85,7 +95,11 @@ def test_lenet_gluon_hybrid_imports_api(): test_data = data['data'] for folder in get_top_level_folders_in_bucket(s3, model_bucket_name): - model_files = download_model_files_from_s3(model_name) + print ('Fetching files for MXNet version : %s and model %s' %(folder, model_name)) + model_files = download_model_files_from_s3(model_name, folder) + if len(model_files) == 0: + print('No training files found for %s for MXNet version : %s'%(model_name, folder)) + continue ## Load the model and perform inference loaded_model = HybridNet() loaded_model = gluon.SymbolBlock.imports(model_name + '-symbol.json', ['data'], model_name + '-0001.params') @@ -93,6 +107,7 @@ def test_lenet_gluon_hybrid_imports_api(): old_inference_results = mx.nd.load(model_name + '-inference')['inference'] assert_almost_equal(old_inference_results.asnumpy(), output.asnumpy()) clean_model_files(model_files, model_name) + print ('=================================') print ('Assertion passed for model : %s' %model_name) def test_lstm_gluon_load_parameters_api(): @@ -102,7 +117,7 @@ def test_lstm_gluon_load_parameters_api(): return model_name = 'lstm_gluon_save_parameters_api' - print ('Performing inference for model/API %s' %model_name) + print ('Performing inference for model/API %s and model'%model_name) ## Get data from S3 data = download_data_from_s3(model_name) if data is None: @@ -112,7 +127,11 @@ def test_lstm_gluon_load_parameters_api(): test_data = data['data'] for folder in get_top_level_folders_in_bucket(s3, model_bucket_name): - model_files = download_model_files_from_s3(model_name) + print ('Fetching files for MXNet version : %s' %folder) + model_files = download_model_files_from_s3(model_name, folder) + if len(model_files) == 0: + print('No training files found for %s for MXNet version : %s'%(model_name, folder)) + continue ## Load the model and perform inference loaded_model = SimpleLSTMModel() loaded_model.load_parameters(model_name+'-params') @@ -120,6 +139,7 @@ def test_lstm_gluon_load_parameters_api(): old_inference_results = mx.nd.load(model_name + '-inference')['inference'] assert_almost_equal(old_inference_results.asnumpy(), output.asnumpy()) clean_model_files(model_files, model_name) + print ('=================================') print ('Assertion passed for model : %s' %model_name) if __name__=='__main__': From 9d4d0993b56ec8b3148e1a0ee90c81424bb1f0cb Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Wed, 18 Jul 2018 09:58:20 -0700 Subject: [PATCH 31/59] Replaced print statements with logging ones --- .../common.py | 12 ++--- .../model_backwards_compat_inference.py | 48 +++++++++---------- .../model_backwards_compat_train.py | 10 ++-- 3 files changed, 35 insertions(+), 35 deletions(-) diff --git a/tests/nightly/model_backwards_compatibility_check/common.py b/tests/nightly/model_backwards_compatibility_check/common.py index 877169acffbf..47782959d68a 100644 --- a/tests/nightly/model_backwards_compatibility_check/common.py +++ b/tests/nightly/model_backwards_compatibility_check/common.py @@ -38,7 +38,7 @@ # Set fixed random seeds. mx.random.seed(7) np.random.seed(7) -logging.getLogger().setLevel(logging.DEBUG) +logging.basicConfig(level=logging.DEBUG) # get the current mxnet version we are running on mxnet_version = mx.__version__ @@ -76,7 +76,7 @@ def upload_data_and_labels_to_s3(model_name): s3 = boto3.client('s3') file = model_name + '-data' s3.upload_file(file, model_bucket_name, data_folder + backslash + file) - print ('data files successfully uploaded to s3') + logging.info('data files successfully uploaded to s3') def upload_model_files_to_s3(files, folder_name): s3 = boto3.client('s3') @@ -92,12 +92,12 @@ def clean_model_files(files, model_name): os.remove(file) def download_data_from_s3(model_name): - print ('Downloading data files for %s from bucket %s'%(model_name, model_bucket_name + backslash + data_folder)) + logging.info('Downloading data files for %s from bucket %s' %(model_name, model_bucket_name + backslash + data_folder)) bucket = s3.Bucket(model_bucket_name) prefix = data_folder + backslash + model_name + '-data' data_files_meta = list(bucket.objects.filter(Prefix = prefix)) if len(data_files_meta) == 0: - print ('No data files found for %s' %model_name) + logging.error('No data files found for %s', model_name) return None bucket.download_file(data_folder + backslash + model_name+'-data', model_name+'-data') @@ -111,7 +111,7 @@ def download_model_files_from_s3(model_name, folder_name): prefix = folder_name + backslash + model_name model_files_meta = list(bucket.objects.filter(Prefix = prefix)) if len(model_files_meta) == 0: - print ('No trained models found under path : %s' %prefix) + logging.error('No trained models found under path : %s', prefix) return model_files for obj in model_files_meta: file_name = obj.key.split('/')[2] @@ -128,7 +128,7 @@ def get_top_level_folders_in_bucket(s3client, bucket_name): Delimiter=backslash) folder_list = list() if 'CommonPrefixes' not in result: - print ('No trained models found in S3 bucket : %s for this file. Please train the models and run inference again' %bucket_name) + logging.error('No trained models found in S3 bucket : %s for this file. Please train the models and run inference again' %bucket_name) return folder_list for obj in result['CommonPrefixes']: folder_name = obj['Prefix'].strip(backslash) diff --git a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py index d4302e70c3ef..82aad2ab4b5e 100644 --- a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py +++ b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py @@ -24,7 +24,7 @@ def test_module_checkpoint_api(): print ('Performing inference for model/API %s' %model_name) data = download_data_from_s3(model_name) if data is None: - print ('No data files found for %s' %model_name) + logging.error ('No data files found for %s' %model_name) return test_data = data['data'] @@ -34,10 +34,10 @@ def test_module_checkpoint_api(): ## For each MXNet version that has the saved models for folder in get_top_level_folders_in_bucket(s3, model_bucket_name): - print ('Fetching files for MXNet version : %s and model %s' %(folder, model_name)) + logging.info ('Fetching files for MXNet version : %s and model %s' %(folder, model_name)) model_files = download_model_files_from_s3(model_name, folder) if len(model_files) == 0: - print('No training files found for %s for MXNet version : %s'%(model_name, folder)) + logging.warn ('No training files found for %s for MXNet version : %s'%(model_name, folder)) continue ## Load the model and perform inference loaded_model = get_module_api_model_definition() @@ -51,27 +51,27 @@ def test_module_checkpoint_api(): ## Check whether they are equal or not ? assert_almost_equal(inference_results.asnumpy(), old_inference_results.asnumpy()) clean_model_files(model_files, model_name) - print ('=================================') + logging.info ('=================================') - print ('Assertion passed for model : %s' %model_name) + logging.info ('Assertion passed for model : %s' %model_name) def test_lenet_gluon_load_params_api(): model_name = 'lenet_gluon_save_params_api' - print ('Performing inference for model/API %s' %model_name) + logging.info ('Performing inference for model/API %s' %model_name) ## Get data from S3 data = download_data_from_s3(model_name) if data is None: - print ('No data files found for %s' %model_name) + logging.error ('No data files found for %s' %model_name) return test_data = data['data'] for folder in get_top_level_folders_in_bucket(s3, model_bucket_name): - print ('Fetching files for MXNet version : %s and model %s' %(folder, model_name)) + logging.info ('Fetching files for MXNet version : %s and model %s' %(folder, model_name)) model_files = download_model_files_from_s3(model_name, folder) if len(model_files) == 0: - print('No training files found for %s for MXNet version : %s'%(model_name, folder)) + logging.warn ('No training files found for %s for MXNet version : %s'%(model_name, folder)) continue ## Load the model and perform inference loaded_model = Net() @@ -80,25 +80,25 @@ def test_lenet_gluon_load_params_api(): old_inference_results = mx.nd.load(model_name + '-inference')['inference'] assert_almost_equal(old_inference_results.asnumpy(), output.asnumpy()) clean_model_files(model_files, model_name) - print ('=================================') - print ('Assertion passed for model : %s' %model_name) + logging.info ('=================================') + logging.info ('Assertion passed for model : %s' %model_name) def test_lenet_gluon_hybrid_imports_api(): model_name = 'lenet_gluon_hybrid_export_api' - print ('Performing inference for model/API %s' %model_name) + logging.info ('Performing inference for model/API %s' %model_name) ## Get data from S3 data = download_data_from_s3(model_name) if data is None: - print ('No data files found for %s' %model_name) + logging.error ('No data files found for %s' %model_name) return test_data = data['data'] for folder in get_top_level_folders_in_bucket(s3, model_bucket_name): - print ('Fetching files for MXNet version : %s and model %s' %(folder, model_name)) + logging.info ('Fetching files for MXNet version : %s and model %s' %(folder, model_name)) model_files = download_model_files_from_s3(model_name, folder) if len(model_files) == 0: - print('No training files found for %s for MXNet version : %s'%(model_name, folder)) + logging.warn('No training files found for %s for MXNet version : %s'%(model_name, folder)) continue ## Load the model and perform inference loaded_model = HybridNet() @@ -107,30 +107,30 @@ def test_lenet_gluon_hybrid_imports_api(): old_inference_results = mx.nd.load(model_name + '-inference')['inference'] assert_almost_equal(old_inference_results.asnumpy(), output.asnumpy()) clean_model_files(model_files, model_name) - print ('=================================') - print ('Assertion passed for model : %s' %model_name) + logging.info ('=================================') + logging.info ('Assertion passed for model : %s' %model_name) def test_lstm_gluon_load_parameters_api(): ## If this code is being run on version >= 1.2.0 only then execute it, since it uses save_parameters and load_parameters API if compare_versions(str(mxnet_version), '1.2.1') < 0: - print ('Found MXNet version %s and exiting because this version does not contain save_parameters and load_parameters functions' %str(mxnet_version)) + logging.warn('Found MXNet version %s and exiting because this version does not contain save_parameters and load_parameters functions' %str(mxnet_version)) return model_name = 'lstm_gluon_save_parameters_api' - print ('Performing inference for model/API %s and model'%model_name) + logging.info ('Performing inference for model/API %s and model'%model_name) ## Get data from S3 data = download_data_from_s3(model_name) if data is None: - print ('No data files found for %s' %model_name) + logging.error ('No data files found for %s' %model_name) return test_data = data['data'] for folder in get_top_level_folders_in_bucket(s3, model_bucket_name): - print ('Fetching files for MXNet version : %s' %folder) + logging.info ('Fetching files for MXNet version : %s' %folder) model_files = download_model_files_from_s3(model_name, folder) if len(model_files) == 0: - print('No training files found for %s for MXNet version : %s'%(model_name, folder)) + logging.warn('No training files found for %s for MXNet version : %s'%(model_name, folder)) continue ## Load the model and perform inference loaded_model = SimpleLSTMModel() @@ -139,8 +139,8 @@ def test_lstm_gluon_load_parameters_api(): old_inference_results = mx.nd.load(model_name + '-inference')['inference'] assert_almost_equal(old_inference_results.asnumpy(), output.asnumpy()) clean_model_files(model_files, model_name) - print ('=================================') - print ('Assertion passed for model : %s' %model_name) + logging.info ('=================================') + logging.info ('Assertion passed for model : %s' %model_name) if __name__=='__main__': test_module_checkpoint_api() diff --git a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py index 92d377eaa381..87a62661b540 100644 --- a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py +++ b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py @@ -21,7 +21,7 @@ def train_module_checkpoint_api(): model_name = 'module_checkpoint_api' - print ('Saving files for model %s' %model_name) + logging.info('Saving files for model %s' %model_name) ### Prepare data test_data = mx.nd.array(np.random.uniform(-1, 1, size=(20, 1))) test_label = mx.nd.array(np.random.randint(0, 2, size=(20,)), dtype='float32') @@ -55,7 +55,7 @@ def train_module_checkpoint_api(): def train_lenet_gluon_save_params_api(): model_name = 'lenet_gluon_save_params_api' - print ('Saving files for model %s' %model_name) + logging.info('Saving files for model %s' %model_name) net = Net() weights = mx.initializer.Xavier(magnitude = 2.57) net.initialize(weights, ctx = [mx.cpu(0)]) @@ -87,7 +87,7 @@ def train_lenet_gluon_save_params_api(): def train_lenet_gluon_hybrid_export_api(): model_name = 'lenet_gluon_hybrid_export_api' - print ('Saving files for model %s' %model_name) + logging.info('Saving files for model %s' %model_name) net = HybridNet() weights = mx.initializer.Xavier(magnitude = 2.57) net.initialize(weights, ctx = [mx.cpu(0)]) @@ -122,11 +122,11 @@ def train_lenet_gluon_hybrid_export_api(): def train_lstm_gluon_save_parameters_api(): ## If this code is being run on version >= 1.2.1 only then execute it, since it uses save_parameters and load_parameters API if compare_versions(str(mxnet_version), '1.2.1') < 0: - print ('Found MXNet version %s and exiting because this version does not contain save_parameters and load_parameters functions' %str(mxnet_version)) + logging.warn('Found MXNet version %s and exiting because this version does not contain save_parameters and load_parameters functions' %str(mxnet_version)) return model_name = 'lstm_gluon_save_parameters_api' - print ('Saving files for model %s' %model_name) + logging.info ('Saving files for model %s' %model_name) net = SimpleLSTMModel() weights = mx.initializer.Xavier(magnitude = 2.57) net.initialize(weights, ctx = [mx.cpu(0)]) From cebfb26a83529afcd31ab4c20e5f1ed0c8156722 Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Wed, 25 Jul 2018 13:17:24 -0700 Subject: [PATCH 32/59] Removed boto install statements and move them into ubuntu_python docker --- ci/docker/install/ubuntu_python.sh | 4 ++-- ci/docker/runtime_functions.sh | 1 - .../model_backwards_compatibility_check/JenkinsfileForMBCC | 5 +++-- .../train_mxnet_legacy_models.sh | 7 ------- 4 files changed, 5 insertions(+), 12 deletions(-) diff --git a/ci/docker/install/ubuntu_python.sh b/ci/docker/install/ubuntu_python.sh index f087f07091e6..e71cac8a3898 100755 --- a/ci/docker/install/ubuntu_python.sh +++ b/ci/docker/install/ubuntu_python.sh @@ -29,5 +29,5 @@ wget -nv https://bootstrap.pypa.io/get-pip.py python3 get-pip.py python2 get-pip.py -pip2 install nose cpplint==1.3.0 pylint==1.8.3 'numpy<1.15.0,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 -pip3 install nose cpplint==1.3.0 pylint==1.8.3 'numpy<1.15.0,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 +pip2 install nose cpplint==1.3.0 pylint==1.8.3 'numpy<1.15.0,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3 +pip3 install nose cpplint==1.3.0 pylint==1.8.3 'numpy<1.15.0,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3 diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index b30052e17bed..32b859969f31 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -899,7 +899,6 @@ nightly_test_javascript() { nightly_model_backwards_compat_test() { set -ex export PYTHONPATH=./python/ - pip install boto3 ./tests/nightly/model_backwards_compatibility_check/model_backward_compat_checker.sh } diff --git a/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC b/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC index 2d312fa4ea96..9e277586dd5f 100644 --- a/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC +++ b/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC @@ -53,7 +53,8 @@ try { parallel 'ModelBackwardsCompat: CPU': { node('mxnetlinux-cpu') { ws('workspace/modelBackwardsCompat') { - init_git() + init_git() + docker_run('ubuntu_nightly_cpu', 'nightly_model_backwards_compat_test', false) } } @@ -76,4 +77,4 @@ try { throw err } } -} \ No newline at end of file +} diff --git a/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh index b186cf7fcfbb..47470333962a 100755 --- a/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh +++ b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh @@ -33,17 +33,10 @@ install_mxnet() { pip install mxnet==$version } -install_boto3(){ - echo "Intalling boto3" - pip install boto3 -} - echo `pwd` cd tests/nightly/model_backwards_compatibility_check echo `pwd` -install_boto3 - ## Fetch the latest release tags, filtering out 'rcs' and filtering out some other irrelevant ones ## This list is sorted in descending order chronologically. Keeping n = 5 for a precautionary check. ## Sample output for the below git tag command is : 1.2.0 utils 1.1.0 1.0.0 0.12.1 From f7a36eb7bdd4836c6e0ed6ee58f199d35320f06b Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Wed, 25 Jul 2018 16:50:30 -0700 Subject: [PATCH 33/59] Separated training and uploading of models into separate files so that training runs in Docker and upload runs outside Docker --- ci/docker/runtime_functions.sh | 3 - .../JenkinsfileForMBCC | 12 ++- .../common.py | 31 ++++---- .../model_backwards_compat_inference.py | 41 +++------- .../model_backwards_compat_train.py | 79 +++++-------------- .../upload_models_to_s3.sh | 41 ++++++++++ 6 files changed, 96 insertions(+), 111 deletions(-) create mode 100755 tests/nightly/model_backwards_compatibility_check/upload_models_to_s3.sh diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index 32b859969f31..f7dd32d69758 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -903,15 +903,12 @@ nightly_model_backwards_compat_test() { } #Backfills S3 bucket with models trained on earlier versions of mxnet -# Note : This script should not be called from the regular docker environment because the IAM roles required for S3 uploads -# do not get propagated into the container as of now. nightly_model_backwards_compat_train() { set -ex export PYTHONPATH=./python/ VENV=mbcc_py2_venv virtualenv -p `which python2` $VENV source $VENV/bin/activate - pip install boto3 ./tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh #Deactivate the virtual env once we are done with it deactivate diff --git a/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC b/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC index 9e277586dd5f..f659d9b39c91 100644 --- a/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC +++ b/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC @@ -53,9 +53,15 @@ try { parallel 'ModelBackwardsCompat: CPU': { node('mxnetlinux-cpu') { ws('workspace/modelBackwardsCompat') { - init_git() - - docker_run('ubuntu_nightly_cpu', 'nightly_model_backwards_compat_test', false) + + init_git() + // Train models on older versions + docker_run('ubuntu_nightly_cpu', 'nightly_model_backwards_compat_train', false) + // upload files to S3 here outside of the docker environment + sh /tests/nightly/model_backwards_compatibility_check/upload_models_to_s3.sh + // Perform inference on these models + docker_run('ubuntu_nightly_cpu', 'nightly_model_backwards_compat_test', false) + } } } diff --git a/tests/nightly/model_backwards_compatibility_check/common.py b/tests/nightly/model_backwards_compatibility_check/common.py index 47782959d68a..26026be0c079 100644 --- a/tests/nightly/model_backwards_compatibility_check/common.py +++ b/tests/nightly/model_backwards_compatibility_check/common.py @@ -38,7 +38,7 @@ # Set fixed random seeds. mx.random.seed(7) np.random.seed(7) -logging.basicConfig(level=logging.DEBUG) +logging.basicConfig(level=logging.INFO) # get the current mxnet version we are running on mxnet_version = mx.__version__ @@ -48,6 +48,9 @@ s3 = boto3.resource('s3') ctx = mx.cpu(0) +def get_model_path(model_name): + return os.path.join(os.getcwd(), 'models', str(mxnet_version), model_name) + def get_module_api_model_definition(): input = mx.symbol.Variable('data') input = mx.symbol.Flatten(data=input) @@ -61,7 +64,9 @@ def get_module_api_model_definition(): def save_inference_results(inference_results, model_name): assert (isinstance(inference_results, mx.ndarray.ndarray.NDArray)) - mx.nd.save(model_name + '-inference', {'inference' : inference_results}) + save_path = os.path.join(get_model_path(model_name), ''.join([model_name, '-inference'])) + + mx.nd.save(save_path, {'inference': inference_results}) def load_inference_results(model_name): inf_dict = mx.nd.load(model_name+'-inference') @@ -70,7 +75,9 @@ def load_inference_results(model_name): def save_data_and_labels(test_data, test_labels, model_name): assert (isinstance(test_data, mx.ndarray.ndarray.NDArray)) assert (isinstance(test_labels, mx.ndarray.ndarray.NDArray)) - mx.nd.save(model_name + '-data', {'data' : test_data, 'labels' : test_labels}) + + save_path = os.path.join(get_model_path(model_name), ''.join([model_name, '-data'])) + mx.nd.save(save_path, {'data': test_data, 'labels': test_labels}) def upload_data_and_labels_to_s3(model_name): s3 = boto3.client('s3') @@ -91,19 +98,6 @@ def clean_model_files(files, model_name): if os.path.isfile(file): os.remove(file) -def download_data_from_s3(model_name): - logging.info('Downloading data files for %s from bucket %s' %(model_name, model_bucket_name + backslash + data_folder)) - bucket = s3.Bucket(model_bucket_name) - prefix = data_folder + backslash + model_name + '-data' - data_files_meta = list(bucket.objects.filter(Prefix = prefix)) - if len(data_files_meta) == 0: - logging.error('No data files found for %s', model_name) - return None - - bucket.download_file(data_folder + backslash + model_name+'-data', model_name+'-data') - - data = mx.nd.load(model_name+'-data') - return data def download_model_files_from_s3(model_name, folder_name): model_files = list() @@ -139,6 +133,11 @@ def get_top_level_folders_in_bucket(s3client, bucket_name): return folder_list +def create_model_folder(model_name): + path = get_model_path(model_name) + if not os.path.exists(path): + os.makedirs(path) + class Net(gluon.Block): def __init__(self, **kwargs): super(Net, self).__init__(**kwargs) diff --git a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py index 82aad2ab4b5e..5a976e72a1bb 100644 --- a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py +++ b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py @@ -22,15 +22,6 @@ def test_module_checkpoint_api(): model_name = 'module_checkpoint_api' print ('Performing inference for model/API %s' %model_name) - data = download_data_from_s3(model_name) - if data is None: - logging.error ('No data files found for %s' %model_name) - return - - test_data = data['data'] - test_label = data['labels'] - - data_iter = mx.io.NDArrayIter(test_data, test_label, batch_size=10) ## For each MXNet version that has the saved models for folder in get_top_level_folders_in_bucket(s3, model_bucket_name): @@ -39,6 +30,9 @@ def test_module_checkpoint_api(): if len(model_files) == 0: logging.warn ('No training files found for %s for MXNet version : %s'%(model_name, folder)) continue + + data = mx.nd.load(''.join([model_name, '-data'])) + data_iter = mx.io.NDArrayIter(data['data'], data['labels'], batch_size=10) ## Load the model and perform inference loaded_model = get_module_api_model_definition() @@ -59,13 +53,6 @@ def test_module_checkpoint_api(): def test_lenet_gluon_load_params_api(): model_name = 'lenet_gluon_save_params_api' logging.info ('Performing inference for model/API %s' %model_name) - ## Get data from S3 - data = download_data_from_s3(model_name) - if data is None: - logging.error ('No data files found for %s' %model_name) - return - - test_data = data['data'] for folder in get_top_level_folders_in_bucket(s3, model_bucket_name): logging.info ('Fetching files for MXNet version : %s and model %s' %(folder, model_name)) @@ -73,6 +60,9 @@ def test_lenet_gluon_load_params_api(): if len(model_files) == 0: logging.warn ('No training files found for %s for MXNet version : %s'%(model_name, folder)) continue + + data = mx.nd.load(''.join([model_name, '-data'])) + test_data = data['data'] ## Load the model and perform inference loaded_model = Net() loaded_model.load_params(model_name+'-params') @@ -86,13 +76,6 @@ def test_lenet_gluon_load_params_api(): def test_lenet_gluon_hybrid_imports_api(): model_name = 'lenet_gluon_hybrid_export_api' logging.info ('Performing inference for model/API %s' %model_name) - ## Get data from S3 - data = download_data_from_s3(model_name) - if data is None: - logging.error ('No data files found for %s' %model_name) - return - - test_data = data['data'] for folder in get_top_level_folders_in_bucket(s3, model_bucket_name): logging.info ('Fetching files for MXNet version : %s and model %s' %(folder, model_name)) @@ -101,6 +84,8 @@ def test_lenet_gluon_hybrid_imports_api(): logging.warn('No training files found for %s for MXNet version : %s'%(model_name, folder)) continue ## Load the model and perform inference + data = mx.nd.load(''.join([model_name, '-data'])) + test_data = data['data'] loaded_model = HybridNet() loaded_model = gluon.SymbolBlock.imports(model_name + '-symbol.json', ['data'], model_name + '-0001.params') output = loaded_model(test_data) @@ -118,13 +103,6 @@ def test_lstm_gluon_load_parameters_api(): model_name = 'lstm_gluon_save_parameters_api' logging.info ('Performing inference for model/API %s and model'%model_name) - ## Get data from S3 - data = download_data_from_s3(model_name) - if data is None: - logging.error ('No data files found for %s' %model_name) - return - - test_data = data['data'] for folder in get_top_level_folders_in_bucket(s3, model_bucket_name): logging.info ('Fetching files for MXNet version : %s' %folder) @@ -132,6 +110,9 @@ def test_lstm_gluon_load_parameters_api(): if len(model_files) == 0: logging.warn('No training files found for %s for MXNet version : %s'%(model_name, folder)) continue + + data = mx.nd.load(''.join([model_name, '-data'])) + test_data = data['data'] ## Load the model and perform inference loaded_model = SimpleLSTMModel() loaded_model.load_parameters(model_name+'-params') diff --git a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py index 87a62661b540..6fd16daeea88 100644 --- a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py +++ b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py @@ -21,6 +21,7 @@ def train_module_checkpoint_api(): model_name = 'module_checkpoint_api' + create_model_folder(model_name) logging.info('Saving files for model %s' %model_name) ### Prepare data test_data = mx.nd.array(np.random.uniform(-1, 1, size=(20, 1))) @@ -33,28 +34,17 @@ def train_module_checkpoint_api(): weights = mx.initializer.Xavier(magnitude = 2.57) mod.init_params(weights) - mod.save_checkpoint(model_name, 1) - ### Save the data, labels - save_data_and_labels(test_data, test_label, model_name) - upload_data_and_labels_to_s3(model_name) + mod.save_checkpoint(os.path.join(get_model_path(model_name), model_name), 1) inference_results = mod.predict(data_iter) ### Save inference_results + # Save the model files + save_data_and_labels(test_data, test_label, model_name) save_inference_results(inference_results, model_name) - ### upload model and inference files to S3 - files = list() - files.append(model_name + '-0001.params') - files.append(model_name + '-symbol.json') - files.append(model_name + '-inference') - - mxnet_folder = str(mxnet_version) + backslash + model_name + backslash - - upload_model_files_to_s3(files, mxnet_folder) - - clean_model_files(files, model_name) def train_lenet_gluon_save_params_api(): model_name = 'lenet_gluon_save_params_api' + create_model_folder(model_name) logging.info('Saving files for model %s' %model_name) net = Net() weights = mx.initializer.Xavier(magnitude = 2.57) @@ -68,26 +58,15 @@ def train_lenet_gluon_save_params_api(): # ### Save the inference output ys # ### Save the model params - mx.nd.save(model_name + '-data', {'data' : test_data}) + mx.nd.save(os.path.join(get_model_path(model_name), ''.join([model_name, '-data'])), {'data': test_data}) save_inference_results(output, model_name) - net.save_params(model_name + '-params') + net.save_params(os.path.join(get_model_path(model_name), ''.join([model_name, '-params']))) - mxnet_folder = str(mxnet_version) + backslash + model_name + backslash - - files = list() - files.append(model_name + '-data') - files.append(model_name + '-inference') - files.append(model_name + '-params') - - upload_data_and_labels_to_s3(model_name) - - upload_model_files_to_s3(files, mxnet_folder) - - clean_model_files(files, model_name) def train_lenet_gluon_hybrid_export_api(): model_name = 'lenet_gluon_hybrid_export_api' logging.info('Saving files for model %s' %model_name) + create_model_folder(model_name) net = HybridNet() weights = mx.initializer.Xavier(magnitude = 2.57) net.initialize(weights, ctx = [mx.cpu(0)]) @@ -100,24 +79,10 @@ def train_lenet_gluon_hybrid_export_api(): ### Save the inference output ys ### Save the model params - mx.nd.save(model_name + '-data', {'data' : test_data}) + mx.nd.save(os.path.join(get_model_path(model_name), ''.join([model_name, '-data'])), {'data': test_data}) save_inference_results(output, model_name) - net.export(model_name, epoch=1) - - mxnet_folder = str(mxnet_version) + backslash + model_name + backslash + net.export(os.path.join(get_model_path(model_name), model_name), epoch=1) - files = list() - files.append(model_name + '-data') - files.append(model_name + '-inference') - files.append(model_name + '-0001.params') - files.append(model_name + '-symbol.json') - - - upload_data_and_labels_to_s3(model_name) - - upload_model_files_to_s3(files, mxnet_folder) - - clean_model_files(files, model_name) def train_lstm_gluon_save_parameters_api(): ## If this code is being run on version >= 1.2.1 only then execute it, since it uses save_parameters and load_parameters API @@ -127,6 +92,7 @@ def train_lstm_gluon_save_parameters_api(): model_name = 'lstm_gluon_save_parameters_api' logging.info ('Saving files for model %s' %model_name) + create_model_folder(model_name) net = SimpleLSTMModel() weights = mx.initializer.Xavier(magnitude = 2.57) net.initialize(weights, ctx = [mx.cpu(0)]) @@ -134,25 +100,20 @@ def train_lstm_gluon_save_parameters_api(): test_data = mx.nd.array(np.random.uniform(-1, 1, size=(10, 30))) output = net(test_data) # print output - mx.nd.save(model_name + '-data', {'data' : test_data}) + mx.nd.save(os.path.join(get_model_path(model_name), ''.join([model_name, '-data'])), {'data' : test_data}) save_inference_results(output, model_name) - net.save_parameters(model_name + '-params') - - mxnet_folder = str(mxnet_version) + backslash + model_name + backslash + net.save_parameters(os.path.join(get_model_path(model_name) , ''.join([model_name, '-params']))) - files = list() - files.append(model_name + '-data') - files.append(model_name + '-inference') - files.append(model_name + '-params') - - upload_data_and_labels_to_s3(model_name) - - upload_model_files_to_s3(files, mxnet_folder) - - clean_model_files(files, model_name) +def create_root_folder(): + base_path = os.getcwd() + version_path = os.path.join(base_path, 'models') + if not os.path.exists(version_path): + os.mkdir(version_path) if __name__=='__main__': + create_root_folder() + train_module_checkpoint_api() train_lenet_gluon_save_params_api() train_lenet_gluon_hybrid_export_api() diff --git a/tests/nightly/model_backwards_compatibility_check/upload_models_to_s3.sh b/tests/nightly/model_backwards_compatibility_check/upload_models_to_s3.sh new file mode 100755 index 000000000000..b6848f7fc12a --- /dev/null +++ b/tests/nightly/model_backwards_compatibility_check/upload_models_to_s3.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#Author: Piyush Ghai + +echo "uploading model files to s3" + +echo `pwd` +cd ./tests/nightly/model_backwards_compatibility_check/models/ +echo `pwd` + +# The directory structure will be as follows : +# / eg : +# ls /tests/nightly/model_backwards_compatibility_check/models/ +# 1.1.0/ 1.2.0/ 1.2.1/ +# we upload these folders to S3 and the inference files understand them and pull of models off them +for dir in $(ls `pwd`/) +do + echo $dir + aws s3 cp $dir/ s3://mxnet-model-backwards-compatibility-models/$dir/ --recursive +done + +echo "Deleting model files" +cd ../ +rm -rf `pwd`/models From 1f63941e9ade0eaa2f4361ed5877bc97defcf6fb Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Wed, 25 Jul 2018 17:14:17 -0700 Subject: [PATCH 34/59] Updated comments and README --- .../README.md | 17 +++++++++++------ .../train_mxnet_legacy_models.sh | 5 +++++ 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/tests/nightly/model_backwards_compatibility_check/README.md b/tests/nightly/model_backwards_compatibility_check/README.md index f894eafc322f..7a2116ac564e 100644 --- a/tests/nightly/model_backwards_compatibility_check/README.md +++ b/tests/nightly/model_backwards_compatibility_check/README.md @@ -8,13 +8,18 @@ This folder contains the scripts that are required to run the nightly job of ver ## JenkinsfileForMBCC This is configuration file for jenkins job. -## Details -- The `model_backward_compat_checker.sh` is a top level script that invokes the inference files in python. +## Details - Currently the APIs that covered for model saving/loading are : do_checkpoint/load_checkpoint, save_params/load_params, save_parameters/load_parameters(added v1.2.1 onwards), export/gluon.SymbolBlock.imports. -- These APIs are covered over models with architectures such as : MLP, RNNs, LeNet covering the four scenarios described above. +- These APIs are covered over models with architectures such as : MLP, RNNs, LeNet, LSTMs covering the four scenarios described above. - More operators/models will be added in the future to extend the operator coverage. - The model train file is suffixed by `_train.py` and the trained models are hosted in AWS S3. -- The trained models for now are backfilled into S3 starting from every MXNet release version v1.1.0. -- The script for training the models on older versions of MXNet is : `train_mxnet_legacy_models.sh`. -- The inference file is suffixed by `_inference.py`. +- The trained models for now are backfilled into S3 starting from every MXNet release version v1.1.0 via the `train_mxnet_legacy_models.sh`. +- `train_mxnet_legacy_models.sh` script checks out the previous two releases using git tag command and trains and uploads models to S3 on those MXNet versions. +- The S3 bucket's folder structure looks like this : + * 1.1.0/ 1.1.0/ + * 1.2.0/ 1.2.0/ +- The is also a folder which contains the trained model symbol definitions, toy datasets it was trained on, weights and parameters of the model and other relevant files required to reload the model. +- Over a period of time, the training script would have accumulated a repository of models trained over several versions of MXNet (both major and minor releases). +- The inference part is checked via the script `model_backwards_compat_inference.sh`. +- The inference script scans the S3 bucket for MXNet version folders as described above and runs the inference code for each model folder found. diff --git a/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh index 47470333962a..a8effbe33c3a 100755 --- a/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh +++ b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh @@ -40,6 +40,11 @@ echo `pwd` ## Fetch the latest release tags, filtering out 'rcs' and filtering out some other irrelevant ones ## This list is sorted in descending order chronologically. Keeping n = 5 for a precautionary check. ## Sample output for the below git tag command is : 1.2.0 utils 1.1.0 1.0.0 0.12.1 +## so from this sample we will pick up the top two : 1.2.0 and 1.1.0 and train models on them +## Now while performing inference the latest version could be 1.3.0, which will help in validating models trained +## on 1.1.0 and 1.2.0 by loading them on the latest version (1.3.0) +## Over a period of time, the model repository will grow since with every new release we +## upload models trained on newer versions as well through this script previous_versions=($(git tag --sort=-creatordate | grep --invert-match rc | head -n 5)) count=0 for version in ${previous_versions[*]} From fbaf3e00002d203351782d00b01eadc43bd5cc88 Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Wed, 25 Jul 2018 17:03:45 -0700 Subject: [PATCH 35/59] Fixed pylint warnings --- .../common.py | 44 ++-- .../model_backwards_compat_inference.py | 211 +++++++++--------- .../model_backwards_compat_train.py | 167 +++++++------- 3 files changed, 222 insertions(+), 200 deletions(-) diff --git a/tests/nightly/model_backwards_compatibility_check/common.py b/tests/nightly/model_backwards_compatibility_check/common.py index 26026be0c079..c2c8598b78ba 100644 --- a/tests/nightly/model_backwards_compatibility_check/common.py +++ b/tests/nightly/model_backwards_compatibility_check/common.py @@ -48,30 +48,35 @@ s3 = boto3.resource('s3') ctx = mx.cpu(0) + def get_model_path(model_name): return os.path.join(os.getcwd(), 'models', str(mxnet_version), model_name) + def get_module_api_model_definition(): input = mx.symbol.Variable('data') input = mx.symbol.Flatten(data=input) fc1 = mx.symbol.FullyConnected(data=input, name='fc1', num_hidden=128) act1 = mx.sym.Activation(data=fc1, name='relu1', act_type="relu") - fc2 = mx.symbol.FullyConnected(data=fc1, name='fc2', num_hidden=2) + fc2 = mx.symbol.FullyConnected(data=act1, name='fc2', num_hidden=2) op = mx.symbol.SoftmaxOutput(data=fc2, name='softmax') model = mx.mod.Module(symbol=op, context=ctx, data_names=['data'], label_names=['softmax_label']) return model + def save_inference_results(inference_results, model_name): assert (isinstance(inference_results, mx.ndarray.ndarray.NDArray)) save_path = os.path.join(get_model_path(model_name), ''.join([model_name, '-inference'])) mx.nd.save(save_path, {'inference': inference_results}) + def load_inference_results(model_name): inf_dict = mx.nd.load(model_name+'-inference') return inf_dict['inference'] + def save_data_and_labels(test_data, test_labels, model_name): assert (isinstance(test_data, mx.ndarray.ndarray.NDArray)) assert (isinstance(test_labels, mx.ndarray.ndarray.NDArray)) @@ -79,17 +84,20 @@ def save_data_and_labels(test_data, test_labels, model_name): save_path = os.path.join(get_model_path(model_name), ''.join([model_name, '-data'])) mx.nd.save(save_path, {'data': test_data, 'labels': test_labels}) + def upload_data_and_labels_to_s3(model_name): s3 = boto3.client('s3') file = model_name + '-data' s3.upload_file(file, model_bucket_name, data_folder + backslash + file) logging.info('data files successfully uploaded to s3') + def upload_model_files_to_s3(files, folder_name): s3 = boto3.client('s3') for file in files: s3.upload_file(file, model_bucket_name, folder_name + file) + def clean_model_files(files, model_name): files.append(model_name + '-inference') files.append(model_name + '-data') @@ -110,19 +118,21 @@ def download_model_files_from_s3(model_name, folder_name): for obj in model_files_meta: file_name = obj.key.split('/')[2] model_files.append(file_name) - ## Download this file--- + # Download this file bucket.download_file(obj.key, file_name) return model_files + def get_top_level_folders_in_bucket(s3client, bucket_name): - '''This function returns the top level folders in the S3Bucket. These folders help us to navigate to the trained model files stored for different MXNet versions. ''' + # This function returns the top level folders in the S3Bucket. + # These folders help us to navigate to the trained model files stored for different MXNet versions. bucket = s3client.Bucket(bucket_name) - result = bucket.meta.client.list_objects(Bucket=bucket.name, - Delimiter=backslash) + result = bucket.meta.client.list_objects(Bucket=bucket.name, Delimiter=backslash) folder_list = list() if 'CommonPrefixes' not in result: - logging.error('No trained models found in S3 bucket : %s for this file. Please train the models and run inference again' %bucket_name) + logging.error('No trained models found in S3 bucket : %s for this file. ' + 'Please train the models and run inference again' % bucket_name) return folder_list for obj in result['CommonPrefixes']: folder_name = obj['Prefix'].strip(backslash) @@ -133,21 +143,23 @@ def get_top_level_folders_in_bucket(s3client, bucket_name): return folder_list + def create_model_folder(model_name): path = get_model_path(model_name) if not os.path.exists(path): os.makedirs(path) + class Net(gluon.Block): def __init__(self, **kwargs): super(Net, self).__init__(**kwargs) with self.name_scope(): # layers created in name_scope will inherit name space # from parent layer. - self.conv1 = nn.Conv2D(20, kernel_size=(5,5)) - self.pool1 = nn.MaxPool2D(pool_size=(2,2), strides = (2,2)) - self.conv2 = nn.Conv2D(50, kernel_size=(5,5)) - self.pool2 = nn.MaxPool2D(pool_size=(2,2), strides = (2,2)) + self.conv1 = nn.Conv2D(20, kernel_size=(5, 5)) + self.pool1 = nn.MaxPool2D(pool_size=(2, 2), strides=(2, 2)) + self.conv2 = nn.Conv2D(50, kernel_size=(5, 5)) + self.pool2 = nn.MaxPool2D(pool_size=(2, 2), strides=(2, 2)) self.fc1 = nn.Dense(500) self.fc2 = nn.Dense(2) @@ -161,16 +173,17 @@ def forward(self, x): x = F.tanh(self.fc2(x)) return x + class HybridNet(gluon.HybridBlock): def __init__(self, **kwargs): super(HybridNet, self).__init__(**kwargs) with self.name_scope(): # layers created in name_scope will inherit name space # from parent layer. - self.conv1 = nn.Conv2D(20, kernel_size=(5,5)) - self.pool1 = nn.MaxPool2D(pool_size=(2,2), strides = (2,2)) - self.conv2 = nn.Conv2D(50, kernel_size=(5,5)) - self.pool2 = nn.MaxPool2D(pool_size=(2,2), strides = (2,2)) + self.conv1 = nn.Conv2D(20, kernel_size=(5, 5)) + self.pool1 = nn.MaxPool2D(pool_size=(2, 2), strides=(2, 2)) + self.conv2 = nn.Conv2D(50, kernel_size=(5, 5)) + self.pool2 = nn.MaxPool2D(pool_size=(2, 2), strides=(2, 2)) self.fc1 = nn.Dense(500) self.fc2 = nn.Dense(2) @@ -184,6 +197,7 @@ def hybrid_forward(self, F, x): x = F.tanh(self.fc2(x)) return x + class SimpleLSTMModel(gluon.Block): def __init__(self, **kwargs): super(SimpleLSTMModel, self).__init__(**kwargs) @@ -196,10 +210,10 @@ def __init__(self, **kwargs): self.model.add(mx.gluon.nn.Dropout(0.5)) self.model.add(mx.gluon.nn.Dense(2, flatten=True, activation='tanh')) - def forward(self, x): return self.model(x) + def compare_versions(version1, version2): ''' https://stackoverflow.com/questions/1714027/version-number-comparison-in-python diff --git a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py index 5a976e72a1bb..0761a23c181b 100644 --- a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py +++ b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py @@ -19,112 +19,119 @@ from common import * + def test_module_checkpoint_api(): - model_name = 'module_checkpoint_api' - print ('Performing inference for model/API %s' %model_name) - - ## For each MXNet version that has the saved models - for folder in get_top_level_folders_in_bucket(s3, model_bucket_name): - logging.info ('Fetching files for MXNet version : %s and model %s' %(folder, model_name)) - model_files = download_model_files_from_s3(model_name, folder) - if len(model_files) == 0: - logging.warn ('No training files found for %s for MXNet version : %s'%(model_name, folder)) - continue - - data = mx.nd.load(''.join([model_name, '-data'])) - data_iter = mx.io.NDArrayIter(data['data'], data['labels'], batch_size=10) - ## Load the model and perform inference - loaded_model = get_module_api_model_definition() - - sym, arg_params, aux_params = mx.model.load_checkpoint(model_name, 1) - loaded_model.bind(data_shapes=data_iter.provide_data, label_shapes=data_iter.provide_label) - loaded_model.set_params(arg_params, aux_params) - - old_inference_results = load_inference_results(model_name) - inference_results = loaded_model.predict(data_iter) - ## Check whether they are equal or not ? - assert_almost_equal(inference_results.asnumpy(), old_inference_results.asnumpy()) - clean_model_files(model_files, model_name) - logging.info ('=================================') - - logging.info ('Assertion passed for model : %s' %model_name) - + model_name = 'module_checkpoint_api' + print ('Performing inference for model/API %s' % model_name) + + # For each MXNet version that has the saved models + for folder in get_top_level_folders_in_bucket(s3, model_bucket_name): + logging.info('Fetching files for MXNet version : %s and model %s' % (folder, model_name)) + model_files = download_model_files_from_s3(model_name, folder) + if len(model_files) == 0: + logging.warn('No training files found for %s for MXNet version : %s' % (model_name, folder)) + continue + + data = mx.nd.load(''.join([model_name, '-data'])) + data_iter = mx.io.NDArrayIter(data['data'], data['labels'], batch_size=10) + # Load the model and perform inference + loaded_model = get_module_api_model_definition() + + sym, arg_params, aux_params = mx.model.load_checkpoint(model_name, 1) + loaded_model.bind(data_shapes=data_iter.provide_data, label_shapes=data_iter.provide_label) + loaded_model.set_params(arg_params, aux_params) + + old_inference_results = load_inference_results(model_name) + inference_results = loaded_model.predict(data_iter) + # Check whether they are equal or not ? + assert_almost_equal(inference_results.asnumpy(), old_inference_results.asnumpy()) + clean_model_files(model_files, model_name) + logging.info('=================================') + + logging.info('Assertion passed for model : %s' % model_name) + def test_lenet_gluon_load_params_api(): - model_name = 'lenet_gluon_save_params_api' - logging.info ('Performing inference for model/API %s' %model_name) - - for folder in get_top_level_folders_in_bucket(s3, model_bucket_name): - logging.info ('Fetching files for MXNet version : %s and model %s' %(folder, model_name)) - model_files = download_model_files_from_s3(model_name, folder) - if len(model_files) == 0: - logging.warn ('No training files found for %s for MXNet version : %s'%(model_name, folder)) - continue - - data = mx.nd.load(''.join([model_name, '-data'])) - test_data = data['data'] - ## Load the model and perform inference - loaded_model = Net() - loaded_model.load_params(model_name+'-params') - output = loaded_model(test_data) - old_inference_results = mx.nd.load(model_name + '-inference')['inference'] - assert_almost_equal(old_inference_results.asnumpy(), output.asnumpy()) - clean_model_files(model_files, model_name) - logging.info ('=================================') - logging.info ('Assertion passed for model : %s' %model_name) + model_name = 'lenet_gluon_save_params_api' + logging.info('Performing inference for model/API %s' % model_name) + + for folder in get_top_level_folders_in_bucket(s3, model_bucket_name): + logging.info('Fetching files for MXNet version : %s and model %s' % (folder, model_name)) + model_files = download_model_files_from_s3(model_name, folder) + if len(model_files) == 0: + logging.warn('No training files found for %s for MXNet version : %s' % (model_name, folder)) + continue + + data = mx.nd.load(''.join([model_name, '-data'])) + test_data = data['data'] + # Load the model and perform inference + loaded_model = Net() + loaded_model.load_params(model_name + '-params') + output = loaded_model(test_data) + old_inference_results = mx.nd.load(model_name + '-inference')['inference'] + assert_almost_equal(old_inference_results.asnumpy(), output.asnumpy()) + clean_model_files(model_files, model_name) + logging.info('=================================') + logging.info('Assertion passed for model : %s' % model_name) + def test_lenet_gluon_hybrid_imports_api(): - model_name = 'lenet_gluon_hybrid_export_api' - logging.info ('Performing inference for model/API %s' %model_name) - - for folder in get_top_level_folders_in_bucket(s3, model_bucket_name): - logging.info ('Fetching files for MXNet version : %s and model %s' %(folder, model_name)) - model_files = download_model_files_from_s3(model_name, folder) - if len(model_files) == 0: - logging.warn('No training files found for %s for MXNet version : %s'%(model_name, folder)) - continue - ## Load the model and perform inference - data = mx.nd.load(''.join([model_name, '-data'])) - test_data = data['data'] - loaded_model = HybridNet() - loaded_model = gluon.SymbolBlock.imports(model_name + '-symbol.json', ['data'], model_name + '-0001.params') - output = loaded_model(test_data) - old_inference_results = mx.nd.load(model_name + '-inference')['inference'] - assert_almost_equal(old_inference_results.asnumpy(), output.asnumpy()) - clean_model_files(model_files, model_name) - logging.info ('=================================') - logging.info ('Assertion passed for model : %s' %model_name) + model_name = 'lenet_gluon_hybrid_export_api' + logging.info('Performing inference for model/API %s' % model_name) + + for folder in get_top_level_folders_in_bucket(s3, model_bucket_name): + logging.info('Fetching files for MXNet version : %s and model %s' % (folder, model_name)) + model_files = download_model_files_from_s3(model_name, folder) + if len(model_files) == 0: + logging.warn('No training files found for %s for MXNet version : %s' % (model_name, folder)) + continue + # Load the model and perform inference + data = mx.nd.load(''.join([model_name, '-data'])) + test_data = data['data'] + loaded_model = HybridNet() + loaded_model = gluon.SymbolBlock.imports(model_name + '-symbol.json', ['data'], model_name + '-0001.params') + output = loaded_model(test_data) + old_inference_results = mx.nd.load(model_name + '-inference')['inference'] + assert_almost_equal(old_inference_results.asnumpy(), output.asnumpy()) + clean_model_files(model_files, model_name) + logging.info('=================================') + logging.info('Assertion passed for model : %s' % model_name) + def test_lstm_gluon_load_parameters_api(): - ## If this code is being run on version >= 1.2.0 only then execute it, since it uses save_parameters and load_parameters API - if compare_versions(str(mxnet_version), '1.2.1') < 0: - logging.warn('Found MXNet version %s and exiting because this version does not contain save_parameters and load_parameters functions' %str(mxnet_version)) - return - - model_name = 'lstm_gluon_save_parameters_api' - logging.info ('Performing inference for model/API %s and model'%model_name) - - for folder in get_top_level_folders_in_bucket(s3, model_bucket_name): - logging.info ('Fetching files for MXNet version : %s' %folder) - model_files = download_model_files_from_s3(model_name, folder) - if len(model_files) == 0: - logging.warn('No training files found for %s for MXNet version : %s'%(model_name, folder)) - continue - - data = mx.nd.load(''.join([model_name, '-data'])) - test_data = data['data'] - ## Load the model and perform inference - loaded_model = SimpleLSTMModel() - loaded_model.load_parameters(model_name+'-params') - output = loaded_model(test_data) - old_inference_results = mx.nd.load(model_name + '-inference')['inference'] - assert_almost_equal(old_inference_results.asnumpy(), output.asnumpy()) - clean_model_files(model_files, model_name) - logging.info ('=================================') - logging.info ('Assertion passed for model : %s' %model_name) - -if __name__=='__main__': - test_module_checkpoint_api() - test_lenet_gluon_load_params_api() - test_lenet_gluon_hybrid_imports_api() - test_lstm_gluon_load_parameters_api() + # If this code is being run on version >= 1.2.0 only then execute it, + # since it uses save_parameters and load_parameters API + + if compare_versions(str(mxnet_version), '1.2.1') < 0: + logging.warn('Found MXNet version %s and exiting because this version does not contain save_parameters' + ' and load_parameters functions' % str(mxnet_version)) + return + + model_name = 'lstm_gluon_save_parameters_api' + logging.info('Performing inference for model/API %s and model' % model_name) + + for folder in get_top_level_folders_in_bucket(s3, model_bucket_name): + logging.info('Fetching files for MXNet version : %s' % folder) + model_files = download_model_files_from_s3(model_name, folder) + if len(model_files) == 0: + logging.warn('No training files found for %s for MXNet version : %s' % (model_name, folder)) + continue + + data = mx.nd.load(''.join([model_name, '-data'])) + test_data = data['data'] + # Load the model and perform inference + loaded_model = SimpleLSTMModel() + loaded_model.load_parameters(model_name + '-params') + output = loaded_model(test_data) + old_inference_results = mx.nd.load(model_name + '-inference')['inference'] + assert_almost_equal(old_inference_results.asnumpy(), output.asnumpy()) + clean_model_files(model_files, model_name) + logging.info('=================================') + logging.info('Assertion passed for model : %s' % model_name) + + +if __name__ == '__main__': + test_module_checkpoint_api() + test_lenet_gluon_load_params_api() + test_lenet_gluon_hybrid_imports_api() + test_lstm_gluon_load_parameters_api() diff --git a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py index 6fd16daeea88..869c73c01f7e 100644 --- a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py +++ b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py @@ -19,102 +19,103 @@ from common import * + def train_module_checkpoint_api(): - model_name = 'module_checkpoint_api' - create_model_folder(model_name) - logging.info('Saving files for model %s' %model_name) - ### Prepare data - test_data = mx.nd.array(np.random.uniform(-1, 1, size=(20, 1))) - test_label = mx.nd.array(np.random.randint(0, 2, size=(20,)), dtype='float32') - data_iter = mx.io.NDArrayIter(test_data, test_label, batch_size=10) + model_name = 'module_checkpoint_api' + create_model_folder(model_name) + logging.info('Saving files for model %s' % model_name) + # Prepare data + test_data = mx.nd.array(np.random.uniform(-1, 1, size=(20, 1))) + test_label = mx.nd.array(np.random.randint(0, 2, size=(20,)), dtype='float32') + data_iter = mx.io.NDArrayIter(test_data, test_label, batch_size=10) + mod = get_module_api_model_definition() + mod.bind(data_shapes=data_iter.provide_data, label_shapes=data_iter.provide_label) + weights = mx.initializer.Xavier(magnitude=2.57) + mod.init_params(weights) - mod = get_module_api_model_definition() - mod.bind(data_shapes=data_iter.provide_data, label_shapes=data_iter.provide_label) - weights = mx.initializer.Xavier(magnitude = 2.57) - mod.init_params(weights) + mod.save_checkpoint(os.path.join(get_model_path(model_name), model_name), 1) - mod.save_checkpoint(os.path.join(get_model_path(model_name), model_name), 1) + inference_results = mod.predict(data_iter) + # Save inference_results + # Save the model files + save_data_and_labels(test_data, test_label, model_name) + save_inference_results(inference_results, model_name) - inference_results = mod.predict(data_iter) - ### Save inference_results - # Save the model files - save_data_and_labels(test_data, test_label, model_name) - save_inference_results(inference_results, model_name) def train_lenet_gluon_save_params_api(): - model_name = 'lenet_gluon_save_params_api' - create_model_folder(model_name) - logging.info('Saving files for model %s' %model_name) - net = Net() - weights = mx.initializer.Xavier(magnitude = 2.57) - net.initialize(weights, ctx = [mx.cpu(0)]) - ### Prepare data - - test_data = mx.nd.array(np.random.uniform(-1, 1, size=(20, 1, 30, 30))) - output = net(test_data) - # print (y) - # ### Save the test data as well. - # ### Save the inference output ys - # ### Save the model params - - mx.nd.save(os.path.join(get_model_path(model_name), ''.join([model_name, '-data'])), {'data': test_data}) - save_inference_results(output, model_name) - net.save_params(os.path.join(get_model_path(model_name), ''.join([model_name, '-params']))) + model_name = 'lenet_gluon_save_params_api' + create_model_folder(model_name) + logging.info('Saving files for model %s' % model_name) + net = Net() + weights = mx.initializer.Xavier(magnitude=2.57) + net.initialize(weights, ctx=[mx.cpu(0)]) + # Prepare data + + test_data = mx.nd.array(np.random.uniform(-1, 1, size=(20, 1, 30, 30))) + output = net(test_data) + # print (y) + + mx.nd.save(os.path.join(get_model_path(model_name), ''.join([model_name, '-data'])), {'data': test_data}) + save_inference_results(output, model_name) + net.save_params(os.path.join(get_model_path(model_name), ''.join([model_name, '-params']))) def train_lenet_gluon_hybrid_export_api(): - model_name = 'lenet_gluon_hybrid_export_api' - logging.info('Saving files for model %s' %model_name) - create_model_folder(model_name) - net = HybridNet() - weights = mx.initializer.Xavier(magnitude = 2.57) - net.initialize(weights, ctx = [mx.cpu(0)]) - net.hybridize() - ### Prepare data - test_data = mx.nd.array(np.random.uniform(-1, 1, size=(20, 1, 30, 30))) - output = net(test_data) - # print (y) - ### Save the test data as well. - ### Save the inference output ys - ### Save the model params - - mx.nd.save(os.path.join(get_model_path(model_name), ''.join([model_name, '-data'])), {'data': test_data}) - save_inference_results(output, model_name) - net.export(os.path.join(get_model_path(model_name), model_name), epoch=1) + model_name = 'lenet_gluon_hybrid_export_api' + logging.info('Saving files for model %s' % model_name) + create_model_folder(model_name) + net = HybridNet() + weights = mx.initializer.Xavier(magnitude=2.57) + net.initialize(weights, ctx=[mx.cpu(0)]) + net.hybridize() + # Prepare data + test_data = mx.nd.array(np.random.uniform(-1, 1, size=(20, 1, 30, 30))) + output = net(test_data) + # print (y) + # Save the test data as well. + # Save the inference output ys + # Save the model params + + mx.nd.save(os.path.join(get_model_path(model_name), ''.join([model_name, '-data'])), {'data': test_data}) + save_inference_results(output, model_name) + net.export(os.path.join(get_model_path(model_name), model_name), epoch=1) def train_lstm_gluon_save_parameters_api(): - ## If this code is being run on version >= 1.2.1 only then execute it, since it uses save_parameters and load_parameters API - if compare_versions(str(mxnet_version), '1.2.1') < 0: - logging.warn('Found MXNet version %s and exiting because this version does not contain save_parameters and load_parameters functions' %str(mxnet_version)) - return - - model_name = 'lstm_gluon_save_parameters_api' - logging.info ('Saving files for model %s' %model_name) - create_model_folder(model_name) - net = SimpleLSTMModel() - weights = mx.initializer.Xavier(magnitude = 2.57) - net.initialize(weights, ctx = [mx.cpu(0)]) - - test_data = mx.nd.array(np.random.uniform(-1, 1, size=(10, 30))) - output = net(test_data) - # print output - mx.nd.save(os.path.join(get_model_path(model_name), ''.join([model_name, '-data'])), {'data' : test_data}) - save_inference_results(output, model_name) - net.save_parameters(os.path.join(get_model_path(model_name) , ''.join([model_name, '-params']))) + # If this code is being run on version >= 1.2.1 only then execute it, + # since it uses save_parameters and load_parameters API + if compare_versions(str(mxnet_version), '1.2.1') < 0: + logging.warn('Found MXNet version %s and exiting because this version does not contain save_parameters' + ' and load_parameters functions' % str(mxnet_version)) + return + + model_name = 'lstm_gluon_save_parameters_api' + logging.info('Saving files for model %s' % model_name) + create_model_folder(model_name) + net = SimpleLSTMModel() + weights = mx.initializer.Xavier(magnitude=2.57) + net.initialize(weights, ctx=[mx.cpu(0)]) + + test_data = mx.nd.array(np.random.uniform(-1, 1, size=(10, 30))) + output = net(test_data) + # print output + mx.nd.save(os.path.join(get_model_path(model_name), ''.join([model_name, '-data'])), {'data': test_data}) + save_inference_results(output, model_name) + net.save_parameters(os.path.join(get_model_path(model_name), ''.join([model_name, '-params']))) def create_root_folder(): - base_path = os.getcwd() - version_path = os.path.join(base_path, 'models') - if not os.path.exists(version_path): - os.mkdir(version_path) - -if __name__=='__main__': - create_root_folder() - - train_module_checkpoint_api() - train_lenet_gluon_save_params_api() - train_lenet_gluon_hybrid_export_api() - train_lstm_gluon_save_parameters_api() + base_path = os.getcwd() + version_path = os.path.join(base_path, 'models') + if not os.path.exists(version_path): + os.mkdir(version_path) + + +if __name__ == '__main__': + create_root_folder() + + train_module_checkpoint_api() + train_lenet_gluon_save_params_api() + train_lenet_gluon_hybrid_export_api() + train_lstm_gluon_save_parameters_api() From edd68164526cf591b090322f8f6e0805e17c17ee Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Thu, 26 Jul 2018 09:06:55 -0700 Subject: [PATCH 36/59] Removed the venv for training process --- ci/docker/runtime_functions.sh | 5 ----- 1 file changed, 5 deletions(-) diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index f7dd32d69758..1d8673c0f321 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -906,12 +906,7 @@ nightly_model_backwards_compat_test() { nightly_model_backwards_compat_train() { set -ex export PYTHONPATH=./python/ - VENV=mbcc_py2_venv - virtualenv -p `which python2` $VENV - source $VENV/bin/activate ./tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh - #Deactivate the virtual env once we are done with it - deactivate } # Nightly 'MXNet: The Straight Dope' Single-GPU Tests From 87103d43dee19bb42ac4a7296a9be940ca44dffc Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Thu, 26 Jul 2018 09:15:19 -0700 Subject: [PATCH 37/59] Fixed indentation in the MBCC Jenkins file and also separated out training and inference into two separate stages --- .../JenkinsfileForMBCC | 32 +++++++++++-------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC b/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC index f659d9b39c91..b04a8c7e1a7d 100644 --- a/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC +++ b/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC @@ -49,21 +49,25 @@ def docker_run(platform, function_name, use_nvidia, shared_mem = '500m') { } try { - stage('MBCC'){ - parallel 'ModelBackwardsCompat: CPU': { - node('mxnetlinux-cpu') { - ws('workspace/modelBackwardsCompat') { - - init_git() - // Train models on older versions - docker_run('ubuntu_nightly_cpu', 'nightly_model_backwards_compat_train', false) - // upload files to S3 here outside of the docker environment - sh /tests/nightly/model_backwards_compatibility_check/upload_models_to_s3.sh - // Perform inference on these models - docker_run('ubuntu_nightly_cpu', 'nightly_model_backwards_compat_test', false) + stage('MBCC Train'){ + node('mxnetlinux-cpu') { + ws('workspace/modelBackwardsCompat') { + init_git() + // Train models on older versions + docker_run('ubuntu_nightly_cpu', 'nightly_model_backwards_compat_train', false) + // upload files to S3 here outside of the docker environment + sh ./tests/nightly/model_backwards_compatibility_check/upload_models_to_s3.sh + } + } + } - } - } + stage('MBCC Inference)'{ + node('mxnetlinux-cpu') { + ws('workspace/modelBackwardsCompat') { + init_git() + // Perform inference on the latest version of MXNet + docker_run('ubuntu_nightly_cpu', 'nightly_model_backwards_compat_test', false) + } } } } catch (caughtError) { From eb24e8ed5a4d7d493b2b6b762f8e5a15f83dd83b Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Thu, 26 Jul 2018 09:20:37 -0700 Subject: [PATCH 38/59] Fixed indendation --- .../model_backwards_compatibility_check/JenkinsfileForMBCC | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC b/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC index b04a8c7e1a7d..bf2bbb199587 100644 --- a/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC +++ b/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC @@ -57,7 +57,7 @@ try { docker_run('ubuntu_nightly_cpu', 'nightly_model_backwards_compat_train', false) // upload files to S3 here outside of the docker environment sh ./tests/nightly/model_backwards_compatibility_check/upload_models_to_s3.sh - } + } } } @@ -67,7 +67,7 @@ try { init_git() // Perform inference on the latest version of MXNet docker_run('ubuntu_nightly_cpu', 'nightly_model_backwards_compat_test', false) - } + } } } } catch (caughtError) { From 352565681308fc7868a0973357987c9a4e16069c Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Thu, 26 Jul 2018 10:46:25 -0700 Subject: [PATCH 39/59] Fixed erroneous single quote --- .../model_backwards_compatibility_check/JenkinsfileForMBCC | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC b/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC index bf2bbb199587..fce0ea727791 100644 --- a/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC +++ b/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC @@ -61,7 +61,7 @@ try { } } - stage('MBCC Inference)'{ + stage('MBCC Inference'){ node('mxnetlinux-cpu') { ws('workspace/modelBackwardsCompat') { init_git() From 25e7ec797b0f6eaf2eb6ffd23b9909a3f8468b9b Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Thu, 26 Jul 2018 12:18:51 -0700 Subject: [PATCH 40/59] Added --user flag to check for Jenkins error --- .../train_mxnet_legacy_models.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh index a8effbe33c3a..e61c355d9f04 100755 --- a/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh +++ b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh @@ -30,7 +30,7 @@ run_models() { install_mxnet() { version=$1 echo "Installing MXNet "$version - pip install mxnet==$version + pip install mxnet==$version --user } echo `pwd` From 00ee6e72ea7f83c33483db361e10808edfaf5341 Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Thu, 26 Jul 2018 12:42:54 -0700 Subject: [PATCH 41/59] Removed unused methods --- .../model_backwards_compatibility_check/common.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/tests/nightly/model_backwards_compatibility_check/common.py b/tests/nightly/model_backwards_compatibility_check/common.py index c2c8598b78ba..b8c0690170fd 100644 --- a/tests/nightly/model_backwards_compatibility_check/common.py +++ b/tests/nightly/model_backwards_compatibility_check/common.py @@ -85,19 +85,6 @@ def save_data_and_labels(test_data, test_labels, model_name): mx.nd.save(save_path, {'data': test_data, 'labels': test_labels}) -def upload_data_and_labels_to_s3(model_name): - s3 = boto3.client('s3') - file = model_name + '-data' - s3.upload_file(file, model_bucket_name, data_folder + backslash + file) - logging.info('data files successfully uploaded to s3') - - -def upload_model_files_to_s3(files, folder_name): - s3 = boto3.client('s3') - for file in files: - s3.upload_file(file, model_bucket_name, folder_name + file) - - def clean_model_files(files, model_name): files.append(model_name + '-inference') files.append(model_name + '-data') From a3a72b8d5ff125c8c3c3e1b6e63b146108ac0d33 Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Thu, 26 Jul 2018 13:32:23 -0700 Subject: [PATCH 42/59] Added force flag in the pip command to install mxnet --- .../train_mxnet_legacy_models.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh index e61c355d9f04..be5a1652aa7f 100755 --- a/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh +++ b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh @@ -30,7 +30,7 @@ run_models() { install_mxnet() { version=$1 echo "Installing MXNet "$version - pip install mxnet==$version --user + pip install --upgrade --force-reinstall mxnet==$version --user } echo `pwd` From 86e88824e8d990015f2f5865c635720684f119bf Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Thu, 26 Jul 2018 13:35:12 -0700 Subject: [PATCH 43/59] Removed the force-re-install flag --- .../train_mxnet_legacy_models.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh index be5a1652aa7f..e61c355d9f04 100755 --- a/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh +++ b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh @@ -30,7 +30,7 @@ run_models() { install_mxnet() { version=$1 echo "Installing MXNet "$version - pip install --upgrade --force-reinstall mxnet==$version --user + pip install mxnet==$version --user } echo `pwd` From ddb672a72bd995708bd6b01a525ab472388434a5 Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Thu, 26 Jul 2018 13:35:26 -0700 Subject: [PATCH 44/59] Changed exit 1 to exit 0 --- .../train_mxnet_legacy_models.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh index e61c355d9f04..04861bfbcbe4 100755 --- a/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh +++ b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh @@ -53,7 +53,7 @@ do if [[ "$count" -gt 1 ]] then echo "Successfully trained files for the previous two MXNet release versions" - exit 1 + exit 0 fi ## If MXNet major version starts with a number >=1. with a wildcard match for the minor version numbers From 9e7706400f802233d9b5b98517f4141ccfa924f9 Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Thu, 26 Jul 2018 14:19:57 -0700 Subject: [PATCH 45/59] Added quotes around the shell command --- .../model_backwards_compatibility_check/JenkinsfileForMBCC | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC b/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC index fce0ea727791..d2fbb9472ec7 100644 --- a/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC +++ b/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC @@ -56,7 +56,7 @@ try { // Train models on older versions docker_run('ubuntu_nightly_cpu', 'nightly_model_backwards_compat_train', false) // upload files to S3 here outside of the docker environment - sh ./tests/nightly/model_backwards_compatibility_check/upload_models_to_s3.sh + sh "./tests/nightly/model_backwards_compatibility_check/upload_models_to_s3.sh" } } } From 69843fbe4d6669c135d3ae85aa56df144bc6c076 Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Thu, 26 Jul 2018 15:28:02 -0700 Subject: [PATCH 46/59] added packlibs and unpack libs for MXNet builds --- .../JenkinsfileForMBCC | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC b/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC index d2fbb9472ec7..999f9c8cac30 100644 --- a/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC +++ b/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC @@ -20,6 +20,7 @@ //This is a Jenkinsfile for the model backwards compatibility checker. The format and some functions have been picked up from the top-level Jenkinsfile. err = null +mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a' def init_git() { deleteDir() @@ -38,6 +39,24 @@ def init_git() { } } +// pack libraries for later use +def pack_lib(name, libs=mx_lib) { + sh """ +echo "Packing ${libs} into ${name}" +echo ${libs} | sed -e 's/,/ /g' | xargs md5sum +""" + stash includes: libs, name: name +} + +// unpack libraries saved before +def unpack_lib(name, libs=mx_lib) { + unstash name + sh """ +echo "Unpacked ${libs} from ${name}" +echo ${libs} | sed -e 's/,/ /g' | xargs md5sum +""" +} + def docker_run(platform, function_name, use_nvidia, shared_mem = '500m') { def command = "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY} %USE_NVIDIA% --platform %PLATFORM% --shm-size %SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%" command = command.replaceAll('%USE_NVIDIA%', use_nvidia ? '--nvidiadocker' : '') @@ -61,10 +80,21 @@ try { } } + stage('MXNet Build'){ + node('mxnetlinux-cpu') { + ws('workspace/build-cpu') { + init_git() + docker_run('ubuntu_cpu','build_ubuntu_cpu', false) + pack_lib('cpu', mx_lib) + } + } + } + stage('MBCC Inference'){ node('mxnetlinux-cpu') { ws('workspace/modelBackwardsCompat') { init_git() + unpack_lib('cpu', mx_lib) // Perform inference on the latest version of MXNet docker_run('ubuntu_nightly_cpu', 'nightly_model_backwards_compat_test', false) } From fae44fe22e322d928fa735968476d38cfbf26e62 Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Fri, 27 Jul 2018 08:23:31 -0700 Subject: [PATCH 47/59] Changed PythonPath from relative to absolute --- ci/docker/runtime_functions.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index 1d8673c0f321..5f4357f5edff 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -898,14 +898,14 @@ nightly_test_javascript() { #Tests Model backwards compatibility on MXNet nightly_model_backwards_compat_test() { set -ex - export PYTHONPATH=./python/ + export PYTHONPATH=/work/mxnet/python/ ./tests/nightly/model_backwards_compatibility_check/model_backward_compat_checker.sh } #Backfills S3 bucket with models trained on earlier versions of mxnet nightly_model_backwards_compat_train() { set -ex - export PYTHONPATH=./python/ + export PYTHONPATH=/work/mxnet/python/ ./tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh } From c0999797ce048866fb9e792d9d182095ded68b4f Mon Sep 17 00:00:00 2001 From: Marco de Abreu Date: Mon, 30 Jul 2018 13:29:52 +0200 Subject: [PATCH 48/59] Created dedicated bucket with correct permission --- .../JenkinsfileForMBCC | 10 +++++----- .../model_backwards_compatibility_check/common.py | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC b/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC index 999f9c8cac30..412d68d56ff3 100644 --- a/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC +++ b/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC @@ -69,7 +69,7 @@ def docker_run(platform, function_name, use_nvidia, shared_mem = '500m') { try { stage('MBCC Train'){ - node('mxnetlinux-cpu') { + node('restricted-mxnetlinux-cpu') { ws('workspace/modelBackwardsCompat') { init_git() // Train models on older versions @@ -81,7 +81,7 @@ try { } stage('MXNet Build'){ - node('mxnetlinux-cpu') { + node('restricted-mxnetlinux-cpu') { ws('workspace/build-cpu') { init_git() docker_run('ubuntu_cpu','build_ubuntu_cpu', false) @@ -91,7 +91,7 @@ try { } stage('MBCC Inference'){ - node('mxnetlinux-cpu') { + node('restricted-mxnetlinux-cpu') { ws('workspace/modelBackwardsCompat') { init_git() unpack_lib('cpu', mx_lib) @@ -101,13 +101,13 @@ try { } } } catch (caughtError) { - node("mxnetlinux-cpu") { + node("restricted-mxnetlinux-cpu") { sh "echo caught ${caughtError}" err = caughtError currentBuild.result = "FAILURE" } } finally { - node("mxnetlinux-cpu") { + node("restricted-mxnetlinux-cpu") { // Only send email if model backwards compat test failed if (currentBuild.result == "FAILURE") { emailext body: 'Nightly tests for model backwards compatibity on MXNet branch : ${BRANCH_NAME} failed. Please view the build at ${BUILD_URL}', replyTo: '${EMAIL}', subject: '[MODEL BACKWARDS COMPATIBILITY TEST FAILED] build ${BUILD_NUMBER}', to: '${EMAIL}' diff --git a/tests/nightly/model_backwards_compatibility_check/common.py b/tests/nightly/model_backwards_compatibility_check/common.py index b8c0690170fd..2f8ed6ffe26b 100644 --- a/tests/nightly/model_backwards_compatibility_check/common.py +++ b/tests/nightly/model_backwards_compatibility_check/common.py @@ -42,7 +42,7 @@ # get the current mxnet version we are running on mxnet_version = mx.__version__ -model_bucket_name = 'mxnet-model-backwards-compatibility-models' +model_bucket_name = 'mxnet-ci-prod-backwards-compatibility-models' data_folder = 'mxnet-model-backwards-compatibility-data' backslash = '/' s3 = boto3.resource('s3') @@ -207,4 +207,4 @@ def compare_versions(version1, version2): ''' def normalize(v): return [int(x) for x in re.sub(r'(\.0+)*$','', v).split(".")] - return cmp(normalize(version1), normalize(version2)) \ No newline at end of file + return cmp(normalize(version1), normalize(version2)) From ffcc63718582ea15820b8ed9aaaae95cb5f16271 Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Mon, 30 Jul 2018 08:04:59 -0700 Subject: [PATCH 49/59] Fix for python path in training --- ci/docker/runtime_functions.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index 5f4357f5edff..4fcde72faa38 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -905,7 +905,7 @@ nightly_model_backwards_compat_test() { #Backfills S3 bucket with models trained on earlier versions of mxnet nightly_model_backwards_compat_train() { set -ex - export PYTHONPATH=/work/mxnet/python/ + export PYTHONPATH=./python/ ./tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh } From 33096c02f56186464b34ba6b88480290eb89263f Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Mon, 30 Jul 2018 08:32:24 -0700 Subject: [PATCH 50/59] Changed bucket name to CI bucket --- .../model_backwards_compatibility_check/upload_models_to_s3.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/nightly/model_backwards_compatibility_check/upload_models_to_s3.sh b/tests/nightly/model_backwards_compatibility_check/upload_models_to_s3.sh index b6848f7fc12a..d1a0825fb774 100755 --- a/tests/nightly/model_backwards_compatibility_check/upload_models_to_s3.sh +++ b/tests/nightly/model_backwards_compatibility_check/upload_models_to_s3.sh @@ -33,7 +33,7 @@ echo `pwd` for dir in $(ls `pwd`/) do echo $dir - aws s3 cp $dir/ s3://mxnet-model-backwards-compatibility-models/$dir/ --recursive + aws s3 cp $dir/ s3://mxnet-ci-prod-backwards-compatibility-models/$dir/ --recursive done echo "Deleting model files" From 8a085b582babab8ceed91f3d2680da46c8192957 Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Mon, 30 Jul 2018 09:04:52 -0700 Subject: [PATCH 51/59] Added set -ex to the upload shell script --- .../model_backwards_compatibility_check/upload_models_to_s3.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/nightly/model_backwards_compatibility_check/upload_models_to_s3.sh b/tests/nightly/model_backwards_compatibility_check/upload_models_to_s3.sh index d1a0825fb774..16923980aca9 100755 --- a/tests/nightly/model_backwards_compatibility_check/upload_models_to_s3.sh +++ b/tests/nightly/model_backwards_compatibility_check/upload_models_to_s3.sh @@ -19,6 +19,8 @@ #Author: Piyush Ghai +set -ex + echo "uploading model files to s3" echo `pwd` From 5207ab1f6e99ea8641c76f3516d0fd727cb47aa2 Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Mon, 30 Jul 2018 09:14:29 -0700 Subject: [PATCH 52/59] Now raising an exception if no models are found in the S3 bucket --- tests/nightly/model_backwards_compatibility_check/common.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/nightly/model_backwards_compatibility_check/common.py b/tests/nightly/model_backwards_compatibility_check/common.py index 2f8ed6ffe26b..4e20b12eb84d 100644 --- a/tests/nightly/model_backwards_compatibility_check/common.py +++ b/tests/nightly/model_backwards_compatibility_check/common.py @@ -120,6 +120,8 @@ def get_top_level_folders_in_bucket(s3client, bucket_name): if 'CommonPrefixes' not in result: logging.error('No trained models found in S3 bucket : %s for this file. ' 'Please train the models and run inference again' % bucket_name) + raise Exception("No trained models found in S3 bucket : %s for this file. " + "Please train the models and run inference again" % bucket_name) return folder_list for obj in result['CommonPrefixes']: folder_name = obj['Prefix'].strip(backslash) From 5e30f7ab112833c303ee73d25f94838060d56f0b Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Mon, 30 Jul 2018 09:50:49 -0700 Subject: [PATCH 53/59] Added regex to train models script --- .../train_mxnet_legacy_models.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh index 04861bfbcbe4..dbc10df9d820 100755 --- a/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh +++ b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh @@ -38,14 +38,14 @@ cd tests/nightly/model_backwards_compatibility_check echo `pwd` ## Fetch the latest release tags, filtering out 'rcs' and filtering out some other irrelevant ones -## This list is sorted in descending order chronologically. Keeping n = 5 for a precautionary check. +## This list is sorted in descending order chronologically. ## Sample output for the below git tag command is : 1.2.0 utils 1.1.0 1.0.0 0.12.1 ## so from this sample we will pick up the top two : 1.2.0 and 1.1.0 and train models on them ## Now while performing inference the latest version could be 1.3.0, which will help in validating models trained ## on 1.1.0 and 1.2.0 by loading them on the latest version (1.3.0) ## Over a period of time, the model repository will grow since with every new release we ## upload models trained on newer versions as well through this script -previous_versions=($(git tag --sort=-creatordate | grep --invert-match rc | head -n 5)) +previous_versions=($(git tag --sort=-creatordate | grep --invert-match rc)) count=0 for version in ${previous_versions[*]} do @@ -57,10 +57,10 @@ do fi ## If MXNet major version starts with a number >=1. with a wildcard match for the minor version numbers - if [[ $version = [1-9]* ]] + if [[ $version = [1-9].[0-9].[0-9] ]] then count=$((count + 1)) - # echo $version + #echo $version install_mxnet $version run_models fi From e079d3ce47656cceaad92b018c52bc0572bb0367 Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Mon, 30 Jul 2018 09:52:24 -0700 Subject: [PATCH 54/59] Added check for performing inference only on models trained on same major versions --- .../nightly/model_backwards_compatibility_check/common.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/nightly/model_backwards_compatibility_check/common.py b/tests/nightly/model_backwards_compatibility_check/common.py index 4e20b12eb84d..8bf02c4271f2 100644 --- a/tests/nightly/model_backwards_compatibility_check/common.py +++ b/tests/nightly/model_backwards_compatibility_check/common.py @@ -125,11 +125,19 @@ def get_top_level_folders_in_bucket(s3client, bucket_name): return folder_list for obj in result['CommonPrefixes']: folder_name = obj['Prefix'].strip(backslash) + # We only compare models from the same major versions. i.e. 1.x.x compared with latest 1.y.y etc + if str(folder_name).split('.')[0] != str(mxnet_version).split('.')[0]: + continue # The top level folders contain MXNet Version # for trained models. Skipping the data folder here if folder_name == data_folder: continue folder_list.append(obj['Prefix'].strip(backslash)) + if len(folder_list) == 0: + logging.error('No trained models found in S3 bucket : %s for this file. ' + 'Please train the models and run inference again' % bucket_name) + raise Exception("No trained models found in S3 bucket : %s for this file. " + "Please train the models and run inference again" % bucket_name) return folder_list From ceac705fe133cc7100f8c0b5076f943c4315462d Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Mon, 30 Jul 2018 10:03:23 -0700 Subject: [PATCH 55/59] Added set -ex flags to shell scripts --- .../model_backward_compat_checker.sh | 2 ++ .../train_mxnet_legacy_models.sh | 2 ++ 2 files changed, 4 insertions(+) diff --git a/tests/nightly/model_backwards_compatibility_check/model_backward_compat_checker.sh b/tests/nightly/model_backwards_compatibility_check/model_backward_compat_checker.sh index 66bd93102663..23386836ed83 100755 --- a/tests/nightly/model_backwards_compatibility_check/model_backward_compat_checker.sh +++ b/tests/nightly/model_backwards_compatibility_check/model_backward_compat_checker.sh @@ -19,6 +19,8 @@ #Author: Piyush Ghai +set -ex + echo "Invoking model_backwards_compat_checker.sh script" echo `pwd` cd tests/nightly/model_backwards_compatibility_check diff --git a/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh index dbc10df9d820..fdcfa97ebefd 100755 --- a/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh +++ b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh @@ -19,6 +19,8 @@ #Author: Piyush Ghai +set -ex + run_models() { echo '==========================' echo "Running training files and preparing models" From 16d320a926d4f23265f730d2fc197ecec5bc4a2e Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Mon, 30 Jul 2018 11:11:51 -0700 Subject: [PATCH 56/59] Added multi-version regex checks in training --- .../train_mxnet_legacy_models.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh index fdcfa97ebefd..b40a112c6e09 100755 --- a/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh +++ b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh @@ -59,7 +59,9 @@ do fi ## If MXNet major version starts with a number >=1. with a wildcard match for the minor version numbers - if [[ $version = [1-9].[0-9].[0-9] ]] + ## Could have used a [[:digit:]]+. as well but it was not working as a traditional regex in bash. + ## so had to resort to using [[:digit:]] [[:digit:]]* to indicate multi-digit version regex match + if [[ $version = [[:digit:][[:digit]:]*.[[:digit:]].[[:digit:]] ]] then count=$((count + 1)) #echo $version From 19495d62b6f07ef30a234adc673647e4173ebbd2 Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Mon, 30 Jul 2018 11:23:22 -0700 Subject: [PATCH 57/59] Fixed typo in regex --- .../train_mxnet_legacy_models.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh index b40a112c6e09..bdeb365e2557 100755 --- a/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh +++ b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh @@ -61,7 +61,7 @@ do ## If MXNet major version starts with a number >=1. with a wildcard match for the minor version numbers ## Could have used a [[:digit:]]+. as well but it was not working as a traditional regex in bash. ## so had to resort to using [[:digit:]] [[:digit:]]* to indicate multi-digit version regex match - if [[ $version = [[:digit:][[:digit]:]*.[[:digit:]].[[:digit:]] ]] + if [[ $version = [[:digit:][[:digit:]]*.[[:digit:]].[[:digit:]] ]] then count=$((count + 1)) #echo $version From d8fa75d72bd87466112f58865cebb38883b63171 Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Mon, 30 Jul 2018 15:35:44 -0700 Subject: [PATCH 58/59] Now we will train models for all the minor versions for a given major version by traversing the tags --- .../common.py | 10 +---- .../model_backwards_compat_inference.py | 2 +- .../model_backwards_compat_train.py | 8 +++- .../train_mxnet_legacy_models.sh | 39 ++++++++++++------- 4 files changed, 35 insertions(+), 24 deletions(-) diff --git a/tests/nightly/model_backwards_compatibility_check/common.py b/tests/nightly/model_backwards_compatibility_check/common.py index 8bf02c4271f2..4c61cc4e3267 100644 --- a/tests/nightly/model_backwards_compatibility_check/common.py +++ b/tests/nightly/model_backwards_compatibility_check/common.py @@ -20,19 +20,13 @@ import boto3 import mxnet as mx -import json import os import numpy as np import logging -from mxnet import nd, autograd, gluon -import mxnet.ndarray as nd -from mxnet.gluon.data.vision import transforms, datasets -from mxnet import autograd as ag +from mxnet import gluon import mxnet.ndarray as F -from mxnet.gluon import nn, rnn +from mxnet.gluon import nn import re -import time -import sys from mxnet.test_utils import assert_almost_equal # Set fixed random seeds. diff --git a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py index 0761a23c181b..ae368e3a0fc6 100644 --- a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py +++ b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py @@ -89,7 +89,7 @@ def test_lenet_gluon_hybrid_imports_api(): data = mx.nd.load(''.join([model_name, '-data'])) test_data = data['data'] loaded_model = HybridNet() - loaded_model = gluon.SymbolBlock.imports(model_name + '-symbol.json', ['data'], model_name + '-0001.params') + loaded_model = gluon.SymbolBlock.imports(model_name + '-symbol.json', ['data'], model_name + '-0000.params') output = loaded_model(test_data) old_inference_results = mx.nd.load(model_name + '-inference')['inference'] assert_almost_equal(old_inference_results.asnumpy(), output.asnumpy()) diff --git a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py index 869c73c01f7e..289d47c705db 100644 --- a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py +++ b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py @@ -79,7 +79,13 @@ def train_lenet_gluon_hybrid_export_api(): mx.nd.save(os.path.join(get_model_path(model_name), ''.join([model_name, '-data'])), {'data': test_data}) save_inference_results(output, model_name) - net.export(os.path.join(get_model_path(model_name), model_name), epoch=1) + if compare_versions(str(mxnet_version) , '1.1.0') < 0: + # v1.0.0 does not have the epoch param in the .exports API. Hence adding this safety net + net.export(os.path.join(get_model_path(model_name), model_name)) + else: + # Saving with 0 since by default on 1.0.0 it was saved with 0, so simplifying things + net.export(os.path.join(get_model_path(model_name), model_name), epoch=0) + def train_lstm_gluon_save_parameters_api(): diff --git a/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh index bdeb365e2557..3a94440ccee4 100755 --- a/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh +++ b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh @@ -35,6 +35,17 @@ install_mxnet() { pip install mxnet==$version --user } +## Cuts the string and gives only the major version part. +## eg : 12.3.0 ---> 12 +get_major_version() { + major=$(echo $1 | cut -d. -f1) + echo $major +} + +## We read the current major version from libinfo.py file. And we extract the major version from it. +curr_mxnet_version=$(grep -w "__version__" python/mxnet/libinfo.py | grep -o '".*"' | sed 's/"//g') +curr_major_version=$(get_major_version $curr_mxnet_version) + echo `pwd` cd tests/nightly/model_backwards_compatibility_check echo `pwd` @@ -42,7 +53,7 @@ echo `pwd` ## Fetch the latest release tags, filtering out 'rcs' and filtering out some other irrelevant ones ## This list is sorted in descending order chronologically. ## Sample output for the below git tag command is : 1.2.0 utils 1.1.0 1.0.0 0.12.1 -## so from this sample we will pick up the top two : 1.2.0 and 1.1.0 and train models on them +## so from this sample, we will pick up all the versions matching with the current latest version ## Now while performing inference the latest version could be 1.3.0, which will help in validating models trained ## on 1.1.0 and 1.2.0 by loading them on the latest version (1.3.0) ## Over a period of time, the model repository will grow since with every new release we @@ -51,21 +62,21 @@ previous_versions=($(git tag --sort=-creatordate | grep --invert-match rc)) count=0 for version in ${previous_versions[*]} do - # We just need to train the previous two versions. This logic can be changed later on as welll. - if [[ "$count" -gt 1 ]] - then - echo "Successfully trained files for the previous two MXNet release versions" - exit 0 - fi - ## If MXNet major version starts with a number >=1. with a wildcard match for the minor version numbers ## Could have used a [[:digit:]]+. as well but it was not working as a traditional regex in bash. ## so had to resort to using [[:digit:]] [[:digit:]]* to indicate multi-digit version regex match - if [[ $version = [[:digit:][[:digit:]]*.[[:digit:]].[[:digit:]] ]] + ## Example : #previous_versions=(12.0.0 12.12.0 12.12.12 2.0.0 1.0.4 1.2.0 v.12.0.0 beta.12.0.1) + ## When passed through the regex, the output is : [12.0.0 12.12.0 12.12.12 2.0.0 1.0.4 1.2.0] + if [[ $version = [[:digit:][[:digit:]]*.[[:digit:][[:digit:]]*.[[:digit:][[:digit:]]* ]] then - count=$((count + 1)) - #echo $version - install_mxnet $version - run_models +# echo $version + major_version=$(get_major_version $version) + if [ ${major_version} -eq ${curr_major_version} ] + then +# echo $version + install_mxnet $version + run_models + fi fi -done \ No newline at end of file +done +exit 0 From ca01aa2bd9a183af10a4c3ae7ca5194c395b4ffb Mon Sep 17 00:00:00 2001 From: Piyush Ghai Date: Mon, 30 Jul 2018 15:51:39 -0700 Subject: [PATCH 59/59] Added check for validating current_version --- .../train_mxnet_legacy_models.sh | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh index 3a94440ccee4..336c61df24f7 100755 --- a/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh +++ b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh @@ -44,7 +44,14 @@ get_major_version() { ## We read the current major version from libinfo.py file. And we extract the major version from it. curr_mxnet_version=$(grep -w "__version__" python/mxnet/libinfo.py | grep -o '".*"' | sed 's/"//g') -curr_major_version=$(get_major_version $curr_mxnet_version) +## Expected in .. format +if [[ $curr_mxnet_version = [[:digit:][[:digit:]]*.[[:digit:][[:digit:]]*.[[:digit:][[:digit:]]* ]] +then + curr_major_version=$(get_major_version $curr_mxnet_version) +else + echo "The current major version does not comply with the regex expected. Exiting here." + exit 1 +fi echo `pwd` cd tests/nightly/model_backwards_compatibility_check