From ca4502900d77b44232aac5aeba4a3ae78b22cf91 Mon Sep 17 00:00:00 2001
From: Lai Wei <royweilai@gmail.com>
Date: Mon, 20 May 2019 11:31:09 -0700
Subject: [PATCH] Revert "[MXNET-1333] Estimator and Fit API (#14629)" (#15008)

This reverts commit 9f451fb6f4265f7e122ca08a386e85595a5030a2.
---
 ci/docker/runtime_functions.sh                |  10 -
 .../mxnet/gluon/contrib/estimator/__init__.py |  21 -
 .../gluon/contrib/estimator/estimator.py      | 408 ----------
 .../gluon/contrib/estimator/event_handler.py  | 705 ------------------
 python/mxnet/gluon/trainer.py                 |   7 -
 tests/nightly/JenkinsfileForBinaries          |   8 -
 tests/nightly/estimator/test_estimator_cnn.py | 151 ----
 tests/nightly/estimator/test_sentiment_rnn.py | 276 -------
 tests/python/unittest/test_gluon_estimator.py | 371 ---------
 .../unittest/test_gluon_event_handler.py      | 198 -----
 10 files changed, 2155 deletions(-)
 delete mode 100644 python/mxnet/gluon/contrib/estimator/__init__.py
 delete mode 100644 python/mxnet/gluon/contrib/estimator/estimator.py
 delete mode 100644 python/mxnet/gluon/contrib/estimator/event_handler.py
 delete mode 100644 tests/nightly/estimator/test_estimator_cnn.py
 delete mode 100644 tests/nightly/estimator/test_sentiment_rnn.py
 delete mode 100644 tests/python/unittest/test_gluon_estimator.py
 delete mode 100644 tests/python/unittest/test_gluon_event_handler.py

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 58e39efc2873..e1da222ca298 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -1350,16 +1350,6 @@ nightly_scala_demo_test_cpu() {
     bash bin/run_im.sh
 }
 
-nightly_estimator() {
-    set -ex
-    cd /work/mxnet/tests/nightly/estimator
-    export PYTHONPATH=/work/mxnet/python/
-    python test_estimator_cnn.py --type gpu
-    python test_sentiment_rnn.py --type gpu
-    python test_estimator_cnn.py --type cpu
-    python test_sentiment_rnn.py --type cpu
-}
-
 # Deploy
 
 deploy_docs() {
diff --git a/python/mxnet/gluon/contrib/estimator/__init__.py b/python/mxnet/gluon/contrib/estimator/__init__.py
deleted file mode 100644
index 58600dadffb4..000000000000
--- a/python/mxnet/gluon/contrib/estimator/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""Gluon Estimator Module"""
-from .estimator import *
-from .event_handler import *
diff --git a/python/mxnet/gluon/contrib/estimator/estimator.py b/python/mxnet/gluon/contrib/estimator/estimator.py
deleted file mode 100644
index da1a3915caec..000000000000
--- a/python/mxnet/gluon/contrib/estimator/estimator.py
+++ /dev/null
@@ -1,408 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# coding: utf-8
-# pylint: disable=wildcard-import, unused-variable
-"""Gluon Estimator"""
-
-import copy
-import warnings
-
-from .event_handler import MetricHandler, ValidationHandler, LoggingHandler, StoppingHandler
-from .event_handler import TrainBegin, EpochBegin, BatchBegin, BatchEnd, EpochEnd, TrainEnd
-from .... import gluon, autograd
-from ....context import Context, cpu, gpu, num_gpus
-from ....metric import EvalMetric, Loss, Accuracy
-
-__all__ = ['Estimator']
-
-
-class Estimator(object):
-    """Estimator Class for easy model training
-
-    :py:class:`Estimator` can be used to facilitate the training & validation process
-
-
-    Parameters
-    ----------
-    net : Block
-        The model used for training.
-    loss : gluon.loss.Loss or list of gluon.loss.Loss
-        Loss(objective functions) to calculate during training.
-    metrics : EvalMetric or list of EvalMetric
-        Metrics for evaluating models.
-    initializer : Initializer
-        Initializer to initialize the network.
-    trainer : Trainer
-        Trainer to apply optimizer on network parameters.
-    context : Context or list of Context
-        Device(s) to run the training on.
-    """
-
-    def __init__(self, net,
-                 loss,
-                 metrics=None,
-                 initializer=None,
-                 trainer=None,
-                 context=None):
-
-        self.net = net
-        self.loss = self._check_loss(loss)
-        self.train_metrics = self._check_metrics(metrics)
-
-        self.context = self._check_context(context)
-        self._initialize(initializer)
-        self.trainer = self._check_trainer(trainer)
-
-    def _check_loss(self, loss):
-        if isinstance(loss, gluon.loss.Loss):
-            loss = [loss]
-        elif isinstance(loss, list) and all([isinstance(l, gluon.loss.Loss) for l in loss]):
-            loss = loss
-        else:
-            raise ValueError("loss must be a Loss or a list of Loss, "
-                             "refer to gluon.loss.Loss:{}".format(loss))
-        return loss
-
-    def _check_metrics(self, metrics):
-        if isinstance(metrics, EvalMetric):
-            metrics = [metrics]
-        else:
-            metrics = metrics or []
-            if not all([isinstance(metric, EvalMetric) for metric in metrics]):
-                raise ValueError("metrics must be a Metric or a list of Metric, "
-                                 "refer to mxnet.metric.EvalMetric:{}".format(metrics))
-        return metrics
-
-    def _check_context(self, context):
-        # infer available context
-        gpus = num_gpus()
-        available_gpus = [gpu(i) for i in range(gpus)]
-
-        if context:
-            # check context values, only accept Context or a list of Context
-            if isinstance(context, Context):
-                context = [context]
-            elif isinstance(context, list) and all([isinstance(c, Context) for c in context]):
-                context = context
-            else:
-                raise ValueError("context must be a Context or a list of Context, "
-                                 "for example mx.cpu() or [mx.gpu(0), mx.gpu(1)], "
-                                 "refer to mxnet.Context:{}".format(context))
-            for ctx in context:
-                assert ctx in available_gpus or str(ctx).startswith('cpu'), \
-                    "%s is not available, please make sure " \
-                    "your context is in one of: mx.cpu(), %s" % \
-                    (ctx, ", ".join([str(ctx) for ctx in available_gpus]))
-        else:
-            # provide default context
-            if gpus > 0:
-                # only use 1 GPU by default
-                if gpus > 1:
-                    warnings.warn("You have multiple GPUs, gpu(0) will be used by default."
-                                  "To utilize all your GPUs, specify context as a list of gpus, "
-                                  "e.g. context=[mx.gpu(0), mx.gpu(1)] ")
-                context = [gpu(0)]
-            else:
-                context = [cpu()]
-        return context
-
-    def _initialize(self, initializer):
-        # initialize the network
-        if not self._is_initialized():
-            # net is partially or not initialized,
-            # initialize with user specified initializer
-            # if initializer is None, default initializer will be used
-            # do not re-init layers already initialized
-            if initializer:
-                self.net.initialize(init=initializer, ctx=self.context)
-            else:
-                self.net.initialize(ctx=self.context)
-        elif initializer:
-            # net is fully initialized, and user passed not None initializer
-            # do not force reinitialize, give warning
-            warnings.warn("Network already fully initialized, skipping initialization. "
-                          "You don't need to pass initializer if you already "
-                          "initialized your net. "
-                          "You can use net.initialize(init=your_initializer, force_reinit=True)"
-                          "to force re-initialize.")
-
-    def _check_trainer(self, trainer):
-        # handle trainer
-        if not trainer:
-            warnings.warn("No trainer specified, default SGD optimizer "
-                          "with learning rate 0.001 is used.")
-            trainer = gluon.Trainer(self.net.collect_params(),
-                                    'sgd', {'learning_rate': 0.001})
-        elif not isinstance(trainer, gluon.Trainer):
-            raise ValueError("Trainer must be a Gluon Trainer instance, refer to "
-                             "gluon.Trainer:{}".format(trainer))
-        return trainer
-
-    def _is_initialized(self):
-        param_dict = self.net.collect_params()
-        for param in param_dict:
-            try:
-                param_dict[param].list_ctx()
-            except RuntimeError:
-                return False
-        return True
-
-    def _get_data_and_label(self, batch, ctx, batch_axis=0):
-        data = batch[0]
-        label = batch[1]
-        data = gluon.utils.split_and_load(data, ctx_list=ctx, batch_axis=batch_axis)
-        label = gluon.utils.split_and_load(label, ctx_list=ctx, batch_axis=batch_axis)
-        return data, label
-
-    def prepare_loss_and_metrics(self):
-        """
-        Based on loss functions and training metrics in estimator
-        Create metric wrappers to record loss values,
-        Create copies of train loss/metric objects to record validation values
-        Returns train_metrics and val_metrics
-
-        """
-        if any(not hasattr(self, attribute) for attribute in
-               ['train_metrics', 'val_metrics']):
-            # Use default mx.metric.Accuracy() for gluon.loss.SoftmaxCrossEntropyLoss()
-            if not self.train_metrics and any([isinstance(l, gluon.loss.SoftmaxCrossEntropyLoss) for l in self.loss]):
-                self.train_metrics = [Accuracy()]
-            self.val_metrics = []
-            for loss in self.loss:
-                # remove trailing numbers from loss name to avoid confusion
-                self.train_metrics.append(Loss(loss.name.rstrip('1234567890')))
-            for metric in self.train_metrics:
-                val_metric = copy.deepcopy(metric)
-                metric.name = "train " + metric.name
-                val_metric.name = "validation " + val_metric.name
-                self.val_metrics.append(val_metric)
-        return self.train_metrics, self.val_metrics
-
-    def evaluate(self,
-                 val_data,
-                 val_metrics,
-                 batch_axis=0):
-        """Evaluate model on validation data
-
-         Parameters
-         ----------
-         val_data : DataLoader
-             Validation data loader with data and labels.
-         val_metrics : EvalMetric or list of EvalMetrics
-             Metrics to update validation result.
-         batch_axis : int, default 0
-             Batch axis to split the validation data into devices.
-         """
-        if not isinstance(val_data, gluon.data.DataLoader):
-            raise ValueError("Estimator only support input as Gluon DataLoader. Alternatively, you "
-                             "can transform your DataIter or any NDArray into Gluon DataLoader. "
-                             "Refer to gluon.data.dataloader")
-
-        for metric in val_metrics:
-            metric.reset()
-
-        for _, batch in enumerate(val_data):
-            data, label = self._get_data_and_label(batch, self.context, batch_axis)
-            pred = [self.net(x) for x in data]
-            loss = [self.loss[0](y_hat, y) for y_hat, y in zip(pred, label)]
-            # update metrics
-            for metric in val_metrics:
-                if isinstance(metric, Loss):
-                    metric.update(0, loss)
-                else:
-                    metric.update(label, pred)
-
-    def fit(self, train_data,
-            val_data=None,
-            epochs=None,
-            event_handlers=None,
-            batches=None,
-            batch_axis=0):
-        """Trains the model with a given :py:class:`DataLoader` for a specified
-        number of epochs or batches. The batch size is inferred from the
-        data loader's batch_size.
-
-        Parameters
-        ----------
-        train_data : DataLoader
-            Training data loader with data and labels.
-        val_data : DataLoader, default None
-            Validation data loader with data and labels.
-        epochs : int, default None
-            Number of epochs to iterate on the training data.
-            You can only specify one and only one type of iteration(epochs or batches).
-        event_handlers : EventHandler or list of EventHandler
-            List of :py:class:`EventHandlers` to apply during training.
-        batches : int, default None
-            Number of batches to iterate on the training data.
-            You can only specify one and only one type of iteration(epochs or batches).
-        batch_axis : int, default 0
-            Batch axis to split the training data into devices.
-        """
-        if not isinstance(train_data, gluon.data.DataLoader):
-            raise ValueError("Estimator only support input as Gluon DataLoader. Alternatively, you "
-                             "can transform your DataIter or any NDArray into Gluon DataLoader. "
-                             "Refer to gluon.data.dataloader")
-
-        # must specify one and only one of epochs or batches
-        if (not epochs) == (not batches):
-            raise ValueError(
-                "Fit only support exactly one type of iteration, "
-                "train by number of epochs or number of batches."
-                "Please specify one and only one of: epochs or batches.")
-
-        self.max_epoch = epochs
-        self.max_batch = batches
-
-        # provide default handlers
-        event_handlers = self._prepare_default_handlers(val_data, event_handlers)
-
-        train_begin, epoch_begin, batch_begin, \
-        batch_end, epoch_end, train_end = self._categorize_handlers(event_handlers)
-
-        # pass a reference to all event handlers
-        estimator_ref = self
-        # training begin
-        for handler in train_begin:
-            handler.train_begin(estimator_ref)
-
-        while True:
-            # epoch begin
-            for handler in epoch_begin:
-                handler.epoch_begin(estimator_ref)
-
-            for i, batch in enumerate(train_data):
-                data, label = self._get_data_and_label(batch, self.context, batch_axis)
-
-                batch_size = batch[0].shape[0]
-
-                # batch begin
-                for handler in batch_begin:
-                    handler.batch_begin(estimator_ref, batch=batch)
-
-                with autograd.record():
-                    pred = [self.net(x) for x in data]
-                    loss = [self.loss[0](y_hat, y) for y_hat, y in zip(pred, label)]
-
-                for l in loss:
-                    l.backward()
-
-                self.trainer.step(batch_size)
-                # batch end
-
-                batch_end_result = []
-                for handler in batch_end:
-                    batch_end_result.append(handler.batch_end(estimator_ref, batch=batch,
-                                                              pred=pred, label=label, loss=loss))
-                # if any handler signaled to stop
-                if any(batch_end_result):
-                    break
-
-            # epoch end
-            epoch_end_result = []
-            for handler in epoch_end:
-                epoch_end_result.append(handler.epoch_end(estimator_ref))
-            # if any handler signaled to stop
-            if any(epoch_end_result):
-                break
-
-        # train end
-        for handler in train_end:
-            handler.train_end(estimator_ref)
-
-    def _prepare_default_handlers(self, val_data, event_handlers):
-        event_handlers = event_handlers or []
-        default_handlers = []
-        train_metrics, val_metrics = self.prepare_loss_and_metrics()
-
-        # no need to add to default handler check as StoppingHandler does not use metrics
-        event_handlers.append(StoppingHandler(self.max_epoch, self.max_batch))
-
-        if not any(isinstance(handler, MetricHandler) for handler in event_handlers):
-            event_handlers.append(MetricHandler(train_metrics=train_metrics))
-            default_handlers.append("MetricHandler")
-
-        if val_data and not any(isinstance(handler, ValidationHandler) for handler in event_handlers):
-            event_handlers.append(ValidationHandler(val_data=val_data, eval_fn=self.evaluate,
-                                                    val_metrics=val_metrics))
-            default_handlers.append("ValidationHandler")
-
-        if not any(isinstance(handler, LoggingHandler) for handler in event_handlers):
-            event_handlers.append(LoggingHandler(train_metrics=train_metrics,
-                                                 val_metrics=val_metrics))
-            default_handlers.append("LoggingHandler")
-
-        # if there is a mix of user defined event handlers and default event handlers
-        # they should have the same set of loss and metrics
-        if default_handlers:
-            msg = "You are training with the following default event handlers: %s. " \
-                  "They use loss and metrics from estimator.prepare_loss_and_metrics(). " \
-                  "Please use the same set of metrics for all your other handlers." % \
-                  ", ".join(default_handlers)
-            warnings.warn(msg)
-            # check if all handlers has the same set of references to loss and metrics
-            references = []
-            for handler in event_handlers:
-                for attribute in dir(handler):
-                    if any(keyword in attribute for keyword in ['metric' or 'monitor']):
-                        reference = getattr(handler, attribute)
-                        if isinstance(reference, list):
-                            references += reference
-                        else:
-                            references.append(reference)
-            # remove None metric references
-            references = set([ref for ref in references if ref])
-            for metric in references:
-                if metric not in train_metrics + val_metrics:
-                    msg = "We have added following default handlers for you: %s and used " \
-                          "estimator.prepare_loss_and_metrics() to pass metrics to " \
-                          "those handlers. Please use the same set of metrics " \
-                          "for all your handlers." % \
-                          ", ".join(default_handlers)
-                    raise ValueError(msg)
-
-        event_handlers.sort(key=lambda handler: getattr(handler, 'priority', 0))
-        return event_handlers
-
-    def _categorize_handlers(self, event_handlers):
-        """
-        categorize handlers into 6 event lists to avoid calling empty methods
-        for example, only event handlers with train_begin method
-        implemented will be called at train begin
-        """
-
-        train_begin = []
-        epoch_begin = []
-        batch_begin = []
-        batch_end = []
-        epoch_end = []
-        train_end = []
-        for handler in event_handlers:
-            if isinstance(handler, TrainBegin):
-                train_begin.append(handler)
-            if isinstance(handler, EpochBegin):
-                epoch_begin.append(handler)
-            if isinstance(handler, BatchBegin):
-                batch_begin.append(handler)
-            if isinstance(handler, BatchEnd):
-                batch_end.append(handler)
-            if isinstance(handler, EpochEnd):
-                epoch_end.append(handler)
-            if isinstance(handler, TrainEnd):
-                train_end.append(handler)
-        return train_begin, epoch_begin, batch_begin, batch_end, epoch_end, train_end
diff --git a/python/mxnet/gluon/contrib/estimator/event_handler.py b/python/mxnet/gluon/contrib/estimator/event_handler.py
deleted file mode 100644
index ce5890e0bcae..000000000000
--- a/python/mxnet/gluon/contrib/estimator/event_handler.py
+++ /dev/null
@@ -1,705 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# coding: utf-8
-# pylint: disable=wildcard-import, unused-argument
-"""Gluon EventHandlers for Estimators"""
-
-import logging
-import os
-import time
-import warnings
-
-import numpy as np
-
-from ....metric import EvalMetric, Loss
-
-
-class TrainBegin(object):
-    def train_begin(self, estimator, *args, **kwargs):
-        pass
-
-
-class TrainEnd(object):
-    def train_end(self, estimator, *args, **kwargs):
-        pass
-
-
-class EpochBegin(object):
-    def epoch_begin(self, estimator, *args, **kwargs):
-        pass
-
-
-class EpochEnd(object):
-    def epoch_end(self, estimator, *args, **kwargs):
-        return False
-
-
-class BatchBegin(object):
-    def batch_begin(self, estimator, *args, **kwargs):
-        pass
-
-
-class BatchEnd(object):
-    def batch_end(self, estimator, *args, **kwargs):
-        return False
-
-
-class StoppingHandler(TrainBegin, BatchEnd, EpochEnd):
-    """Stop conditions to stop training
-    Stop training if maximum number of batches or epochs
-    reached.
-
-    Parameters
-    ----------
-    max_epoch : int, default None
-        Number of maximum epochs to train.
-    max_batch : int, default None
-        Number of maximum batches to train.
-
-    """
-
-    def __init__(self, max_epoch=None, max_batch=None):
-        self.max_epoch = max_epoch
-        self.max_batch = max_batch
-        self.current_batch = 0
-        self.current_epoch = 0
-        self.stop_training = False
-
-    def train_begin(self, estimator, *args, **kwargs):
-        self.max_epoch = estimator.max_epoch
-        self.max_batch = estimator.max_batch
-        self.current_batch = 0
-        self.current_epoch = 0
-
-    def batch_end(self, estimator, *args, **kwargs):
-        self.current_batch += 1
-        if self.current_batch == self.max_batch:
-            self.stop_training = True
-        return self.stop_training
-
-    def epoch_end(self, estimator, *args, **kwargs):
-        self.current_epoch += 1
-        if self.current_epoch == self.max_epoch:
-            self.stop_training = True
-        return self.stop_training
-
-
-class MetricHandler(EpochBegin, BatchEnd):
-    """Metric Handler that update metric values at batch end
-
-    :py:class:`MetricHandler` takes model predictions and true labels
-    and update the metrics, it also update metric wrapper for loss with loss values.
-    Validation loss and metrics will be handled by :py:class:`ValidationHandler`
-
-    Parameters
-    ----------
-    train_metrics : List of EvalMetrics
-        Training metrics to be updated at batch end.
-    """
-
-    def __init__(self, train_metrics):
-        self.train_metrics = train_metrics or []
-        # order to be called among all callbacks
-        # metrics need to be calculated before other callbacks can access them
-        self.priority = -np.Inf
-
-    def epoch_begin(self, estimator, *args, **kwargs):
-        for metric in self.train_metrics:
-            metric.reset()
-
-    def batch_end(self, estimator, *args, **kwargs):
-        pred = kwargs['pred']
-        label = kwargs['label']
-        loss = kwargs['loss']
-        for metric in self.train_metrics:
-            if isinstance(metric, Loss):
-                # metric wrapper for loss values
-                metric.update(0, loss)
-            else:
-                metric.update(label, pred)
-
-
-class ValidationHandler(TrainBegin, BatchEnd, EpochEnd):
-    """"Validation Handler that evaluate model on validation dataset
-
-    :py:class:`ValidationHandler` takes validation dataset, an evaluation function,
-    metrics to be evaluated, and how often to run the validation. You can provide custom
-    evaluation function or use the one provided my :py:class:`Estimator`
-
-    Parameters
-    ----------
-    val_data : DataLoader
-        Validation data set to run evaluation.
-    eval_fn : function
-        A function defines how to run evaluation and
-        calculate loss and metrics.
-    val_metrics : List of EvalMetrics
-        Validation metrics to be updated.
-    epoch_period : int, default 1
-        How often to run validation at epoch end, by default
-        :py:class:`ValidationHandler` validate every epoch.
-    batch_period : int, default None
-        How often to run validation at batch end, by default
-        :py:class:`ValidationHandler` does not validate at batch end.
-    """
-
-    def __init__(self,
-                 val_data,
-                 eval_fn,
-                 val_metrics=None,
-                 epoch_period=1,
-                 batch_period=None):
-        self.val_data = val_data
-        self.eval_fn = eval_fn
-        self.epoch_period = epoch_period
-        self.batch_period = batch_period
-        self.val_metrics = val_metrics
-        self.current_batch = 0
-        self.current_epoch = 0
-        # order to be called among all callbacks
-        # validation metrics need to be calculated before other callbacks can access them
-        self.priority = -np.Inf
-        self.logger = logging.getLogger(__name__)
-
-    def train_begin(self, estimator, *args, **kwargs):
-        # reset epoch and batch counter
-        self.current_batch = 0
-        self.current_epoch = 0
-
-    def batch_end(self, estimator, *args, **kwargs):
-        self.current_batch += 1
-        if self.batch_period and self.current_batch % self.batch_period == 0:
-            self.eval_fn(val_data=self.val_data,
-                         val_metrics=self.val_metrics)
-            msg = '[Epoch %d] ValidationHandler: %d batches reached, ' \
-                  % (self.current_epoch, self.current_batch)
-            for monitor in self.val_metrics:
-                name, value = monitor.get()
-                msg += '%s: %.4f, ' % (name, value)
-            self.logger.info(msg.rstrip(','))
-
-    def epoch_end(self, estimator, *args, **kwargs):
-        self.current_epoch += 1
-        if self.epoch_period and self.current_epoch % self.epoch_period == 0:
-            self.eval_fn(val_data=self.val_data,
-                         val_metrics=self.val_metrics)
-
-
-class LoggingHandler(TrainBegin, TrainEnd, EpochBegin, EpochEnd, BatchBegin, BatchEnd):
-    """Basic Logging Handler that applies to every Gluon estimator by default.
-
-    :py:class:`LoggingHandler` logs hyper-parameters, training statistics,
-    and other useful information during training
-
-    Parameters
-    ----------
-    file_name : str
-        File name to save the logs.
-    file_location : str
-        File location to save the logs.
-    filemode : str, default 'a'
-        Logging file mode, default using append mode.
-    verbose : int, default LOG_PER_EPOCH
-        Limit the granularity of metrics displayed during training process.
-        verbose=LOG_PER_EPOCH: display metrics every epoch
-        verbose=LOG_PER_BATCH: display metrics every batch
-    train_metrics : list of EvalMetrics
-        Training metrics to be logged, logged at batch end, epoch end, train end.
-    val_metrics : list of EvalMetrics
-        Validation metrics to be logged, logged at epoch end, train end.
-    """
-
-    LOG_PER_EPOCH = 1
-    LOG_PER_BATCH = 2
-
-    def __init__(self, file_name=None,
-                 file_location=None,
-                 filemode='a',
-                 verbose=LOG_PER_EPOCH,
-                 train_metrics=None,
-                 val_metrics=None):
-        super(LoggingHandler, self).__init__()
-        self.logger = logging.getLogger(__name__)
-        self.logger.setLevel(logging.INFO)
-        stream_handler = logging.StreamHandler()
-        self.logger.addHandler(stream_handler)
-        # save logger to file only if file name or location is specified
-        if file_name or file_location:
-            file_name = file_name or 'estimator_log'
-            file_location = file_location or './'
-            file_handler = logging.FileHandler(os.path.join(file_location, file_name), mode=filemode)
-            self.logger.addHandler(file_handler)
-        if verbose not in [self.LOG_PER_EPOCH, self.LOG_PER_BATCH]:
-            raise ValueError("verbose level must be either LOG_PER_EPOCH or "
-                             "LOG_PER_BATCH, received %s. "
-                             "E.g: LoggingHandler(verbose=LoggingHandler.LOG_PER_EPOCH)"
-                             % verbose)
-        self.verbose = verbose
-        self.train_metrics = train_metrics or []
-        self.val_metrics = val_metrics or []
-        self.batch_index = 0
-        self.current_epoch = 0
-        self.processed_samples = 0
-        # logging handler need to be called at last to make sure all states are updated
-        # it will also shut down logging at train end
-        self.priority = np.Inf
-
-    def train_begin(self, estimator, *args, **kwargs):
-        self.train_start = time.time()
-        trainer = estimator.trainer
-        optimizer = trainer.optimizer.__class__.__name__
-        lr = trainer.learning_rate
-        self.logger.info("Training begin: using optimizer %s "
-                         "with current learning rate %.4f ",
-                         optimizer, lr)
-        if estimator.max_epoch:
-            self.logger.info("Train for %d epochs.", estimator.max_epoch)
-        else:
-            self.logger.info("Train for %d batches.", estimator.max_batch)
-        # reset all counters
-        self.current_epoch = 0
-        self.batch_index = 0
-        self.processed_samples = 0
-
-    def train_end(self, estimator, *args, **kwargs):
-        train_time = time.time() - self.train_start
-        msg = 'Train finished using total %ds with %d epochs. ' % (train_time, self.current_epoch)
-        # log every result in train stats including train/validation loss & metrics
-        for metric in self.train_metrics + self.val_metrics:
-            name, value = metric.get()
-            msg += '%s: %.4f, ' % (name, value)
-        self.logger.info(msg.rstrip(', '))
-        # make a copy of handler list and remove one by one
-        # as removing handler will edit the handler list
-        for handler in self.logger.handlers[:]:
-            handler.close()
-            self.logger.removeHandler(handler)
-        logging.shutdown()
-
-    def batch_begin(self, estimator, *args, **kwargs):
-        if self.verbose == self.LOG_PER_BATCH:
-            self.batch_start = time.time()
-
-    def batch_end(self, estimator, *args, **kwargs):
-        if self.verbose == self.LOG_PER_BATCH:
-            batch_time = time.time() - self.batch_start
-            msg = '[Epoch %d][Batch %d]' % (self.current_epoch, self.batch_index)
-            self.processed_samples += kwargs['batch'][0].shape[0]
-            msg += '[Samples %s] ' % (self.processed_samples)
-            msg += 'time/batch: %.3fs ' % batch_time
-            for metric in self.train_metrics:
-                # only log current training loss & metric after each batch
-                name, value = metric.get()
-                msg += '%s: %.4f, ' % (name, value)
-            self.logger.info(msg.rstrip(', '))
-        self.batch_index += 1
-
-    def epoch_begin(self, estimator, *args, **kwargs):
-        if self.verbose >= self.LOG_PER_EPOCH:
-            self.epoch_start = time.time()
-            self.logger.info("[Epoch %d] Begin, current learning rate: %.4f",
-                             self.current_epoch, estimator.trainer.learning_rate)
-
-    def epoch_end(self, estimator, *args, **kwargs):
-        if self.verbose >= self.LOG_PER_EPOCH:
-            epoch_time = time.time() - self.epoch_start
-            msg = '[Epoch %d] Finished in %.3fs, ' % (self.current_epoch, epoch_time)
-            for monitor in self.train_metrics + self.val_metrics:
-                name, value = monitor.get()
-                msg += '%s: %.4f, ' % (name, value)
-            self.logger.info(msg.rstrip(', '))
-        self.current_epoch += 1
-        self.batch_index = 0
-
-
-class CheckpointHandler(TrainBegin, BatchEnd, EpochEnd):
-    """Save the model after user define period
-
-    :py:class:`CheckpointHandler` saves the network architecture after first batch if the model
-    can be fully hybridized, saves model parameters and trainer states after user defined period,
-    default saves every epoch.
-
-    Parameters
-    ----------
-    model_dir : str
-        File directory to save all the model related files including model architecture,
-        model parameters, and trainer states.
-    model_prefix : str default 'model'
-        Prefix to add for all checkpoint file names.
-    monitor: EvalMetric, default None
-        The metrics to monitor and determine if model has improved
-    verbose: int, default 0
-        Verbosity mode, 1 means inform user every time a checkpoint is saved
-    save_best: bool, default False
-        If True, monitor must not be None, :py:class:`CheckpointHandler` will save the
-        model parameters and trainer states with the best monitored value.
-    mode: str, default 'auto'
-        One of {auto, min, max}, if `save_best=True`, the comparison to make
-        and determine if the monitored value has improved. if 'auto' mode,
-        :py:class:`CheckpointHandler` will try to use min or max based on
-        the monitored metric name.
-    epoch_period: int, default 1
-        Epoch intervals between saving the network. By default, checkpoints are
-        saved every epoch.
-    batch_period: int, default None
-        Batch intervals between saving the network.
-        By default, checkpoints are not saved based on the number of batches.
-    max_checkpoints : int, default 5
-        Maximum number of checkpoint files to keep in the model_dir, older checkpoints
-        will be removed. Best checkpoint file is not counted.
-    resume_from_checkpoint : bool, default False
-        Whether to resume training from checkpoint in model_dir. If True and checkpoints
-        found, :py:class:`CheckpointHandler` will load net parameters and trainer states,
-        and train the remaining of epochs and batches.
-    """
-
-    def __init__(self,
-                 model_dir,
-                 model_prefix='model',
-                 monitor=None,
-                 verbose=0,
-                 save_best=False,
-                 mode='auto',
-                 epoch_period=1,
-                 batch_period=None,
-                 max_checkpoints=5,
-                 resume_from_checkpoint=False):
-        self.monitor = monitor
-        self.verbose = verbose
-        if not os.path.exists(model_dir):
-            os.makedirs(model_dir)
-        self.model_dir = model_dir
-        self.model_prefix = model_prefix
-        self.save_best = save_best
-        if self.save_best and not isinstance(self.monitor, EvalMetric):
-            raise ValueError("To save best model only, please provide one of the metric objects as monitor, "
-                             "You can get these objects using estimator.prepare_loss_and_metric()")
-        self.epoch_period = epoch_period
-        self.batch_period = batch_period
-        self.current_batch = 0
-        self.current_epoch = 0
-        self.max_checkpoints = max_checkpoints
-        self.resume_from_checkpoint = resume_from_checkpoint
-        self.saved_checkpoints = []
-        self.logger = logging.getLogger(__name__)
-        if self.save_best:
-            if mode not in ['auto', 'min', 'max']:
-                warnings.warn('ModelCheckpoint mode %s is unknown, '
-                              'fallback to auto mode. CheckpointHandler will use'
-                              'max mode for f1 and accuracy metric comparison and '
-                              'use min mode other wise' % (mode),
-                              RuntimeWarning)
-                mode = 'auto'
-
-            if mode == 'min':
-                self.monitor_op = np.less
-                self.best = np.Inf
-            elif mode == 'max':
-                self.monitor_op = np.greater
-                self.best = -np.Inf
-            else:
-                # use greater for accuracy and f1 and less otherwise
-                if 'acc' or 'f1' in self.monitor.get()[0].lower():
-                    self.logger.info("`greater` operator will be used to determine "
-                                     "if %s has improved, please use `min` for mode "
-                                     "if you want otherwise", self.monitor.get()[0])
-                    self.monitor_op = np.greater
-                else:
-                    self.logger.info("`less` operator will be used to determine "
-                                     "if %s has improved, please use `max` for mode "
-                                     "if you want otherwise", self.monitor.get()[0])
-                    self.monitor_op = np.less
-
-    def train_begin(self, estimator, *args, **kwargs):
-        # reset all counters
-        self.current_epoch = 0
-        self.current_batch = 0
-        if self.save_best:
-            self.best = np.Inf if self.monitor_op == np.less else -np.Inf
-        if self.resume_from_checkpoint:
-            error_msg = "To use resume from checkpoint, you must only specify " \
-                        "the same type of period you used for training." \
-                        "For example, if you are training based on number of epochs," \
-                        "you must save only based on epochs, and set batch_period to None."
-            if estimator.max_batch:
-                assert self.batch_period, error_msg
-                assert not self.epoch_period, error_msg
-            if estimator.max_epoch:
-                assert self.epoch_period, error_msg
-                assert not self.batch_period, error_msg
-
-            self._resume_from_checkpoint(estimator)
-
-    def batch_end(self, estimator, *args, **kwargs):
-        # only save symbol once after first batch
-        if self.current_batch == 0:
-            self._save_symbol(estimator)
-        if self.batch_period and (self.current_batch + 1) % self.batch_period == 0:
-            self._save_checkpoint(estimator)
-        self.current_batch += 1
-
-    def epoch_end(self, estimator, *args, **kwargs):
-        if self.epoch_period and (self.current_epoch + 1) % self.epoch_period == 0:
-            self._save_checkpoint(estimator)
-        self.current_epoch += 1
-
-    def _save_checkpoint(self, estimator):
-        # if resumed from checkpoint, increment checkpoint number
-        if self.resume_from_checkpoint:
-            save_epoch_number = self.current_epoch + self.trained_epoch + 1
-            if estimator.max_epoch:
-                # checkpoint saved at epoch end, batch number already incremented
-                save_batch_number = self.current_batch + self.trained_batch
-            else:
-                save_batch_number = self.current_batch + self.trained_batch + 1
-        else:
-            save_epoch_number = self.current_epoch
-            save_batch_number = self.current_batch
-        prefix = "%s-epoch%dbatch%d" % (self.model_prefix, save_epoch_number, save_batch_number)
-        self._save_params_and_trainer(estimator, prefix)
-        if self.verbose > 0:
-            self.logger.info('[Epoch %d] CheckpointHandler: trained total %d batches, '
-                             'saving model at %s with prefix: %s',
-                             self.current_epoch, self.current_batch + 1, self.model_dir, prefix)
-
-        if self.save_best:
-            monitor_name, monitor_value = self.monitor.get()
-            # check if monitor exists in train stats
-            if np.isnan(monitor_value):
-                warnings.warn(RuntimeWarning('Skipping save best because %s is not updated, make sure you '
-                                             'pass one of the metric objects as monitor, '
-                                             'you can use estimator.prepare_loss_and_metrics to'
-                                             'create all metric objects', monitor_name))
-            else:
-                if self.monitor_op(monitor_value, self.best):
-                    prefix = self.model_prefix + '-best'
-                    self._save_params_and_trainer(estimator, prefix)
-                    self.best = monitor_value
-                    if self.verbose > 0:
-                        self.logger.info('[Epoch %d] CheckpointHandler: '
-                                         '%s improved from %0.5f to %0.5f, '
-                                         'updating best model at %s with prefix: %s',
-                                         self.current_epoch, monitor_name,
-                                         self.best, monitor_value, self.model_dir, prefix)
-                else:
-                    if self.verbose > 0:
-                        self.logger.info('[Epoch %d] CheckpointHandler: '
-                                         '%s did not improve from %0.5f, '
-                                         'skipping updating best model',
-                                         self.current_batch, monitor_name,
-                                         self.best)
-
-    def _save_symbol(self, estimator):
-        symbol_file = os.path.join(self.model_dir, self.model_prefix + '-symbol.json')
-        if hasattr(estimator.net, '_cached_graph'):
-            sym = estimator.net._cached_graph[1]
-            sym.save(symbol_file)
-        else:
-            self.logger.info("Model architecture(symbol file) is not saved, please use HybridBlock"
-                             "to construct your model, can call net.hybridize() before passing to"
-                             "Estimator in order to save model architecture as %s.", symbol_file)
-
-    def _save_params_and_trainer(self, estimator, file_prefix):
-        param_file = os.path.join(self.model_dir, file_prefix + '.params')
-        trainer_file = os.path.join(self.model_dir, file_prefix + '.states')
-        estimator.net.save_parameters(param_file)
-        estimator.trainer.save_states(trainer_file)
-
-        # only count checkpoints with epoch or batch number in file name
-        if 'best' not in file_prefix:
-            self.saved_checkpoints.append(file_prefix)
-        # remove old checkpoint when max number of checkpoints reached
-        if len(self.saved_checkpoints) > self.max_checkpoints:
-            prefix = self.saved_checkpoints.pop(0)
-            for fname in os.listdir(self.model_dir):
-                if fname.startswith(prefix):
-                    os.remove(os.path.join(self.model_dir, fname))
-
-    def _resume_from_checkpoint(self, estimator):
-        prefix = self.model_prefix + '-epoch'
-        self.trained_epoch = self._find_max_iteration(
-            dir=self.model_dir,
-            prefix=prefix,
-            start='epoch',
-            end='batch',
-            saved_checkpoints=self.saved_checkpoints)
-        prefix += str(self.trained_epoch)
-        self.trained_batch = self._find_max_iteration(
-            dir=self.model_dir,
-            prefix=prefix,
-            start='batch',
-            end='.params')
-
-        if self.trained_epoch == -1:
-            msg = "CheckpointHandler: No checkpoint found, training from scratch for "
-            if estimator.max_batch:
-                msg += "%d batches" % estimator.max_batch
-            else:
-                msg += "%d epochs" % estimator.max_epoch
-            self.logger.info(msg)
-        else:
-            msg = "CheckpointHandler: Checkpoint resumed from epoch %d batch %d, " \
-                  "continue to train for " % (self.trained_epoch, self.trained_batch)
-            # change maximum number of epoch or batch to train if resumed from epoch checkpoint
-            if estimator.max_epoch:
-                if self.trained_epoch >= estimator.max_epoch - 1:
-                    raise ValueError("Found checkpoint with maximum number of epoch %d reached, please specify "
-                                     "resume_from_checkpoint=False (default value) if you wan to train from scratch."
-                                     % estimator.max_epoch)
-                estimator.max_epoch = estimator.max_epoch - self.trained_epoch - 1
-                msg += "%d epochs " % estimator.max_epoch
-            if estimator.max_batch:
-                if self.trained_batch >= estimator.max_batch - 1:
-                    raise ValueError("Found checkpoint with maximum number of batch %d reached, please specify"
-                                     "resume_from_checkpoint=False (default value) if you wan to train from scratch."
-                                     % self.trained_batch)
-                estimator.max_batch = estimator.max_batch - self.trained_batch - 1
-                msg += "%d batches " % estimator.max_batch
-            # load checkpoint
-            param_file = "%s-epoch%dbatch%d.params" % (self.model_prefix, self.trained_epoch, self.trained_batch)
-            param_file = os.path.join(self.model_dir, param_file)
-            trainer_file = "%s-epoch%dbatch%d.states" % (self.model_prefix, self.trained_epoch, self.trained_batch)
-            trainer_file = os.path.join(self.model_dir, trainer_file)
-            assert os.path.exists(param_file), "Failed to load checkpoint, %s does not exist" % param_file
-            assert os.path.exists(trainer_file), "Failed to load checkpoint, %s does not exist" % trainer_file
-            estimator.net.load_parameters(param_file, ctx=estimator.context)
-            estimator.trainer.load_states(trainer_file)
-            self.logger.warning(msg)
-
-    def _find_max_iteration(self, dir, prefix, start, end, saved_checkpoints=None):
-        error_msg = "Error parsing checkpoint file, please check your " \
-                    "checkpoints have the format: " \
-                    "{model_name}-epoch{epoch_number}batch{batch_number}.params, " \
-                    "there should also be a .states file for each .params file "
-        max_iter = -1
-        for fname in os.listdir(dir):
-            if fname.startswith(prefix) and '.params' in fname:
-                if saved_checkpoints:
-                    # save prefix of existing checkpoints
-                    saved_checkpoints.append(fname[:fname.find('.params')])
-                try:
-                    # find trained number of epoch
-                    iter = int(fname[fname.find(start) + len(start): fname.find(end)])
-                    if iter > max_iter:
-                        max_iter = iter
-                except ValueError:
-                    raise ValueError(error_msg)
-        return max_iter
-
-
-class EarlyStoppingHandler(TrainBegin, EpochEnd, TrainEnd):
-    """Early stop training if monitored value is not improving
-
-    Parameters
-    ----------
-    monitor: EvalMetric
-        The metric to monitor, and stop training if this metric does not improve.
-    min_delta: float, default 0
-        Minimal change in monitored value to be considered as an improvement.
-    patience: int, default 0
-        Number of epochs to wait for improvement before terminate training.
-    mode: str, default 'auto'
-        One of {auto, min, max}, if `save_best_only=True`, the comparison to make
-        and determine if the monitored value has improved. if 'auto' mode, checkpoint
-        handler will try to use min or max based on the monitored metric name.
-    baseline: float
-        Baseline value to compare the monitored value with.
-    """
-
-    def __init__(self,
-                 monitor,
-                 min_delta=0,
-                 patience=0,
-                 mode='auto',
-                 baseline=None):
-        super(EarlyStoppingHandler, self).__init__()
-
-        if not isinstance(monitor, EvalMetric):
-            raise ValueError("Please provide one of the metric objects as monitor, "
-                             "You can create these objects using estimator.prepare_loss_and_metric()")
-        self.monitor = monitor
-        self.baseline = baseline
-        self.patience = patience
-        self.min_delta = min_delta
-        self.wait = 0
-        self.stopped_epoch = 0
-        self.current_epoch = 0
-        self.stop_training = False
-        self.logger = logging.getLogger(__name__)
-
-        if mode not in ['auto', 'min', 'max']:
-            warnings.warn('EarlyStopping mode %s is unknown, '
-                          'fallback to auto mode. CheckpointHandler will use'
-                          'max mode for f1 and accuracy metric comparison and '
-                          'use min mode other wise' % (mode),
-                          RuntimeWarning)
-            mode = 'auto'
-
-        if mode == 'min':
-            self.monitor_op = np.less
-        elif mode == 'max':
-            self.monitor_op = np.greater
-        else:
-            if 'acc' or 'f1' in self.monitor.get()[0].lower():
-                self.logger.info("`greater` operator is used to determine "
-                                 "if %s has improved, please use `min` for mode "
-                                 "if you want otherwise", self.monitor.get()[0])
-                self.monitor_op = np.greater
-            else:
-                self.logger.info("`less` operator is used to determine "
-                                 "if %s has improved, please use `max` for mode "
-                                 "if you want otherwise", self.monitor.get()[0])
-                self.monitor_op = np.less
-
-        if self.monitor_op == np.greater:
-            self.min_delta *= 1
-        else:
-            self.min_delta *= -1
-
-    def train_begin(self, estimator, *args, **kwargs):
-        self.wait = 0
-        self.stopped_epoch = 0
-        self.current_epoch = 0
-        self.stop_training = False
-        if self.baseline is not None:
-            self.best = self.baseline
-        else:
-            self.best = np.Inf if self.monitor_op == np.less else -np.Inf
-
-    def epoch_end(self, estimator, *args, **kwargs):
-        monitor_name, monitor_value = self.monitor.get()
-        if np.isnan(monitor_value):
-            warnings.warn(RuntimeWarning('%s is not updated, make sure you pass one of the metric objects'
-                                         'as monitor, you can use estimator.prepare_loss_and_metrics to'
-                                         'create all metric objects', monitor_name))
-        else:
-            if self.monitor_op(monitor_value - self.min_delta, self.best):
-                self.best = monitor_value
-                self.wait = 0
-            else:
-                self.wait += 1
-                if self.wait >= self.patience:
-                    self.stopped_epoch = self.current_epoch
-                    self.stop_training = True
-        self.current_epoch += 1
-        return self.stop_training
-
-    def train_end(self, estimator, *args, **kwargs):
-        if self.stopped_epoch > 0:
-            self.logger.info('[Epoch %d] EarlyStoppingHanlder: early stopping due to %s not improving',
-                             self.stopped_epoch, self.monitor.get()[0])
diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py
index 0939490a8307..6935c2752e1a 100644
--- a/python/mxnet/gluon/trainer.py
+++ b/python/mxnet/gluon/trainer.py
@@ -255,13 +255,6 @@ def learning_rate(self):
         else:
             return self._optimizer.learning_rate
 
-    @property
-    def optimizer(self):
-        if isinstance(self._optimizer, opt.Optimizer):
-            return self._optimizer
-        else:
-            raise UserWarning("Optimizer has not been initialized yet")
-
     def set_learning_rate(self, lr):
         """Sets a new learning rate of the optimizer.
 
diff --git a/tests/nightly/JenkinsfileForBinaries b/tests/nightly/JenkinsfileForBinaries
index e4b9ff1acbb1..ea6db1a20cbf 100755
--- a/tests/nightly/JenkinsfileForBinaries
+++ b/tests/nightly/JenkinsfileForBinaries
@@ -141,14 +141,6 @@ core_logic: {
           utils.docker_run('ubuntu_nightly_gpu', 'nightly_tutorial_test_ubuntu_python3_gpu', true, '1500m')
         }
       }
-    },
-    'Gluon estimator: GPU': {
-      node(NODE_LINUX_GPU) {
-        ws('workspace/estimator-test-gpu') {
-          utils.unpack_and_init('gpu', mx_lib)
-          utils.docker_run('ubuntu_nightly_gpu', 'nightly_estimator', true)
-        }
-      }
     }
   }
 }
diff --git a/tests/nightly/estimator/test_estimator_cnn.py b/tests/nightly/estimator/test_estimator_cnn.py
deleted file mode 100644
index c60dc544b347..000000000000
--- a/tests/nightly/estimator/test_estimator_cnn.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# Test gluon estimator on CNN models
-
-import argparse
-import numpy as np
-import mxnet as mx
-from mxnet import gluon, init, nd
-from mxnet.gluon import data
-from mxnet.gluon.contrib.estimator import estimator
-from mxnet.gluon.model_zoo import vision
-
-def load_data_mnist(batch_size, resize=None, num_workers=4):
-    '''
-    Load MNIST dataset
-    '''
-    transformer = []
-    if resize:
-        transformer += [data.vision.transforms.Resize(resize)]
-    transformer += [data.vision.transforms.ToTensor()]
-    transformer = data.vision.transforms.Compose(transformer)
-    mnist_train = data.vision.MNIST(train=True)
-    mnist_test = data.vision.MNIST(train=False)
-    train_iter = data.DataLoader(
-        mnist_train.transform_first(transformer), batch_size, shuffle=True,
-        num_workers=num_workers)
-    test_iter = data.DataLoader(
-        mnist_test.transform_first(transformer), batch_size, shuffle=False,
-        num_workers=num_workers)
-    return train_iter, test_iter
-
-def bilinear_kernel(in_channels, out_channels, kernel_size):
-    '''
-    Bilinear interpolation using transposed convolution
-    https://github.com/d2l-ai/d2l-en/blob/master/chapter_computer-vision/fcn.md
-    '''
-    factor = (kernel_size + 1) // 2
-    if kernel_size % 2 == 1:
-        center = factor - 1
-    else:
-        center = factor - 0.5
-    og = np.ogrid[:kernel_size, :kernel_size]
-    filt = (1 - abs(og[0] - center) / factor) * (1 - abs(og[1] - center) / factor)
-    weight = np.zeros((in_channels, out_channels, kernel_size, kernel_size), dtype='float32')
-    weight[range(in_channels), range(out_channels), :, :] = filt
-    return nd.array(weight)
-
-def get_net(model_name, context):
-    if model_name == 'FCN':
-        num_classes = 21
-        pretrained_net = vision.resnet18_v2(pretrained=True, ctx=context)
-        net = gluon.nn.HybridSequential()
-        for layer in pretrained_net.features[:-2]:
-            net.add(layer)
-        net.add(gluon.nn.Conv2D(num_classes, kernel_size=1),
-                gluon.nn.Conv2DTranspose(num_classes, kernel_size=64, padding=16, strides=32))
-        net[-1].initialize(init.Constant(bilinear_kernel(num_classes, num_classes, 64)), ctx=context)
-        net[-2].initialize(init=init.Xavier(), ctx=context)
-        input_shape = (1, 3, 320, 480)
-        label_shape = (1, 320, 480)
-        loss_axis = 1
-    else:
-        net = vision.get_model(model_name, classes=10)
-        net.initialize(mx.init.Xavier(), ctx=context)
-        input_shape = (1, 1, 224, 224)
-        label_shape = 1
-        loss_axis = -1
-    return net, input_shape, label_shape, loss_axis
-
-def test_estimator_cpu():
-    '''
-    Test estimator by doing one pass over each model with synthetic data
-    '''
-    models = ['resnet18_v1',
-              'FCN'
-              ]
-    context = mx.cpu()
-    for model_name in models:
-        net, input_shape, label_shape, loss_axis = get_net(model_name, context)
-        train_dataset = gluon.data.dataset.ArrayDataset(mx.nd.random.uniform(shape=input_shape),
-                                                        mx.nd.zeros(shape=label_shape))
-        val_dataset = gluon.data.dataset.ArrayDataset(mx.nd.random.uniform(shape=input_shape),
-                                                      mx.nd.zeros(shape=label_shape))
-        loss = gluon.loss.SoftmaxCrossEntropyLoss(axis=loss_axis)
-        train_data = gluon.data.DataLoader(train_dataset, batch_size=1)
-        val_data = gluon.data.DataLoader(val_dataset, batch_size=1)
-        net.hybridize()
-        trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
-        # Define estimator
-        est = estimator.Estimator(net=net,
-                                  loss=loss,
-                                  metrics=mx.metric.Accuracy(),
-                                  trainer=trainer,
-                                  context=context)
-        # Call fit()
-        est.fit(train_data=train_data,
-                val_data=val_data,
-                epochs=1)
-
-def test_estimator_gpu():
-    '''
-    Test estimator by training resnet18_v1 for 5 epochs on MNIST and verify accuracy
-    '''
-    model_name = 'resnet18_v1'
-    batch_size = 128
-    num_epochs = 5
-    context = mx.gpu(0)
-    net, _, _, _ = get_net(model_name, context)
-    train_data, test_data = load_data_mnist(batch_size, resize=224)
-    loss = gluon.loss.SoftmaxCrossEntropyLoss()
-    net.hybridize()
-    acc = mx.metric.Accuracy()
-    trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
-    # Define estimator
-    est = estimator.Estimator(net=net,
-                              loss=loss,
-                              metrics=acc,
-                              trainer=trainer,
-                              context=context)
-    # Call fit()
-    est.fit(train_data=train_data,
-            val_data=test_data,
-            epochs=num_epochs)
-
-    assert acc.get()[1] > 0.80
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='test gluon estimator')
-    parser.add_argument('--type', type=str, default='cpu')
-    opt = parser.parse_args()
-    if opt.type == 'cpu':
-        test_estimator_cpu()
-    elif opt.type == 'gpu':
-        test_estimator_gpu()
-    else:
-        raise RuntimeError("Unknown test type")
diff --git a/tests/nightly/estimator/test_sentiment_rnn.py b/tests/nightly/estimator/test_sentiment_rnn.py
deleted file mode 100644
index 404bf83fb86f..000000000000
--- a/tests/nightly/estimator/test_sentiment_rnn.py
+++ /dev/null
@@ -1,276 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Gluon Text Sentiment Classification Example using RNN/CNN
-Example modified from below link:
-https://github.com/d2l-ai/d2l-en/blob/master/chapter_natural-language-processing/sentiment-analysis-rnn.md
-https://github.com/d2l-ai/d2l-en/blob/master/chapter_natural-language-processing/sentiment-analysis-cnn.md"""
-
-import argparse
-import os
-import tarfile
-import random
-import collections
-import mxnet as mx
-from mxnet import nd, gluon
-from mxnet.contrib import text
-from mxnet.gluon import nn, rnn
-from mxnet.gluon.contrib.estimator import estimator
-
-
-class TextCNN(nn.Block):
-    def __init__(self, vocab, embed_size, kernel_sizes, num_channels,
-                 **kwargs):
-        super(TextCNN, self).__init__(**kwargs)
-        self.embedding = nn.Embedding(len(vocab), embed_size)
-        # The embedding layer does not participate in training
-        self.constant_embedding = nn.Embedding(len(vocab), embed_size)
-        self.dropout = nn.Dropout(0.5)
-        self.decoder = nn.Dense(2)
-        # The max-over-time pooling layer has no weight, so it can share an
-        # instance
-        self.pool = nn.GlobalMaxPool1D()
-        # Create multiple one-dimensional convolutional layers
-        self.convs = nn.Sequential()
-        for c, k in zip(num_channels, kernel_sizes):
-            self.convs.add(nn.Conv1D(c, k, activation='relu'))
-
-    def forward(self, inputs):
-        # Concatenate the output of two embedding layers with shape of
-        # (batch size, number of words, word vector dimension) by word vector
-        embeddings = nd.concat(
-            self.embedding(inputs), self.constant_embedding(inputs), dim=2)
-        # According to the input format required by Conv1D, the word vector
-        # dimension, that is, the channel dimension of the one-dimensional
-        # convolutional layer, is transformed into the previous dimension
-        embeddings = embeddings.transpose((0, 2, 1))
-        # For each one-dimensional convolutional layer, after max-over-time
-        # pooling, an NDArray with the shape of (batch size, channel size, 1)
-        # can be obtained. Use the flatten function to remove the last
-        # dimension and then concatenate on the channel dimension
-        encoding = nd.concat(*[nd.flatten(
-            self.pool(conv(embeddings))) for conv in self.convs], dim=1)
-        # After applying the dropout method, use a fully connected layer to
-        # obtain the output
-        outputs = self.decoder(self.dropout(encoding))
-        return outputs
-
-
-class BiRNN(nn.Block):
-    def __init__(self, vocab, embed_size, num_hiddens, num_layers, **kwargs):
-        super(BiRNN, self).__init__(**kwargs)
-        self.embedding = nn.Embedding(len(vocab), embed_size)
-        # Set Bidirectional to True to get a bidirectional recurrent neural
-        # network
-        self.encoder = rnn.LSTM(num_hiddens, num_layers=num_layers,
-                                bidirectional=True, input_size=embed_size)
-        self.decoder = nn.Dense(2)
-
-    def forward(self, inputs):
-        # The shape of inputs is (batch size, number of words). Because LSTM
-        # needs to use sequence as the first dimension, the input is
-        # transformed and the word feature is then extracted. The output shape
-        # is (number of words, batch size, word vector dimension).
-        embeddings = self.embedding(inputs.T)
-        # The shape of states is (number of words, batch size, 2 * number of
-        # hidden units).
-        states = self.encoder(embeddings)
-        # Concatenate the hidden states of the initial time step and final
-        # time step to use as the input of the fully connected layer. Its
-        # shape is (batch size, 4 * number of hidden units)
-        encoding = nd.concat(states[0], states[-1])
-        outputs = self.decoder(encoding)
-        return outputs
-
-
-def download_imdb(data_dir='/tmp/data'):
-    '''
-    Download and extract the IMDB dataset
-    '''
-    url = ('http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz')
-    sha1 = '01ada507287d82875905620988597833ad4e0903'
-    if not os.path.exists(data_dir):
-        os.makedirs(data_dir)
-    file_path = os.path.join(data_dir, 'aclImdb_v1.tar.gz')
-    if not os.path.isfile(file_path):
-        file_path = gluon.utils.download(url, data_dir, sha1_hash=sha1)
-    with tarfile.open(file_path, 'r') as f:
-        f.extractall(data_dir)
-
-
-def read_imdb(folder='train'):
-    '''
-    Read the IMDB dataset
-    '''
-    data = []
-    for label in ['pos', 'neg']:
-        folder_name = os.path.join('/tmp/data/aclImdb/', folder, label)
-        for file in os.listdir(folder_name):
-            with open(os.path.join(folder_name, file), 'rb') as f:
-                review = f.read().decode('utf-8').replace('\n', '').lower()
-                data.append([review, 1 if label == 'pos' else 0])
-    random.shuffle(data)
-    return data
-
-
-def get_tokenized_imdb(data):
-    '''
-    Tokenized the words
-    '''
-
-    def tokenizer(text):
-        return [tok.lower() for tok in text.split(' ')]
-
-    return [tokenizer(review) for review, _ in data]
-
-
-def get_vocab_imdb(data):
-    '''
-    Get the indexed tokens
-    '''
-    tokenized_data = get_tokenized_imdb(data)
-    counter = collections.Counter([tk for st in tokenized_data for tk in st])
-    return text.vocab.Vocabulary(counter, min_freq=5)
-
-
-def preprocess_imdb(data, vocab):
-    '''
-    Make the length of each comment 500 by truncating or adding 0s
-    '''
-    max_l = 500
-
-    def pad(x):
-        return x[:max_l] if len(x) > max_l else x + [0] * (max_l - len(x))
-
-    tokenized_data = get_tokenized_imdb(data)
-    features = nd.array([pad(vocab.to_indices(x)) for x in tokenized_data])
-    labels = nd.array([score for _, score in data])
-    return features, labels
-
-
-def run(net, train_dataloader, test_dataloader, **kwargs):
-    '''
-    Train a test sentiment model
-    '''
-    num_epochs = kwargs['epochs']
-    ctx = kwargs['ctx']
-    batch_size = kwargs['batch_size']
-    lr = kwargs['lr']
-
-    # Define trainer
-    trainer = mx.gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': lr})
-    # Define loss and evaluation metrics
-    loss = gluon.loss.SoftmaxCrossEntropyLoss()
-    acc = mx.metric.Accuracy()
-
-    # Define estimator
-    est = estimator.Estimator(net=net, loss=loss, metrics=acc,
-                              trainer=trainer, context=ctx)
-    # Begin training
-    est.fit(train_data=train_dataloader, val_data=test_dataloader,
-            epochs=num_epochs)
-    return acc
-
-
-def test_estimator_cpu(**kwargs):
-    '''
-    Test estimator by doing one pass over each model with synthetic data
-    '''
-    models = ['TextCNN', 'BiRNN']
-    ctx = kwargs['ctx']
-    batch_size = kwargs['batch_size']
-    embed_size = kwargs['embed_size']
-
-    train_data = mx.nd.random.randint(low=0, high=100, shape=(2 * batch_size, 500))
-    train_label = mx.nd.random.randint(low=0, high=2, shape=(2 * batch_size,))
-    val_data = mx.nd.random.randint(low=0, high=100, shape=(batch_size, 500))
-    val_label = mx.nd.random.randint(low=0, high=2, shape=(batch_size,))
-
-    train_dataloader = gluon.data.DataLoader(dataset=gluon.data.ArrayDataset(train_data, train_label),
-                                             batch_size=batch_size, shuffle=True)
-    val_dataloader = gluon.data.DataLoader(dataset=gluon.data.ArrayDataset(val_data, val_label),
-                                           batch_size=batch_size)
-    vocab_list = mx.nd.zeros(shape=(100,))
-
-    # Get the model
-    for model in models:
-        if model == 'TextCNN':
-            kernel_sizes, nums_channels = [3, 4, 5], [100, 100, 100]
-            net = TextCNN(vocab_list, embed_size, kernel_sizes, nums_channels)
-        else:
-            num_hiddens, num_layers = 100, 2
-            net = BiRNN(vocab_list, embed_size, num_hiddens, num_layers)
-        net.initialize(mx.init.Xavier(), ctx=ctx)
-
-        run(net, train_dataloader, val_dataloader, **kwargs)
-
-
-def test_estimator_gpu(**kwargs):
-    '''
-    Test estimator by training Bidirectional RNN for 5 epochs on the IMDB dataset
-    and verify accuracy
-    '''
-    ctx = kwargs['ctx']
-    batch_size = kwargs['batch_size']
-    num_epochs = kwargs['epochs']
-    embed_size = kwargs['embed_size']
-
-    # data
-    download_imdb()
-    train_data, test_data = read_imdb('train'), read_imdb('test')
-    vocab = get_vocab_imdb(train_data)
-
-    train_set = gluon.data.ArrayDataset(*preprocess_imdb(train_data, vocab))
-    test_set = gluon.data.ArrayDataset(*preprocess_imdb(test_data, vocab))
-    train_dataloader = gluon.data.DataLoader(train_set, batch_size, shuffle=True)
-    test_dataloader = gluon.data.DataLoader(test_set, batch_size)
-
-    # Model
-    num_hiddens, num_layers = 100, 2
-    net = BiRNN(vocab, embed_size, num_hiddens, num_layers)
-    net.initialize(mx.init.Xavier(), ctx=ctx)
-
-    glove_embedding = text.embedding.create(
-        'glove', pretrained_file_name='glove.6B.100d.txt', vocabulary=vocab)
-
-    net.embedding.weight.set_data(glove_embedding.idx_to_vec)
-    net.embedding.collect_params().setattr('grad_req', 'null')
-
-    acc = run(net, train_dataloader, test_dataloader, **kwargs)
-
-    assert acc.get()[1] > 0.70
-
-
-parser = argparse.ArgumentParser(description='test gluon estimator')
-parser.add_argument('--type', type=str, default='cpu')
-opt = parser.parse_args()
-kwargs = {
-    'batch_size': 64,
-    'lr': 0.01,
-    'embed_size': 100
-}
-
-if opt.type == 'cpu':
-    kwargs['ctx'] = mx.cpu()
-    kwargs['epochs'] = 1
-    test_estimator_cpu(**kwargs)
-elif opt.type == 'gpu':
-    kwargs['ctx'] = mx.gpu()
-    kwargs['epochs'] = 5
-    test_estimator_gpu(**kwargs)
-else:
-    raise RuntimeError("Unknown test type")
diff --git a/tests/python/unittest/test_gluon_estimator.py b/tests/python/unittest/test_gluon_estimator.py
deleted file mode 100644
index d2e8c082aa08..000000000000
--- a/tests/python/unittest/test_gluon_estimator.py
+++ /dev/null
@@ -1,371 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-''' Unit tests for Gluon Estimator '''
-
-import sys
-import unittest
-
-import mxnet as mx
-from mxnet import gluon
-from mxnet.gluon import nn
-from mxnet.gluon.contrib.estimator import *
-from nose.tools import assert_raises
-
-
-def _get_test_network():
-    net = nn.Sequential()
-    net.add(nn.Dense(4, activation='relu', flatten=False))
-    return net
-
-
-def _get_test_data():
-    batch_size = 4
-    in_data = mx.nd.random.uniform(shape=(10, 3))
-    out_data = mx.nd.random.uniform(shape=(10, 4))
-    # Input dataloader
-    dataset = gluon.data.dataset.ArrayDataset(in_data, out_data)
-    dataloader = gluon.data.DataLoader(dataset, batch_size=batch_size)
-    dataiter = mx.io.NDArrayIter(data=in_data, label=out_data, batch_size=batch_size)
-    return dataloader, dataiter
-
-
-def test_fit():
-    ''' test estimator with different train data types '''
-    net = _get_test_network()
-    dataloader, dataiter = _get_test_data()
-    num_epochs = 1
-    ctx = mx.cpu()
-    loss = gluon.loss.L2Loss()
-    acc = mx.metric.Accuracy()
-    net.initialize(ctx=ctx)
-    trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
-    est = Estimator(net=net,
-                    loss=loss,
-                    metrics=acc,
-                    trainer=trainer,
-                    context=ctx)
-
-    est.fit(train_data=dataloader,
-            epochs=num_epochs)
-
-    with assert_raises(ValueError):
-        est.fit(train_data=dataiter,
-                epochs=num_epochs)
-
-    # Input NDArray
-    with assert_raises(ValueError):
-        est.fit(train_data=[mx.nd.ones(shape=(10, 3))],
-                epochs=num_epochs)
-
-
-def test_validation():
-    ''' test different validation data types'''
-    net = _get_test_network()
-    dataloader, dataiter = _get_test_data()
-    num_epochs = 1
-    ctx = mx.cpu()
-    loss = gluon.loss.L2Loss()
-    acc = mx.metric.Accuracy()
-    net.initialize(ctx=ctx)
-    trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
-    est = Estimator(net=net,
-                    loss=loss,
-                    metrics=acc,
-                    trainer=trainer,
-                    context=ctx)
-    # Input dataloader
-    est.fit(train_data=dataloader,
-            val_data=dataloader,
-            epochs=num_epochs)
-
-    # using validation handler
-    train_metrics, val_metrics = est.prepare_loss_and_metrics()
-    validation_handler = ValidationHandler(val_data=dataloader, eval_fn=est.evaluate,
-                                           val_metrics=val_metrics)
-
-    with assert_raises(ValueError):
-        est.fit(train_data=dataiter,
-                val_data=dataiter,
-                epochs=num_epochs)
-    # Input NDArray
-    with assert_raises(ValueError):
-        est.fit(train_data=[mx.nd.ones(shape=(10, 3))],
-                val_data=[mx.nd.ones(shape=(10, 3))],
-                epochs=num_epochs)
-
-
-@unittest.skipIf(sys.version_info.major < 3, 'Test on python 3')
-def test_initializer():
-    ''' test with no initializer, inconsistent initializer '''
-    net = _get_test_network()
-    train_data, _ = _get_test_data()
-    num_epochs = 1
-    ctx = mx.cpu()
-
-    loss = gluon.loss.L2Loss()
-    acc = mx.metric.Accuracy()
-    # no initializer
-    est = Estimator(net=net,
-                    loss=loss,
-                    metrics=acc,
-                    context=ctx)
-    est.fit(train_data=train_data,
-            epochs=num_epochs)
-
-    # different initializer for net and estimator
-    net = _get_test_network()
-    net.initialize(mx.init.Xavier(), ctx=ctx)
-    trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
-    # catch reinit warning
-    with warnings.catch_warnings(record=True) as w:
-        est = Estimator(net=net,
-                        loss=loss,
-                        metrics=acc,
-                        initializer=mx.init.MSRAPrelu(),
-                        trainer=trainer,
-                        context=ctx)
-        assert 'Network already fully initialized' in str(w[-1].message)
-    # net partially initialized, fine tuning use case
-    net = gluon.model_zoo.vision.resnet18_v1(pretrained=True, ctx=ctx)
-    net.output = gluon.nn.Dense(10) #last layer not initialized
-    est = Estimator(net, loss=loss, metrics=acc, context=ctx)
-    dataset =  gluon.data.ArrayDataset(mx.nd.zeros((10, 3, 224, 224)), mx.nd.zeros((10, 10)))
-    train_data = gluon.data.DataLoader(dataset=dataset, batch_size=5)
-    est.fit(train_data=train_data,
-            epochs=num_epochs)
-
-
-@unittest.skipIf(sys.version_info.major < 3, 'Test on python 3')
-def test_trainer():
-    ''' test with no trainer and invalid trainer '''
-    net = _get_test_network()
-    train_data, _ = _get_test_data()
-    num_epochs = 1
-    ctx = mx.cpu()
-
-    loss = gluon.loss.L2Loss()
-    acc = mx.metric.Accuracy()
-    net.initialize(ctx=ctx)
-    # input no trainer
-    with warnings.catch_warnings(record=True) as w:
-        est = Estimator(net=net,
-                        loss=loss,
-                        metrics=acc,
-                        context=ctx)
-        assert 'No trainer specified' in str(w[-1].message)
-    est.fit(train_data=train_data,
-            epochs=num_epochs)
-
-    # input invalid trainer
-    trainer = 'sgd'
-    with assert_raises(ValueError):
-        est = Estimator(net=net,
-                        loss=loss,
-                        metrics=acc,
-                        trainer=trainer,
-                        context=ctx)
-
-
-def test_metric():
-    ''' test with no metric, list of metrics, invalid metric '''
-    net = _get_test_network()
-    train_data, _ = _get_test_data()
-    num_epochs = 1
-    ctx = mx.cpu()
-
-    loss = gluon.loss.L2Loss()
-    net.initialize(ctx=ctx)
-    trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
-    # input no metric
-    est = Estimator(net=net,
-                    loss=loss,
-                    trainer=trainer,
-                    context=ctx)
-    est.fit(train_data=train_data,
-            epochs=num_epochs)
-    # input list of metrics
-    metrics = [mx.metric.Accuracy(), mx.metric.Accuracy()]
-    est = Estimator(net=net,
-                    loss=loss,
-                    metrics=metrics,
-                    trainer=trainer,
-                    context=ctx)
-    est.fit(train_data=train_data,
-            epochs=num_epochs)
-    # input invalid metric
-    with assert_raises(ValueError):
-        est = Estimator(net=net,
-                        loss=loss,
-                        metrics='acc',
-                        trainer=trainer,
-                        context=ctx)
-    # test default metric
-    loss = gluon.loss.SoftmaxCrossEntropyLoss()
-    est = Estimator(net=net,
-                    loss=loss,
-                    trainer=trainer,
-                    context=ctx)
-    est.prepare_loss_and_metrics()
-    assert isinstance(est.train_metrics[0], mx.metric.Accuracy)
-
-
-def test_loss():
-    ''' test with invalid loss '''
-    net = _get_test_network()
-    ctx = mx.cpu()
-    acc = mx.metric.Accuracy()
-    net.initialize(ctx=ctx)
-    trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
-    # input invalid loss
-    with assert_raises(ValueError):
-        est = Estimator(net=net,
-                        loss='mse',
-                        metrics=acc,
-                        trainer=trainer,
-                        context=ctx)
-
-
-def test_context():
-    ''' test with no context, list of context, invalid context '''
-    net = _get_test_network()
-    loss = gluon.loss.L2Loss()
-    metrics = mx.metric.Accuracy()
-    # input no context
-    est = Estimator(net=net,
-                    loss=loss,
-                    metrics=metrics)
-    # input list of context
-    gpus = mx.context.num_gpus()
-    ctx = [mx.gpu(i) for i in range(gpus)] if gpus > 0 else [mx.cpu()]
-    net = _get_test_network()
-    est = Estimator(net=net,
-                    loss=loss,
-                    metrics=metrics,
-                    context=ctx)
-    # input invalid context
-    with assert_raises(ValueError):
-        est = Estimator(net=net,
-                        loss=loss,
-                        metrics=metrics,
-                        context='cpu')
-
-    with assert_raises(AssertionError):
-        est = Estimator(net=net,
-                        loss=loss,
-                        metrics=metrics,
-                        context=[mx.gpu(0), mx.gpu(100)])
-
-
-def test_categorize_handlers():
-    class CustomHandler1(TrainBegin):
-
-        def train_begin(self):
-            print("custom train begin")
-
-    class CustomHandler2(EpochBegin, BatchBegin, TrainEnd):
-
-        def epoch_begin(self):
-            print("custom epoch begin")
-
-        def batch_begin(self):
-            print("custom batch begin")
-
-        def train_end(self):
-            print("custom train end")
-
-    class CustomHandler3(EpochBegin, BatchBegin, BatchEnd, TrainEnd):
-
-        def epoch_begin(self):
-            print("custom epoch begin")
-
-        def batch_begin(self):
-            print("custom batch begin")
-
-        def batch_end(self):
-            print("custom batch end")
-
-        def train_end(self):
-            print("custom train end")
-
-    net = nn.Sequential()
-    net.add(nn.Dense(10))
-    loss = gluon.loss.SoftmaxCrossEntropyLoss()
-    est = Estimator(net, loss=loss)
-    event_handlers = [CustomHandler1(), CustomHandler2(), CustomHandler3()]
-    train_begin, epoch_begin, batch_begin, \
-    batch_end, epoch_end, train_end = est._categorize_handlers(event_handlers)
-    assert len(train_begin) == 1
-    assert len(epoch_begin) == 2
-    assert len(batch_begin) == 2
-    assert len(batch_end) == 1
-    assert len(train_end) == 2
-
-
-@unittest.skipIf(sys.version_info.major < 3, 'Test on python 3')
-def test_default_handlers():
-    net = _get_test_network()
-    train_data, _ = _get_test_data()
-
-    num_epochs = 1
-    ctx = mx.cpu()
-
-    net.initialize(ctx=ctx)
-    trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
-
-    train_acc = mx.metric.RMSE()
-    loss = gluon.loss.L2Loss()
-
-    est = Estimator(net=net,
-                    loss=loss,
-                    metrics=train_acc,
-                    trainer=trainer,
-                    context=ctx)
-    # no handler
-    with warnings.catch_warnings(record=True) as w:
-        est.fit(train_data=train_data, epochs=num_epochs)
-        assert 'You are training with the' in str(w[-1].message)
-
-    # handler with prepared loss and metrics
-    # use mix of default and user defined handlers
-    train_metrics, val_metrics = est.prepare_loss_and_metrics()
-    logging = LoggingHandler(train_metrics=train_metrics, val_metrics=val_metrics)
-    with warnings.catch_warnings(record=True) as w:
-        est.fit(train_data=train_data, epochs=num_epochs, event_handlers=[logging])
-        assert 'You are training with the' in str(w[-1].message)
-        # provide metric handler by default
-        assert 'MetricHandler' in str(w[-1].message)
-
-    # handler with all user defined metrics
-    # use mix of default and user defined handlers
-    metric = MetricHandler(train_metrics=[train_acc])
-    logging = LoggingHandler(train_metrics=[train_acc], val_metrics=[mx.metric.RMSE("val acc")])
-    est.fit(train_data=train_data, epochs=num_epochs, event_handlers=[metric, logging])
-
-    # handler with mixed metrics, some handler use metrics prepared by estimator
-    # some handler use metrics user prepared
-    logging = LoggingHandler(train_metrics=train_metrics, val_metrics=[mx.metric.RMSE("val acc")])
-    with assert_raises(ValueError):
-        est.fit(train_data=train_data, epochs=num_epochs, event_handlers=[logging])
-
-    # test handler order
-    train_metrics, val_metrics = est.prepare_loss_and_metrics()
-    early_stopping = EarlyStoppingHandler(monitor=val_metrics[0])
-    handlers = est._prepare_default_handlers(val_data=None, event_handlers=[early_stopping])
-    assert len(handlers) == 4
-    assert isinstance(handlers[0], MetricHandler)
-    assert isinstance(handlers[3], LoggingHandler)
diff --git a/tests/python/unittest/test_gluon_event_handler.py b/tests/python/unittest/test_gluon_event_handler.py
deleted file mode 100644
index 7ea5ff3f4b62..000000000000
--- a/tests/python/unittest/test_gluon_event_handler.py
+++ /dev/null
@@ -1,198 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os
-
-import mxnet as mx
-from common import TemporaryDirectory
-from mxnet import nd
-from mxnet.gluon import nn, loss
-from mxnet.gluon.contrib.estimator import estimator, event_handler
-
-
-def _get_test_network(net=nn.Sequential()):
-    net.add(nn.Dense(128, activation='relu', flatten=False),
-            nn.Dense(64, activation='relu'),
-            nn.Dense(10, activation='relu'))
-    return net
-
-
-def _get_test_data():
-    data = nd.ones((32, 100))
-    label = nd.zeros((32, 1))
-    data_arr = mx.gluon.data.dataset.ArrayDataset(data, label)
-    return mx.gluon.data.DataLoader(data_arr, batch_size=8)
-
-
-def test_checkpoint_handler():
-    with TemporaryDirectory() as tmpdir:
-        model_prefix = 'test_epoch'
-        file_path = os.path.join(tmpdir, model_prefix)
-        test_data = _get_test_data()
-
-        net = _get_test_network()
-        ce_loss = loss.SoftmaxCrossEntropyLoss()
-        acc = mx.metric.Accuracy()
-        est = estimator.Estimator(net, loss=ce_loss, metrics=acc)
-        checkpoint_handler = event_handler.CheckpointHandler(model_dir=tmpdir,
-                                                             model_prefix=model_prefix,
-                                                             monitor=acc,
-                                                             save_best=True,
-                                                             epoch_period=1)
-        est.fit(test_data, event_handlers=[checkpoint_handler], epochs=1)
-        assert checkpoint_handler.current_epoch == 1
-        assert checkpoint_handler.current_batch == 4
-        assert os.path.isfile(file_path + '-best.params')
-        assert os.path.isfile(file_path + '-best.states')
-        assert os.path.isfile(file_path + '-epoch0batch4.params')
-        assert os.path.isfile(file_path + '-epoch0batch4.states')
-
-        model_prefix = 'test_batch'
-        file_path = os.path.join(tmpdir, model_prefix)
-        net = _get_test_network(nn.HybridSequential())
-        net.hybridize()
-        est = estimator.Estimator(net, loss=ce_loss, metrics=acc)
-        checkpoint_handler = event_handler.CheckpointHandler(model_dir=tmpdir,
-                                                             model_prefix=model_prefix,
-                                                             epoch_period=None,
-                                                             batch_period=2,
-                                                             max_checkpoints=2)
-        est.fit(test_data, event_handlers=[checkpoint_handler], batches=10)
-        assert checkpoint_handler.current_batch == 10
-        assert checkpoint_handler.current_epoch == 3
-        assert not os.path.isfile(file_path + 'best.params')
-        assert not os.path.isfile(file_path + 'best.states')
-        assert not os.path.isfile(file_path + '-epoch0batch0.params')
-        assert not os.path.isfile(file_path + '-epoch0batch0.states')
-        assert os.path.isfile(file_path + '-symbol.json')
-        assert os.path.isfile(file_path + '-epoch1batch7.params')
-        assert os.path.isfile(file_path + '-epoch1batch7.states')
-        assert os.path.isfile(file_path + '-epoch2batch9.params')
-        assert os.path.isfile(file_path + '-epoch2batch9.states')
-
-def test_resume_checkpoint():
-    with TemporaryDirectory() as tmpdir:
-        model_prefix = 'test_net'
-        file_path = os.path.join(tmpdir, model_prefix)
-        test_data = _get_test_data()
-
-        net = _get_test_network()
-        ce_loss = loss.SoftmaxCrossEntropyLoss()
-        acc = mx.metric.Accuracy()
-        est = estimator.Estimator(net, loss=ce_loss, metrics=acc)
-        checkpoint_handler = event_handler.CheckpointHandler(model_dir=tmpdir,
-                                                             model_prefix=model_prefix,
-                                                             monitor=acc,
-                                                             max_checkpoints=1)
-        est.fit(test_data, event_handlers=[checkpoint_handler], epochs=2)
-        assert os.path.isfile(file_path + '-epoch1batch8.params')
-        assert os.path.isfile(file_path + '-epoch1batch8.states')
-        checkpoint_handler = event_handler.CheckpointHandler(model_dir=tmpdir,
-                                                             model_prefix=model_prefix,
-                                                             monitor=acc,
-                                                             max_checkpoints=1,
-                                                             resume_from_checkpoint=True)
-        est.fit(test_data, event_handlers=[checkpoint_handler], epochs=5)
-        # should only continue to train 3 epochs and last checkpoint file is epoch4
-        assert est.max_epoch == 3
-        assert os.path.isfile(file_path + '-epoch4batch20.states')
-
-
-def test_early_stopping():
-    test_data = _get_test_data()
-
-    net = _get_test_network()
-    ce_loss = loss.SoftmaxCrossEntropyLoss()
-    acc = mx.metric.Accuracy()
-    est = estimator.Estimator(net, loss=ce_loss, metrics=acc)
-    early_stopping = event_handler.EarlyStoppingHandler(monitor=acc,
-                                                        patience=0,
-                                                        mode='min')
-    est.fit(test_data, event_handlers=[early_stopping], epochs=5)
-    assert early_stopping.current_epoch == 2
-    assert early_stopping.stopped_epoch == 1
-
-    early_stopping = event_handler.EarlyStoppingHandler(monitor=acc,
-                                                        patience=2,
-                                                        mode='auto')
-    est.fit(test_data, event_handlers=[early_stopping], epochs=1)
-    assert early_stopping.current_epoch == 1
-
-
-def test_logging():
-    with TemporaryDirectory() as tmpdir:
-        test_data = _get_test_data()
-        file_name = 'test_log'
-        output_dir = os.path.join(tmpdir, file_name)
-
-        net = _get_test_network()
-        ce_loss = loss.SoftmaxCrossEntropyLoss()
-        acc = mx.metric.Accuracy()
-        est = estimator.Estimator(net, loss=ce_loss, metrics=acc)
-        train_metrics, val_metrics = est.prepare_loss_and_metrics()
-        logging_handler = event_handler.LoggingHandler(file_name=file_name,
-                                                       file_location=tmpdir,
-                                                       train_metrics=train_metrics,
-                                                       val_metrics=val_metrics)
-        est.fit(test_data, event_handlers=[logging_handler], epochs=3)
-        assert logging_handler.batch_index == 0
-        assert logging_handler.current_epoch == 3
-        assert os.path.isfile(output_dir)
-
-
-def test_custom_handler():
-    class CustomStopHandler(event_handler.TrainBegin,
-                            event_handler.BatchEnd,
-                            event_handler.EpochEnd):
-        def __init__(self, batch_stop=None, epoch_stop=None):
-            self.batch_stop = batch_stop
-            self.epoch_stop = epoch_stop
-            self.num_batch = 0
-            self.num_epoch = 0
-            self.stop_training = False
-
-        def train_begin(self, estimator, *args, **kwargs):
-            self.num_batch = 0
-            self.num_epoch = 0
-
-        def batch_end(self, estimator, *args, **kwargs):
-            self.num_batch += 1
-            if self.num_batch == self.batch_stop:
-                self.stop_training = True
-            return self.stop_training
-
-        def epoch_end(self, estimator, *args, **kwargs):
-            self.num_epoch += 1
-            if self.num_epoch == self.epoch_stop:
-                self.stop_training = True
-            return self.stop_training
-
-    # total data size is 32, batch size is 8
-    # 4 batch per epoch
-    test_data = _get_test_data()
-    net = _get_test_network()
-    ce_loss = loss.SoftmaxCrossEntropyLoss()
-    acc = mx.metric.Accuracy()
-    est = estimator.Estimator(net, loss=ce_loss, metrics=acc)
-    custom_handler = CustomStopHandler(3, 2)
-    est.fit(test_data, event_handlers=[custom_handler], epochs=3)
-    assert custom_handler.num_batch == 3
-    assert custom_handler.num_epoch == 1
-    custom_handler = CustomStopHandler(100, 5)
-    est.fit(test_data, event_handlers=[custom_handler], epochs=10)
-    assert custom_handler.num_batch == 5 * 4
-    assert custom_handler.num_epoch == 5