diff --git a/.gitignore b/.gitignore
index d62c63f403e9..549726650e43 100644
--- a/.gitignore
+++ b/.gitignore
@@ -55,4 +55,3 @@ Debug
 .dir-locals.el
 __pycache__
 *.pkl
-*
\ No newline at end of file
diff --git a/Makefile b/Makefile
index 8ebcfa896d62..bdebed0b5ae6 100644
--- a/Makefile
+++ b/Makefile
@@ -64,7 +64,7 @@ endif
 #BIN = test/test_threaded_engine test/api_registry_test
 OBJ = narray_function_cpu.o
 # add threaded engine after it is done
-OBJCXX11 = reshape_cpu.o engine.o narray.o c_api.o operator.o symbol.o storage.o fully_connected_cpu.o static_graph.o activation_cpu.o graph_executor.o softmax_cpu.o elementwise_sum_cpu.o pooling_cpu.o convolution_cpu.o
+OBJCXX11 = reshape_cpu.o engine.o narray.o c_api.o operator.o symbol.o storage.o fully_connected_cpu.o static_graph.o activation_cpu.o graph_executor.o softmax_cpu.o elementwise_sum_cpu.o pooling_cpu.o convolution_cpu.o io.o iter_mnist.o
 CUOBJ =
 SLIB = lib/libmxnet.so
 ALIB = lib/libmxnet.a
@@ -105,12 +105,13 @@ convolution_cpu.o: src/operator/convolution.cc
 convolution_gpu.o: src/operator/convolution.cu
 reshape_cpu.o: src/operator/reshape.cc
 reshape_gpu.o: src/operator/reshape.cu
+io.o: src/io/io.cc
+iter_mnist.o: src/io/iter_mnist.cc
 
-lib/libmxnet.a: $(OBJ) $(OBJCXX11) $(CUOBJ)
-lib/libmxnet.so: $(OBJ) $(OBJCXX11) $(CUOBJ)
+lib/libmxnet.a: $(OBJ) $(OBJCXX11) $(CUOBJ) $(LIB_DEP)
+lib/libmxnet.so: $(OBJ) $(OBJCXX11) $(CUOBJ) $(LIB_DEP)
 
 test/test_storage: test/test_storage.cc lib/libmxnet.a
-#test/test_threaded_engine: test/test_threaded_engine.cc api/libmxnet.a
 
 $(BIN) :
 	$(CXX) $(CFLAGS) -std=c++0x -o $@ $(filter %.cpp %.o %.c %.a %.cc, $^) $(LDFLAGS)
diff --git a/doc/python/io.md b/doc/python/io.md
new file mode 100644
index 000000000000..7bff6a83e354
--- /dev/null
+++ b/doc/python/io.md
@@ -0,0 +1,12 @@
+Python IO API
+===================
+Mxnet handles IO for you by implementing data iterators.
+It is like an iterable class in python, you can traverse the data using a for loop.
+
+
+IO API Reference
+----------------------
+```eval_rst
+.. automodule:: mxnet.io
+    :members:
+```
diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index 04b634ff8b87..a7a3a8063a92 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -55,8 +55,6 @@ typedef mshadow::TBlob TBlob;
 namespace dmlc {
 // Add a few patches to support TShape in dmlc/parameter.
 DMLC_DECLARE_TYPE_NAME(mxnet::TShape, "Shape(tuple)");
-DMLC_DECLARE_TYPE_NAME(uint32_t, "unsigned int");
-
 
 namespace parameter {
 template<>
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 5802c32cf75c..f5f0a05169ec 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -36,6 +36,8 @@ typedef void *SymbolHandle;
 typedef void *AtomicSymbolHandle;
 /*! \brief handle to an Executor */
 typedef void *ExecutorHandle;
+/*! \brief handle a dataiter creator */
+typedef void *DataIterCreator;
 /*! \brief handle to a DataIterator */
 typedef void *DataIterHandle;
 /*!
@@ -452,49 +454,176 @@ MXNET_DLL int MXExecutorBind(SymbolHandle symbol_handle,
 // Part 5: IO Interface
 //--------------------------------------------
 /*!
- * \brief create an data iterator from configs string
- * \param cfg config string that contains the
- *    configuration about the iterator
- * \param out the handle to the iterator
+ * \brief List all the available iterator entries
+ * \param out_size the size of returned iterators
+ * \param out_array the output iteratos entries
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXIOCreateFromConfig(const char *cfg,
-                                   DataIterHandle *out);
+MXNET_DLL int MXListDataIters(mx_uint *out_size,
+                              DataIterCreator **out_array);
 /*!
- * \brief move iterator to next position
- * \param handle the handle to iterator
- * \param out return value of next
+ * \brief Init an iterator, init with parameters
+ * the array size of passed in arguments
+ * \param handle of the iterator creator
+ * \param num_param number of parameter
+ * \param keys parameter keys
+ * \param vals parameter values
+ * \param out resulting iterator
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXIONext(DataIterHandle handle,
-                       int *out);
+MXNET_DLL int MXDataIterCreateIter(DataIterCreator handle,
+                               int num_param,
+                               const char **keys,
+                               const char **vals,
+                               DataIterHandle *out);
 /*!
- * \brief call iterator.BeforeFirst
- * \param handle the handle to iterator
+ * \brief Get the detailed information about data iterator.
+ * \param creator the DataIterCreator.
+ * \param name The returned name of the creator.
+ * \param description The returned description of the symbol.
+ * \param num_args Number of arguments.
+ * \param arg_names Name of the arguments.
+ * \param arg_type_infos Type informations about the arguments.
+ * \param arg_descriptions Description information about the arguments.
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXIOBeforeFirst(DataIterHandle handle);
+MXNET_DLL int MXDataIterGetIterInfo(AtomicSymbolCreator creator,
+                                          const char **name,
+                                          const char **description,
+                                          mx_uint *num_args,
+                                          const char ***arg_names,
+                                          const char ***arg_type_infos,
+                                          const char ***arg_descriptions);
 /*!
- * \brief free the handle to the IO module
+ * \brief Free the handle to the IO module
  * \param handle the handle pointer to the data iterator
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXIOFree(DataIterHandle handle);
+MXNET_DLL int MXDataIterFree(DataIterHandle handle);
+/*!
+ * \brief get the name of iterator entry
+ * \param iter iterator entry
+ * \param out_name the name of the iterator
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXDataIterGetName(DataIterCreator iter,
+                            const char **out_name);
+/*!
+ * \brief Init an iterator, init with parameters
+ * the array size of passed in arguments
+ * \param handle of the iterator creator
+ * \param num_param number of parameter
+ * \param keys parameter keys
+ * \param vals parameter values
+ * \param out resulting iterator
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXDataIterCreateIter(DataIterCreator handle,
+                               int num_param,
+                               const char **keys,
+                               const char **vals,
+                               DataIterHandle *out);
+/*!
+ * \brief Get the detailed information about data iterator.
+ * \param creator the DataIterCreator.
+ * \param name The returned name of the creator.
+ * \param description The returned description of the symbol.
+ * \param num_args Number of arguments.
+ * \param arg_names Name of the arguments.
+ * \param arg_type_infos Type informations about the arguments.
+ * \param arg_descriptions Description information about the arguments.
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXDataIterGetIterInfo(AtomicSymbolCreator creator,
+                                          const char **name,
+                                          const char **description,
+                                          mx_uint *num_args,
+                                          const char ***arg_names,
+                                          const char ***arg_type_infos,
+                                          const char ***arg_descriptions);
+/*!
+ * \brief Free the handle to the IO module
+ * \param handle the handle pointer to the data iterator
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXDataIterFree(DataIterHandle handle);
+/*!
+ * \brief Get the name of iterator entry
+ * \param iter iterator entry
+ * \param out_name the name of the iterator
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXDataIterGetName(DataIterCreator iter,
+                            const char **out_name);
+/*!
+ * \brief Init an iterator, init with parameters
+ * the array size of passed in arguments
+ * \param handle of the iterator creator
+ * \param num_param number of parameter
+ * \param keys parameter keys
+ * \param vals parameter values
+ * \param out resulting iterator
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXDataIterCreateIter(DataIterCreator handle,
+                               int num_param,
+                               const char **keys,
+                               const char **vals,
+                               DataIterHandle *out);
+/*!
+ * \brief Get the detailed information about data iterator.
+ * \param creator the DataIterCreator.
+ * \param name The returned name of the creator.
+ * \param description The returned description of the symbol.
+ * \param num_args Number of arguments.
+ * \param arg_names Name of the arguments.
+ * \param arg_type_infos Type informations about the arguments.
+ * \param arg_descriptions Description information about the arguments.
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXDataIterGetIterInfo(AtomicSymbolCreator creator,
+                                          const char **name,
+                                          const char **description,
+                                          mx_uint *num_args,
+                                          const char ***arg_names,
+                                          const char ***arg_type_infos,
+                                          const char ***arg_descriptions);
+/*!
+ * \brief Free the handle to the IO module
+ * \param handle the handle pointer to the data iterator
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXDataIterFree(DataIterHandle handle);
+/*!
+ * \brief Move iterator to next position
+ * \param handle the handle to iterator
+ * \param out return value of next
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXDataIterNext(DataIterHandle handle,
+                       int *out);
+/*!
+ * \brief Call iterator.Reset
+ * \param handle the handle to iterator
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXDataIterBeforeFirst(DataIterHandle handle);
+
 /*!
- * \brief get the handle to the NArray of underlying data
+ * \brief Get the handle to the NArray of underlying data
  * \param handle the handle pointer to the data iterator
  * \param out handle to underlying data NArray
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXIOGetData(DataIterHandle handle,
+MXNET_DLL int MXDataIterGetData(DataIterHandle handle,
                           NArrayHandle *out);
 /*!
- * \brief get the handle to the NArray of underlying label
+ * \brief Get the handle to the NArray of underlying label
  * \param handle the handle pointer to the data iterator
  * \param out the handle to underlying label NArray
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXIOGetLabel(DataIterHandle handle,
+MXNET_DLL int MXDataIterGetLabel(DataIterHandle handle,
                            NArrayHandle *out);
 
 #endif  // MXNET_C_API_H_
diff --git a/include/mxnet/io.h b/include/mxnet/io.h
new file mode 100644
index 000000000000..47a59eec54fe
--- /dev/null
+++ b/include/mxnet/io.h
@@ -0,0 +1,113 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file io.h
+ * \brief mxnet io data structure and data iterator
+ */
+#ifndef MXNET_IO_H_
+#define MXNET_IO_H_
+#include <dmlc/data.h>
+#include <dmlc/registry.h>
+#include <vector>
+#include <string>
+#include <utility>
+#include "./base.h"
+
+namespace mxnet {
+/*!
+ * \brief iterator type
+ * \tparam DType data type
+ */
+template<typename DType>
+class IIterator : public dmlc::DataIter<DType> {
+ public:
+  /*!
+   * \brief set the parameters and init iter
+   * \param kwargs key-value pairs
+   */ 
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) = 0;
+  /*! \brief reset the iterator */
+  virtual void BeforeFirst(void) = 0;
+  /*! \brief move to next item */
+  virtual bool Next(void) = 0;
+  /*! \brief get current data */
+  virtual const DType &Value(void) const = 0;
+  /*! \brief constructor */
+  virtual ~IIterator(void) {}
+  /*! \brief store the name of each data, it could be used for making NArrays */
+  std::vector<std::string> data_names;
+  /*! \brief set data name to each attribute of data */
+  inline void SetDataName(const std::string data_name){
+    data_names.push_back(data_name);
+  }
+};  // class IIterator
+
+/*! \brief a single data instance */
+struct DataInst {
+  /*! \brief unique id for instance */
+  unsigned index;
+  /*! \brief content of data */
+  std::vector<TBlob> data;
+  /*! \brief extra data to be fed to the network */
+  std::string extra_data;
+};  // struct DataInst
+
+/*!
+ * \brief a standard batch of data commonly used by iterator
+ *      a databatch contains multiple TBlobs. Each Tblobs has
+ *      a name stored in a map. There's no different between
+ *      data and label, how we use them is to see the DNN implementation.
+ */
+struct DataBatch {
+ public:
+  /*! \brief unique id for instance, can be NULL, sometimes is useful */
+  unsigned *inst_index;
+  /*! \brief number of instance */
+  mshadow::index_t batch_size;
+  /*! \brief number of padding elements in this batch,
+       this is used to indicate the last elements in the batch are only padded up to match the batch, and should be discarded */
+  mshadow::index_t num_batch_padd;
+ public:
+  /*! \brief content of dense data, if this DataBatch is dense */
+  std::vector<TBlob> data;
+  /*! \brief extra data to be fed to the network */
+  std::string extra_data;
+ public:
+  /*! \brief constructor */
+  DataBatch(void) {
+    inst_index = NULL;
+    batch_size = 0; num_batch_padd = 0;
+  }
+  /*! \brief giving name to the data */
+  void Naming(std::vector<std::string> names);
+};  // struct DataBatch
+
+/*! \brief typedef the factory function of data iterator */
+typedef IIterator<DataBatch> *(*DataIteratorFactory)();
+/*!
+ * \brief Registry entry for DataIterator factory functions.
+ */
+struct DataIteratorReg
+    : public dmlc::FunctionRegEntryBase<DataIteratorReg,
+                                        DataIteratorFactory> {
+};
+//--------------------------------------------------------------
+// The following part are API Registration of Iterators
+//--------------------------------------------------------------
+/*!
+ * \brief Macro to register Iterators
+ *
+ * \code
+ * // example of registering a mnist iterator
+ * REGISTER_IO_ITERATOR(MNIST, MNISTIterator)
+ * .describe("Mnist data iterator");
+ *
+ * \endcode
+ */
+#define MXNET_REGISTER_IO_ITER(name, DataIteratorType)          \
+  static ::mxnet::IIterator<DataBatch>* __create__ ## DataIteratorType ## __() { \
+    return new DataIteratorType;                                    \
+  }                                                                     \
+  DMLC_REGISTRY_REGISTER(::mxnet::DataIteratorReg, DataIteratorReg, name) \
+  .set_body(__create__ ## DataIteratorType ## __)
+}  // namespace mxnet
+#endif  // MXNET_IO_H_
diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py
index c7720dcbd935..a8632bfa2ff8 100644
--- a/python/mxnet/__init__.py
+++ b/python/mxnet/__init__.py
@@ -12,6 +12,7 @@
 from .base import MXNetError
 from . import narray
 from . import symbol
+from . import io
 
 __version__ = "0.1.0"
 
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index 6cf8c616f805..ec9d43dc58aa 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -75,7 +75,8 @@ def _load_lib():
 SymbolCreatorHandle = ctypes.c_void_p
 SymbolHandle = ctypes.c_void_p
 ExecutorHandle = ctypes.c_void_p
-
+DataIterCreatorHandle = ctypes.c_void_p
+DataIterHandle = ctypes.c_void_p
 #----------------------------
 # helper function definition
 #----------------------------
diff --git a/python/mxnet/io.py b/python/mxnet/io.py
new file mode 100644
index 000000000000..58dbe6e6f9a3
--- /dev/null
+++ b/python/mxnet/io.py
@@ -0,0 +1,172 @@
+# coding: utf-8
+
+"""NArray interface of mxnet"""
+from __future__ import absolute_import
+
+import ctypes
+import sys
+from .base import _LIB
+from .base import c_array, c_str, mx_uint, py_str
+from .base import DataIterHandle, NArrayHandle
+from .base import check_call
+from .narray import NArray
+
+class DataIter(object):
+    """DataIter object in mxnet. List all the needed functions here. """
+
+    def __init__(self, handle):
+        """Initialize with handle
+
+        Parameters
+        ----------
+        handle : DataIterHandle
+            the handle to the underlying C++ Data Iterator
+        """
+        self.handle = handle
+
+    def __del__(self):
+        check_call(_LIB.MXDataIterFree(self.handle))
+
+    def __iter__(self):
+        """make the class iterable
+
+        """
+        return self
+
+    def reset(self):
+        """set loc to 0
+
+        """
+        check_call(_LIB.MXDataIterBeforeFirst(self.handle))
+
+    def next(self):
+        """get next data batch from iterator
+
+        Returns
+        -------
+        labels and images for the next batch
+        """
+        next_res = ctypes.c_int(0)
+        check_call(_LIB.MXDataIterNext(self.handle, ctypes.byref(next_res)))
+        if next_res.value:
+            return self.getdata(), self.getlabel()
+        else:
+            raise StopIteration
+
+    # make it work for both python2 and 3
+    __next__ = next
+
+    def iter_next(self):
+        """iterate to next data with return value
+
+        Returns
+        -------
+        return true if success
+        """
+        next_res = ctypes.c_int(0)
+        check_call(_LIB.MXDataIterNext(self.handle, ctypes.byref(next_res)))
+        return next_res.value
+
+    def getdata(self):
+        """get data from batch
+
+        """
+        hdl = NArrayHandle()
+        check_call(_LIB.MXDataIterGetData(self.handle, ctypes.byref(hdl)))
+        return NArray(hdl)
+
+    def getlabel(self):
+        """get label from batch
+
+        """
+        hdl = NArrayHandle()
+        check_call(_LIB.MXDataIterGetLabel(self.handle, ctypes.byref(hdl)))
+        return NArray(hdl)
+
+def _make_io_iterator(handle):
+    """Create an io iterator by handle."""
+    name = ctypes.c_char_p()
+    desc = ctypes.c_char_p()
+    num_args = mx_uint()
+    arg_names = ctypes.POINTER(ctypes.c_char_p)()
+    arg_types = ctypes.POINTER(ctypes.c_char_p)()
+    arg_descs = ctypes.POINTER(ctypes.c_char_p)()
+
+    check_call(_LIB.MXDataIterGetIterInfo( \
+            handle, ctypes.byref(name), ctypes.byref(desc), \
+            ctypes.byref(num_args), \
+            ctypes.byref(arg_names), \
+            ctypes.byref(arg_types), \
+            ctypes.byref(arg_descs)))
+    iter_name = py_str(name.value)
+    param_str = []
+    for i in range(num_args.value):
+        ret = '%s : %s' % (arg_names[i], arg_types[i])
+        if len(arg_descs[i]) != 0:
+            ret += '\n    ' + py_str(arg_descs[i])
+        param_str.append(ret)
+
+    doc_str = ('%s\n\n' +
+               'Parameters\n' +
+               '----------\n' +
+               '%s\n' +
+               'name : string, required.\n' +
+               '    Name of the resulting data iterator.\n\n' +
+               'Returns\n' +
+               '-------\n' +
+               'iterator: Iterator\n'+
+               '    The result iterator.')
+    doc_str = doc_str % (desc.value, '\n'.join(param_str))
+
+    def creator(*args, **kwargs):
+        """Create an iterator.
+        The parameters listed below can be passed in as keyword arguments.
+
+        Parameters
+        ----------
+        name : string, required.
+            Name of the resulting data iterator.
+
+        Returns
+        -------
+        dataiter: Dataiter
+            the resulting data iterator
+        """
+        param_keys = []
+        param_vals = []
+
+        for k, val in kwargs.items():
+            param_keys.append(c_str(k))
+            param_vals.append(c_str(str(val)))
+        # create atomic symbol
+        param_keys = c_array(ctypes.c_char_p, param_keys)
+        param_vals = c_array(ctypes.c_char_p, param_vals)
+        iter_handle = DataIterHandle()
+        check_call(_LIB.MXDataIterCreateIter(
+            handle, len(param_keys),
+            param_keys, param_vals,
+            ctypes.byref(iter_handle)))
+
+        if len(args):
+            raise TypeError('%s can only accept keyword arguments' % iter_name)
+
+        return DataIter(iter_handle)
+
+    creator.__name__ = iter_name
+    creator.__doc__ = doc_str
+    return creator
+
+
+def _init_io_module():
+    """List and add all the data iterators to current module."""
+    plist = ctypes.POINTER(ctypes.c_void_p)()
+    size = ctypes.c_uint()
+    check_call(_LIB.MXListDataIters(ctypes.byref(size), ctypes.byref(plist)))
+    module_obj = sys.modules[__name__]
+    for i in range(size.value):
+        hdl = ctypes.c_void_p(plist[i])
+        dataiter = _make_io_iterator(hdl)
+        setattr(module_obj, dataiter.__name__, dataiter)
+
+# Initialize the io in startups
+_init_io_module()
diff --git a/python/test_io.py b/python/test_io.py
new file mode 100644
index 000000000000..d15d4cc32fcd
--- /dev/null
+++ b/python/test_io.py
@@ -0,0 +1,21 @@
+#pylint: skip-file
+import mxnet as mx
+import numpy as np
+import os
+
+dataiter = mx.io.MNISTIterator(path_img="/home/tianjun/data/mnist/train-images-idx3-ubyte",
+        path_label="/home/tianjun/data/mnist/train-labels-idx1-ubyte",
+        batch_size=100, shuffle=1, silent=1, input_flat="flat")
+
+dataiter.beforefirst()
+
+idx = 0
+while dataiter.next():
+    info = "Batch %d" % (idx)
+    idx += 1
+    print info
+    '''
+    label = dataiter.getlabel()
+    print label.numpy
+    '''
+
diff --git a/src/c_api.cc b/src/c_api.cc
index b251ba578743..0d496c3855bf 100644
--- a/src/c_api.cc
+++ b/src/c_api.cc
@@ -9,6 +9,7 @@
 #include <mxnet/narray.h>
 #include <mxnet/symbolic.h>
 #include <mxnet/operator.h>
+#include <mxnet/io.h>
 #include <mxnet/c_api.h>
 #include <vector>
 #include <sstream>
@@ -609,3 +610,78 @@ int MXExecutorBind(SymbolHandle symbol_handle,
   *out = Executor::Bind(*symb, ctx, in_args_vec, arg_grad_vec, grad_req_vec);
   API_END();
 }
+
+//--------------------------------------------
+// Part 5: IO Interface
+//--------------------------------------------
+int MXListDataIters(mx_uint *out_size,
+                    DataIterCreator **out_array) {
+  API_BEGIN();
+  auto &vec = dmlc::Registry<DataIteratorReg>::List();
+  *out_size = static_cast<mx_uint>(vec.size());
+  *out_array = (DataIterCreator*)(dmlc::BeginPtr(vec));  //  NOLINT(*)
+  API_END();
+}
+
+int MXDataIterGetIterInfo(DataIterCreator creator,
+                                const char **name,
+                                const char **description,
+                                mx_uint *num_args,
+                                const char ***arg_names,
+                                const char ***arg_type_infos,
+                                const char ***arg_descriptions) {
+  DataIteratorReg *e = static_cast<DataIteratorReg *>(creator);
+  return MXAPIGetFunctionRegInfo(e, name, description, num_args,
+                                 arg_names, arg_type_infos, arg_descriptions);
+}
+
+int MXDataIterCreateIter(DataIterCreator creator,
+                               int num_param,
+                               const char **keys,
+                               const char **vals,
+                               DataIterHandle *out) {
+  IIterator<DataBatch> *iter = nullptr;
+  API_BEGIN();
+  DataIteratorReg *e = static_cast<DataIteratorReg *>(creator);
+  iter = e->body();
+  std::vector<std::pair<std::string, std::string> > kwargs;
+  for (int i = 0; i < num_param; ++i) {
+    kwargs.push_back({std::string(keys[i]), std::string(vals[i])});
+  }
+  iter->Init(kwargs);
+  iter->BeforeFirst();
+  *out = iter;
+  API_END_HANDLE_ERROR(delete iter);
+}
+
+int MXDataIterFree(DataIterHandle handle) {
+  API_BEGIN();
+  delete static_cast<IIterator<DataBatch> *>(handle);
+  API_END();
+}
+
+int MXDataIterBeforeFirst(DataIterHandle handle) {
+  API_BEGIN();
+  static_cast<IIterator<DataBatch>* >(handle)->BeforeFirst();
+  API_END();
+}
+
+int MXDataIterNext(DataIterHandle handle, int *out) {
+  API_BEGIN();
+  *out = static_cast<IIterator<DataBatch>* >(handle)->Next();
+  API_END();
+}
+
+int MXDataIterGetLabel(DataIterHandle handle, NArrayHandle *out) {
+  API_BEGIN();
+  DataBatch db = static_cast<IIterator<DataBatch>* >(handle)->Value();
+  *out = new NArray(db.data[1], 0);
+  API_END();
+}
+
+int MXDataIterGetData(DataIterHandle handle, NArrayHandle *out) {
+  API_BEGIN();
+  DataBatch db = static_cast<IIterator<DataBatch>* >(handle)->Value();
+  *out = new NArray(db.data[0], 0);
+  API_END();
+}
diff --git a/src/common/utils.h b/src/common/utils.h
index f55ebc26535f..cf1fd2f1bb36 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -10,12 +10,18 @@
 #include <memory>
 #include <type_traits>
 #include <utility>
+#include <random>
 #endif  // DMLC_USE_CXX11
 
 namespace common {
 
 #if DMLC_USE_CXX11
 
+/*!
+ * \brief Random Engine
+ */
+typedef std::mt19937 RANDOM_ENGINE;
+
 /*!
  * \brief Helper functions.
  */
diff --git a/src/io/inst_vector.h b/src/io/inst_vector.h
new file mode 100644
index 000000000000..1ae734631680
--- /dev/null
+++ b/src/io/inst_vector.h
@@ -0,0 +1,117 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \inst_vector.h
+ * \brief holder of a sequence of DataInst in CPU
+ *        that are not necessarily of same shape
+ */
+#ifndef MXNET_IO_INST_VECTOR_H_
+#define MXNET_IO_INST_VECTOR_H_
+#include <dmlc/base.h>
+#include <mshadow/tensor.h>
+#include <vector>
+#include <string>
+#include "./data.h"
+namespace mxnet {
+/*!
+ * \brief tensor vector that can store sequence of tensor
+ *  in a memory compact way, tensors do not have to be of same shape
+ */
+template<int dim, typename DType>
+class TensorVector {
+ public:
+  TensorVector(void) {
+    this->Clear();
+  }
+  // get i-th tensor
+  inline mshadow::Tensor<cpu, dim, DType>
+  operator[](size_t i) const {
+    CHECK(i + 1 < offset_.size());
+    CHECK(shape_[i].Size() == offset_[i + 1] - offset_[i]);
+    return mshadow::Tensor<cpu, dim, DType>
+        (reinterpret_cast<DType*>(BeginPtr(content_)) + offset_[i], shape_[i]);
+  }
+  inline mshadow::Tensor<cpu, dim, DType> Back() const {
+    return (*this)[Size() - 1];
+  }
+  inline size_t Size(void) const {
+    return shape_.size();
+  }
+  // push a tensor of certain shape
+  // return the reference of the pushed tensor
+  inline void Push(mshadow::Shape<dim> shape) {
+    shape_.push_back(shape);
+    offset_.push_back(offset_.back() + shape.Size());
+    content_.resize(offset_.back());
+  }
+  inline void Clear(void) {
+    offset_.clear();
+    offset_.push_back(0);
+    content_.clear();
+    shape_.clear();
+  }
+
+ private:
+  // offset of the data content
+  std::vector<size_t> offset_;
+  // data content
+  std::vector<DType> content_;
+  // shape of data
+  std::vector<mshadow::Shape<dim> > shape_;
+};
+
+/*!
+ * \brief tblob vector that can store sequence of tblob
+ *  in a memory compact way, tblobs do not have to be of same shape
+ */
+template<typename DType>
+class TBlobVector {
+ public:
+  TBlobVector(void) {
+    this->Clear();
+  }
+  // get i-th tblob
+  inline TBlob operator[](size_t i) const;
+  // get the last tblob
+  inline TBlob Back();
+  // return the size of the vector
+  inline size_t Size(void) const;
+  // push a tensor of certain shape
+  // return the reference of the pushed tensor
+  inline void Push(TShape shape_);
+  inline void Clear(void);
+ private:
+  // offset of the data content
+  std::vector<size_t> offset_;
+  // data content
+  std::vector<DType> content_;
+  // shape of data
+  std::vector<TShape > shape_;
+};
+
+/*!
+ * \brief instance vector that can holds
+ * non-uniform shape data instance in a shape efficient way
+ */
+class InstVector {
+ public:
+  inline size_t Size(void) const {
+    return index_.size();
+  }
+  // instance
+  inline DataInst operator[](size_t i) const;
+  // get back of instance vector
+  inline DataInst Back() const;
+  // clear the container
+  inline void Clear(void);
+  // push the newly coming instance
+  inline void Push(unsigned index, TBlob data_);
+
+ private:
+  /*! \brief index of the data */
+  std::vector<unsigned> index_;
+  // data
+  std::vector<TensorVector<real_t> > data_;
+  // extra data
+  std::vector<std::string> extra_data_;
+};
+#endif  // MXNET_IO_INST_VECTOR_H_
diff --git a/src/io/io.cc b/src/io/io.cc
new file mode 100644
index 000000000000..bd5b78dda643
--- /dev/null
+++ b/src/io/io.cc
@@ -0,0 +1,10 @@
+// Copyright (c) 2015 by Contributors
+#define _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_DEPRECATE
+
+#include <mxnet/io.h>
+#include <dmlc/registry.h>
+
+namespace dmlc {
+DMLC_REGISTRY_ENABLE(::mxnet::DataIteratorReg);
+}  // namespace dmlc
diff --git a/src/io/iter_mnist.cc b/src/io/iter_mnist.cc
new file mode 100644
index 000000000000..93195061b278
--- /dev/null
+++ b/src/io/iter_mnist.cc
@@ -0,0 +1,207 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file iter_mnist.cc
+ * \brief register mnist iterator
+ * \author Tianjun Xiao
+*/
+#include <mxnet/io.h>
+#include <mxnet/base.h>
+#include <dmlc/io.h>
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <string>
+#include <vector>
+#include <utility>
+#include <map>
+#include "../common/utils.h"
+
+namespace mxnet {
+namespace io {
+// Define mnist io parameters
+struct MNISTParam : public dmlc::Parameter<MNISTParam> {
+  /*! \brief path */
+  std::string image, label;
+  /*! \brief whether to do shuffle */
+  bool shuffle;
+  /*! \brief whether to print info */
+  bool silent;
+  /*! \brief batch size */
+  int batch_size;
+  /*! \brief data mode */
+  bool flat;
+  /*! \brief random seed */
+  int seed;
+  // declare parameters in header file
+  DMLC_DECLARE_PARAMETER(MNISTParam) {
+    DMLC_DECLARE_FIELD(image).set_default("./train-images-idx3-ubyte")
+        .describe("Mnist image path.");
+    DMLC_DECLARE_FIELD(label).set_default("./train-labels-idx1-ubyte")
+        .describe("Mnist label path.");
+    DMLC_DECLARE_FIELD(batch_size).set_lower_bound(1).set_default(128)
+        .describe("Batch Size.");
+    DMLC_DECLARE_FIELD(shuffle).set_default(true)
+        .describe("Whether to shuffle data.");
+    DMLC_DECLARE_FIELD(flat).set_default(false)
+        .describe("Whether to flat the data into 1D.");
+    DMLC_DECLARE_FIELD(silent).set_default(false)
+        .describe("Whether to print out data info.");
+    DMLC_DECLARE_FIELD(seed).set_default(0)
+        .describe("Random Seed.");
+  }
+};
+
+class MNISTIter: public IIterator<DataBatch> {
+ public:
+  MNISTIter(void) {
+    img_.dptr_ = NULL;
+    inst_offset_ = 0;
+    out_.data.resize(2);
+  }
+  virtual ~MNISTIter(void) {
+    if (img_.dptr_ != NULL) delete []img_.dptr_;
+  }
+  // intialize iterator loads data in
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    std::map<std::string, std::string> kmap(kwargs.begin(), kwargs.end());
+    param_.Init(kmap);
+    this->LoadImage();
+    this->LoadLabel();
+    // set name
+    this->SetDataName(std::string("data"));
+    this->SetDataName(std::string("label"));
+    if (param_.flat) {
+      batch_data_.shape_ = mshadow::Shape4(param_.batch_size, 1, 1, img_.size(1) * img_.size(2));
+    } else {
+      batch_data_.shape_ = mshadow::Shape4(param_.batch_size, 1, img_.size(1), img_.size(2));
+    }
+    out_.inst_index = NULL;
+    batch_label_.shape_ = mshadow::Shape2(param_.batch_size, 1);
+    batch_label_.stride_ = 1;
+    batch_data_.stride_ = batch_data_.size(3);
+    out_.batch_size = param_.batch_size;
+    if (param_.shuffle) this->Shuffle();
+    if (param_.silent == 0) {
+      mshadow::TShape s;
+      s = batch_data_.shape_;
+      if (param_.flat) {
+        LOG(INFO) << "MNISTIter: load " << (unsigned)img_.size(0) << " images, shuffle="
+            << param_.shuffle << ", shape=" << s.FlatTo2D();
+      } else {
+        LOG(INFO) << "MNISTIter: load " << (unsigned)img_.size(0) << " images, shuffle="
+            << param_.shuffle << ", shape=" << s;
+      }
+    }
+  }
+  virtual void BeforeFirst(void) {
+    this->loc_ = 0;
+  }
+  virtual bool Next(void) {
+    if (loc_ + param_.batch_size <= img_.size(0)) {
+      batch_data_.dptr_ = img_[loc_].dptr_;
+      batch_label_.dptr_ = &labels_[loc_];
+      if (param_.flat)
+          out_.data[0] = TBlob(batch_data_.FlatTo2D());
+      else
+          out_.data[0] = TBlob(batch_data_);
+      out_.data[1] = TBlob(batch_label_);
+      out_.inst_index = &inst_[loc_];
+      loc_ += param_.batch_size;
+      return true;
+    } else {
+      return false;
+    }
+  }
+  virtual const DataBatch &Value(void) const {
+    return out_;
+  }
+
+ private:
+  inline void LoadImage(void) {
+    dmlc::Stream *stdimg = dmlc::Stream::Create(param_.image.c_str(), "r");
+    ReadInt(stdimg);
+    int image_count = ReadInt(stdimg);
+    int image_rows  = ReadInt(stdimg);
+    int image_cols  = ReadInt(stdimg);
+
+    img_.shape_ = mshadow::Shape3(image_count, image_rows, image_cols);
+    img_.stride_ = img_.size(2);
+
+    // allocate continuous memory
+    img_.dptr_ = new float[img_.MSize()];
+    for (int i = 0; i < image_count; ++i) {
+      for (int j = 0; j < image_rows; ++j) {
+        for (int k = 0; k < image_cols; ++k) {
+          unsigned char ch;
+          CHECK(stdimg->Read(&ch, sizeof(ch) != 0));
+          img_[i][j][k] = ch;
+        }
+      }
+    }
+    // normalize to 0-1
+    img_ *= 1.0f / 256.0f;
+    delete stdimg;
+  }
+  inline void LoadLabel(void) {
+    dmlc::Stream *stdlabel = dmlc::Stream::Create(param_.label.c_str(), "r");
+    ReadInt(stdlabel);
+    int labels_count = ReadInt(stdlabel);
+    labels_.resize(labels_count);
+    for (int i = 0; i < labels_count; ++i) {
+      unsigned char ch;
+      CHECK(stdlabel->Read(&ch, sizeof(ch) != 0));
+      labels_[i] = ch;
+      inst_.push_back((unsigned)i + inst_offset_);
+    }
+    delete stdlabel;
+  }
+  inline void Shuffle(void) {
+    std::shuffle(inst_.begin(), inst_.end(), common::RANDOM_ENGINE(kRandMagic+param_.seed));
+    std::vector<float> tmplabel(labels_.size());
+    mshadow::TensorContainer<cpu, 3> tmpimg(img_.shape_);
+    for (size_t i = 0; i < inst_.size(); ++i) {
+      unsigned ridx = inst_[i] - inst_offset_;
+      mshadow::Copy(tmpimg[i], img_[ridx]);
+      tmplabel[i] = labels_[ridx];
+    }
+    // copy back
+    mshadow::Copy(img_, tmpimg);
+    labels_ = tmplabel;
+  }
+
+ private:
+  inline static int ReadInt(dmlc::Stream *fi) {
+    unsigned char buf[4];
+    CHECK(fi->Read(buf, sizeof(buf)) == sizeof(buf))
+        << "invalid mnist format";
+    return reinterpret_cast<int>(buf[0] << 24 | buf[1] << 16 | buf[2] << 8 | buf[3]);
+  }
+
+ private:
+  /*! \brief MNIST iter params */
+  MNISTParam param_;
+  /*! \brief output */
+  DataBatch out_;
+  /*! \brief current location */
+  index_t loc_;
+  /*! \brief image content */
+  mshadow::Tensor<cpu, 3> img_;
+  /*! \brief label content */
+  std::vector<float> labels_;
+  /*! \brief batch data tensor */
+  mshadow::Tensor<cpu, 4> batch_data_;
+  /*! \brief batch label tensor  */
+  mshadow::Tensor<cpu, 2> batch_label_;
+  /*! \brief instance index offset */
+  unsigned inst_offset_;
+  /*! \brief instance index */
+  std::vector<unsigned> inst_;
+  // magic number to setup randomness
+  static const int kRandMagic = 0;
+};  // class MNISTIter
+
+DMLC_REGISTER_PARAMETER(MNISTParam);
+MXNET_REGISTER_IO_ITER(MNISTIter, MNISTIter)
+    .describe("Create iterator for MNIST hand-written digit number recognition dataset.")
+    .add_arguments(MNISTParam::__FIELDS__());
+}  // namespace io
+}  // namespace mxnet
diff --git a/tests/python/get_data.py b/tests/python/get_data.py
new file mode 100644
index 000000000000..82d25d9072fb
--- /dev/null
+++ b/tests/python/get_data.py
@@ -0,0 +1,29 @@
+# pylint: skip-file
+import os, gzip
+import pickle as pickle
+import sys
+
+# download mnist.pkl.gz
+def GetMNIST_pkl():
+    if not os.path.isdir("data/"):
+        os.system("mkdir data/")
+    if not os.path.exists('data/mnist.pkl.gz'):
+        os.system("wget http://deeplearning.net/data/mnist/mnist.pkl.gz -P data/")
+
+# download ubyte version of mnist and untar
+def GetMNIST_ubyte():
+    if not os.path.isdir("data/"):
+        os.system("mkdir data/")
+    if not os.path.exists('data/train-images-idx3-ubyte'):
+        os.system("wget http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz -P data/")
+        os.system("gunzip data/train-images-idx3-ubyte.gz")
+    if not os.path.exists('data/train-labels-idx1-ubyte'):
+        os.system("wget http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz -P data/")
+        os.system("gunzip data/train-labels-idx1-ubyte.gz")
+    if not os.path.exists('data/t10k-images-idx3-ubyte'):
+        os.system("wget http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz -P data/")
+        os.system("gunzip data/t10k-images-idx3-ubyte.gz")
+    if not os.path.exists('data/t10k-labels-idx1-ubyte'):
+        os.system("wget http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz -P data/")
+        os.system("gunzip data/t10k-labels-idx1-ubyte.gz")
+
diff --git a/tests/python/test_conv.py b/tests/python/test_conv.py
index a456b19982f8..0604476d4bb5 100644
--- a/tests/python/test_conv.py
+++ b/tests/python/test_conv.py
@@ -3,64 +3,12 @@
 import numpy as np
 import os, pickle, gzip
 import sys
-
+import get_data
 
 def CalAcc(out, label):
     pred = np.argmax(out, axis=1)
     return np.sum(pred == label) * 1.0 / out.shape[0]
 
-def IgnorePython3():
-    if sys.version_info[0] >= 3:
-        # TODO(tianjun): use IO instead of pickle
-        # Python3 pickle is not able to load data correctly
-        sys.exit(0)
-
-
-# load data
-class MNISTIter(object):
-    def __init__(self, which_set, batch_size=100, flatten=True):
-        if not os.path.exists('mnist.pkl.gz'):
-            os.system("wget http://deeplearning.net/data/mnist/mnist.pkl.gz")
-        f = gzip.open('mnist.pkl.gz', 'rb')
-        IgnorePython3()
-        train_set, valid_set, test_set = pickle.load(f)
-        f.close()
-        if which_set == 'train':
-            self.data = train_set[0]
-            self.label = np.asarray(train_set[1])
-        elif which_set == 'valid':
-            self.data = valid_set[0]
-            self.label = np.asarray(valid_set[1])
-        else:
-            self.data = test_set[0]
-            self.data = np.asarray(test_set[1])
-        self.flatten = flatten
-        self.batch_size = batch_size
-        self.nbatch = self.data.shape[0] / batch_size
-        assert(self.data.shape[0] % batch_size == 0) # I am lazy
-        self.now_idx = -1
-    def BeforeFirst(self):
-        self.now_idx = -1
-    def Next(self):
-        self.now_idx += 1
-        if self.now_idx == self.nbatch:
-            return False
-        return True
-    def Get(self):
-        if self.now_idx < 0:
-            raise Exception("Iterator is at head")
-        elif self.now_idx >= self.nbatch:
-            raise Exception("Iterator is at end")
-        start = self.now_idx * self.batch_size
-        end = (self.now_idx + 1) * self.batch_size
-        if self.flatten:
-            return (self.data[start:end, :], self.label[start:end])
-        else:
-            return (self.data[start:end, :].reshape(batch_size, 1, 28, 28),
-                    self.label[start:end])
-
-
-
 # symbol net
 batch_size = 100
 data = mx.symbol.Variable('data')
@@ -110,26 +58,38 @@ def Get(self):
 def Update(grad, weight):
     weight.numpy[:] -= lr * grad.numpy[:] / batch_size
 
-block = zip(grad_narrays, arg_narrays)
+block = list(zip(grad_narrays, arg_narrays))
 
+# check data
+get_data.GetMNIST_ubyte()
 
-train = MNISTIter("train", batch_size, False)
-valid = MNISTIter("valid", batch_size, False)
+train_dataiter = mx.io.MNISTIter(
+        image="data/train-images-idx3-ubyte",
+        label="data/train-labels-idx1-ubyte",
+        batch_size=batch_size, shuffle=True, silent=False, seed=10)
+val_dataiter = mx.io.MNISTIter(
+        image="data/t10k-images-idx3-ubyte",
+        label="data/t10k-labels-idx1-ubyte",
+        batch_size=batch_size, shuffle=True, silent=False)
 
 def test_mnist():
     acc_train = 0.0
     acc_val = 0.0
-    for i in xrange(epoch):
+    for i in range(epoch):
         # train
         print("Epoch %d" % i)
         train_acc = 0.0
         val_acc = 0.0
-        while train.Next():
-            data, label = train.Get()
+        train_nbatch = 0
+        val_nbatch = 0
+        for data, label in train_dataiter:
+            data = data.numpy
+            label = label.numpy.flatten()
             inputs["data"].numpy[:] = data
             inputs["sm_label"].numpy[:] = label
             executor.forward()
             train_acc += CalAcc(out_narray.numpy, label)
+            train_nbatch += 1
             grad_narray.numpy[:] = out_narray.numpy
             executor.backward([grad_narray])
 
@@ -137,17 +97,19 @@ def test_mnist():
                 Update(grad, weight)
 
         # evaluate
-        while valid.Next():
-            data, label = valid.Get()
+        for data, label in val_dataiter:
+            data = data.numpy
+            label = label.numpy.flatten()
             inputs["data"].numpy[:] = data
             executor.forward()
             val_acc += CalAcc(out_narray.numpy, label)
-        print("Train Acc: ", train_acc / train.nbatch)
-        print("Valid Acc: ", val_acc / valid.nbatch)
-        acc_train = train_acc / train.nbatch
-        acc_val = val_acc / valid.nbatch
-        train.BeforeFirst()
-        valid.BeforeFirst()
+            val_nbatch += 1
+        print("Train Acc: ", train_acc / train_nbatch)
+        print("Valid Acc: ", val_acc / val_nbatch)
+        acc_train = train_acc / train_nbatch
+        acc_val = val_acc / val_nbatch
+        train_dataiter.reset()
+        val_dataiter.reset()
     assert(acc_train > 0.84)
     assert(acc_val > 0.96)
 
diff --git a/tests/python/test_io.py b/tests/python/test_io.py
new file mode 100644
index 000000000000..dfeb3f67c293
--- /dev/null
+++ b/tests/python/test_io.py
@@ -0,0 +1,41 @@
+# pylint: skip-file
+import mxnet as mx
+import numpy as np
+import os, gzip
+import pickle as pickle
+import sys
+import get_data
+
+# prepare data
+get_data.GetMNIST_ubyte()
+
+batch_size = 100
+train_dataiter = mx.io.MNISTIter(
+        image="data/train-images-idx3-ubyte",
+        label="data/train-labels-idx1-ubyte",
+        batch_size=batch_size, shuffle=1, flat=1, silent=0, seed=10)
+val_dataiter = mx.io.MNISTIter(
+        image="data/t10k-images-idx3-ubyte",
+        label="data/t10k-labels-idx1-ubyte",
+        batch_size=batch_size, shuffle=0, flat=1, silent=0)
+
+def test_MNISTIter_loop():
+    nbatch = 60000 / batch_size
+    batch_count = 0
+    for data, label in train_dataiter:
+        batch_count += 1
+    assert(nbatch == batch_count)
+
+def test_MNISTIter_reset():
+    train_dataiter.reset()
+    train_dataiter.iter_next()
+    label_0 = train_dataiter.getlabel().numpy.flatten()
+    train_dataiter.iter_next()
+    train_dataiter.iter_next()
+    train_dataiter.iter_next()
+    train_dataiter.iter_next()
+    train_dataiter.reset()
+    train_dataiter.iter_next()
+    label_1 = train_dataiter.getlabel().numpy.flatten()
+    assert(sum(label_0 - label_1) == 0)
+
diff --git a/tests/python/test_mlp.py b/tests/python/test_mlp.py
index 4770c19b9136..8a84d50536c3 100644
--- a/tests/python/test_mlp.py
+++ b/tests/python/test_mlp.py
@@ -4,61 +4,12 @@
 import os, gzip
 import pickle as pickle
 import sys
+import get_data
+
 def CalAcc(out, label):
     pred = np.argmax(out, axis=1)
     return np.sum(pred == label) * 1.0 / out.shape[0]
 
-def IgnorePython3():
-    if sys.version_info[0] >= 3:
-        # TODO(tianjun): use IO instead of pickle
-        # Python3 pickle is not able to load data correctly
-        sys.exit(0)
-
-
-# load data
-class MNISTIter(object):
-    def __init__(self, which_set, batch_size=100, flatten=True):
-        if not os.path.exists('mnist.pkl.gz'):
-            os.system("wget http://deeplearning.net/data/mnist/mnist.pkl.gz")
-        f = gzip.open('mnist.pkl.gz', 'rb')
-        IgnorePython3()
-        train_set, valid_set, test_set = pickle.load(f)
-        f.close()
-        if which_set == 'train':
-            self.data = train_set[0]
-            self.label = np.asarray(train_set[1])
-        elif which_set == 'valid':
-            self.data = valid_set[0]
-            self.label = np.asarray(valid_set[1])
-        else:
-            self.data = test_set[0]
-            self.data = np.asarray(test_set[1])
-        self.flatten = flatten
-        self.batch_size = batch_size
-        self.nbatch = self.data.shape[0] / batch_size
-        assert(self.data.shape[0] % batch_size == 0) # I am lazy
-        self.now_idx = -1
-    def BeforeFirst(self):
-        self.now_idx = -1
-    def Next(self):
-        self.now_idx += 1
-        if self.now_idx == self.nbatch:
-            return False
-        return True
-    def Get(self):
-        if self.now_idx < 0:
-            raise Exception("Iterator is at head")
-        elif self.now_idx >= self.nbatch:
-            raise Exception("Iterator is at end")
-        start = self.now_idx * self.batch_size
-        end = (self.now_idx + 1) * self.batch_size
-        if self.flatten:
-            return (self.data[start:end, :], self.label[start:end])
-        else:
-            return (self.data[start:end, :].reshape(batch_size, 1, 28, 28),
-                    self.label[start:end])
-
-
 # symbol net
 batch_size = 100
 data = mx.symbol.Variable('data')
@@ -98,12 +49,19 @@ def Get(self):
 def Update(grad, weight):
     weight.numpy[:] -= lr * grad.numpy[:]  / batch_size
 
-block = zip(grad_narrays, arg_narrays)
-
+block = list(zip(grad_narrays, arg_narrays))
 
+#check data
+get_data.GetMNIST_ubyte()
 
-train = MNISTIter("train", batch_size, True)
-valid = MNISTIter("valid", batch_size, True)
+train_dataiter = mx.io.MNISTIter(
+        image="data/train-images-idx3-ubyte",
+        label="data/train-labels-idx1-ubyte",
+        batch_size=batch_size, shuffle=True, flat=True, silent=False, seed=10)
+val_dataiter = mx.io.MNISTIter(
+        image="data/t10k-images-idx3-ubyte",
+        label="data/t10k-labels-idx1-ubyte",
+        batch_size=batch_size, shuffle=True, flat=True, silent=False)
 
 def test_mlp():
     acc_train = 0.
@@ -113,12 +71,16 @@ def test_mlp():
         print("Epoch %d" % i)
         train_acc = 0.0
         val_acc = 0.0
-        while train.Next():
-            data, label = train.Get()
+        train_nbatch = 0
+        val_nbatch = 0
+        for data, label in train_dataiter:
+            data = data.numpy
+            label = label.numpy.flatten()
             inputs["data"].numpy[:] = data
             inputs["sm_label"].numpy[:] = label
             executor.forward()
             train_acc += CalAcc(out_narray.numpy, label)
+            train_nbatch += 1
             grad_narray.numpy[:] = out_narray.numpy
             executor.backward([grad_narray])
 
@@ -126,17 +88,19 @@ def test_mlp():
                 Update(grad, weight)
 
         # evaluate
-        while valid.Next():
-            data, label = valid.Get()
+        for data, label in val_dataiter:
+            data = data.numpy
+            label = label.numpy.flatten()
             inputs["data"].numpy[:] = data
             executor.forward()
             val_acc += CalAcc(out_narray.numpy, label)
-        acc_train = train_acc / train.nbatch
-        acc_val = val_acc / valid.nbatch
-        print("Train Acc: ", train_acc / train.nbatch)
-        print("Valid Acc: ", val_acc / valid.nbatch)
-        train.BeforeFirst()
-        valid.BeforeFirst()
+            val_nbatch += 1
+        acc_train = train_acc / train_nbatch
+        acc_val = val_acc / val_nbatch
+        print("Train Acc: ", train_acc / train_nbatch)
+        print("Valid Acc: ", val_acc / val_nbatch)
+        train_dataiter.reset()
+        val_dataiter.reset()
     assert(acc_train > 0.98)
     assert(acc_val > 0.97)