diff --git a/.gitignore b/.gitignore index d62c63f403e9..549726650e43 100644 --- a/.gitignore +++ b/.gitignore @@ -55,4 +55,3 @@ Debug .dir-locals.el __pycache__ *.pkl -* \ No newline at end of file diff --git a/Makefile b/Makefile index 8ebcfa896d62..bdebed0b5ae6 100644 --- a/Makefile +++ b/Makefile @@ -64,7 +64,7 @@ endif #BIN = test/test_threaded_engine test/api_registry_test OBJ = narray_function_cpu.o # add threaded engine after it is done -OBJCXX11 = reshape_cpu.o engine.o narray.o c_api.o operator.o symbol.o storage.o fully_connected_cpu.o static_graph.o activation_cpu.o graph_executor.o softmax_cpu.o elementwise_sum_cpu.o pooling_cpu.o convolution_cpu.o +OBJCXX11 = reshape_cpu.o engine.o narray.o c_api.o operator.o symbol.o storage.o fully_connected_cpu.o static_graph.o activation_cpu.o graph_executor.o softmax_cpu.o elementwise_sum_cpu.o pooling_cpu.o convolution_cpu.o io.o iter_mnist.o CUOBJ = SLIB = lib/libmxnet.so ALIB = lib/libmxnet.a @@ -105,12 +105,13 @@ convolution_cpu.o: src/operator/convolution.cc convolution_gpu.o: src/operator/convolution.cu reshape_cpu.o: src/operator/reshape.cc reshape_gpu.o: src/operator/reshape.cu +io.o: src/io/io.cc +iter_mnist.o: src/io/iter_mnist.cc -lib/libmxnet.a: $(OBJ) $(OBJCXX11) $(CUOBJ) -lib/libmxnet.so: $(OBJ) $(OBJCXX11) $(CUOBJ) +lib/libmxnet.a: $(OBJ) $(OBJCXX11) $(CUOBJ) $(LIB_DEP) +lib/libmxnet.so: $(OBJ) $(OBJCXX11) $(CUOBJ) $(LIB_DEP) test/test_storage: test/test_storage.cc lib/libmxnet.a -#test/test_threaded_engine: test/test_threaded_engine.cc api/libmxnet.a $(BIN) : $(CXX) $(CFLAGS) -std=c++0x -o $@ $(filter %.cpp %.o %.c %.a %.cc, $^) $(LDFLAGS) diff --git a/doc/python/io.md b/doc/python/io.md new file mode 100644 index 000000000000..7bff6a83e354 --- /dev/null +++ b/doc/python/io.md @@ -0,0 +1,12 @@ +Python IO API +=================== +Mxnet handles IO for you by implementing data iterators. +It is like an iterable class in python, you can traverse the data using a for loop. + + +IO API Reference +---------------------- +```eval_rst +.. automodule:: mxnet.io + :members: +``` diff --git a/include/mxnet/base.h b/include/mxnet/base.h index 04b634ff8b87..a7a3a8063a92 100644 --- a/include/mxnet/base.h +++ b/include/mxnet/base.h @@ -55,8 +55,6 @@ typedef mshadow::TBlob TBlob; namespace dmlc { // Add a few patches to support TShape in dmlc/parameter. DMLC_DECLARE_TYPE_NAME(mxnet::TShape, "Shape(tuple)"); -DMLC_DECLARE_TYPE_NAME(uint32_t, "unsigned int"); - namespace parameter { template<> diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h index 5802c32cf75c..f5f0a05169ec 100644 --- a/include/mxnet/c_api.h +++ b/include/mxnet/c_api.h @@ -36,6 +36,8 @@ typedef void *SymbolHandle; typedef void *AtomicSymbolHandle; /*! \brief handle to an Executor */ typedef void *ExecutorHandle; +/*! \brief handle a dataiter creator */ +typedef void *DataIterCreator; /*! \brief handle to a DataIterator */ typedef void *DataIterHandle; /*! @@ -452,49 +454,176 @@ MXNET_DLL int MXExecutorBind(SymbolHandle symbol_handle, // Part 5: IO Interface //-------------------------------------------- /*! - * \brief create an data iterator from configs string - * \param cfg config string that contains the - * configuration about the iterator - * \param out the handle to the iterator + * \brief List all the available iterator entries + * \param out_size the size of returned iterators + * \param out_array the output iteratos entries * \return 0 when success, -1 when failure happens */ -MXNET_DLL int MXIOCreateFromConfig(const char *cfg, - DataIterHandle *out); +MXNET_DLL int MXListDataIters(mx_uint *out_size, + DataIterCreator **out_array); /*! - * \brief move iterator to next position - * \param handle the handle to iterator - * \param out return value of next + * \brief Init an iterator, init with parameters + * the array size of passed in arguments + * \param handle of the iterator creator + * \param num_param number of parameter + * \param keys parameter keys + * \param vals parameter values + * \param out resulting iterator * \return 0 when success, -1 when failure happens */ -MXNET_DLL int MXIONext(DataIterHandle handle, - int *out); +MXNET_DLL int MXDataIterCreateIter(DataIterCreator handle, + int num_param, + const char **keys, + const char **vals, + DataIterHandle *out); /*! - * \brief call iterator.BeforeFirst - * \param handle the handle to iterator + * \brief Get the detailed information about data iterator. + * \param creator the DataIterCreator. + * \param name The returned name of the creator. + * \param description The returned description of the symbol. + * \param num_args Number of arguments. + * \param arg_names Name of the arguments. + * \param arg_type_infos Type informations about the arguments. + * \param arg_descriptions Description information about the arguments. * \return 0 when success, -1 when failure happens */ -MXNET_DLL int MXIOBeforeFirst(DataIterHandle handle); +MXNET_DLL int MXDataIterGetIterInfo(AtomicSymbolCreator creator, + const char **name, + const char **description, + mx_uint *num_args, + const char ***arg_names, + const char ***arg_type_infos, + const char ***arg_descriptions); /*! - * \brief free the handle to the IO module + * \brief Free the handle to the IO module * \param handle the handle pointer to the data iterator * \return 0 when success, -1 when failure happens */ -MXNET_DLL int MXIOFree(DataIterHandle handle); +MXNET_DLL int MXDataIterFree(DataIterHandle handle); +/*! + * \brief get the name of iterator entry + * \param iter iterator entry + * \param out_name the name of the iterator + * \return 0 when success, -1 when failure happens + */ +MXNET_DLL int MXDataIterGetName(DataIterCreator iter, + const char **out_name); +/*! + * \brief Init an iterator, init with parameters + * the array size of passed in arguments + * \param handle of the iterator creator + * \param num_param number of parameter + * \param keys parameter keys + * \param vals parameter values + * \param out resulting iterator + * \return 0 when success, -1 when failure happens + */ +MXNET_DLL int MXDataIterCreateIter(DataIterCreator handle, + int num_param, + const char **keys, + const char **vals, + DataIterHandle *out); +/*! + * \brief Get the detailed information about data iterator. + * \param creator the DataIterCreator. + * \param name The returned name of the creator. + * \param description The returned description of the symbol. + * \param num_args Number of arguments. + * \param arg_names Name of the arguments. + * \param arg_type_infos Type informations about the arguments. + * \param arg_descriptions Description information about the arguments. + * \return 0 when success, -1 when failure happens + */ +MXNET_DLL int MXDataIterGetIterInfo(AtomicSymbolCreator creator, + const char **name, + const char **description, + mx_uint *num_args, + const char ***arg_names, + const char ***arg_type_infos, + const char ***arg_descriptions); +/*! + * \brief Free the handle to the IO module + * \param handle the handle pointer to the data iterator + * \return 0 when success, -1 when failure happens + */ +MXNET_DLL int MXDataIterFree(DataIterHandle handle); +/*! + * \brief Get the name of iterator entry + * \param iter iterator entry + * \param out_name the name of the iterator + * \return 0 when success, -1 when failure happens + */ +MXNET_DLL int MXDataIterGetName(DataIterCreator iter, + const char **out_name); +/*! + * \brief Init an iterator, init with parameters + * the array size of passed in arguments + * \param handle of the iterator creator + * \param num_param number of parameter + * \param keys parameter keys + * \param vals parameter values + * \param out resulting iterator + * \return 0 when success, -1 when failure happens + */ +MXNET_DLL int MXDataIterCreateIter(DataIterCreator handle, + int num_param, + const char **keys, + const char **vals, + DataIterHandle *out); +/*! + * \brief Get the detailed information about data iterator. + * \param creator the DataIterCreator. + * \param name The returned name of the creator. + * \param description The returned description of the symbol. + * \param num_args Number of arguments. + * \param arg_names Name of the arguments. + * \param arg_type_infos Type informations about the arguments. + * \param arg_descriptions Description information about the arguments. + * \return 0 when success, -1 when failure happens + */ +MXNET_DLL int MXDataIterGetIterInfo(AtomicSymbolCreator creator, + const char **name, + const char **description, + mx_uint *num_args, + const char ***arg_names, + const char ***arg_type_infos, + const char ***arg_descriptions); +/*! + * \brief Free the handle to the IO module + * \param handle the handle pointer to the data iterator + * \return 0 when success, -1 when failure happens + */ +MXNET_DLL int MXDataIterFree(DataIterHandle handle); +/*! + * \brief Move iterator to next position + * \param handle the handle to iterator + * \param out return value of next + * \return 0 when success, -1 when failure happens + */ +MXNET_DLL int MXDataIterNext(DataIterHandle handle, + int *out); +/*! + * \brief Call iterator.Reset + * \param handle the handle to iterator + * \return 0 when success, -1 when failure happens + */ +MXNET_DLL int MXDataIterBeforeFirst(DataIterHandle handle); + /*! - * \brief get the handle to the NArray of underlying data + * \brief Get the handle to the NArray of underlying data * \param handle the handle pointer to the data iterator * \param out handle to underlying data NArray * \return 0 when success, -1 when failure happens */ -MXNET_DLL int MXIOGetData(DataIterHandle handle, +MXNET_DLL int MXDataIterGetData(DataIterHandle handle, NArrayHandle *out); /*! - * \brief get the handle to the NArray of underlying label + * \brief Get the handle to the NArray of underlying label * \param handle the handle pointer to the data iterator * \param out the handle to underlying label NArray * \return 0 when success, -1 when failure happens */ -MXNET_DLL int MXIOGetLabel(DataIterHandle handle, +MXNET_DLL int MXDataIterGetLabel(DataIterHandle handle, NArrayHandle *out); #endif // MXNET_C_API_H_ diff --git a/include/mxnet/io.h b/include/mxnet/io.h new file mode 100644 index 000000000000..47a59eec54fe --- /dev/null +++ b/include/mxnet/io.h @@ -0,0 +1,113 @@ +/*! + * Copyright (c) 2015 by Contributors + * \file io.h + * \brief mxnet io data structure and data iterator + */ +#ifndef MXNET_IO_H_ +#define MXNET_IO_H_ +#include +#include +#include +#include +#include +#include "./base.h" + +namespace mxnet { +/*! + * \brief iterator type + * \tparam DType data type + */ +template +class IIterator : public dmlc::DataIter { + public: + /*! + * \brief set the parameters and init iter + * \param kwargs key-value pairs + */ + virtual void Init(const std::vector >& kwargs) = 0; + /*! \brief reset the iterator */ + virtual void BeforeFirst(void) = 0; + /*! \brief move to next item */ + virtual bool Next(void) = 0; + /*! \brief get current data */ + virtual const DType &Value(void) const = 0; + /*! \brief constructor */ + virtual ~IIterator(void) {} + /*! \brief store the name of each data, it could be used for making NArrays */ + std::vector data_names; + /*! \brief set data name to each attribute of data */ + inline void SetDataName(const std::string data_name){ + data_names.push_back(data_name); + } +}; // class IIterator + +/*! \brief a single data instance */ +struct DataInst { + /*! \brief unique id for instance */ + unsigned index; + /*! \brief content of data */ + std::vector data; + /*! \brief extra data to be fed to the network */ + std::string extra_data; +}; // struct DataInst + +/*! + * \brief a standard batch of data commonly used by iterator + * a databatch contains multiple TBlobs. Each Tblobs has + * a name stored in a map. There's no different between + * data and label, how we use them is to see the DNN implementation. + */ +struct DataBatch { + public: + /*! \brief unique id for instance, can be NULL, sometimes is useful */ + unsigned *inst_index; + /*! \brief number of instance */ + mshadow::index_t batch_size; + /*! \brief number of padding elements in this batch, + this is used to indicate the last elements in the batch are only padded up to match the batch, and should be discarded */ + mshadow::index_t num_batch_padd; + public: + /*! \brief content of dense data, if this DataBatch is dense */ + std::vector data; + /*! \brief extra data to be fed to the network */ + std::string extra_data; + public: + /*! \brief constructor */ + DataBatch(void) { + inst_index = NULL; + batch_size = 0; num_batch_padd = 0; + } + /*! \brief giving name to the data */ + void Naming(std::vector names); +}; // struct DataBatch + +/*! \brief typedef the factory function of data iterator */ +typedef IIterator *(*DataIteratorFactory)(); +/*! + * \brief Registry entry for DataIterator factory functions. + */ +struct DataIteratorReg + : public dmlc::FunctionRegEntryBase { +}; +//-------------------------------------------------------------- +// The following part are API Registration of Iterators +//-------------------------------------------------------------- +/*! + * \brief Macro to register Iterators + * + * \code + * // example of registering a mnist iterator + * REGISTER_IO_ITERATOR(MNIST, MNISTIterator) + * .describe("Mnist data iterator"); + * + * \endcode + */ +#define MXNET_REGISTER_IO_ITER(name, DataIteratorType) \ + static ::mxnet::IIterator* __create__ ## DataIteratorType ## __() { \ + return new DataIteratorType; \ + } \ + DMLC_REGISTRY_REGISTER(::mxnet::DataIteratorReg, DataIteratorReg, name) \ + .set_body(__create__ ## DataIteratorType ## __) +} // namespace mxnet +#endif // MXNET_IO_H_ diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py index c7720dcbd935..a8632bfa2ff8 100644 --- a/python/mxnet/__init__.py +++ b/python/mxnet/__init__.py @@ -12,6 +12,7 @@ from .base import MXNetError from . import narray from . import symbol +from . import io __version__ = "0.1.0" diff --git a/python/mxnet/base.py b/python/mxnet/base.py index 6cf8c616f805..ec9d43dc58aa 100644 --- a/python/mxnet/base.py +++ b/python/mxnet/base.py @@ -75,7 +75,8 @@ def _load_lib(): SymbolCreatorHandle = ctypes.c_void_p SymbolHandle = ctypes.c_void_p ExecutorHandle = ctypes.c_void_p - +DataIterCreatorHandle = ctypes.c_void_p +DataIterHandle = ctypes.c_void_p #---------------------------- # helper function definition #---------------------------- diff --git a/python/mxnet/io.py b/python/mxnet/io.py new file mode 100644 index 000000000000..58dbe6e6f9a3 --- /dev/null +++ b/python/mxnet/io.py @@ -0,0 +1,172 @@ +# coding: utf-8 + +"""NArray interface of mxnet""" +from __future__ import absolute_import + +import ctypes +import sys +from .base import _LIB +from .base import c_array, c_str, mx_uint, py_str +from .base import DataIterHandle, NArrayHandle +from .base import check_call +from .narray import NArray + +class DataIter(object): + """DataIter object in mxnet. List all the needed functions here. """ + + def __init__(self, handle): + """Initialize with handle + + Parameters + ---------- + handle : DataIterHandle + the handle to the underlying C++ Data Iterator + """ + self.handle = handle + + def __del__(self): + check_call(_LIB.MXDataIterFree(self.handle)) + + def __iter__(self): + """make the class iterable + + """ + return self + + def reset(self): + """set loc to 0 + + """ + check_call(_LIB.MXDataIterBeforeFirst(self.handle)) + + def next(self): + """get next data batch from iterator + + Returns + ------- + labels and images for the next batch + """ + next_res = ctypes.c_int(0) + check_call(_LIB.MXDataIterNext(self.handle, ctypes.byref(next_res))) + if next_res.value: + return self.getdata(), self.getlabel() + else: + raise StopIteration + + # make it work for both python2 and 3 + __next__ = next + + def iter_next(self): + """iterate to next data with return value + + Returns + ------- + return true if success + """ + next_res = ctypes.c_int(0) + check_call(_LIB.MXDataIterNext(self.handle, ctypes.byref(next_res))) + return next_res.value + + def getdata(self): + """get data from batch + + """ + hdl = NArrayHandle() + check_call(_LIB.MXDataIterGetData(self.handle, ctypes.byref(hdl))) + return NArray(hdl) + + def getlabel(self): + """get label from batch + + """ + hdl = NArrayHandle() + check_call(_LIB.MXDataIterGetLabel(self.handle, ctypes.byref(hdl))) + return NArray(hdl) + +def _make_io_iterator(handle): + """Create an io iterator by handle.""" + name = ctypes.c_char_p() + desc = ctypes.c_char_p() + num_args = mx_uint() + arg_names = ctypes.POINTER(ctypes.c_char_p)() + arg_types = ctypes.POINTER(ctypes.c_char_p)() + arg_descs = ctypes.POINTER(ctypes.c_char_p)() + + check_call(_LIB.MXDataIterGetIterInfo( \ + handle, ctypes.byref(name), ctypes.byref(desc), \ + ctypes.byref(num_args), \ + ctypes.byref(arg_names), \ + ctypes.byref(arg_types), \ + ctypes.byref(arg_descs))) + iter_name = py_str(name.value) + param_str = [] + for i in range(num_args.value): + ret = '%s : %s' % (arg_names[i], arg_types[i]) + if len(arg_descs[i]) != 0: + ret += '\n ' + py_str(arg_descs[i]) + param_str.append(ret) + + doc_str = ('%s\n\n' + + 'Parameters\n' + + '----------\n' + + '%s\n' + + 'name : string, required.\n' + + ' Name of the resulting data iterator.\n\n' + + 'Returns\n' + + '-------\n' + + 'iterator: Iterator\n'+ + ' The result iterator.') + doc_str = doc_str % (desc.value, '\n'.join(param_str)) + + def creator(*args, **kwargs): + """Create an iterator. + The parameters listed below can be passed in as keyword arguments. + + Parameters + ---------- + name : string, required. + Name of the resulting data iterator. + + Returns + ------- + dataiter: Dataiter + the resulting data iterator + """ + param_keys = [] + param_vals = [] + + for k, val in kwargs.items(): + param_keys.append(c_str(k)) + param_vals.append(c_str(str(val))) + # create atomic symbol + param_keys = c_array(ctypes.c_char_p, param_keys) + param_vals = c_array(ctypes.c_char_p, param_vals) + iter_handle = DataIterHandle() + check_call(_LIB.MXDataIterCreateIter( + handle, len(param_keys), + param_keys, param_vals, + ctypes.byref(iter_handle))) + + if len(args): + raise TypeError('%s can only accept keyword arguments' % iter_name) + + return DataIter(iter_handle) + + creator.__name__ = iter_name + creator.__doc__ = doc_str + return creator + + +def _init_io_module(): + """List and add all the data iterators to current module.""" + plist = ctypes.POINTER(ctypes.c_void_p)() + size = ctypes.c_uint() + check_call(_LIB.MXListDataIters(ctypes.byref(size), ctypes.byref(plist))) + module_obj = sys.modules[__name__] + for i in range(size.value): + hdl = ctypes.c_void_p(plist[i]) + dataiter = _make_io_iterator(hdl) + setattr(module_obj, dataiter.__name__, dataiter) + +# Initialize the io in startups +_init_io_module() diff --git a/python/test_io.py b/python/test_io.py new file mode 100644 index 000000000000..d15d4cc32fcd --- /dev/null +++ b/python/test_io.py @@ -0,0 +1,21 @@ +#pylint: skip-file +import mxnet as mx +import numpy as np +import os + +dataiter = mx.io.MNISTIterator(path_img="/home/tianjun/data/mnist/train-images-idx3-ubyte", + path_label="/home/tianjun/data/mnist/train-labels-idx1-ubyte", + batch_size=100, shuffle=1, silent=1, input_flat="flat") + +dataiter.beforefirst() + +idx = 0 +while dataiter.next(): + info = "Batch %d" % (idx) + idx += 1 + print info + ''' + label = dataiter.getlabel() + print label.numpy + ''' + diff --git a/src/c_api.cc b/src/c_api.cc index b251ba578743..0d496c3855bf 100644 --- a/src/c_api.cc +++ b/src/c_api.cc @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -609,3 +610,78 @@ int MXExecutorBind(SymbolHandle symbol_handle, *out = Executor::Bind(*symb, ctx, in_args_vec, arg_grad_vec, grad_req_vec); API_END(); } + +//-------------------------------------------- +// Part 5: IO Interface +//-------------------------------------------- +int MXListDataIters(mx_uint *out_size, + DataIterCreator **out_array) { + API_BEGIN(); + auto &vec = dmlc::Registry::List(); + *out_size = static_cast(vec.size()); + *out_array = (DataIterCreator*)(dmlc::BeginPtr(vec)); // NOLINT(*) + API_END(); +} + +int MXDataIterGetIterInfo(DataIterCreator creator, + const char **name, + const char **description, + mx_uint *num_args, + const char ***arg_names, + const char ***arg_type_infos, + const char ***arg_descriptions) { + DataIteratorReg *e = static_cast(creator); + return MXAPIGetFunctionRegInfo(e, name, description, num_args, + arg_names, arg_type_infos, arg_descriptions); +} + +int MXDataIterCreateIter(DataIterCreator creator, + int num_param, + const char **keys, + const char **vals, + DataIterHandle *out) { + IIterator *iter = nullptr; + API_BEGIN(); + DataIteratorReg *e = static_cast(creator); + iter = e->body(); + std::vector > kwargs; + for (int i = 0; i < num_param; ++i) { + kwargs.push_back({std::string(keys[i]), std::string(vals[i])}); + } + iter->Init(kwargs); + iter->BeforeFirst(); + *out = iter; + API_END_HANDLE_ERROR(delete iter); +} + +int MXDataIterFree(DataIterHandle handle) { + API_BEGIN(); + delete static_cast *>(handle); + API_END(); +} + +int MXDataIterBeforeFirst(DataIterHandle handle) { + API_BEGIN(); + static_cast* >(handle)->BeforeFirst(); + API_END(); +} + +int MXDataIterNext(DataIterHandle handle, int *out) { + API_BEGIN(); + *out = static_cast* >(handle)->Next(); + API_END(); +} + +int MXDataIterGetLabel(DataIterHandle handle, NArrayHandle *out) { + API_BEGIN(); + DataBatch db = static_cast* >(handle)->Value(); + *out = new NArray(db.data[1], 0); + API_END(); +} + +int MXDataIterGetData(DataIterHandle handle, NArrayHandle *out) { + API_BEGIN(); + DataBatch db = static_cast* >(handle)->Value(); + *out = new NArray(db.data[0], 0); + API_END(); +} diff --git a/src/common/utils.h b/src/common/utils.h index f55ebc26535f..cf1fd2f1bb36 100644 --- a/src/common/utils.h +++ b/src/common/utils.h @@ -10,12 +10,18 @@ #include #include #include +#include #endif // DMLC_USE_CXX11 namespace common { #if DMLC_USE_CXX11 +/*! + * \brief Random Engine + */ +typedef std::mt19937 RANDOM_ENGINE; + /*! * \brief Helper functions. */ diff --git a/src/io/inst_vector.h b/src/io/inst_vector.h new file mode 100644 index 000000000000..1ae734631680 --- /dev/null +++ b/src/io/inst_vector.h @@ -0,0 +1,117 @@ +/*! + * Copyright (c) 2015 by Contributors + * \inst_vector.h + * \brief holder of a sequence of DataInst in CPU + * that are not necessarily of same shape + */ +#ifndef MXNET_IO_INST_VECTOR_H_ +#define MXNET_IO_INST_VECTOR_H_ +#include +#include +#include +#include +#include "./data.h" +namespace mxnet { +/*! + * \brief tensor vector that can store sequence of tensor + * in a memory compact way, tensors do not have to be of same shape + */ +template +class TensorVector { + public: + TensorVector(void) { + this->Clear(); + } + // get i-th tensor + inline mshadow::Tensor + operator[](size_t i) const { + CHECK(i + 1 < offset_.size()); + CHECK(shape_[i].Size() == offset_[i + 1] - offset_[i]); + return mshadow::Tensor + (reinterpret_cast(BeginPtr(content_)) + offset_[i], shape_[i]); + } + inline mshadow::Tensor Back() const { + return (*this)[Size() - 1]; + } + inline size_t Size(void) const { + return shape_.size(); + } + // push a tensor of certain shape + // return the reference of the pushed tensor + inline void Push(mshadow::Shape shape) { + shape_.push_back(shape); + offset_.push_back(offset_.back() + shape.Size()); + content_.resize(offset_.back()); + } + inline void Clear(void) { + offset_.clear(); + offset_.push_back(0); + content_.clear(); + shape_.clear(); + } + + private: + // offset of the data content + std::vector offset_; + // data content + std::vector content_; + // shape of data + std::vector > shape_; +}; + +/*! + * \brief tblob vector that can store sequence of tblob + * in a memory compact way, tblobs do not have to be of same shape + */ +template +class TBlobVector { + public: + TBlobVector(void) { + this->Clear(); + } + // get i-th tblob + inline TBlob operator[](size_t i) const; + // get the last tblob + inline TBlob Back(); + // return the size of the vector + inline size_t Size(void) const; + // push a tensor of certain shape + // return the reference of the pushed tensor + inline void Push(TShape shape_); + inline void Clear(void); + private: + // offset of the data content + std::vector offset_; + // data content + std::vector content_; + // shape of data + std::vector shape_; +}; + +/*! + * \brief instance vector that can holds + * non-uniform shape data instance in a shape efficient way + */ +class InstVector { + public: + inline size_t Size(void) const { + return index_.size(); + } + // instance + inline DataInst operator[](size_t i) const; + // get back of instance vector + inline DataInst Back() const; + // clear the container + inline void Clear(void); + // push the newly coming instance + inline void Push(unsigned index, TBlob data_); + + private: + /*! \brief index of the data */ + std::vector index_; + // data + std::vector > data_; + // extra data + std::vector extra_data_; +}; +#endif // MXNET_IO_INST_VECTOR_H_ diff --git a/src/io/io.cc b/src/io/io.cc new file mode 100644 index 000000000000..bd5b78dda643 --- /dev/null +++ b/src/io/io.cc @@ -0,0 +1,10 @@ +// Copyright (c) 2015 by Contributors +#define _CRT_SECURE_NO_WARNINGS +#define _CRT_SECURE_NO_DEPRECATE + +#include +#include + +namespace dmlc { +DMLC_REGISTRY_ENABLE(::mxnet::DataIteratorReg); +} // namespace dmlc diff --git a/src/io/iter_mnist.cc b/src/io/iter_mnist.cc new file mode 100644 index 000000000000..93195061b278 --- /dev/null +++ b/src/io/iter_mnist.cc @@ -0,0 +1,207 @@ +/*! + * Copyright (c) 2015 by Contributors + * \file iter_mnist.cc + * \brief register mnist iterator + * \author Tianjun Xiao +*/ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../common/utils.h" + +namespace mxnet { +namespace io { +// Define mnist io parameters +struct MNISTParam : public dmlc::Parameter { + /*! \brief path */ + std::string image, label; + /*! \brief whether to do shuffle */ + bool shuffle; + /*! \brief whether to print info */ + bool silent; + /*! \brief batch size */ + int batch_size; + /*! \brief data mode */ + bool flat; + /*! \brief random seed */ + int seed; + // declare parameters in header file + DMLC_DECLARE_PARAMETER(MNISTParam) { + DMLC_DECLARE_FIELD(image).set_default("./train-images-idx3-ubyte") + .describe("Mnist image path."); + DMLC_DECLARE_FIELD(label).set_default("./train-labels-idx1-ubyte") + .describe("Mnist label path."); + DMLC_DECLARE_FIELD(batch_size).set_lower_bound(1).set_default(128) + .describe("Batch Size."); + DMLC_DECLARE_FIELD(shuffle).set_default(true) + .describe("Whether to shuffle data."); + DMLC_DECLARE_FIELD(flat).set_default(false) + .describe("Whether to flat the data into 1D."); + DMLC_DECLARE_FIELD(silent).set_default(false) + .describe("Whether to print out data info."); + DMLC_DECLARE_FIELD(seed).set_default(0) + .describe("Random Seed."); + } +}; + +class MNISTIter: public IIterator { + public: + MNISTIter(void) { + img_.dptr_ = NULL; + inst_offset_ = 0; + out_.data.resize(2); + } + virtual ~MNISTIter(void) { + if (img_.dptr_ != NULL) delete []img_.dptr_; + } + // intialize iterator loads data in + virtual void Init(const std::vector >& kwargs) { + std::map kmap(kwargs.begin(), kwargs.end()); + param_.Init(kmap); + this->LoadImage(); + this->LoadLabel(); + // set name + this->SetDataName(std::string("data")); + this->SetDataName(std::string("label")); + if (param_.flat) { + batch_data_.shape_ = mshadow::Shape4(param_.batch_size, 1, 1, img_.size(1) * img_.size(2)); + } else { + batch_data_.shape_ = mshadow::Shape4(param_.batch_size, 1, img_.size(1), img_.size(2)); + } + out_.inst_index = NULL; + batch_label_.shape_ = mshadow::Shape2(param_.batch_size, 1); + batch_label_.stride_ = 1; + batch_data_.stride_ = batch_data_.size(3); + out_.batch_size = param_.batch_size; + if (param_.shuffle) this->Shuffle(); + if (param_.silent == 0) { + mshadow::TShape s; + s = batch_data_.shape_; + if (param_.flat) { + LOG(INFO) << "MNISTIter: load " << (unsigned)img_.size(0) << " images, shuffle=" + << param_.shuffle << ", shape=" << s.FlatTo2D(); + } else { + LOG(INFO) << "MNISTIter: load " << (unsigned)img_.size(0) << " images, shuffle=" + << param_.shuffle << ", shape=" << s; + } + } + } + virtual void BeforeFirst(void) { + this->loc_ = 0; + } + virtual bool Next(void) { + if (loc_ + param_.batch_size <= img_.size(0)) { + batch_data_.dptr_ = img_[loc_].dptr_; + batch_label_.dptr_ = &labels_[loc_]; + if (param_.flat) + out_.data[0] = TBlob(batch_data_.FlatTo2D()); + else + out_.data[0] = TBlob(batch_data_); + out_.data[1] = TBlob(batch_label_); + out_.inst_index = &inst_[loc_]; + loc_ += param_.batch_size; + return true; + } else { + return false; + } + } + virtual const DataBatch &Value(void) const { + return out_; + } + + private: + inline void LoadImage(void) { + dmlc::Stream *stdimg = dmlc::Stream::Create(param_.image.c_str(), "r"); + ReadInt(stdimg); + int image_count = ReadInt(stdimg); + int image_rows = ReadInt(stdimg); + int image_cols = ReadInt(stdimg); + + img_.shape_ = mshadow::Shape3(image_count, image_rows, image_cols); + img_.stride_ = img_.size(2); + + // allocate continuous memory + img_.dptr_ = new float[img_.MSize()]; + for (int i = 0; i < image_count; ++i) { + for (int j = 0; j < image_rows; ++j) { + for (int k = 0; k < image_cols; ++k) { + unsigned char ch; + CHECK(stdimg->Read(&ch, sizeof(ch) != 0)); + img_[i][j][k] = ch; + } + } + } + // normalize to 0-1 + img_ *= 1.0f / 256.0f; + delete stdimg; + } + inline void LoadLabel(void) { + dmlc::Stream *stdlabel = dmlc::Stream::Create(param_.label.c_str(), "r"); + ReadInt(stdlabel); + int labels_count = ReadInt(stdlabel); + labels_.resize(labels_count); + for (int i = 0; i < labels_count; ++i) { + unsigned char ch; + CHECK(stdlabel->Read(&ch, sizeof(ch) != 0)); + labels_[i] = ch; + inst_.push_back((unsigned)i + inst_offset_); + } + delete stdlabel; + } + inline void Shuffle(void) { + std::shuffle(inst_.begin(), inst_.end(), common::RANDOM_ENGINE(kRandMagic+param_.seed)); + std::vector tmplabel(labels_.size()); + mshadow::TensorContainer tmpimg(img_.shape_); + for (size_t i = 0; i < inst_.size(); ++i) { + unsigned ridx = inst_[i] - inst_offset_; + mshadow::Copy(tmpimg[i], img_[ridx]); + tmplabel[i] = labels_[ridx]; + } + // copy back + mshadow::Copy(img_, tmpimg); + labels_ = tmplabel; + } + + private: + inline static int ReadInt(dmlc::Stream *fi) { + unsigned char buf[4]; + CHECK(fi->Read(buf, sizeof(buf)) == sizeof(buf)) + << "invalid mnist format"; + return reinterpret_cast(buf[0] << 24 | buf[1] << 16 | buf[2] << 8 | buf[3]); + } + + private: + /*! \brief MNIST iter params */ + MNISTParam param_; + /*! \brief output */ + DataBatch out_; + /*! \brief current location */ + index_t loc_; + /*! \brief image content */ + mshadow::Tensor img_; + /*! \brief label content */ + std::vector labels_; + /*! \brief batch data tensor */ + mshadow::Tensor batch_data_; + /*! \brief batch label tensor */ + mshadow::Tensor batch_label_; + /*! \brief instance index offset */ + unsigned inst_offset_; + /*! \brief instance index */ + std::vector inst_; + // magic number to setup randomness + static const int kRandMagic = 0; +}; // class MNISTIter + +DMLC_REGISTER_PARAMETER(MNISTParam); +MXNET_REGISTER_IO_ITER(MNISTIter, MNISTIter) + .describe("Create iterator for MNIST hand-written digit number recognition dataset.") + .add_arguments(MNISTParam::__FIELDS__()); +} // namespace io +} // namespace mxnet diff --git a/tests/python/get_data.py b/tests/python/get_data.py new file mode 100644 index 000000000000..82d25d9072fb --- /dev/null +++ b/tests/python/get_data.py @@ -0,0 +1,29 @@ +# pylint: skip-file +import os, gzip +import pickle as pickle +import sys + +# download mnist.pkl.gz +def GetMNIST_pkl(): + if not os.path.isdir("data/"): + os.system("mkdir data/") + if not os.path.exists('data/mnist.pkl.gz'): + os.system("wget http://deeplearning.net/data/mnist/mnist.pkl.gz -P data/") + +# download ubyte version of mnist and untar +def GetMNIST_ubyte(): + if not os.path.isdir("data/"): + os.system("mkdir data/") + if not os.path.exists('data/train-images-idx3-ubyte'): + os.system("wget http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz -P data/") + os.system("gunzip data/train-images-idx3-ubyte.gz") + if not os.path.exists('data/train-labels-idx1-ubyte'): + os.system("wget http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz -P data/") + os.system("gunzip data/train-labels-idx1-ubyte.gz") + if not os.path.exists('data/t10k-images-idx3-ubyte'): + os.system("wget http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz -P data/") + os.system("gunzip data/t10k-images-idx3-ubyte.gz") + if not os.path.exists('data/t10k-labels-idx1-ubyte'): + os.system("wget http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz -P data/") + os.system("gunzip data/t10k-labels-idx1-ubyte.gz") + diff --git a/tests/python/test_conv.py b/tests/python/test_conv.py index a456b19982f8..0604476d4bb5 100644 --- a/tests/python/test_conv.py +++ b/tests/python/test_conv.py @@ -3,64 +3,12 @@ import numpy as np import os, pickle, gzip import sys - +import get_data def CalAcc(out, label): pred = np.argmax(out, axis=1) return np.sum(pred == label) * 1.0 / out.shape[0] -def IgnorePython3(): - if sys.version_info[0] >= 3: - # TODO(tianjun): use IO instead of pickle - # Python3 pickle is not able to load data correctly - sys.exit(0) - - -# load data -class MNISTIter(object): - def __init__(self, which_set, batch_size=100, flatten=True): - if not os.path.exists('mnist.pkl.gz'): - os.system("wget http://deeplearning.net/data/mnist/mnist.pkl.gz") - f = gzip.open('mnist.pkl.gz', 'rb') - IgnorePython3() - train_set, valid_set, test_set = pickle.load(f) - f.close() - if which_set == 'train': - self.data = train_set[0] - self.label = np.asarray(train_set[1]) - elif which_set == 'valid': - self.data = valid_set[0] - self.label = np.asarray(valid_set[1]) - else: - self.data = test_set[0] - self.data = np.asarray(test_set[1]) - self.flatten = flatten - self.batch_size = batch_size - self.nbatch = self.data.shape[0] / batch_size - assert(self.data.shape[0] % batch_size == 0) # I am lazy - self.now_idx = -1 - def BeforeFirst(self): - self.now_idx = -1 - def Next(self): - self.now_idx += 1 - if self.now_idx == self.nbatch: - return False - return True - def Get(self): - if self.now_idx < 0: - raise Exception("Iterator is at head") - elif self.now_idx >= self.nbatch: - raise Exception("Iterator is at end") - start = self.now_idx * self.batch_size - end = (self.now_idx + 1) * self.batch_size - if self.flatten: - return (self.data[start:end, :], self.label[start:end]) - else: - return (self.data[start:end, :].reshape(batch_size, 1, 28, 28), - self.label[start:end]) - - - # symbol net batch_size = 100 data = mx.symbol.Variable('data') @@ -110,26 +58,38 @@ def Get(self): def Update(grad, weight): weight.numpy[:] -= lr * grad.numpy[:] / batch_size -block = zip(grad_narrays, arg_narrays) +block = list(zip(grad_narrays, arg_narrays)) +# check data +get_data.GetMNIST_ubyte() -train = MNISTIter("train", batch_size, False) -valid = MNISTIter("valid", batch_size, False) +train_dataiter = mx.io.MNISTIter( + image="data/train-images-idx3-ubyte", + label="data/train-labels-idx1-ubyte", + batch_size=batch_size, shuffle=True, silent=False, seed=10) +val_dataiter = mx.io.MNISTIter( + image="data/t10k-images-idx3-ubyte", + label="data/t10k-labels-idx1-ubyte", + batch_size=batch_size, shuffle=True, silent=False) def test_mnist(): acc_train = 0.0 acc_val = 0.0 - for i in xrange(epoch): + for i in range(epoch): # train print("Epoch %d" % i) train_acc = 0.0 val_acc = 0.0 - while train.Next(): - data, label = train.Get() + train_nbatch = 0 + val_nbatch = 0 + for data, label in train_dataiter: + data = data.numpy + label = label.numpy.flatten() inputs["data"].numpy[:] = data inputs["sm_label"].numpy[:] = label executor.forward() train_acc += CalAcc(out_narray.numpy, label) + train_nbatch += 1 grad_narray.numpy[:] = out_narray.numpy executor.backward([grad_narray]) @@ -137,17 +97,19 @@ def test_mnist(): Update(grad, weight) # evaluate - while valid.Next(): - data, label = valid.Get() + for data, label in val_dataiter: + data = data.numpy + label = label.numpy.flatten() inputs["data"].numpy[:] = data executor.forward() val_acc += CalAcc(out_narray.numpy, label) - print("Train Acc: ", train_acc / train.nbatch) - print("Valid Acc: ", val_acc / valid.nbatch) - acc_train = train_acc / train.nbatch - acc_val = val_acc / valid.nbatch - train.BeforeFirst() - valid.BeforeFirst() + val_nbatch += 1 + print("Train Acc: ", train_acc / train_nbatch) + print("Valid Acc: ", val_acc / val_nbatch) + acc_train = train_acc / train_nbatch + acc_val = val_acc / val_nbatch + train_dataiter.reset() + val_dataiter.reset() assert(acc_train > 0.84) assert(acc_val > 0.96) diff --git a/tests/python/test_io.py b/tests/python/test_io.py new file mode 100644 index 000000000000..dfeb3f67c293 --- /dev/null +++ b/tests/python/test_io.py @@ -0,0 +1,41 @@ +# pylint: skip-file +import mxnet as mx +import numpy as np +import os, gzip +import pickle as pickle +import sys +import get_data + +# prepare data +get_data.GetMNIST_ubyte() + +batch_size = 100 +train_dataiter = mx.io.MNISTIter( + image="data/train-images-idx3-ubyte", + label="data/train-labels-idx1-ubyte", + batch_size=batch_size, shuffle=1, flat=1, silent=0, seed=10) +val_dataiter = mx.io.MNISTIter( + image="data/t10k-images-idx3-ubyte", + label="data/t10k-labels-idx1-ubyte", + batch_size=batch_size, shuffle=0, flat=1, silent=0) + +def test_MNISTIter_loop(): + nbatch = 60000 / batch_size + batch_count = 0 + for data, label in train_dataiter: + batch_count += 1 + assert(nbatch == batch_count) + +def test_MNISTIter_reset(): + train_dataiter.reset() + train_dataiter.iter_next() + label_0 = train_dataiter.getlabel().numpy.flatten() + train_dataiter.iter_next() + train_dataiter.iter_next() + train_dataiter.iter_next() + train_dataiter.iter_next() + train_dataiter.reset() + train_dataiter.iter_next() + label_1 = train_dataiter.getlabel().numpy.flatten() + assert(sum(label_0 - label_1) == 0) + diff --git a/tests/python/test_mlp.py b/tests/python/test_mlp.py index 4770c19b9136..8a84d50536c3 100644 --- a/tests/python/test_mlp.py +++ b/tests/python/test_mlp.py @@ -4,61 +4,12 @@ import os, gzip import pickle as pickle import sys +import get_data + def CalAcc(out, label): pred = np.argmax(out, axis=1) return np.sum(pred == label) * 1.0 / out.shape[0] -def IgnorePython3(): - if sys.version_info[0] >= 3: - # TODO(tianjun): use IO instead of pickle - # Python3 pickle is not able to load data correctly - sys.exit(0) - - -# load data -class MNISTIter(object): - def __init__(self, which_set, batch_size=100, flatten=True): - if not os.path.exists('mnist.pkl.gz'): - os.system("wget http://deeplearning.net/data/mnist/mnist.pkl.gz") - f = gzip.open('mnist.pkl.gz', 'rb') - IgnorePython3() - train_set, valid_set, test_set = pickle.load(f) - f.close() - if which_set == 'train': - self.data = train_set[0] - self.label = np.asarray(train_set[1]) - elif which_set == 'valid': - self.data = valid_set[0] - self.label = np.asarray(valid_set[1]) - else: - self.data = test_set[0] - self.data = np.asarray(test_set[1]) - self.flatten = flatten - self.batch_size = batch_size - self.nbatch = self.data.shape[0] / batch_size - assert(self.data.shape[0] % batch_size == 0) # I am lazy - self.now_idx = -1 - def BeforeFirst(self): - self.now_idx = -1 - def Next(self): - self.now_idx += 1 - if self.now_idx == self.nbatch: - return False - return True - def Get(self): - if self.now_idx < 0: - raise Exception("Iterator is at head") - elif self.now_idx >= self.nbatch: - raise Exception("Iterator is at end") - start = self.now_idx * self.batch_size - end = (self.now_idx + 1) * self.batch_size - if self.flatten: - return (self.data[start:end, :], self.label[start:end]) - else: - return (self.data[start:end, :].reshape(batch_size, 1, 28, 28), - self.label[start:end]) - - # symbol net batch_size = 100 data = mx.symbol.Variable('data') @@ -98,12 +49,19 @@ def Get(self): def Update(grad, weight): weight.numpy[:] -= lr * grad.numpy[:] / batch_size -block = zip(grad_narrays, arg_narrays) - +block = list(zip(grad_narrays, arg_narrays)) +#check data +get_data.GetMNIST_ubyte() -train = MNISTIter("train", batch_size, True) -valid = MNISTIter("valid", batch_size, True) +train_dataiter = mx.io.MNISTIter( + image="data/train-images-idx3-ubyte", + label="data/train-labels-idx1-ubyte", + batch_size=batch_size, shuffle=True, flat=True, silent=False, seed=10) +val_dataiter = mx.io.MNISTIter( + image="data/t10k-images-idx3-ubyte", + label="data/t10k-labels-idx1-ubyte", + batch_size=batch_size, shuffle=True, flat=True, silent=False) def test_mlp(): acc_train = 0. @@ -113,12 +71,16 @@ def test_mlp(): print("Epoch %d" % i) train_acc = 0.0 val_acc = 0.0 - while train.Next(): - data, label = train.Get() + train_nbatch = 0 + val_nbatch = 0 + for data, label in train_dataiter: + data = data.numpy + label = label.numpy.flatten() inputs["data"].numpy[:] = data inputs["sm_label"].numpy[:] = label executor.forward() train_acc += CalAcc(out_narray.numpy, label) + train_nbatch += 1 grad_narray.numpy[:] = out_narray.numpy executor.backward([grad_narray]) @@ -126,17 +88,19 @@ def test_mlp(): Update(grad, weight) # evaluate - while valid.Next(): - data, label = valid.Get() + for data, label in val_dataiter: + data = data.numpy + label = label.numpy.flatten() inputs["data"].numpy[:] = data executor.forward() val_acc += CalAcc(out_narray.numpy, label) - acc_train = train_acc / train.nbatch - acc_val = val_acc / valid.nbatch - print("Train Acc: ", train_acc / train.nbatch) - print("Valid Acc: ", val_acc / valid.nbatch) - train.BeforeFirst() - valid.BeforeFirst() + val_nbatch += 1 + acc_train = train_acc / train_nbatch + acc_val = val_acc / val_nbatch + print("Train Acc: ", train_acc / train_nbatch) + print("Valid Acc: ", val_acc / val_nbatch) + train_dataiter.reset() + val_dataiter.reset() assert(acc_train > 0.98) assert(acc_val > 0.97)