From 0a880daddb24b753c77dcb8125c2b336c1770bcc Mon Sep 17 00:00:00 2001 From: Mu Li Date: Tue, 15 Sep 2015 21:18:54 -0400 Subject: [PATCH] fix tiny bug in iter_image_rec, and tiny refactor --- src/io/inst_vector.h | 26 ++++++++++------- src/io/iter_image_recordio.cc | 55 +++++++++++++++-------------------- 2 files changed, 38 insertions(+), 43 deletions(-) diff --git a/src/io/inst_vector.h b/src/io/inst_vector.h index ed560fc2b5da..f2f86751e698 100644 --- a/src/io/inst_vector.h +++ b/src/io/inst_vector.h @@ -17,8 +17,9 @@ namespace mxnet { namespace io { /*! - * \brief tensor vector that can store sequence of tensor - * in a memory compact way, tensors do not have to be of same shape + * \brief a vector of tensor with various shape + * + * data are stored in memory continously */ template class TensorVector { @@ -26,11 +27,11 @@ class TensorVector { TensorVector(void) { this->Clear(); } - // get i-th tensor + /*! \brief get the buffer to the i-th tensor */ inline mshadow::Tensor operator[](size_t i) const { - CHECK(i + 1 < offset_.size()); - CHECK(shape_[i].Size() == offset_[i + 1] - offset_[i]); + CHECK_LT(i + 1, offset_.size()); + CHECK_EQ(shape_[i].Size(), offset_[i + 1] - offset_[i]); return mshadow::Tensor ((DType*)dmlc::BeginPtr(content_) + offset_[i], shape_[i]); // NOLINT(*) } @@ -40,8 +41,7 @@ class TensorVector { inline size_t Size(void) const { return shape_.size(); } - // push a tensor of certain shape - // return the reference of the pushed tensor + /*! \brief allocate space given the shape (data are copied) */ inline void Push(mshadow::Shape shape) { shape_.push_back(shape); offset_.push_back(offset_.back() + shape.Size()); @@ -64,15 +64,15 @@ class TensorVector { }; /*! - * \brief instance vector that can holds - * non-uniform shape data instance in a shape efficient way + * \brief a list of (label, example) pairs, examples can have various shape */ class InstVector { public: + /*! \brief return the number of (label, example) pairs */ inline size_t Size(void) const { return index_.size(); } - // instance + /* \brief get the i-th (label, example) pair */ inline DataInst operator[](size_t i) const { DataInst inst; inst.index = index_[i]; @@ -80,7 +80,7 @@ class InstVector { inst.data.push_back(TBlob(label_[i])); return inst; } - // get back of instance vector + /* \brief get the last (label, example) pair */ inline DataInst Back() const { return (*this)[Size() - 1]; } @@ -89,6 +89,10 @@ class InstVector { data_.Clear(); label_.Clear(); } + /* + * \brief push a (label, example) pair + * only reserved the space, while the data is not copied + */ inline void Push(unsigned index, mshadow::Shape<3> dshape, mshadow::Shape<1> lshape) { diff --git a/src/io/iter_image_recordio.cc b/src/io/iter_image_recordio.cc index 701c28deb4c9..6f77bb6aac57 100644 --- a/src/io/iter_image_recordio.cc +++ b/src/io/iter_image_recordio.cc @@ -94,8 +94,10 @@ struct ImageRecParserParam : public dmlc::Parameter { int nthread; /*! \brief whether to remain silent */ bool silent; - /*! \brief number of distributed worker */ - int dist_num_worker, dist_worker_rank; + /*! \brief virtually split the data into n parts */ + int num_parts; + /*! \brief only read the i-th part */ + int part_index; /*! \brief label-width */ int label_width; /*! \brief input shape */ @@ -112,10 +114,10 @@ struct ImageRecParserParam : public dmlc::Parameter { .describe("How many labels for an image."); DMLC_DECLARE_FIELD(silent).set_default(false) .describe("Whether to output parser information."); - DMLC_DECLARE_FIELD(dist_num_worker).set_lower_bound(1).set_default(1) - .describe("Dist worker number."); - DMLC_DECLARE_FIELD(dist_worker_rank).set_default(0) - .describe("Dist worker rank."); + DMLC_DECLARE_FIELD(num_parts).set_lower_bound(1).set_default(1) + .describe("virtually split the data into n parts"); + DMLC_DECLARE_FIELD(part_index).set_default(0) + .describe("only read the i-th part"); index_t input_shape_default[] = {3, 224, 224}; DMLC_DECLARE_FIELD(input_shape) .set_default(TShape(input_shape_default, input_shape_default + 3)) @@ -173,12 +175,12 @@ class ImageRecordIOParser { inline void ImageRecordIOParser::Init( const std::vector >& kwargs) { // initialize parameter - std::vector > kwargs_left; // init image rec param - kwargs_left = param_.InitAllowUnknown(kwargs); + param_.InitAllowUnknown(kwargs); int maxthread, threadget; #pragma omp parallel { + // why ? (muli) maxthread = std::max(omp_get_num_procs() / 2 - 1, 1); } param_.nthread = std::min(maxthread, param_.nthread); @@ -194,12 +196,6 @@ inline void ImageRecordIOParser::Init( prnds_.push_back(new common::RANDOM_ENGINE((i + 1) * kRandMagic)); } - // handling for hadoop - const char *ps_rank = getenv("PS_RANK"); - if (ps_rank != NULL) { - param_.dist_worker_rank = atoi(ps_rank); - } - if (param_.path_imglist.length() != 0) { label_map_ = new ImageLabelMap(param_.path_imglist.c_str(), param_.label_width, param_.silent != 0); @@ -208,15 +204,10 @@ inline void ImageRecordIOParser::Init( } CHECK(param_.path_imgrec.length() != 0) << "ImageRecordIOIterator: must specify image_rec"; -#if MSHADOW_DIST_PS - param_.dist_num_worker = ::ps::RankSize(); - param_.dist_worker_rank = ::ps::MyRank(); - LOG(INFO) << "rank " << param_.dist_worker_rank - << " in " << param_.dist_num_worker; -#endif - source_ = dmlc::InputSplit::Create - (param_.path_imgrec.c_str(), param_.dist_worker_rank, - param_.dist_num_worker, "recordio"); + + source_ = dmlc::InputSplit::Create( + param_.path_imgrec.c_str(), param_.part_index, + param_.num_parts, "recordio"); // use 64 MB chunk when possible source_->HintChunkSize(8 << 20UL); } @@ -281,25 +272,22 @@ struct ImageRecordParam: public dmlc::Parameter { } }; + // iterator on image recordio class ImageRecordIter : public IIterator { public: - ImageRecordIter() - : data_(NULL) { - } + ImageRecordIter() : data_(NULL) { } virtual ~ImageRecordIter(void) { iter_.Destroy(); - // data can be NULL delete data_; } virtual void Init(const std::vector >& kwargs) { - std::vector > kwargs_left; - // init image rec param - kwargs_left = param_.InitAllowUnknown(kwargs); + param_.InitAllowUnknown(kwargs); // use the kwarg to init parser parser_.Init(kwargs); - // init thread iter + // prefetch at most 4 minbatches iter_.set_max_capacity(4); + // init thread iter iter_.Init([this](std::vector **dptr) { if (*dptr == NULL) { *dptr = new std::vector(); @@ -309,7 +297,8 @@ class ImageRecordIter : public IIterator { [this]() { parser_.BeforeFirst(); }); // Check Meanfile if (param_.mean_img.length() != 0) { - dmlc::Stream *fi = dmlc::Stream::Create(param_.mean_img.c_str(), "r", true); + dmlc::Stream *fi = + dmlc::Stream::Create(param_.mean_img.c_str(), "r", true); if (fi == NULL) { this->CreateMeanImg(); } else { @@ -317,6 +306,7 @@ class ImageRecordIter : public IIterator { } } inst_ptr_ = 0; + shuffle_ = param_.shuffle; } virtual void BeforeFirst(void) { iter_.BeforeFirst(); @@ -410,6 +400,7 @@ class ImageRecordIter : public IIterator { // mean image mshadow::TensorContainer meanimg_; }; + DMLC_REGISTER_PARAMETER(ImageRecParserParam); DMLC_REGISTER_PARAMETER(ImageRecordParam); MXNET_REGISTER_IO_CHAINED_ITER(ImageRecordIter, ImageRecordIter, BatchAdaptIter)