Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Commit

Permalink
Merge pull request #82 from mli/master
Browse files Browse the repository at this point in the history
fix tiny bug in iter_image_rec, and tiny refactor
  • Loading branch information
mli committed Sep 16, 2015
2 parents f4207b5 + 0a880da commit 7cbfd57
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 43 deletions.
26 changes: 15 additions & 11 deletions src/io/inst_vector.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,21 @@
namespace mxnet {
namespace io {
/*!
* \brief tensor vector that can store sequence of tensor
* in a memory compact way, tensors do not have to be of same shape
* \brief a vector of tensor with various shape
*
* data are stored in memory continously
*/
template<int dim, typename DType>
class TensorVector {
public:
TensorVector(void) {
this->Clear();
}
// get i-th tensor
/*! \brief get the buffer to the i-th tensor */
inline mshadow::Tensor<cpu, dim, DType>
operator[](size_t i) const {
CHECK(i + 1 < offset_.size());
CHECK(shape_[i].Size() == offset_[i + 1] - offset_[i]);
CHECK_LT(i + 1, offset_.size());
CHECK_EQ(shape_[i].Size(), offset_[i + 1] - offset_[i]);
return mshadow::Tensor<cpu, dim, DType>
((DType*)dmlc::BeginPtr(content_) + offset_[i], shape_[i]); // NOLINT(*)
}
Expand All @@ -40,8 +41,7 @@ class TensorVector {
inline size_t Size(void) const {
return shape_.size();
}
// push a tensor of certain shape
// return the reference of the pushed tensor
/*! \brief allocate space given the shape (data are copied) */
inline void Push(mshadow::Shape<dim> shape) {
shape_.push_back(shape);
offset_.push_back(offset_.back() + shape.Size());
Expand All @@ -64,23 +64,23 @@ class TensorVector {
};

/*!
* \brief instance vector that can holds
* non-uniform shape data instance in a shape efficient way
* \brief a list of (label, example) pairs, examples can have various shape
*/
class InstVector {
public:
/*! \brief return the number of (label, example) pairs */
inline size_t Size(void) const {
return index_.size();
}
// instance
/* \brief get the i-th (label, example) pair */
inline DataInst operator[](size_t i) const {
DataInst inst;
inst.index = index_[i];
inst.data.push_back(TBlob(data_[i]));
inst.data.push_back(TBlob(label_[i]));
return inst;
}
// get back of instance vector
/* \brief get the last (label, example) pair */
inline DataInst Back() const {
return (*this)[Size() - 1];
}
Expand All @@ -89,6 +89,10 @@ class InstVector {
data_.Clear();
label_.Clear();
}
/*
* \brief push a (label, example) pair
* only reserved the space, while the data is not copied
*/
inline void Push(unsigned index,
mshadow::Shape<3> dshape,
mshadow::Shape<1> lshape) {
Expand Down
55 changes: 23 additions & 32 deletions src/io/iter_image_recordio.cc
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,10 @@ struct ImageRecParserParam : public dmlc::Parameter<ImageRecParserParam> {
int nthread;
/*! \brief whether to remain silent */
bool silent;
/*! \brief number of distributed worker */
int dist_num_worker, dist_worker_rank;
/*! \brief virtually split the data into n parts */
int num_parts;
/*! \brief only read the i-th part */
int part_index;
/*! \brief label-width */
int label_width;
/*! \brief input shape */
Expand All @@ -112,10 +114,10 @@ struct ImageRecParserParam : public dmlc::Parameter<ImageRecParserParam> {
.describe("How many labels for an image.");
DMLC_DECLARE_FIELD(silent).set_default(false)
.describe("Whether to output parser information.");
DMLC_DECLARE_FIELD(dist_num_worker).set_lower_bound(1).set_default(1)
.describe("Dist worker number.");
DMLC_DECLARE_FIELD(dist_worker_rank).set_default(0)
.describe("Dist worker rank.");
DMLC_DECLARE_FIELD(num_parts).set_lower_bound(1).set_default(1)
.describe("virtually split the data into n parts");
DMLC_DECLARE_FIELD(part_index).set_default(0)
.describe("only read the i-th part");
index_t input_shape_default[] = {3, 224, 224};
DMLC_DECLARE_FIELD(input_shape)
.set_default(TShape(input_shape_default, input_shape_default + 3))
Expand Down Expand Up @@ -173,12 +175,12 @@ class ImageRecordIOParser {
inline void ImageRecordIOParser::Init(
const std::vector<std::pair<std::string, std::string> >& kwargs) {
// initialize parameter
std::vector<std::pair<std::string, std::string> > kwargs_left;
// init image rec param
kwargs_left = param_.InitAllowUnknown(kwargs);
param_.InitAllowUnknown(kwargs);
int maxthread, threadget;
#pragma omp parallel
{
// why ? (muli)
maxthread = std::max(omp_get_num_procs() / 2 - 1, 1);
}
param_.nthread = std::min(maxthread, param_.nthread);
Expand All @@ -194,12 +196,6 @@ inline void ImageRecordIOParser::Init(
prnds_.push_back(new common::RANDOM_ENGINE((i + 1) * kRandMagic));
}

// handling for hadoop
const char *ps_rank = getenv("PS_RANK");
if (ps_rank != NULL) {
param_.dist_worker_rank = atoi(ps_rank);
}

if (param_.path_imglist.length() != 0) {
label_map_ = new ImageLabelMap(param_.path_imglist.c_str(),
param_.label_width, param_.silent != 0);
Expand All @@ -208,15 +204,10 @@ inline void ImageRecordIOParser::Init(
}
CHECK(param_.path_imgrec.length() != 0)
<< "ImageRecordIOIterator: must specify image_rec";
#if MSHADOW_DIST_PS
param_.dist_num_worker = ::ps::RankSize();
param_.dist_worker_rank = ::ps::MyRank();
LOG(INFO) << "rank " << param_.dist_worker_rank
<< " in " << param_.dist_num_worker;
#endif
source_ = dmlc::InputSplit::Create
(param_.path_imgrec.c_str(), param_.dist_worker_rank,
param_.dist_num_worker, "recordio");

source_ = dmlc::InputSplit::Create(
param_.path_imgrec.c_str(), param_.part_index,
param_.num_parts, "recordio");
// use 64 MB chunk when possible
source_->HintChunkSize(8 << 20UL);
}
Expand Down Expand Up @@ -281,25 +272,22 @@ struct ImageRecordParam: public dmlc::Parameter<ImageRecordParam> {
}
};


// iterator on image recordio
class ImageRecordIter : public IIterator<DataInst> {
public:
ImageRecordIter()
: data_(NULL) {
}
ImageRecordIter() : data_(NULL) { }
virtual ~ImageRecordIter(void) {
iter_.Destroy();
// data can be NULL
delete data_;
}
virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
std::vector<std::pair<std::string, std::string> > kwargs_left;
// init image rec param
kwargs_left = param_.InitAllowUnknown(kwargs);
param_.InitAllowUnknown(kwargs);
// use the kwarg to init parser
parser_.Init(kwargs);
// init thread iter
// prefetch at most 4 minbatches
iter_.set_max_capacity(4);
// init thread iter
iter_.Init([this](std::vector<InstVector> **dptr) {
if (*dptr == NULL) {
*dptr = new std::vector<InstVector>();
Expand All @@ -309,14 +297,16 @@ class ImageRecordIter : public IIterator<DataInst> {
[this]() { parser_.BeforeFirst(); });
// Check Meanfile
if (param_.mean_img.length() != 0) {
dmlc::Stream *fi = dmlc::Stream::Create(param_.mean_img.c_str(), "r", true);
dmlc::Stream *fi =
dmlc::Stream::Create(param_.mean_img.c_str(), "r", true);
if (fi == NULL) {
this->CreateMeanImg();
} else {
delete fi;
}
}
inst_ptr_ = 0;
shuffle_ = param_.shuffle;
}
virtual void BeforeFirst(void) {
iter_.BeforeFirst();
Expand Down Expand Up @@ -410,6 +400,7 @@ class ImageRecordIter : public IIterator<DataInst> {
// mean image
mshadow::TensorContainer<cpu, 3> meanimg_;
};

DMLC_REGISTER_PARAMETER(ImageRecParserParam);
DMLC_REGISTER_PARAMETER(ImageRecordParam);
MXNET_REGISTER_IO_CHAINED_ITER(ImageRecordIter, ImageRecordIter, BatchAdaptIter)
Expand Down

0 comments on commit 7cbfd57

Please sign in to comment.