From 49136426c4ff080dc47045891d465626a139b6c1 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sat, 21 Feb 2015 22:12:56 -0800 Subject: [PATCH 1/4] debug mode test_on_server --- mshadow-ps/ps.h | 47 +++++++++++++++++++++++--- mshadow-ps/ps_local-inl.h | 70 +++++++++++++++++++++++++++++++++++---- mshadow-ps/thread_util.h | 4 +-- 3 files changed, 109 insertions(+), 12 deletions(-) diff --git a/mshadow-ps/ps.h b/mshadow-ps/ps.h index 6e6b08d2bd64..e93704a8fd03 100644 --- a/mshadow-ps/ps.h +++ b/mshadow-ps/ps.h @@ -85,7 +85,21 @@ class ISharedModel { * this is unique per device * \param devid the device id this tensor lies in */ - virtual void PullWait(int key, int devid = 0) = 0; + virtual void PullWait(int key, int devid) = 0; + /*! + * \brief check if the weight was correct on the current device + * + * \param data the data + * \param key the unique key to indicate the tensor + * this is unique per device + * \param devid the device id this tensor lies in + */ + template + inline void CheckWeight(Tensor data, + int key, + int devid) { + this->CheckWeight_(data.FlatTo2D(), key, devid); + } /*! * \brief push out a tensor to parameter server * this call is asynchronize and returns immediately @@ -100,7 +114,7 @@ class ISharedModel { template inline void Push(Tensor data, int key, - int devid = 0, + int devid, int priority = 0) { this->Push_(data.FlatTo2D(), key, devid, priority); } @@ -122,7 +136,7 @@ class ISharedModel { template inline void PullReq(Tensor data, int key, - int devid = 0, + int devid, int priority = 0, CallbackFunction callback = NULL, void *callback_arg = NULL) { @@ -155,6 +169,31 @@ class ISharedModel { this->PullReq(data, key, devid, priority, InvokeLambda_, calbk); } #endif // C++11 + + /*! + * \brief set weight of corresponding key in server + * this is a debug function that was not necessarily + * implemented by the server + * \param shape the shape content of the key + * \param key the unique key to indicate the tensor + * this is unique per device + * \param devid the device id this tensor lies in + */ + virtual void SetWeight_(Tensor data, + int key, + int devid) = 0; + /*! + * \brief check if the weight matches the server side + * this is a debug function that was not necessarily + * implemented by the server + * \param shape the shape content of the key + * \param key the unique key to indicate the tensor + * this is unique per device + * \param devid the device id this tensor lies in + */ + virtual void CheckWeight_(Tensor data, + int key, + int devid) = 0; protected: /*! * \brief initialize a key with certain shape @@ -178,7 +217,7 @@ class ISharedModel { */ virtual void Push_(Tensor data, int key, - int devid = 0, + int devid, int priority = 0) = 0; /*! * \brief send a pull request, to pull parameter into data diff --git a/mshadow-ps/ps_local-inl.h b/mshadow-ps/ps_local-inl.h index fa092dc68bce..8964c2369dd8 100644 --- a/mshadow-ps/ps_local-inl.h +++ b/mshadow-ps/ps_local-inl.h @@ -38,6 +38,7 @@ class LocalModel : public ISharedModel { bigarray_bound = 1000 * 1000; nthread_reduction = 8; use_pin_memory = 1; + test_on_server = 0; destroy_signal = false; custom_server = NULL; } @@ -118,6 +119,9 @@ class LocalModel : public ISharedModel { if (!strcmp(name, "update_on_server")) { update_on_server = atoi(val); } + if (!strcmp(name, "test_on_server")) { + test_on_server = atoi(val); + } cfgvec.push_back(std::make_pair(std::string(name), std::string(val))); } @@ -214,6 +218,51 @@ class LocalModel : public ISharedModel { this->init_end = 1; } + // set weight + virtual void SetWeight_(Tensor data, + int key, + int devid) { + utils::Check(test_on_server != 0, + "must be in pair debug mode"); + PushEntry &e = push_map.GetRef(key); + Stream s; + push_lock.Lock(); + mshadow::Copy(e.weight, data, &s); + push_lock.Unlock(); + } + virtual void CheckWeight_(Tensor data, + int key, + int devid) { + utils::Check(test_on_server != 0, + "must be in pair debug mode"); + PushEntry &e = push_map.GetRef(key); + mshadow::TensorContainer tmp(false); + tmp.Resize(data.shape_); + Stream s; + push_lock.Lock(); + // copy data + mshadow::Copy(tmp, data, &s); + index_t count = tmp.shape_.Size(); + double diff = 0.0, ssum = 0.0, maxdiff = 0.0; + index_t mxidx = 0; + for (index_t i = 0; i < count; ++i) { + double d = std::abs(tmp.dptr_[i] - e.weight.dptr_[i]); + if (d > maxdiff) { + maxdiff = d; mxidx = i; + } + diff += d; + ssum += std::abs(tmp.dptr_[i]); + } + push_lock.Unlock(); + // relative absolute error + double rerr = diff / ssum; + if (rerr > 1e-5 || diff != diff) { + fprintf(stderr, "PSLocal:key=%d,dev=%d: err=%f, maxd[%u]=%f, diff=%f, ssum=%f\n", + key, devid, rerr, mxidx, maxdiff, diff, ssum); + } else { + fprintf(stderr, "PSLocal:key=%d,dev=%d:check pass\n", key, devid); + } + } protected: /*! \brief operation performed locally in PS */ enum LocalOp { @@ -230,7 +279,6 @@ class LocalModel : public ISharedModel { this->InitPullMap(key); this->InitPushMap(key, shape); } - virtual void Push_(Tensor data, int key, int devid, int priority) { PullEntry &e = pull_map.GetRef(key); @@ -305,7 +353,9 @@ class LocalModel : public ISharedModel { if (custom_server != NULL) { // intialize server, and ready for pullback custom_server->InitModel(key, weight.dptr_, weight.MSize()); - this->PullReady(weight, key); + if (update_on_server != 0) { + this->PullReady(weight, key); + } } } /*! @@ -327,8 +377,13 @@ class LocalModel : public ISharedModel { if (custom_server != NULL) { this->ReduceSum(data); custom_server->Update(key, data[0].dptr_, data[0].MSize()); - PushEntry &e = push_map.GetRef(key); - this->PullReady(e.weight, key); + if (update_on_server != 0) { + PushEntry &e = push_map.GetRef(key); + this->PullReady(e.weight, key); + } else { + utils::Assert(test_on_server != 0, "test mode"); + this->PullReady(data[0], key); + } return; } switch (op) { @@ -346,7 +401,7 @@ class LocalModel : public ISharedModel { } virtual void InitCustomerServer(void) { - if (update_on_server != 0) { + if (update_on_server != 0 || test_on_server != 0) { custom_server = CreateModelUpdater(); for (size_t j = 0; j < cfgvec.size(); ++j) { custom_server->SetParam(cfgvec[j].first.c_str(), @@ -505,6 +560,8 @@ class LocalModel : public ISharedModel { int init_end; // whether perform update on serverside int update_on_server; + // debug option + int test_on_server; // use pinned memory int use_pin_memory; // number of reduction thread @@ -723,7 +780,8 @@ class LocalModel : public ISharedModel { push_lock.Lock(); if (e.copied.size() == 0) { e.Init(devices.size(), shape, - use_pin_memory != 0, update_on_server != 0); + use_pin_memory != 0, + update_on_server != 0 || test_on_server != 0); } this->ServerInitKey(e.weight, key); push_lock.Unlock(); diff --git a/mshadow-ps/thread_util.h b/mshadow-ps/thread_util.h index 607d69f83c3a..729bf7e65581 100644 --- a/mshadow-ps/thread_util.h +++ b/mshadow-ps/thread_util.h @@ -121,7 +121,7 @@ class ThreadSafeMap { } inline TValue &GetRef(int key) { TValue *ret = this->Get(key); - utils::Assert(ret != NULL, "key does not exist"); + utils::Assert(ret != NULL, "key=%d does not exist", key); return *ret; } inline void Init(int key) { @@ -129,7 +129,7 @@ class ThreadSafeMap { if (map_.count(key) == 0) { map_[key] = new TValue(); } - lock_.Unlock(); + lock_.Unlock(); } private: From c9d84de890cba30ddc88d9b14e6a4d9dfddbac16 Mon Sep 17 00:00:00 2001 From: Bing Xu Date: Mon, 23 Feb 2015 02:10:39 -0700 Subject: [PATCH 2/4] add copy between two type tensor --- mshadow/tensor_cpu-inl.h | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/mshadow/tensor_cpu-inl.h b/mshadow/tensor_cpu-inl.h index 240c65faffd6..b5d70fb10458 100644 --- a/mshadow/tensor_cpu-inl.h +++ b/mshadow/tensor_cpu-inl.h @@ -41,20 +41,20 @@ template<> inline void *AllocHost_(size_t size) { void *dptr; utils::Check(cudaMallocHost(&dptr, size, - cudaHostAllocPortable) == cudaSuccess, + cudaHostAllocPortable) == cudaSuccess, "AllocHost"); return dptr; } template<> inline void FreeHost_(void *dptr) { - cudaFreeHost(dptr); + cudaFreeHost(dptr); } #endif template<> inline void *AllocHost_(size_t size) { size_t pitch; - return sse2::AlignedMallocPitch(&pitch, size, 1); + return sse2::AlignedMallocPitch(&pitch, size, 1); } template<> inline void FreeHost_(void *dptr) { @@ -114,6 +114,21 @@ inline void Copy(Tensor _dst, memcpy(dst[y].dptr_, src[y].dptr_, sizeof(DType) * dst.size(1)); } } + +template +inline void Copy(Tensor _dst, + const Tensor &_src, + Stream *stream) { + utils::Check(_dst.shape_ == _src.shape_, "Copy:shape mismatch"); + Tensor dst = _dst.FlatTo2D(); + Tensor src = _src.FlatTo2D(); + for (index_t i = 0; i < dst.size(0); ++i) { + for (index_t j = 0; j < dst.size(1); ++j) { + dst[i][j] = static_cast(src[i][j]); + } + } +} + template inline void MapPlan(TRValue *dst, @@ -181,7 +196,7 @@ inline void MapReduceKeepLowest(TRValue *dst, ::Error_TypeCheck_Not_Pass_For_Reduce_Exp(); Shape<2> eshape = expr::ShapeCheck::kDim, E> ::Check(exp.self()).FlatTo2D(); - Shape<1> dshape = expr::ShapeCheck<1, R>::Check(dst->self()); + Shape<1> dshape = expr::ShapeCheck<1, R>::Check(dst->self()); utils::Check(eshape[1] == dshape[0], "MapReduceKeepLowest::reduction dimension do not match"); utils::Check(eshape[0] != 0, "can not reduce over empty tensor"); @@ -207,7 +222,7 @@ inline void MapReduceKeepHighDim(TRValue *dst, typedef Shape::kDim> EShape; EShape eshape = expr::ShapeCheck::kDim, E> ::Check(exp.self()); - Shape<1> dshape = expr::ShapeCheck<1, R>::Check(dst->self()); + Shape<1> dshape = expr::ShapeCheck<1, R>::Check(dst->self()); utils::Check(eshape[dimkeep] == dshape[0], "MapReduceKeepHighDim::reduction dimension do not match"); // use equvalent form From 7cb37fe3c35332698e20dede4070bf4af6fed832 Mon Sep 17 00:00:00 2001 From: tqchen Date: Mon, 23 Feb 2015 09:48:55 -0800 Subject: [PATCH 3/4] add copy constructor to tensor container --- mshadow/tensor_container.h | 34 +++++++++++++++++++++++++++++++++- mshadow/tensor_cpu-inl.h | 14 -------------- 2 files changed, 33 insertions(+), 15 deletions(-) diff --git a/mshadow/tensor_container.h b/mshadow/tensor_container.h index 19ebc2658040..6be47ec5222d 100644 --- a/mshadow/tensor_container.h +++ b/mshadow/tensor_container.h @@ -53,6 +53,24 @@ class TensorContainer: public Tensor { this->AllocByShape(shape); (*this) = initv; } + /*! + * \brief copy constructor + * \param src source value + */ + TensorContainer + (const TensorContainer &src) + : pad_(src.pad_) { + this->dptr_ = data_.dptr_ = NULL; + this->shape_[0] = 0; + this->stride_ = 0; + this->data_.stride_ = 0; + this->data_.shape_[0] = 0; + this->stream_ = src.stream_; + if (src.dptr_ != NULL) { + this->AllocByShape(src.shape_); + mshadow::Copy(*this, src, this->stream_); + } + } ~TensorContainer(void) { this->FreeSpace(); } @@ -109,10 +127,24 @@ class TensorContainer: public Tensor { Copy(*this, tmp, &stream); mshadow::FreeSpace(&tmp); } + /*! + * \brief assign operator from TensorContainer + * \param src source value + */ + inline TensorContainer &operator= + (const TensorContainer &src) { + this->pad_ = src.pad_; + this->stream_ = src.stream_; + if (src.dptr_ != NULL) { + this->Resize(src.shape_); + mshadow::Copy(*this, src, this->stream_); + } + return *this; + } /*!\brief functions to fit expression template */ inline Tensor &operator=(DType s) { return this->__assign(s); - } + } /*!\brief functions to fit expression template */ template inline Tensor & diff --git a/mshadow/tensor_cpu-inl.h b/mshadow/tensor_cpu-inl.h index b5d70fb10458..1ec5fa2c3ad5 100644 --- a/mshadow/tensor_cpu-inl.h +++ b/mshadow/tensor_cpu-inl.h @@ -115,20 +115,6 @@ inline void Copy(Tensor _dst, } } -template -inline void Copy(Tensor _dst, - const Tensor &_src, - Stream *stream) { - utils::Check(_dst.shape_ == _src.shape_, "Copy:shape mismatch"); - Tensor dst = _dst.FlatTo2D(); - Tensor src = _src.FlatTo2D(); - for (index_t i = 0; i < dst.size(0); ++i) { - for (index_t j = 0; j < dst.size(1); ++j) { - dst[i][j] = static_cast(src[i][j]); - } - } -} - template inline void MapPlan(TRValue *dst, From 9afa79ee03381b9878eb13240c4a3bf039566ed1 Mon Sep 17 00:00:00 2001 From: tqchen Date: Mon, 23 Feb 2015 15:24:26 -0800 Subject: [PATCH 4/4] remove test check --- mshadow-ps/ps_local-inl.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/mshadow-ps/ps_local-inl.h b/mshadow-ps/ps_local-inl.h index 8964c2369dd8..f61ac3896ea1 100644 --- a/mshadow-ps/ps_local-inl.h +++ b/mshadow-ps/ps_local-inl.h @@ -222,8 +222,6 @@ class LocalModel : public ISharedModel { virtual void SetWeight_(Tensor data, int key, int devid) { - utils::Check(test_on_server != 0, - "must be in pair debug mode"); PushEntry &e = push_map.GetRef(key); Stream s; push_lock.Lock();