PaddlePaddle · reyoung · Sep 8, 2016 · Sep 6, 2016 · Sep 6, 2016 · Sep 7, 2016
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 2.8)
 project(paddle CXX C)
 set(PADDLE_MAJOR_VERSION 0)
 set(PADDLE_MINOR_VERSION 8)
-set(PADDLE_PATCH_VERSION 0b)
+set(PADDLE_PATCH_VERSION 0b0)
 set(PADDLE_VERSION ${PADDLE_MAJOR_VERSION}.${PADDLE_MINOR_VERSION}.${PADDLE_PATCH_VERSION})
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")

diff --git a/doc/build/docker_install.md b/doc/build/docker_install.md
@@ -8,12 +8,12 @@ Docker is a tool designed to make it easier to create, deploy, and run applicati
 ### PaddlePaddle Docker images
 There are six Docker images:
 
-- paddledev/paddle:latest-cpu: PaddlePaddle CPU binary image.
-- paddledev/paddle:latest-gpu: PaddlePaddle GPU binary image.
-- paddledev/paddle:latest-cpu-devel: PaddlePaddle CPU binary image plus source code.
-- paddledev/paddle:latest-gpu-devel: PaddlePaddle GPU binary image plus source code.
-- paddledev/paddle:latest-cpu-demo: PaddlePaddle CPU binary image plus source code and demo
-- paddledev/paddle:latest-gpu-demo: PaddlePaddle GPU binary image plus source code and demo
+- paddledev/paddle:cpu-latest: PaddlePaddle CPU binary image.
+- paddledev/paddle:gpu-latest: PaddlePaddle GPU binary image.
+- paddledev/paddle:cpu-devel-latest: PaddlePaddle CPU binary image plus source code.
+- paddledev/paddle:gpu-devel-latest: PaddlePaddle GPU binary image plus source code.
+- paddledev/paddle:cpu-demo-latest: PaddlePaddle CPU binary image plus source code and demo
+- paddledev/paddle:gpu-demo-latest: PaddlePaddle GPU binary image plus source code and demo
 
 Tags with latest will be replaced by a released version. 
 
@@ -23,15 +23,15 @@ You have to install Docker in your machine which has linux kernel version 3.10+
 
 You can use ```docker pull ```to download images first, or just launch a container with ```docker run```:
 ```bash
-docker run -it paddledev/paddle:lastest-cpu
+docker run -it paddledev/paddle:cpu-latest
 ```
 
 If you want to launch container with GPU support, you need to set some environment variables at the same time:
 
 ```bash
 export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}"
 export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-docker run -it paddledev/paddle:latest-gpu
+docker run -it paddledev/paddle:gpu-latest
 ``` 
 
 ### Notice

diff --git a/doc/demo/imagenet_model/resnet_model.md b/doc/demo/imagenet_model/resnet_model.md
@@ -165,7 +165,7 @@ We provide both C++ and Python interfaces to extract features. The following exa
 
 ### C++ Interface
 
-First, specify image data list in `define_py_data_sources` in the config, see example `demo/model_zoo/resnet/resnet.py`.
+First, specify image data list in `define_py_data_sources2` in the config, see example `demo/model_zoo/resnet/resnet.py`.
 
 ```
     train_list = 'train.list' if not is_test else None

diff --git a/doc/demo/rec/ml_regression.rst b/doc/demo/rec/ml_regression.rst
@@ -257,7 +257,7 @@ In these network, we use several api in `trainer_config_helpers
 *  Text Convolution Pooling Layer, `text_conv_pool
    <../../ui/api/trainer_config_helpers/networks.html
    #trainer_config_helpers.networks.text_conv_pool>`_
-*  Declare Python Data Sources, `define_py_data_sources
+*  Declare Python Data Sources, `define_py_data_sources2
    <../../ui/api/trainer_config_helpers/data_sources.html>`_
 
 Data Provider

diff --git a/doc/ui/predict/predict_sample.py b/doc/ui/predict/predict_sample.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from py_paddle import swig_paddle, DataProviderWrapperConverter
-from paddle.trainer.PyDataProviderWrapper import DenseSlot
+from py_paddle import swig_paddle, DataProviderConverter
+from paddle.trainer.PyDataProvider2 import dense_vector
 from paddle.trainer.config_parser import parse_config
 
 TEST_DATA = [[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -89,12 +89,12 @@
 
 
 def main():
-    conf = parse_config("./mnist_model/trainer_config.conf.norm", "")
+    conf = parse_config("./mnist_model/trainer_config.py", "")
     print conf.data_config.load_data_args
     network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config)
     assert isinstance(network, swig_paddle.GradientMachine)  # For code hint.
     network.loadParameters("./mnist_model/")
-    converter = DataProviderWrapperConverter(False, [DenseSlot(784)])
+    converter = DataProviderConverter([dense_vector(784)])
     inArg = converter(TEST_DATA)
     print network.forwardTest(inArg)
 

diff --git a/doc/ui/predict/swig_py_paddle_en.rst b/doc/ui/predict/swig_py_paddle_en.rst
@@ -10,27 +10,35 @@ SWIG. The main steps of predict values in python are:
 * Predict
 
 Here is a sample python script that shows the typical prediction process for the
-MNIST classification problem.
+MNIST classification problem. A complete sample code could be found at
+:code:`src_root/doc/ui/predict/predict_sample.py`.
 
 ..  literalinclude:: ./predict_sample.py
     :language: python
-    :linenos:
+    :lines: 15-18,90-100,101-104
 
 The module that does the most of the job is py_paddle.swig_paddle, it's
 generated by SWIG and has complete documents, for more details you can use
 python's :code:`help()` function. Let's walk through the above python script:
 
-* At the beginning, initialize PaddlePaddle with command line arguments(line 90).
-* Parse the configuration file that is used in training(line 93).
-* Create a neural network at line 95 according the parsed configuration, then
-  load the trained parameters from model at line 97.
-* A utility class for data transformation is created at line 98.
+* At the beginning, use :code:`swig_paddle.initPaddle()` to initialize
+  PaddlePaddle with command line arguments, for more about command line arguments
+  see `Command Line Arguments <../cmd_argument/detail_introduction.html>`_.
+* Parse the configuration file that is used in training with :code:`parse_config()`.
+  Because data to predict with always have no label, and output of prediction work
+  normally is the output layer rather than the cost layer, so you should modify
+  the configuration file accordingly before using it in the prediction work.
+* Create a neural network with
+  :code:`swig_paddle.GradientMachine.createFromConfigproto()`, which takes the
+  parsed configuration :code:`conf.model_config` as argument. Then load the
+  trained parameters from the model with :code:`network.loadParameters()`.
+* Create a data converter object of utility class :code:`DataProviderConverter`.
     - Note: As swig_paddle can only accept C++ matrices, we offer a utility
-      class DataProviderWraaperConverter that can accept the same input data with
-      PyDataProviderWrapper, for more information please refer to document
+      class DataProviderConverter that can accept the same input data with
+      PyDataProvider2, for more information please refer to document
       of `PyDataProvider2 <../data_provider/pydataprovider2.html>`_.
-* Do the prediction and output the result at line 100, forwardTest is another
-  utility class that directly takes the activations of the output layer.
+* Do the prediction with :code:`forwardTest()`, which takes the converted
+  input data and outputs the activations of the output layer.
 
 Here is a typical output:
 

diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
@@ -7,6 +7,9 @@ add_subdirectory(pserver)
 add_subdirectory(trainer)
 add_subdirectory(scripts)
 
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
+    ${CMAKE_CURRENT_SOURCE_DIR}/setup.py)
+
 if(WITH_PREDICT_SDK)
     add_subdirectory(predict)
 endif()

diff --git a/paddle/cuda/src/hl_cuda_matrix.cu b/paddle/cuda/src/hl_cuda_matrix.cu
@@ -266,25 +266,21 @@ template<int blockSize>
 __global__ void KeMatrixClassificationError(real* in_A,
                                             int* in_B,
                                             real* out_C,
-                                            int dimM,
                                             int dimN) {
   __shared__ real max_s[blockSize];
   __shared__ int max_l[blockSize];
-  int cnt = (dimN + blockSize -1) / blockSize;
-  int tid = threadIdx.x;
-  int lmt = tid;
-  int index = 0;
-  real t;
+  const int tid = threadIdx.x;
+  const int rowId = blockIdx.x;
 
   max_s[tid] = -1e30f;
-  for (int ii = 0; ii < cnt && lmt < dimN; ii++) {
-    index = blockIdx.y*dimN + lmt;
-    t = in_A[index];
-    if (max_s[tid] < t) {
-      max_s[tid] = t;
-      max_l[tid] = lmt;
+  in_A += rowId * dimN;
+  real tmp;
+  for (int colId = tid; colId < dimN; colId += blockSize) {
+    tmp = in_A[colId];
+    if (max_s[tid] < tmp) {
+      max_s[tid] = tmp;
+      max_l[tid] = colId;
     }
-    lmt += blockSize;
   }
   __syncthreads();
 
@@ -300,7 +296,7 @@ __global__ void KeMatrixClassificationError(real* in_A,
   __syncthreads();
 
   if (tid == 0) {
-    out_C[blockIdx.y] = (max_l[0] == in_B[blockIdx.y] ? 0 : 1.0f);
+    out_C[rowId] = (max_l[0] == in_B[rowId] ? 0 : 1.0f);
   }
 }
 
@@ -313,12 +309,9 @@ void hl_matrix_classification_error(real* A_d,
   CHECK_NOTNULL(B_d);
   CHECK_NOTNULL(C_d);
 
-  int blocksX = 1;
-  int blocksY = dimM;
-  dim3 threads(1024, 1);
-  dim3 grid(blocksX, blocksY);
-  KeMatrixClassificationError<1024><<< grid, threads, 0, STREAM_DEFAULT >>>
-           (A_d, B_d, C_d, dimM, dimN);
+  // each sample is calculated by one block
+  KeMatrixClassificationError<1024><<< dimM, 1024, 0, STREAM_DEFAULT >>>
+    (A_d, B_d, C_d, dimN);
   CHECK_SYNC("hl_matrix_classification_error");
 }
 

diff --git a/paddle/gserver/layers/CRFLayer.cpp b/paddle/gserver/layers/CRFLayer.cpp
@@ -47,104 +47,49 @@ bool CRFLayer::init(const LayerMap& layerMap,
   // We don't need sequenceStartPositions because each sample of output_ is
   // for the cost of one sequence.
   setNeedSequenceInfo(false);
-  if (useGpu_) {
-    tmpCpuInput_.reserve(inputLayers_.size());
-    for (size_t i = 0; i < inputLayers_.size(); i++) {
-      tmpCpuInput_.push_back(Argument());
-    }
-  }
+
   return true;
 }
 
 void CRFLayer::forward(PassType passType) {
   Layer::forward(passType);
-  if (useGpu_) {
-    for (size_t i = 0; i < inputLayers_.size(); i++) {
-      tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1);
-    }
-    VectorPtr cpuParameterValue;
-    VectorPtr cpuParameterGradient;
-    cpuParameterValue =
-      Vector::create(parameter_->getBuf(PARAMETER_VALUE)->getSize(), false);
-    cpuParameterValue->
-      copyFrom(*parameter_->getBuf(PARAMETER_VALUE), HPPL_STREAM_1);
-    if (parameter_->getBuf(PARAMETER_GRADIENT)) {
-      cpuParameterGradient =
-        Vector::create(parameter_->getBuf(PARAMETER_GRADIENT)->getSize(),
-                       false);
-      cpuParameterGradient->
-        copyFrom(*parameter_->getBuf(PARAMETER_GRADIENT), HPPL_STREAM_1);
-    } else {
-      cpuParameterGradient = nullptr;
-    }
-    forwardImp(tmpCpuInput_[0], tmpCpuInput_[1], cpuParameterValue,
-               cpuParameterGradient);
-    parameter_->getBuf(PARAMETER_VALUE)->copyFrom(*cpuParameterValue,
-                                                  HPPL_STREAM_1);
-    if (parameter_->getBuf(PARAMETER_GRADIENT)) {
-      parameter_->getBuf(PARAMETER_GRADIENT)->copyFrom(*cpuParameterGradient,
-                                                    HPPL_STREAM_1);
-    }
-  } else {
-    forwardImp(getInput(0), getInput(1), parameter_->getBuf(PARAMETER_VALUE),
-               parameter_->getBuf(PARAMETER_GRADIENT));
-  }
-}
 
-void CRFLayer::forwardImp(const Argument&output,
-                          const Argument& label,
-                          VectorPtr parameterValue,
-                          VectorPtr parameterGradient) {
+  CHECK(!useGpu_) << "GPU is not supported";
+
+  const Argument& output = getInput(0);
+  const Argument& label = getInput(1);
   CHECK(label.sequenceStartPositions);
   CHECK(label.ids);
 
   int batchSize = output.getBatchSize();
   size_t numSequences = label.sequenceStartPositions->getSize() - 1;
   resizeOutput(numSequences, 1);
-  std::vector<real> out(numSequences);
 
   const int* starts = label.sequenceStartPositions->getData(false);
   CHECK_EQ(starts[numSequences], batchSize);
-  VectorPtr cpuParameterValue;
-  VectorPtr cpuParameterGradient;
-
 
   for (size_t i = 0; i < numSequences; ++i) {
     if (i >= crfs_.size()) {
       crfs_.emplace_back(numClasses_,
-                         parameterValue->getData(),
-                         parameterGradient
-                            ? parameterGradient->getData()
+                         parameter_->getBuf(PARAMETER_VALUE)->getData(),
+                         parameter_->getBuf(PARAMETER_GRADIENT)
+                            ? parameter_->getBuf(PARAMETER_GRADIENT)->getData()
                             : nullptr);
     }
-    out[i] = crfs_[i].forward(
+    output_.value->getData()[i] = crfs_[i].forward(
         output.value->getData() + numClasses_ * starts[i],
         label.ids->getData() + starts[i], starts[i + 1] - starts[i]);
   }
-  output_.value->copyFrom(out.data(), numSequences);
+
   if (weightLayer_) {
     const MatrixPtr& weight = getInputValue(*weightLayer_);
     getOutputValue()->dotMul(*getOutputValue(), *weight);
   }
 }
 
 void CRFLayer::backward(const UpdateCallback &callback) {
-  (void)callback;
-  if (useGpu_) {
-    backwardImp(callback, tmpCpuInput_[0], tmpCpuInput_[1]);
-    const_cast<Argument&>(getInput(0)).
-            resizeAndCopyFrom(tmpCpuInput_[0], true, HPPL_STREAM_1);
-    const_cast<Argument&>(getInput(1)).
-            resizeAndCopyFrom(tmpCpuInput_[1], true, HPPL_STREAM_1);
-
-  } else {
-    backwardImp(callback, getInput(0), getInput(1));
-  }
-}
-
-void CRFLayer::backwardImp(const UpdateCallback& callback,
-                           const Argument&output,
-                           const Argument& label) {
+  const Argument& output = getInput(0);
+  const Argument& label = getInput(1);
   const int* starts = label.sequenceStartPositions->getData(false);
   int numSequences = label.sequenceStartPositions->getSize() - 1;
 
@@ -159,9 +104,11 @@ void CRFLayer::backwardImp(const UpdateCallback& callback,
       grad->mulScalar(weight);
     }
   }
+
   if (coeff_ != real(1.0f)) {
     output.grad->mulScalar(coeff_);
   }
+
   parameter_->incUpdate(callback);
 }
 

diff --git a/paddle/gserver/layers/CRFLayer.h b/paddle/gserver/layers/CRFLayer.h
@@ -32,19 +32,14 @@ class CRFLayer : public Layer {
   explicit CRFLayer(const LayerConfig& config) : Layer(config) {}
   virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
   virtual void forward(PassType passType);
-  void forwardImp(const Argument&output, const Argument& label,
-                  VectorPtr parameterValue, VectorPtr parameterGradient);
   virtual void backward(const UpdateCallback& callback);
-  void backwardImp(const UpdateCallback& callback, const Argument&output,
-                   const Argument& label);
 
 protected:
   size_t numClasses_;
   ParameterPtr parameter_;
   std::vector<LinearChainCRF> crfs_;
   LayerPtr weightLayer_;  // weight for each sequence
   real coeff_;  // weight for the layer
-  std::vector<Argument> tmpCpuInput_;
 };
 
 }  // namespace paddle