From 65969dad641a95a1ac0f744b11c1166a173d169b Mon Sep 17 00:00:00 2001
From: yangyaming <yangyaming@baidu.com>
Date: Fri, 16 Jun 2017 16:29:08 +0800
Subject: [PATCH 01/79] Add DetectionOutputLayer and MultiBoxLossLayer.

---
 .../gserver/layers/DetectionOutputLayer.cpp   | 154 ++++++++
 paddle/gserver/layers/DetectionOutputLayer.h  |  81 ++++
 paddle/gserver/layers/MultiBoxLossLayer.cpp   | 365 ++++++++++++++++++
 paddle/gserver/layers/MultiBoxLossLayer.h     | 103 +++++
 paddle/gserver/tests/CMakeLists.txt           |   7 +
 paddle/gserver/tests/LayerGradUtil.cpp        |  25 ++
 paddle/gserver/tests/LayerGradUtil.h          |  18 +-
 paddle/gserver/tests/test_DetectionOutput.cpp | 191 +++++++++
 paddle/gserver/tests/test_LayerGrad.cpp       |  64 +++
 proto/ModelConfig.proto                       |  25 ++
 python/paddle/trainer/config_parser.py        |  46 +++
 .../paddle/trainer_config_helpers/layers.py   | 161 ++++++++
 12 files changed, 1239 insertions(+), 1 deletion(-)
 create mode 100644 paddle/gserver/layers/DetectionOutputLayer.cpp
 create mode 100644 paddle/gserver/layers/DetectionOutputLayer.h
 create mode 100644 paddle/gserver/layers/MultiBoxLossLayer.cpp
 create mode 100644 paddle/gserver/layers/MultiBoxLossLayer.h
 create mode 100644 paddle/gserver/tests/test_DetectionOutput.cpp

diff --git a/paddle/gserver/layers/DetectionOutputLayer.cpp b/paddle/gserver/layers/DetectionOutputLayer.cpp
new file mode 100644
index 0000000000..2a4d7f8b5b
--- /dev/null
+++ b/paddle/gserver/layers/DetectionOutputLayer.cpp
@@ -0,0 +1,154 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "DetectionOutputLayer.h"
+
+namespace paddle {
+
+REGISTER_LAYER(detection_output, DetectionOutputLayer);
+
+bool DetectionOutputLayer::init(const LayerMap& layerMap,
+                                const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  auto& layerConf = config_.inputs(0).detection_output_conf();
+  numClasses_ = layerConf.num_classes();
+  inputNum_ = layerConf.input_num();
+  nmsThreshold_ = layerConf.nms_threshold();
+  confidenceThreshold_ = layerConf.confidence_threshold();
+  nmsTopK_ = layerConf.nms_top_k();
+  keepTopK_ = layerConf.keep_top_k();
+  backgroundId_ = layerConf.background_id();
+  return true;
+}
+
+void DetectionOutputLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  size_t batchSize = getInputValue(*getLocInputLayer(0))->getHeight();
+
+  locSizeSum_ = 0;
+  confSizeSum_ = 0;
+  for (size_t n = 0; n < inputNum_; ++n) {
+    const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n));
+    const MatrixPtr inConf = getInputValue(*getConfInputLayer(n));
+    locSizeSum_ += inLoc->getElementCnt();
+    confSizeSum_ += inConf->getElementCnt();
+  }
+
+  Matrix::resizeOrCreate(locTmpBuffer_, 1, locSizeSum_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      confTmpBuffer_, confSizeSum_ / numClasses_, numClasses_, false, useGpu_);
+  locBuffer_ = locTmpBuffer_;
+  confBuffer_ = confTmpBuffer_;
+
+  size_t locOffset = 0;
+  size_t confOffset = 0;
+  auto& layerConf = config_.inputs(0).detection_output_conf();
+  for (size_t n = 0; n < inputNum_; ++n) {
+    const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n));
+    const MatrixPtr inConf = getInputValue(*getConfInputLayer(n));
+
+    size_t height = getInput(*getLocInputLayer(n)).getFrameHeight();
+    if (!height) height = layerConf.height();
+    size_t width = getInput(*getLocInputLayer(n)).getFrameWidth();
+    if (!width) width = layerConf.width();
+    locOffset += appendWithPermute(*inLoc,
+                                   height,
+                                   width,
+                                   locSizeSum_,
+                                   locOffset,
+                                   batchSize,
+                                   *locBuffer_,
+                                   kNCHWToNHWC);
+    confOffset += appendWithPermute(*inConf,
+                                    height,
+                                    width,
+                                    confSizeSum_,
+                                    confOffset,
+                                    batchSize,
+                                    *confBuffer_,
+                                    kNCHWToNHWC);
+  }
+  CHECK_EQ(locOffset, locSizeSum_ / batchSize);
+  CHECK_EQ(confOffset, confSizeSum_ / batchSize);
+
+  MatrixPtr priorValue;
+  if (useGpu_) {
+    Matrix::resizeOrCreate(locCpuBuffer_, 1, locSizeSum_, false, false);
+    Matrix::resizeOrCreate(
+        confCpuBuffer_, confSizeSum_ / numClasses_, numClasses_, false, false);
+    MatrixPtr priorTmpValue = getInputValue(*getPriorBoxLayer());
+    Matrix::resizeOrCreate(
+        priorCpuValue_, 1, priorTmpValue->getElementCnt(), false, false);
+
+    locCpuBuffer_->copyFrom(*locTmpBuffer_);
+    confCpuBuffer_->copyFrom(*confTmpBuffer_);
+    priorCpuValue_->copyFrom(*priorTmpValue);
+
+    locBuffer_ = locCpuBuffer_;
+    confBuffer_ = confCpuBuffer_;
+    priorValue = priorCpuValue_;
+  } else {
+    priorValue = getInputValue(*getPriorBoxLayer());
+  }
+  confBuffer_->softmax(*confBuffer_);
+
+  size_t numPriors = priorValue->getElementCnt() / 8;
+  vector<vector<NormalizedBBox>> allDecodedBBoxes;
+  for (size_t n = 0; n < batchSize; ++n) {
+    vector<NormalizedBBox> decodedBBoxes;
+    for (size_t i = 0; i < numPriors; ++i) {
+      size_t priorOffset = i * 8;
+      size_t locPredOffset = n * numPriors * 4 + i * 4;
+      vector<NormalizedBBox> priorBBoxVec;
+      getBBoxFromPriorData(
+          priorValue->getData() + priorOffset, 1, priorBBoxVec);
+      vector<vector<real>> priorBBoxVar;
+      getBBoxVarFromPriorData(
+          priorValue->getData() + priorOffset, 1, priorBBoxVar);
+      vector<real> locPredData;
+      for (size_t j = 0; j < 4; ++j)
+        locPredData.push_back(*(locBuffer_->getData() + locPredOffset + j));
+      NormalizedBBox bbox =
+          decodeBBoxWithVar(priorBBoxVec[0], priorBBoxVar[0], locPredData);
+      decodedBBoxes.push_back(bbox);
+    }
+    allDecodedBBoxes.push_back(decodedBBoxes);
+  }
+
+  vector<map<size_t, vector<size_t>>> allIndices;
+  size_t numKept = getDetectionIndices(confBuffer_->getData(),
+                                       numPriors,
+                                       numClasses_,
+                                       backgroundId_,
+                                       batchSize,
+                                       confidenceThreshold_,
+                                       nmsTopK_,
+                                       nmsThreshold_,
+                                       keepTopK_,
+                                       allDecodedBBoxes,
+                                       &allIndices);
+
+  resetOutput(numKept, 7);
+  MatrixPtr outV = getOutputValue();
+  getDetectionOutput(confBuffer_->getData(),
+                     numKept,
+                     numPriors,
+                     numClasses_,
+                     batchSize,
+                     allIndices,
+                     allDecodedBBoxes,
+                     *outV);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/DetectionOutputLayer.h b/paddle/gserver/layers/DetectionOutputLayer.h
new file mode 100644
index 0000000000..38271cb054
--- /dev/null
+++ b/paddle/gserver/layers/DetectionOutputLayer.h
@@ -0,0 +1,81 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+#include <vector>
+#include "DetectionUtil.h"
+#include "Layer.h"
+
+using std::vector;
+using std::map;
+using std::pair;
+
+namespace paddle {
+
+/**
+ * The detection output layer for a SSD detection task. This layer apply the
+ * Non-maximum suppression to the all predicted bounding box and keep the
+ * Top-K bounding boxes.
+ * - Input: This layer need three input layers: This first input layer
+ *          is the priorbox layer. The rest two input layers are convolution
+ *          layers for generating bbox location offset and the classification
+ *          confidence.
+ * - Output: The predict bounding box location.
+ */
+
+class DetectionOutputLayer : public Layer {
+public:
+  explicit DetectionOutputLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+
+  void backward(const UpdateCallback& callback = nullptr) {}
+
+protected:
+  inline LayerPtr getPriorBoxLayer() { return inputLayers_[0]; }
+
+  inline LayerPtr getLocInputLayer(size_t index) {
+    return inputLayers_[1 + index];
+  }
+
+  inline LayerPtr getConfInputLayer(size_t index) {
+    return inputLayers_[1 + inputNum_ + index];
+  }
+
+private:
+  size_t numClasses_;  // number of classes
+  size_t inputNum_;    // number of input layers
+  real nmsThreshold_;
+  real confidenceThreshold_;
+  size_t nmsTopK_;
+  size_t keepTopK_;
+  size_t backgroundId_;
+
+  size_t locSizeSum_;
+  size_t confSizeSum_;
+
+  MatrixPtr locBuffer_;
+  MatrixPtr confBuffer_;
+  MatrixPtr locTmpBuffer_;
+  MatrixPtr confTmpBuffer_;
+  MatrixPtr priorCpuValue_;
+  MatrixPtr locCpuBuffer_;
+  MatrixPtr confCpuBuffer_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MultiBoxLossLayer.cpp b/paddle/gserver/layers/MultiBoxLossLayer.cpp
new file mode 100644
index 0000000000..27a2cc3fa4
--- /dev/null
+++ b/paddle/gserver/layers/MultiBoxLossLayer.cpp
@@ -0,0 +1,365 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MultiBoxLossLayer.h"
+#include <float.h>
+#include <vector>
+#include "DataLayer.h"
+
+using std::vector;
+using std::map;
+using std::pair;
+
+namespace paddle {
+
+REGISTER_LAYER(multibox_loss, MultiBoxLossLayer);
+
+bool MultiBoxLossLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  auto layerConf = config_.inputs(0).multibox_loss_conf();
+  numClasses_ = layerConf.num_classes();
+  inputNum_ = layerConf.input_num();
+  overlapThreshold_ = layerConf.overlap_threshold();
+  negPosRatio_ = layerConf.neg_pos_ratio();
+  negOverlap_ = layerConf.neg_overlap();
+  backgroundId_ = layerConf.background_id();
+  return true;
+}
+
+void MultiBoxLossLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  size_t batchSize = getInputValue(*getLocInputLayer(0))->getHeight();
+  resetOutput(batchSize, 1);
+
+  // all location data and confidence score data
+  locSizeSum_ = 0;
+  confSizeSum_ = 0;
+  for (size_t n = 0; n < inputNum_; ++n) {
+    const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n));
+    const MatrixPtr inConf = getInputValue(*getConfInputLayer(n));
+    locSizeSum_ += inLoc->getElementCnt();
+    confSizeSum_ += inConf->getElementCnt();
+  }
+
+  // locBuffer layout:
+  // | xmin1 | ymin1 | xmax1 | ymax1 | xmin2 ......
+  Matrix::resizeOrCreate(locTmpBuffer_, 1, locSizeSum_, false, useGpu_);
+  locBuffer_ = locTmpBuffer_;
+
+  // confBuffer layout:
+  // | class1 score | class2 score | ... |classN score | class1 score | ......
+  Matrix::resizeOrCreate(confTmpBuffer_, 1, confSizeSum_, false, useGpu_);
+  confBuffer_ = confTmpBuffer_;
+
+  // concate location data and confidence score data
+  size_t locOffset = 0;
+  size_t confOffset = 0;
+  auto& layerConf = config_.inputs(0).multibox_loss_conf();
+  for (size_t n = 0; n < inputNum_; ++n) {
+    const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n));
+    const MatrixPtr inConf = getInputValue(*getConfInputLayer(n));
+    size_t height = getInput(*getLocInputLayer(n)).getFrameHeight();
+    if (!height) height = layerConf.height();
+    size_t width = getInput(*getLocInputLayer(n)).getFrameWidth();
+    if (!width) width = layerConf.width();
+    locOffset += appendWithPermute(*inLoc,
+                                   height,
+                                   width,
+                                   locSizeSum_,
+                                   locOffset,
+                                   batchSize,
+                                   *locBuffer_,
+                                   kNCHWToNHWC);
+    confOffset += appendWithPermute(*inConf,
+                                    height,
+                                    width,
+                                    confSizeSum_,
+                                    confOffset,
+                                    batchSize,
+                                    *confBuffer_,
+                                    kNCHWToNHWC);
+  }
+  CHECK_EQ(locOffset, locSizeSum_ / batchSize);
+  CHECK_EQ(confOffset, confSizeSum_ / batchSize);
+
+  // priorValue layout:
+  // | xmin1 | ymin1 | xmax1 | ymax1 | xmin1Var | ymin1Var | xmax1Var | ymax1Var
+  // | xmin2 | ......
+  MatrixPtr priorValue;
+
+  // labelValue layout:
+  // | class1_1 | xmin1_1 | ymin1_1 | xmax1_1 | ymax1_1 | difficult1_1 | ......
+  MatrixPtr labelValue;
+
+  // Copy data from GPU to CPU if use GPU
+  if (useGpu_) {
+    Matrix::resizeOrCreate(locCpuBuffer_, 1, locSizeSum_, false, false);
+    Matrix::resizeOrCreate(confCpuBuffer_, 1, confSizeSum_, false, false);
+    MatrixPtr priorTmpValue = getInputValue(*getPriorBoxLayer());
+    Matrix::resizeOrCreate(
+        priorCpuValue_, 1, priorTmpValue->getElementCnt(), false, false);
+    MatrixPtr labelTmpValue = getInputValue(*getLabelLayer());
+    Matrix::resizeOrCreate(labelCpuValue_,
+                           labelTmpValue->getHeight(),
+                           labelTmpValue->getWidth(),
+                           false,
+                           false);
+
+    locCpuBuffer_->copyFrom(*locTmpBuffer_);
+    confCpuBuffer_->copyFrom(*confTmpBuffer_);
+    priorCpuValue_->copyFrom(*priorTmpValue);
+    labelCpuValue_->copyFrom(*labelTmpValue);
+
+    locBuffer_ = locCpuBuffer_;
+    confBuffer_ = confCpuBuffer_;
+    priorValue = priorCpuValue_;
+    labelValue = labelCpuValue_;
+  } else {
+    priorValue = getInputValue(*getPriorBoxLayer());
+    labelValue = getInputValue(*getLabelLayer());
+  }
+
+  // Get max scores for each prior bbox. Used in negative mining
+  vector<vector<real>> allMaxConfScore;
+  numPriors_ = priorValue->getElementCnt() / 8;
+  getMaxConfidenceScores(confBuffer_->getData(),
+                         batchSize,
+                         numPriors_,
+                         numClasses_,
+                         backgroundId_,
+                         &allMaxConfScore);
+
+  // Match prior bbox to groundtruth bbox
+  Argument label = getInput(*getLabelLayer());
+  const int* labelIndex = label.sequenceStartPositions->getData(false);
+  size_t seqNum = label.getNumSequences();
+  numMatches_ = 0;
+  numNegs_ = 0;
+  allMatchIndices_.clear();
+  allNegIndices_.clear();
+
+  pair<size_t, size_t> retPair = generateMatchIndices(*priorValue,
+                                                      numPriors_,
+                                                      *labelValue,
+                                                      labelIndex,
+                                                      seqNum,
+                                                      allMaxConfScore,
+                                                      batchSize,
+                                                      overlapThreshold_,
+                                                      negOverlap_,
+                                                      negPosRatio_,
+                                                      &allMatchIndices_,
+                                                      &allNegIndices_);
+  numMatches_ = retPair.first;
+  numNegs_ = retPair.second;
+
+  // BBox location L1 smooth loss
+  locLoss_ = 0.0;
+  if (numMatches_ >= 1) {
+    size_t count = 0;
+    MatrixPtr locLossOutput;
+    Matrix::resizeOrCreate(locLossOutput, numMatches_ * 4, 1, false, false);
+    Matrix::resizeOrCreate(locGTData_, numMatches_ * 4, 1, false, false);
+    Matrix::resizeOrCreate(locDiff_, numMatches_ * 4, 1, false, false);
+    locDiff_->zeroMem();
+    vector<real> locGTData;
+
+    for (size_t n = 0; n < batchSize; ++n) {
+      for (size_t i = 0; i < numPriors_; ++i) {
+        if (allMatchIndices_[n][i] == -1) continue;  // match none
+        size_t locOffset =
+            n * (locBuffer_->getElementCnt() / batchSize) + i * 4;
+        locDiff_->getData()[count++] = (locBuffer_->getData() + locOffset)[0];
+        locDiff_->getData()[count++] = (locBuffer_->getData() + locOffset)[1];
+        locDiff_->getData()[count++] = (locBuffer_->getData() + locOffset)[2];
+        locDiff_->getData()[count++] = (locBuffer_->getData() + locOffset)[3];
+
+        const int gtIdx = allMatchIndices_[n][i];
+        size_t priorOffset = i * 8;
+        vector<NormalizedBBox> priorBBoxVec;
+        getBBoxFromPriorData(
+            priorValue->getData() + priorOffset, 1, priorBBoxVec);
+        vector<vector<real>> priorBBoxVar;
+        getBBoxVarFromPriorData(
+            priorValue->getData() + priorOffset, 1, priorBBoxVar);
+        size_t labelOffset = (labelIndex[n] + gtIdx) * 6;
+        vector<NormalizedBBox> gtBBoxVec;
+        getBBoxFromLabelData(labelValue->getData() + labelOffset, 1, gtBBoxVec);
+        vector<real> gtEncode;
+        encodeBBoxWithVar(
+            priorBBoxVec[0], priorBBoxVar[0], gtBBoxVec[0], gtEncode);
+        locGTData.insert(locGTData.end(), gtEncode.begin(), gtEncode.end());
+      }
+    }
+    locGTData_->copyFrom(&locGTData[0], numMatches_ * 4);
+    locLossOutput->smoothL1(*locDiff_, *locGTData_, 0.0);
+    locLoss_ = locLossOutput->getSum() / numMatches_;
+  }
+
+  // BBox confidence softmax loss
+  confLoss_ = 0;
+  numConf_ = numMatches_ + numNegs_;
+  if (numConf_ >= 1) {
+    Matrix::resizeOrCreate(confProb_, numConf_, numClasses_, false, false);
+    IVector::resizeOrCreate(confGTData_, numConf_, false);
+    confProb_->zeroMem();
+    size_t count = 0;
+
+    vector<real> confPredData;
+    for (size_t n = 0; n < batchSize; ++n) {
+      for (size_t i = 0; i < numPriors_; ++i) {
+        if (allMatchIndices_[n][i] == -1) continue;
+        size_t labelOffset = (labelIndex[n] + allMatchIndices_[n][i]) * 6;
+        const int gtLabel = (labelValue->getData() + labelOffset)[0];
+        confGTData_->getData()[count] = gtLabel;
+        size_t confOffset = n * numPriors_ * numClasses_ + i * numClasses_;
+        for (size_t j = 0; j < numClasses_; ++j) {
+          confProb_->getData()[count * numClasses_ + j] =
+              (confBuffer_->getData() + confOffset)[j];
+          confPredData.push_back((confBuffer_->getData() + confOffset)[j]);
+        }
+        ++count;
+      }
+      // Negative mining samples
+      for (size_t i = 0; i < allNegIndices_[n].size(); ++i) {
+        confGTData_->getData()[count] = backgroundId_;
+        size_t confOffset =
+            n * numPriors_ * numClasses_ + allNegIndices_[n][i] * numClasses_;
+        for (size_t j = 0; j < numClasses_; ++j) {
+          confProb_->getData()[count * numClasses_ + j] =
+              (confBuffer_->getData() + confOffset)[j];
+          confPredData.push_back((confBuffer_->getData() + confOffset)[j]);
+        }
+        count++;
+      }
+    }
+    confProb_->softmax(*confProb_);
+    MatrixPtr confLossOutput;
+    Matrix::resizeOrCreate(confLossOutput, numConf_, 1, false, false);
+    confLossOutput->oneHotCrossEntropy(*confProb_, *confGTData_);
+    confLoss_ = confLossOutput->getSum() / numMatches_;
+  }
+  real loss = locLoss_ + confLoss_;
+  MatrixPtr outV = getOutputValue();
+  vector<real> tmp(batchSize, loss);
+  outV->copyFrom(&tmp[0], batchSize);
+}
+
+void MultiBoxLossLayer::backward(const UpdateCallback& callback) {
+  size_t batchSize = getInputValue(*getLocInputLayer(0))->getHeight();
+  locBuffer_->zeroMem();
+  confBuffer_->zeroMem();
+
+  // Back propagate on location prediction
+  if (numMatches_ >= 1) {
+    MatrixPtr locDiffBuffer;
+    Matrix::resizeOrCreate(locDiffBuffer, numMatches_ * 4, 1, false, false);
+    locDiffBuffer->smoothL1Bp(*locDiff_, *locGTData_, 0.0);
+    locDiff_->copyFrom(*locDiffBuffer);
+    // scale gradient
+    for (size_t i = 0; i < numMatches_ * 4; ++i)
+      locDiff_->getData()[i] *= (1. / numMatches_);
+    // Copy gradient back
+    size_t count = 0;
+    for (size_t n = 0; n < batchSize; ++n)
+      for (size_t i = 0; i < numPriors_; ++i) {
+        if (allMatchIndices_[n][i] == -1) continue;
+        real* locDiffData = locBuffer_->getData() + n * numPriors_ * 4 + i * 4;
+        locDiffData[0] = (locDiff_->getData() + count * 4)[0];
+        locDiffData[1] = (locDiff_->getData() + count * 4)[1];
+        locDiffData[2] = (locDiff_->getData() + count * 4)[2];
+        locDiffData[3] = (locDiff_->getData() + count * 4)[3];
+        ++count;
+      }
+    CHECK_EQ(count, numMatches_);
+  }
+
+  if (numConf_ >= 1) {
+    for (size_t i = 0; i < numConf_; ++i)
+      confProb_->getData()[i * numClasses_ + confGTData_->getData()[i]] -= 1;
+    for (size_t i = 0; i < numConf_ * numClasses_; ++i)
+      confProb_->getData()[i] *= (1. / numMatches_);
+    size_t count = 0;
+    for (size_t n = 0; n < batchSize; ++n) {
+      for (size_t i = 0; i < numPriors_; ++i) {
+        if (allMatchIndices_[n][i] == -1) continue;
+        real* confDiffData = confBuffer_->getData() +
+                             n * numPriors_ * numClasses_ + i * numClasses_;
+        for (size_t j = 0; j < numClasses_; ++j)
+          confDiffData[j] = (confProb_->getData() + count * numClasses_)[j];
+        ++count;
+      }
+      for (size_t i = 0; i < allNegIndices_[n].size(); ++i) {
+        int idx = allNegIndices_[n][i];
+        real* confDiffData = confBuffer_->getData() +
+                             n * numPriors_ * numClasses_ + idx * numClasses_;
+        for (size_t j = 0; j < numClasses_; ++j)
+          confDiffData[j] = (confProb_->getData() + count * numClasses_)[j];
+        ++count;
+      }
+    }
+    CHECK_EQ(count, numConf_);
+  }
+  if (useGpu_) {
+    locTmpBuffer_->copyFrom(*locCpuBuffer_);
+    confTmpBuffer_->copyFrom(*confCpuBuffer_);
+    locBuffer_ = locTmpBuffer_;
+    confBuffer_ = confTmpBuffer_;
+  }
+  // copy back
+  size_t locOffset = 0;
+  size_t confOffset = 0;
+  auto layerConf = config_.inputs(0).multibox_loss_conf();
+  for (size_t n = 0; n < inputNum_; ++n) {
+    const MatrixPtr inLocG = getInputGrad(*getLocInputLayer(n));
+    const MatrixPtr inConfG = getInputGrad(*getConfInputLayer(n));
+    size_t height = getInput(*getLocInputLayer(n)).getFrameHeight();
+    if (!height) height = layerConf.height();
+    size_t width = getInput(*getLocInputLayer(n)).getFrameWidth();
+    if (!width) width = layerConf.width();
+
+    // NHWC to NCHW
+    MatrixPtr locGBuffer;
+    Matrix::resizeOrCreate(
+        locGBuffer, inLocG->getHeight(), inLocG->getWidth(), false, useGpu_);
+    MatrixPtr confGBuffer;
+    Matrix::resizeOrCreate(
+        confGBuffer, inConfG->getHeight(), inConfG->getWidth(), false, useGpu_);
+
+    locOffset += decomposeWithPermute(*locBuffer_,
+                                      height,
+                                      width,
+                                      locSizeSum_,
+                                      locOffset,
+                                      batchSize,
+                                      *locGBuffer,
+                                      kNHWCToNCHW);
+    inLocG->add(*locGBuffer);
+    confOffset += decomposeWithPermute(*confBuffer_,
+                                       height,
+                                       width,
+                                       confSizeSum_,
+                                       confOffset,
+                                       batchSize,
+                                       *confGBuffer,
+                                       kNHWCToNCHW);
+    inConfG->add(*confGBuffer);
+  }
+  CHECK_EQ(locOffset, locSizeSum_ / batchSize);
+  CHECK_EQ(confOffset, confSizeSum_ / batchSize);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MultiBoxLossLayer.h b/paddle/gserver/layers/MultiBoxLossLayer.h
new file mode 100644
index 0000000000..9767fed7f1
--- /dev/null
+++ b/paddle/gserver/layers/MultiBoxLossLayer.h
@@ -0,0 +1,103 @@
+/* copyright (c) 2016 paddlepaddle authors. all rights reserve.
+
+licensed under the apache license, version 2.0 (the "license");
+you may not use this file except in compliance with the license.
+you may obtain a copy of the license at
+
+    http://www.apache.org/licenses/license-2.0
+
+unless required by applicable law or agreed to in writing, software
+distributed under the license is distributed on an "as is" basis,
+without warranties or conditions of any kind, either express or implied.
+see the license for the specific language governing permissions and
+limitations under the license. */
+
+#pragma once
+
+#include <vector>
+#include "CostLayer.h"
+#include "DataLayer.h"
+#include "DetectionUtil.h"
+#include "Layer.h"
+
+using std::vector;
+using std::pair;
+
+namespace paddle {
+
+/**
+ * The multibox loss layer for a SSD detection task.
+ * The loss is composed by the location loss and the confidence loss.
+ * The location loss is a smooth L1 loss and the confidence loss is
+ * a softmax loss.
+ * - Input: This layer need four input layers: This first input layer
+ *          is the priorbox layer and the second layer is a label layer.
+ *          The rest two input layers are convolution layers for generating
+ *          bbox location offset and the classification confidence.
+ * - Output: The Single Shot Multibox Detection loss value.
+ * Reference:
+ *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
+ *    Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
+ */
+
+class MultiBoxLossLayer : public CostLayer {
+public:
+  explicit MultiBoxLossLayer(const LayerConfig& config) : CostLayer(config) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+
+  void backward(const UpdateCallback& callback = nullptr);
+
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) {}
+
+  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad) {}
+
+protected:
+  inline LayerPtr getPriorBoxLayer() { return inputLayers_[0]; }
+  inline LayerPtr getLabelLayer() { return inputLayers_[1]; }
+  inline LayerPtr getLocInputLayer(size_t index) {
+    return inputLayers_[2 + index];
+  }
+  inline LayerPtr getConfInputLayer(size_t index) {
+    return inputLayers_[2 + inputNum_ + index];
+  }
+
+protected:
+  size_t numClasses_;
+  real overlapThreshold_;
+  real negPosRatio_;
+  real negOverlap_;
+  size_t inputNum_;
+  size_t backgroundId_;
+
+  real locLoss_;
+  real confLoss_;
+
+  size_t numPriors_;
+  size_t numMatches_;
+  size_t numNegs_;
+  size_t numConf_;
+  size_t locSizeSum_;
+  size_t confSizeSum_;
+
+  vector<vector<int>> allMatchIndices_;
+  vector<vector<int>> allNegIndices_;
+  MatrixPtr locGTData_;
+  IVectorPtr confGTData_;
+
+  MatrixPtr locBuffer_;
+  MatrixPtr confBuffer_;
+  MatrixPtr locDiff_;
+  MatrixPtr confProb_;
+
+  MatrixPtr labelCpuValue_;
+  MatrixPtr priorCpuValue_;
+  MatrixPtr locCpuBuffer_;
+  MatrixPtr confCpuBuffer_;
+  MatrixPtr locTmpBuffer_;
+  MatrixPtr confTmpBuffer_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index 3c4128b5b8..92f6cbcfe5 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -45,6 +45,13 @@ add_unittest_without_exec(test_PriorBox
 
 add_test(NAME test_PriorBox
     COMMAND test_PriorBox)
+################# test_DetectionOutput #######################
+add_unittest_without_exec(test_DetectionOutput
+    test_DetectionOutput.cpp
+    LayerGradUtil.cpp)
+
+add_test(NAME test_DetectionOutput 
+    COMMAND test_DetectionOutput)
 ################# test_ConvUnify #######################
 add_unittest_without_exec(test_ConvUnify
     test_ConvUnify.cpp
diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp
index a0b1cd471d..e3591ba4df 100644
--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
@@ -387,6 +387,31 @@ void initDataLayer(TestConfig testConf,
         data.value->sigmoid(*data.value);
         data.grad->zeroMem();
         break;
+      case INPUT_SELF_DEFINE_DATA: {
+        size_t height = testConf.inputDefs[i].selfDefinedData->getHeight();
+        size_t width = testConf.inputDefs[i].selfDefinedData->getWidth();
+        CHECK_GT(static_cast<int>(height), 0);
+        CHECK_GT(static_cast<int>(width), 0);
+        data.value = Matrix::create(height, width, false, useGpu);
+        data.grad = Matrix::create(height, width, false, useGpu);
+        data.value->copyFrom(*testConf.inputDefs[i].selfDefinedData);
+        data.grad->zeroMem();
+
+        const std::vector<int>& labelSeqStartPositions =
+            testConf.inputDefs[i].labelSeqStartPositions;
+        if (labelSeqStartPositions.size() != 0) {
+          CHECK(!sequenceStartPositions);
+          CHECK_GE(static_cast<int>(labelSeqStartPositions.size()), 2);
+
+          sequenceStartPositions =
+              ICpuGpuVector::create(labelSeqStartPositions.size(), useGpu);
+          sequenceStartPositions->copyFrom(labelSeqStartPositions.data(),
+                                           labelSeqStartPositions.size(),
+                                           useGpu);
+          data.sequenceStartPositions = sequenceStartPositions;
+        }
+        break;
+      }
       default:
         LOG(FATAL) << " unknown inputType ";
         return;
diff --git a/paddle/gserver/tests/LayerGradUtil.h b/paddle/gserver/tests/LayerGradUtil.h
index 9f68eb64d0..18a6525a14 100644
--- a/paddle/gserver/tests/LayerGradUtil.h
+++ b/paddle/gserver/tests/LayerGradUtil.h
@@ -31,7 +31,8 @@ enum InputType {
   INPUT_SEQUENCE_LABEL,
   INPUT_SPARSE_NON_VALUE_DATA,
   INPUT_SPARSE_FLOAT_VALUE_DATA,
-  INPUT_DENSE_DIM_DATA,  // using sequence length to init dense data
+  INPUT_DENSE_DIM_DATA,    // using sequence length to init dense data
+  INPUT_SELF_DEFINE_DATA,  // support customizing for input value
 };
 
 struct ParaSparse {
@@ -66,6 +67,7 @@ struct InputDef {
   bool isStatic;
   std::vector<int> labelInitValue;
   std::vector<int> labelSeqStartPositions;
+  MatrixPtr selfDefinedData;
 
   InputDef(InputType type, string nameIn, size_t dimIn, size_t sizeIn) {
     inputType = type;
@@ -76,6 +78,20 @@ struct InputDef {
     isStatic = false;
   }
 
+  InputDef(InputType type,
+           string nameIn,
+           MatrixPtr selfDefinedData,
+           std::vector<int> selfDefinedSeqStartPos = {})
+      : labelSeqStartPositions(selfDefinedSeqStartPos),
+        selfDefinedData(selfDefinedData) {
+    inputType = type;
+    name = nameIn;
+    dim = 0;
+    sparse = {""};
+    paraSize = 0;
+    isStatic = false;
+  }
+
   InputDef(InputType type,
            string nameIn,
            size_t dimIn,
diff --git a/paddle/gserver/tests/test_DetectionOutput.cpp b/paddle/gserver/tests/test_DetectionOutput.cpp
new file mode 100644
index 0000000000..8ec7a28450
--- /dev/null
+++ b/paddle/gserver/tests/test_DetectionOutput.cpp
@@ -0,0 +1,191 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+// Do one forward pass of priorBox layer and check to see if its output
+// matches the given result
+void doOneDetectionOutputTest(MatrixPtr& inputLoc,
+                              MatrixPtr& inputConf,
+                              MatrixPtr& inputPriorBox,
+                              size_t feature_map_width,
+                              size_t feature_map_height,
+                              real nms_threshold,
+                              bool use_gpu,
+                              MatrixPtr& result) {
+  // Setting up the detection output layer
+  TestConfig configt;
+  configt.layerConfig.set_type("detection_output");
+  LayerInputConfig* input = configt.layerConfig.add_inputs();
+  configt.layerConfig.add_inputs();
+  configt.layerConfig.add_inputs();
+
+  DetectionOutputConfig* detOutput = input->mutable_detection_output_conf();
+  detOutput->set_width(feature_map_width);
+  detOutput->set_height(feature_map_height);
+  detOutput->set_nms_threshold(nms_threshold);
+  detOutput->set_num_classes(2);
+  detOutput->set_nms_top_k(20);
+  detOutput->set_keep_top_k(10);
+  detOutput->set_background_id(0);
+  detOutput->set_confidence_threshold(0.01);
+  detOutput->set_input_num(1);
+  configt.inputDefs.push_back({INPUT_DATA_TARGET, "priorbox", 32, 0});
+  configt.inputDefs.push_back({INPUT_DATA, "input_loc", 16, 0});
+  configt.inputDefs.push_back({INPUT_DATA, "input_conf", 8, 0});
+
+  // data layer initialize
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+  initDataLayer(
+      configt, &dataLayers, &datas, &layerMap, "priorbox", 1, false, use_gpu);
+
+  dataLayers[0]->getOutputValue()->copyFrom(*inputPriorBox);
+  dataLayers[1]->getOutputValue()->copyFrom(*inputLoc);
+  dataLayers[2]->getOutputValue()->copyFrom(*inputConf);
+
+  // test layer initialize
+  std::vector<ParameterPtr> parameters;
+  LayerPtr detectionOutputLayer;
+  initTestLayer(configt, &layerMap, &parameters, &detectionOutputLayer);
+  detectionOutputLayer->forward(PASS_GC);
+  checkMatrixEqual(detectionOutputLayer->getOutputValue(), result);
+}
+
+TEST(Layer, detectionOutputLayerFwd) {
+  bool useGpu = false;
+  // CPU case 1.
+  MatrixPtr inputLoc;
+  MatrixPtr inputConf;
+  MatrixPtr inputPriorBox;
+  MatrixPtr result, result2, result3, result4;
+  real nmsTreshold = 0.01;
+  real inputLocData[] = {0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1};
+  real inputConfData[] = {0.1, 0.9, 0.2, 0.8, 0.3, 0.7, 0.4, 0.6};
+  real inputPriorBoxData[] = {0.1, 0.1, 0.5, 0.5, 0.1, 0.1, 0.2, 0.2,
+                              0.2, 0.2, 0.6, 0.6, 0.1, 0.1, 0.2, 0.2,
+                              0.3, 0.3, 0.7, 0.7, 0.1, 0.1, 0.2, 0.2,
+                              0.4, 0.4, 0.8, 0.8, 0.1, 0.1, 0.2, 0.2};
+  real resultData[] = {
+      0, 1, 0.68997443, 0.099959746, 0.099959746, 0.50804031, 0.50804031};
+  inputLoc = Matrix::create(1, 16, false, useGpu);
+  inputConf = Matrix::create(1, 8, false, useGpu);
+  inputPriorBox = Matrix::create(1, 32, false, useGpu);
+  result = Matrix::create(1, 7, false, useGpu);
+  inputLoc->setData(inputLocData);
+  inputConf->setData(inputConfData);
+  inputPriorBox->setData(inputPriorBoxData);
+  result->setData(resultData);
+  doOneDetectionOutputTest(inputLoc,
+                           inputConf,
+                           inputPriorBox,
+                           /* feature_map_width */ 1,
+                           /* feature_map_height */ 1,
+                           nmsTreshold,
+                           useGpu,
+                           result);
+
+  // CPU case 2.
+  nmsTreshold = 0.2;
+  result2 = Matrix::create(2, 7, false, useGpu);
+  real resultData2[] = {0,
+                        1,
+                        0.68997443,
+                        0.099959746,
+                        0.099959746,
+                        0.50804031,
+                        0.50804031,
+                        0,
+                        1,
+                        0.59868765,
+                        0.29995975,
+                        0.29995975,
+                        0.70804024,
+                        0.70804024};
+  result2->setData(resultData2);
+  doOneDetectionOutputTest(inputLoc,
+                           inputConf,
+                           inputPriorBox,
+                           /* feature_map_width */ 1,
+                           /* feature_map_height */ 1,
+                           nmsTreshold,
+                           useGpu,
+                           result2);
+
+#ifndef PADDLE_ONLY_CPU
+  // GPU case 1.
+  useGpu = true;
+  inputLoc = Matrix::create(1, 16, false, useGpu);
+  inputConf = Matrix::create(1, 8, false, useGpu);
+  inputPriorBox = Matrix::create(1, 32, false, useGpu);
+  inputLoc->copyFrom(inputLocData, 16);
+  inputConf->copyFrom(inputConfData, 8);
+  inputPriorBox->copyFrom(inputPriorBoxData, 32);
+
+  nmsTreshold = 0.01;
+  result3 = Matrix::create(1, 7, false, useGpu);
+  result3->copyFrom(resultData, 7);
+  doOneDetectionOutputTest(inputLoc,
+                           inputConf,
+                           inputPriorBox,
+                           /* feature_map_width */ 1,
+                           /* feature_map_height */ 1,
+                           nmsTreshold,
+                           useGpu,
+                           result3);
+
+  // GPU case 2.
+  nmsTreshold = 0.2;
+  result4 = Matrix::create(2, 7, false, useGpu);
+  result4->copyFrom(resultData2, 14);
+  doOneDetectionOutputTest(inputLoc,
+                           inputConf,
+                           inputPriorBox,
+                           /* feature_map_width */ 1,
+                           /* feature_map_height */ 1,
+                           nmsTreshold,
+                           useGpu,
+                           result4);
+#endif
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 6adffcf53b..9c79bd19ee 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1689,6 +1689,70 @@ TEST(Layer, smooth_l1) {
   }
 }
 
+TEST(Layer, multibox_loss) {
+  TestConfig config;
+  config.layerConfig.set_type("multibox_loss");
+  config.biasSize = 0;
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  MultiBoxLossConfig* multiboxLoss = input->mutable_multibox_loss_conf();
+  multiboxLoss->set_num_classes(21);
+  multiboxLoss->set_input_num(1);
+  multiboxLoss->set_overlap_threshold(0.5);
+  multiboxLoss->set_neg_pos_ratio(3);
+  multiboxLoss->set_neg_overlap(0.5);
+  multiboxLoss->set_background_id(0);
+  multiboxLoss->set_height(3);
+  multiboxLoss->set_width(3);
+
+  size_t gtNum = 1;
+  MatrixPtr labelValue = Matrix::create(gtNum, 6, false, false);
+  labelValue->randomizeUniform();
+  labelValue->add(-0.5);
+  labelValue->sigmoid(*labelValue);
+  real* labelData = labelValue->getData();
+  size_t labelWidth = labelValue->getWidth();
+  for (size_t i = 0; i < gtNum; ++i) {
+    *(labelData + i * labelWidth) = std::rand() % 20 + 1;
+    *(labelData + i * labelWidth + 1) = 0.400259;
+    *(labelData + i * labelWidth + 2) = 0.377857;
+    *(labelData + i * labelWidth + 3) = 0.525712;
+    *(labelData + i * labelWidth + 4) = 0.519368;
+  }
+  vector<int> seqStartPositions(gtNum + 1, 0);
+  for (size_t i = 1; i <= gtNum; ++i) {
+    seqStartPositions[i] = i;
+  }
+
+  // Ensure at lease one matched bbox
+  MatrixPtr priorValue = Matrix::create(1, 72, false, false);
+  priorValue->randomizeUniform();
+  priorValue->add(-0.5);
+  priorValue->sigmoid(*priorValue);
+  real* priorData = priorValue->getData();
+  *(priorData) = 0.424811;
+  *(priorData + 1) = 0.397059;
+  *(priorData + 2) = 0.538905;
+  *(priorData + 3) = 0.447091;
+  *(priorData + 4) = 0.425720;
+  *(priorData + 5) = 0.515228;
+  *(priorData + 6) = 0.519452;
+  *(priorData + 7) = 0.591065;
+
+  config.inputDefs.push_back(
+      {INPUT_SELF_DEFINE_DATA, "priorbox", priorValue, {}});
+  config.inputDefs.push_back(
+      {INPUT_SELF_DEFINE_DATA, "label", labelValue, seqStartPositions});
+  config.inputDefs.push_back({INPUT_DATA, "locPred", 36, 0});
+  config.inputDefs.push_back({INPUT_DATA, "confPred", 189, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "multibox_loss", 1, false, useGpu, false);
+  }
+}
+
 TEST(Layer, TransLayer) {
   TestConfig config;
   const int height = 128;
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
index 29270829bb..3d01c23bf9 100644
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -266,6 +266,29 @@ message PadConfig {
   repeated uint32 pad_w = 4;
 }
 
+message MultiBoxLossConfig {
+  required uint32 num_classes = 1;
+  required float overlap_threshold = 2;
+  required float neg_pos_ratio = 3;
+  required float neg_overlap = 4;
+  required uint32 background_id = 5;
+  required uint32 input_num = 6;
+  optional uint32 height = 7 [default = 1];
+  optional uint32 width = 8 [default = 1];
+}
+
+message DetectionOutputConfig {
+  required uint32 num_classes = 1;
+  required float nms_threshold = 2;
+  required uint32 nms_top_k = 3;
+  required uint32 background_id = 4;
+  required uint32 input_num = 5;
+  required uint32 keep_top_k = 6;
+  required float confidence_threshold = 7;
+  optional uint32 height = 8 [default = 1];
+  optional uint32 width = 9 [default = 1];
+}
+
 message LayerInputConfig {
   required string input_layer_name = 1;
   optional string input_parameter_name = 2;
@@ -284,6 +307,8 @@ message LayerInputConfig {
   optional PriorBoxConfig priorbox_conf = 13;
   optional PadConfig pad_conf = 14;
   optional RowConvConfig row_conv_conf = 15;
+  optional MultiBoxLossConfig multibox_loss_conf = 16;
+  optional DetectionOutputConfig detection_output_conf = 17;
 }
 
 message LayerConfig {
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index fc2e3bbcde..c46b335d99 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1676,6 +1676,52 @@ class PriorBoxLayer(LayerBase):
         self.config.size = size
 
 
+@config_layer('multibox_loss')
+class MultiBoxLossLayer(LayerBase):
+    def __init__(self, name, inputs, input_num, num_classes, overlap_threshold,
+                 neg_pos_ratio, neg_overlap, background_id):
+        super(MultiBoxLossLayer, self).__init__(name, 'multibox_loss', 0,
+                                                inputs)
+        config_assert(
+            len(inputs) == (input_num * 2 + 2),
+            'MultiBoxLossLayer does not have enough inputs')
+        config_assert(num_classes > background_id,
+                      'Classes number must greater than background ID')
+        self.config.inputs[0].multibox_loss_conf.num_classes = num_classes
+        self.config.inputs[
+            0].multibox_loss_conf.overlap_threshold = overlap_threshold
+        self.config.inputs[0].multibox_loss_conf.neg_pos_ratio = neg_pos_ratio
+        self.config.inputs[0].multibox_loss_conf.neg_overlap = neg_overlap
+        self.config.inputs[0].multibox_loss_conf.background_id = background_id
+        self.config.inputs[0].multibox_loss_conf.input_num = input_num
+        self.config.size = 1
+
+
+@config_layer('detection_output')
+class DetectionOutputLayer(LayerBase):
+    def __init__(self, name, inputs, size, input_num, num_classes,
+                 nms_threshold, nms_top_k, keep_top_k, confidence_threshold,
+                 background_id):
+        super(DetectionOutputLayer, self).__init__(name, 'detection_output', 0,
+                                                   inputs)
+        config_assert(
+            len(inputs) == (input_num * 2 + 1),
+            'DetectionOutputLayer does not have enough inputs')
+        config_assert(num_classes > background_id,
+                      'Classes number must greater than background ID')
+        self.config.inputs[0].detection_output_conf.num_classes = num_classes
+        self.config.inputs[
+            0].detection_output_conf.nms_threshold = nms_threshold
+        self.config.inputs[0].detection_output_conf.nms_top_k = nms_top_k
+        self.config.inputs[0].detection_output_conf.keep_top_k = keep_top_k
+        self.config.inputs[
+            0].detection_output_conf.confidence_threshold = confidence_threshold
+        self.config.inputs[
+            0].detection_output_conf.background_id = background_id
+        self.config.inputs[0].detection_output_conf.input_num = input_num
+        self.config.size = size
+
+
 @config_layer('data')
 class DataLayer(LayerBase):
     def __init__(self, name, size, height=None, width=None, device=None):
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 2d8ddbb900..770559dc77 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -115,6 +115,8 @@ __all__ = [
     'print_layer',
     'priorbox_layer',
     'cross_channel_norm_layer',
+    'multibox_loss_layer',
+    'detection_output_layer',
     'spp_layer',
     'pad_layer',
     'eos_layer',
@@ -195,6 +197,8 @@ class LayerType(object):
 
     PRINT_LAYER = 'print'
     PRIORBOX_LAYER = 'priorbox'
+    MULTIBOX_LOSS_LAYER = 'multibox_loss'
+    DETECTION_OUTPUT_LAYER = 'detection_output'
 
     CTC_LAYER = 'ctc'
     WARP_CTC_LAYER = 'warp_ctc'
@@ -1052,6 +1056,163 @@ def priorbox_layer(input,
         size=size)
 
 
+@wrap_name_default("multibox_loss")
+def multibox_loss_layer(input_loc,
+                        input_conf,
+                        priorbox,
+                        label,
+                        num_classes,
+                        overlap_threshold=0.5,
+                        neg_pos_ratio=3.0,
+                        neg_overlap=0.5,
+                        background_id=0,
+                        name=None):
+    """
+    Compute the location loss and the confidence loss for ssd.
+
+    :param name: The Layer Name.
+    :type name: basestring
+    :param input_loc: The input predict location.
+    :type input_loc: LayerOutput
+    :param input_conf: The input priorbox confidence.
+    :type input_conf: LayerOutput
+    :param priorbox: The input priorbox location and the variance.
+    :type priorbox: LayerOutput
+    :param label: The input label.
+    :type label: LayerOutput
+    :param num_classes: The number of the classification.
+    :type num_classes: int
+    :param overlap_threshold: The threshold of the overlap.
+    :type overlap_threshold: float
+    :param neg_pos_ratio: The ratio of the negative bbox to the positive bbox.
+    :type neg_pos_ratio: float
+    :param neg_overlap: The negative bbox overlap threshold.
+    :type neg_overlap: float
+    :param background_id: The background class index.
+    :type background_id: int
+    :return: LayerOutput
+    """
+    input_loc_num = 0
+    input_conf_num = 0
+
+    if isinstance(input_loc, LayerOutput):
+        input_loc = [input_loc]
+    assert isinstance(input_loc, collections.Sequence)  # list or tuple
+    for each in input_loc:
+        assert isinstance(each, LayerOutput)
+        input_loc_num += 1
+
+    if isinstance(input_conf, LayerOutput):
+        input_conf = [input_conf]
+    assert isinstance(input_conf, collections.Sequence)  # list or tuple
+    for each in input_conf:
+        assert isinstance(each, LayerOutput)
+        input_conf_num += 1
+    # Check the input layer number.
+    assert input_loc_num == input_conf_num
+
+    inputs = [priorbox.name, label.name]
+    inputs.extend([l.name for l in input_loc])
+    inputs.extend([l.name for l in input_conf])
+    parents = [priorbox, label]
+    parents.extend(input_loc)
+    parents.extend(input_conf)
+
+    Layer(
+        name=name,
+        type=LayerType.MULTIBOX_LOSS_LAYER,
+        inputs=inputs,
+        input_num=input_loc_num,
+        num_classes=num_classes,
+        overlap_threshold=overlap_threshold,
+        neg_pos_ratio=neg_pos_ratio,
+        neg_overlap=neg_overlap,
+        background_id=background_id)
+    return LayerOutput(
+        name, LayerType.MULTIBOX_LOSS_LAYER, parents=parents, size=1)
+
+
+@wrap_name_default("detection_output")
+def detection_output_layer(input_loc,
+                           input_conf,
+                           priorbox,
+                           num_classes,
+                           nms_threshold=0.45,
+                           nms_top_k=400,
+                           keep_top_k=200,
+                           confidence_threshold=0.01,
+                           background_id=0,
+                           name=None):
+    """
+    Apply the NMS to the output of network and compute the predict bounding
+    box location.
+
+    :param name: The Layer Name.
+    :type name: basestring
+    :param input_loc: The input predict location.
+    :type input_loc: LayerOutput
+    :param input_conf: The input priorbox confidence.
+    :type input_conf: LayerOutput
+    :param priorbox: The input priorbox location and the variance.
+    :type priorbox: LayerOutput
+    :param num_classes: The number of the classification.
+    :type num_classes: int
+    :param nms_threshold: The Non-maximum suppression threshold.
+    :type nms_threshold: float
+    :param nms_top_k: The bbox number kept of the NMS's output
+    :type nms_top_k: int
+    :param keep_top_k: The bbox number kept of the layer's output
+    :type keep_top_k: int
+    :param confidence_threshold: The classification confidence threshold
+    :type confidence_threshold: float
+    :param background_id: The background class index.
+    :type background_id: int
+    :return: LayerOutput
+    """
+    input_loc_num = 0
+    input_conf_num = 0
+
+    if isinstance(input_loc, LayerOutput):
+        input_loc = [input_loc]
+    assert isinstance(input_loc, collections.Sequence)  # list or tuple
+    for each in input_loc:
+        assert isinstance(each, LayerOutput)
+        input_loc_num += 1
+
+    if isinstance(input_conf, LayerOutput):
+        input_conf = [input_conf]
+    assert isinstance(input_conf, collections.Sequence)  # list or tuple
+    for each in input_conf:
+        assert isinstance(each, LayerOutput)
+        input_conf_num += 1
+    # Check the input layer number.
+    assert input_loc_num == input_conf_num
+
+    inputs = [priorbox.name]
+    inputs.extend([l.name for l in input_loc])
+    inputs.extend([l.name for l in input_conf])
+    parents = [priorbox]
+    parents.extend(input_loc)
+    parents.extend(input_conf)
+
+    size = keep_top_k * 7
+
+    Layer(
+        name=name,
+        type=LayerType.DETECTION_OUTPUT_LAYER,
+        inputs=inputs,
+        size=size,
+        input_num=input_loc_num,
+        num_classes=num_classes,
+        nms_threshold=nms_threshold,
+        nms_top_k=nms_top_k,
+        keep_top_k=keep_top_k,
+        confidence_threshold=confidence_threshold,
+        background_id=background_id)
+    return LayerOutput(
+        name, LayerType.DETECTION_OUTPUT_LAYER, parents=parents, size=size)
+
+
 @wrap_name_default("cross_channel_norm")
 def cross_channel_norm_layer(input, name=None, param_attr=None):
     """

From b233ed135352de1260b644112f939938798048ec Mon Sep 17 00:00:00 2001
From: yangyaming <mxscmxsc@gmail.com>
Date: Mon, 19 Jun 2017 14:53:59 +0800
Subject: [PATCH 02/79] Set FLAGS_use_gpu in test_DetectionOutput.

---
 paddle/gserver/tests/test_DetectionOutput.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/paddle/gserver/tests/test_DetectionOutput.cpp b/paddle/gserver/tests/test_DetectionOutput.cpp
index 8ec7a28450..af43dc51fa 100644
--- a/paddle/gserver/tests/test_DetectionOutput.cpp
+++ b/paddle/gserver/tests/test_DetectionOutput.cpp
@@ -65,9 +65,12 @@ void doOneDetectionOutputTest(MatrixPtr& inputLoc,
   dataLayers[2]->getOutputValue()->copyFrom(*inputConf);
 
   // test layer initialize
+  bool store_FLAGS_use_gpu = FLAGS_use_gpu;
+  FLAGS_use_gpu = use_gpu;
   std::vector<ParameterPtr> parameters;
   LayerPtr detectionOutputLayer;
   initTestLayer(configt, &layerMap, &parameters, &detectionOutputLayer);
+  FLAGS_use_gpu = store_FLAGS_use_gpu;
   detectionOutputLayer->forward(PASS_GC);
   checkMatrixEqual(detectionOutputLayer->getOutputValue(), result);
 }

From 3919b75884749684e0bd8b502e426fa4949f2c1f Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Wed, 28 Jun 2017 12:01:32 +0000
Subject: [PATCH 03/79] modify cmake

---
 go/master/c/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/go/master/c/CMakeLists.txt b/go/master/c/CMakeLists.txt
index acce698051..3eb598a877 100644
--- a/go/master/c/CMakeLists.txt
+++ b/go/master/c/CMakeLists.txt
@@ -6,7 +6,7 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PARENT_DIR}/cmake")
 
 project(cxx_go C Go)
 
-include(golang)
+#include(golang)
 include(flags)
 
 set(MASTER_LIB_NAME "paddle_master")

From fc3d03142582dcd673cc97fb3b0239bac59815f4 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Thu, 29 Jun 2017 09:38:25 +0800
Subject: [PATCH 04/79] first add

---
 go/master/c/client.go                         |  5 ++
 go/master/client.go                           |  3 +-
 python/paddle/v2/master/client.py             |  3 ++
 python/paddle/v2/reader/creator.py            | 49 ++++++++++++++-----
 python/paddle/v2/reader/tests/creator_test.py |  2 +-
 5 files changed, 49 insertions(+), 13 deletions(-)

diff --git a/go/master/c/client.go b/go/master/c/client.go
index b186474dc3..b88911b858 100644
--- a/go/master/c/client.go
+++ b/go/master/c/client.go
@@ -88,7 +88,12 @@ func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int
 func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int {
 	c := get(client)
 	r := c.NextRecord()
+	if r == nil {
+		// EOF
+		return -1
+	}
 	if len(r) == 0 {
+		// Empty record
 		*record = (*C.uchar)(nullPtr)
 		return 0
 	}
diff --git a/go/master/client.go b/go/master/client.go
index 8451820c19..4f8df5ba66 100644
--- a/go/master/client.go
+++ b/go/master/client.go
@@ -60,6 +60,7 @@ func (c *Client) getRecords() {
 			}
 
 			err = f.Close()
+			c.ch <- nil
 			if err != nil {
 				log.Errorln(err)
 			}
@@ -112,7 +113,7 @@ func (c *Client) monitorMaster(addr Addresser) {
 //
 // SetDataset can be call multiple times from different nodes. But
 // only the first call will be honored.
-func (c *Client) SetDataset(globPaths []string) error {
+func (c *Client) SetDataset(globPaths ...string) error {
 	return c.conn.Call("Service.SetDataset", globPaths, nil)
 }
 
diff --git a/python/paddle/v2/master/client.py b/python/paddle/v2/master/client.py
index de8e9bb88e..9fd3ef0860 100644
--- a/python/paddle/v2/master/client.py
+++ b/python/paddle/v2/master/client.py
@@ -30,6 +30,9 @@ class client(object):
         p = ctypes.c_char_p()
         ret = ctypes.pointer(p)
         size = lib.paddle_next_record(self.c, ret)
+        if size < 0:
+            # EOF
+            return None
         if size == 0:
             # Empty record
             return ""
diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py
index 9f888b16d6..669867fd10 100644
--- a/python/paddle/v2/reader/creator.py
+++ b/python/paddle/v2/reader/creator.py
@@ -57,22 +57,49 @@ def text_file(path):
     return reader
 
 
-def recordio(path):
+def recordio_local(paths):
     """
-    Creates a data reader that outputs record one one by one from given recordio file
-    :path: path of recordio file
-    :returns: data reader of recordio file
+    Creates a data reader that outputs record one one by one 
+        from given local recordio fils path.
+    :path: path of recordio files.
+    :returns: data reader of recordio files.
     """
 
     import recordio as rec
 
     def reader():
-        f = rec.reader(path)
-        while True:
-            r = f.read()
-            if r is None:
-                break
-            yield r
-        f.close()
+        for i, path in enumerate(paths):
+            f = rec.reader(path)
+            while True:
+                r = f.read()
+                if r is None:
+                    break
+                yield r
+            f.close()
 
     return reader
+
+
+def recordio(paths, addr="", buf_size=100):
+    """
+    Creates a data reader that outputs record one one by one 
+        from given local or cloud recordio path.
+    :path: path of recordio files.
+    :returns: data reader of recordio files.
+    """
+    import os
+    import paddle.v2.master.client as cloud
+
+    if len(os.environ["KUBERNETES_SERVICE_HOST"]) == 0:
+        return recordio_local(path)
+
+    c = cloud(addr, buf_size)
+    c.set_dataset(paths)
+
+    while True:
+        r = client.next_record()
+        if r is None:
+            break
+        yield r
+
+    c.close()
diff --git a/python/paddle/v2/reader/tests/creator_test.py b/python/paddle/v2/reader/tests/creator_test.py
index ba4f558874..b42d273ecf 100644
--- a/python/paddle/v2/reader/tests/creator_test.py
+++ b/python/paddle/v2/reader/tests/creator_test.py
@@ -38,7 +38,7 @@ class TestRecordIO(unittest.TestCase):
     def test_recordio(self):
         path = os.path.join(
             os.path.dirname(__file__), "test_recordio_creator.dat")
-        reader = paddle.v2.reader.creator.recordio(path)
+        reader = paddle.v2.reader.creator.recordio([path])
         for idx, r in enumerate(reader()):
             self.assertSequenceEqual(r, str(idx))
 

From 4874810ba5a1e6f8f6b4a9530e6854f65077a59e Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Thu, 29 Jun 2017 04:28:44 +0000
Subject: [PATCH 05/79] fix bugs

---
 go/master/client.go                |  2 +-
 python/paddle/v2/reader/creator.py | 20 ++++++++++++--------
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/go/master/client.go b/go/master/client.go
index 4f8df5ba66..fa479338c5 100644
--- a/go/master/client.go
+++ b/go/master/client.go
@@ -113,7 +113,7 @@ func (c *Client) monitorMaster(addr Addresser) {
 //
 // SetDataset can be call multiple times from different nodes. But
 // only the first call will be honored.
-func (c *Client) SetDataset(globPaths ...string) error {
+func (c *Client) SetDataset(globPaths []string) error {
 	return c.conn.Call("Service.SetDataset", globPaths, nil)
 }
 
diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py
index 669867fd10..3376d7accb 100644
--- a/python/paddle/v2/reader/creator.py
+++ b/python/paddle/v2/reader/creator.py
@@ -93,13 +93,17 @@ def recordio(paths, addr="", buf_size=100):
     if len(os.environ["KUBERNETES_SERVICE_HOST"]) == 0:
         return recordio_local(path)
 
-    c = cloud(addr, buf_size)
-    c.set_dataset(paths)
+    def reader():
+        c = cloud(addr, buf_size)
+        c.set_dataset(paths)
+
+        while True:
+            r = client.next_record()
+            if r is None:
+                break
+            yield r
 
-    while True:
-        r = client.next_record()
-        if r is None:
-            break
-        yield r
+        c.close()
+
+    return reader
 
-    c.close()

From b5ab4b69bcfa604a1ebbb964da1765ff2c586a6a Mon Sep 17 00:00:00 2001
From: yangyaming <mxscmxsc@gmail.com>
Date: Thu, 29 Jun 2017 15:11:40 +0800
Subject: [PATCH 06/79] Follow comments, mainly use std::copy to simplify
 logic.

---
 .../gserver/layers/DetectionOutputLayer.cpp   |  20 ++--
 paddle/gserver/layers/DetectionOutputLayer.h  |   6 +-
 paddle/gserver/layers/MultiBoxLossLayer.cpp   | 109 ++++++++++--------
 python/paddle/trainer/config_parser.py        |   4 +-
 .../paddle/trainer_config_helpers/layers.py   |   7 +-
 5 files changed, 74 insertions(+), 72 deletions(-)

diff --git a/paddle/gserver/layers/DetectionOutputLayer.cpp b/paddle/gserver/layers/DetectionOutputLayer.cpp
index 2a4d7f8b5b..8ab838e191 100644
--- a/paddle/gserver/layers/DetectionOutputLayer.cpp
+++ b/paddle/gserver/layers/DetectionOutputLayer.cpp
@@ -48,8 +48,6 @@ void DetectionOutputLayer::forward(PassType passType) {
   Matrix::resizeOrCreate(locTmpBuffer_, 1, locSizeSum_, false, useGpu_);
   Matrix::resizeOrCreate(
       confTmpBuffer_, confSizeSum_ / numClasses_, numClasses_, false, useGpu_);
-  locBuffer_ = locTmpBuffer_;
-  confBuffer_ = confTmpBuffer_;
 
   size_t locOffset = 0;
   size_t confOffset = 0;
@@ -68,7 +66,7 @@ void DetectionOutputLayer::forward(PassType passType) {
                                    locSizeSum_,
                                    locOffset,
                                    batchSize,
-                                   *locBuffer_,
+                                   *locTmpBuffer_,
                                    kNCHWToNHWC);
     confOffset += appendWithPermute(*inConf,
                                     height,
@@ -76,7 +74,7 @@ void DetectionOutputLayer::forward(PassType passType) {
                                     confSizeSum_,
                                     confOffset,
                                     batchSize,
-                                    *confBuffer_,
+                                    *confTmpBuffer_,
                                     kNCHWToNHWC);
   }
   CHECK_EQ(locOffset, locSizeSum_ / batchSize);
@@ -100,23 +98,25 @@ void DetectionOutputLayer::forward(PassType passType) {
     priorValue = priorCpuValue_;
   } else {
     priorValue = getInputValue(*getPriorBoxLayer());
+    locBuffer_ = locTmpBuffer_;
+    confBuffer_ = confTmpBuffer_;
   }
   confBuffer_->softmax(*confBuffer_);
 
   size_t numPriors = priorValue->getElementCnt() / 8;
-  vector<vector<NormalizedBBox>> allDecodedBBoxes;
+  std::vector<std::vector<NormalizedBBox>> allDecodedBBoxes;
   for (size_t n = 0; n < batchSize; ++n) {
-    vector<NormalizedBBox> decodedBBoxes;
+    std::vector<NormalizedBBox> decodedBBoxes;
     for (size_t i = 0; i < numPriors; ++i) {
       size_t priorOffset = i * 8;
       size_t locPredOffset = n * numPriors * 4 + i * 4;
-      vector<NormalizedBBox> priorBBoxVec;
+      std::vector<NormalizedBBox> priorBBoxVec;
       getBBoxFromPriorData(
           priorValue->getData() + priorOffset, 1, priorBBoxVec);
-      vector<vector<real>> priorBBoxVar;
+      std::vector<std::vector<real>> priorBBoxVar;
       getBBoxVarFromPriorData(
           priorValue->getData() + priorOffset, 1, priorBBoxVar);
-      vector<real> locPredData;
+      std::vector<real> locPredData;
       for (size_t j = 0; j < 4; ++j)
         locPredData.push_back(*(locBuffer_->getData() + locPredOffset + j));
       NormalizedBBox bbox =
@@ -126,7 +126,7 @@ void DetectionOutputLayer::forward(PassType passType) {
     allDecodedBBoxes.push_back(decodedBBoxes);
   }
 
-  vector<map<size_t, vector<size_t>>> allIndices;
+  std::vector<std::map<size_t, std::vector<size_t>>> allIndices;
   size_t numKept = getDetectionIndices(confBuffer_->getData(),
                                        numPriors,
                                        numClasses_,
diff --git a/paddle/gserver/layers/DetectionOutputLayer.h b/paddle/gserver/layers/DetectionOutputLayer.h
index 38271cb054..9cc568219c 100644
--- a/paddle/gserver/layers/DetectionOutputLayer.h
+++ b/paddle/gserver/layers/DetectionOutputLayer.h
@@ -19,17 +19,13 @@ limitations under the License. */
 #include "DetectionUtil.h"
 #include "Layer.h"
 
-using std::vector;
-using std::map;
-using std::pair;
-
 namespace paddle {
 
 /**
  * The detection output layer for a SSD detection task. This layer apply the
  * Non-maximum suppression to the all predicted bounding box and keep the
  * Top-K bounding boxes.
- * - Input: This layer need three input layers: This first input layer
+ * - Input: This layer needs three input layers: This first input layer
  *          is the priorbox layer. The rest two input layers are convolution
  *          layers for generating bbox location offset and the classification
  *          confidence.
diff --git a/paddle/gserver/layers/MultiBoxLossLayer.cpp b/paddle/gserver/layers/MultiBoxLossLayer.cpp
index 27a2cc3fa4..f2d7b8eb1d 100644
--- a/paddle/gserver/layers/MultiBoxLossLayer.cpp
+++ b/paddle/gserver/layers/MultiBoxLossLayer.cpp
@@ -17,10 +17,6 @@ limitations under the License. */
 #include <vector>
 #include "DataLayer.h"
 
-using std::vector;
-using std::map;
-using std::pair;
-
 namespace paddle {
 
 REGISTER_LAYER(multibox_loss, MultiBoxLossLayer);
@@ -133,7 +129,7 @@ void MultiBoxLossLayer::forward(PassType passType) {
   }
 
   // Get max scores for each prior bbox. Used in negative mining
-  vector<vector<real>> allMaxConfScore;
+  std::vector<std::vector<real>> allMaxConfScore;
   numPriors_ = priorValue->getElementCnt() / 8;
   getMaxConfidenceScores(confBuffer_->getData(),
                          batchSize,
@@ -151,18 +147,18 @@ void MultiBoxLossLayer::forward(PassType passType) {
   allMatchIndices_.clear();
   allNegIndices_.clear();
 
-  pair<size_t, size_t> retPair = generateMatchIndices(*priorValue,
-                                                      numPriors_,
-                                                      *labelValue,
-                                                      labelIndex,
-                                                      seqNum,
-                                                      allMaxConfScore,
-                                                      batchSize,
-                                                      overlapThreshold_,
-                                                      negOverlap_,
-                                                      negPosRatio_,
-                                                      &allMatchIndices_,
-                                                      &allNegIndices_);
+  std::pair<size_t, size_t> retPair = generateMatchIndices(*priorValue,
+                                                           numPriors_,
+                                                           *labelValue,
+                                                           labelIndex,
+                                                           seqNum,
+                                                           allMaxConfScore,
+                                                           batchSize,
+                                                           overlapThreshold_,
+                                                           negOverlap_,
+                                                           negPosRatio_,
+                                                           &allMatchIndices_,
+                                                           &allNegIndices_);
   numMatches_ = retPair.first;
   numNegs_ = retPair.second;
 
@@ -175,30 +171,31 @@ void MultiBoxLossLayer::forward(PassType passType) {
     Matrix::resizeOrCreate(locGTData_, numMatches_ * 4, 1, false, false);
     Matrix::resizeOrCreate(locDiff_, numMatches_ * 4, 1, false, false);
     locDiff_->zeroMem();
-    vector<real> locGTData;
+    std::vector<real> locGTData;
 
+    real* locDiffData = locDiff_->getData();
+    const real* locBufferData = locBuffer_->getData();
     for (size_t n = 0; n < batchSize; ++n) {
       for (size_t i = 0; i < numPriors_; ++i) {
         if (allMatchIndices_[n][i] == -1) continue;  // match none
         size_t locOffset =
             n * (locBuffer_->getElementCnt() / batchSize) + i * 4;
-        locDiff_->getData()[count++] = (locBuffer_->getData() + locOffset)[0];
-        locDiff_->getData()[count++] = (locBuffer_->getData() + locOffset)[1];
-        locDiff_->getData()[count++] = (locBuffer_->getData() + locOffset)[2];
-        locDiff_->getData()[count++] = (locBuffer_->getData() + locOffset)[3];
-
+        std::copy(locBufferData + locOffset,
+                  locBufferData + locOffset + 4,
+                  locDiffData + count);
+        count += 4;
         const int gtIdx = allMatchIndices_[n][i];
         size_t priorOffset = i * 8;
-        vector<NormalizedBBox> priorBBoxVec;
+        std::vector<NormalizedBBox> priorBBoxVec;
         getBBoxFromPriorData(
             priorValue->getData() + priorOffset, 1, priorBBoxVec);
-        vector<vector<real>> priorBBoxVar;
+        std::vector<std::vector<real>> priorBBoxVar;
         getBBoxVarFromPriorData(
             priorValue->getData() + priorOffset, 1, priorBBoxVar);
         size_t labelOffset = (labelIndex[n] + gtIdx) * 6;
-        vector<NormalizedBBox> gtBBoxVec;
+        std::vector<NormalizedBBox> gtBBoxVec;
         getBBoxFromLabelData(labelValue->getData() + labelOffset, 1, gtBBoxVec);
-        vector<real> gtEncode;
+        std::vector<real> gtEncode;
         encodeBBoxWithVar(
             priorBBoxVec[0], priorBBoxVar[0], gtBBoxVec[0], gtEncode);
         locGTData.insert(locGTData.end(), gtEncode.begin(), gtEncode.end());
@@ -218,7 +215,9 @@ void MultiBoxLossLayer::forward(PassType passType) {
     confProb_->zeroMem();
     size_t count = 0;
 
-    vector<real> confPredData;
+    std::vector<real> confPredData;
+    real* confProbData = confProb_->getData();
+    const real* confBufferData = confBuffer_->getData();
     for (size_t n = 0; n < batchSize; ++n) {
       for (size_t i = 0; i < numPriors_; ++i) {
         if (allMatchIndices_[n][i] == -1) continue;
@@ -226,11 +225,13 @@ void MultiBoxLossLayer::forward(PassType passType) {
         const int gtLabel = (labelValue->getData() + labelOffset)[0];
         confGTData_->getData()[count] = gtLabel;
         size_t confOffset = n * numPriors_ * numClasses_ + i * numClasses_;
-        for (size_t j = 0; j < numClasses_; ++j) {
-          confProb_->getData()[count * numClasses_ + j] =
-              (confBuffer_->getData() + confOffset)[j];
-          confPredData.push_back((confBuffer_->getData() + confOffset)[j]);
-        }
+        std::copy(confBufferData + confOffset,
+                  confBufferData + confOffset + numClasses_,
+                  confProbData + count * numClasses_);
+        confPredData.reserve(confPredData.size() + numClasses_);
+        confPredData.insert(confPredData.end(),
+                            confBufferData + confOffset,
+                            confBufferData + confOffset + numClasses_);
         ++count;
       }
       // Negative mining samples
@@ -238,14 +239,17 @@ void MultiBoxLossLayer::forward(PassType passType) {
         confGTData_->getData()[count] = backgroundId_;
         size_t confOffset =
             n * numPriors_ * numClasses_ + allNegIndices_[n][i] * numClasses_;
-        for (size_t j = 0; j < numClasses_; ++j) {
-          confProb_->getData()[count * numClasses_ + j] =
-              (confBuffer_->getData() + confOffset)[j];
-          confPredData.push_back((confBuffer_->getData() + confOffset)[j]);
-        }
-        count++;
+        std::copy(confBufferData + confOffset,
+                  confBufferData + confOffset + numClasses_,
+                  confProbData + count * numClasses_);
+        confPredData.reserve(confPredData.size() + numClasses_);
+        confPredData.insert(confPredData.end(),
+                            confBufferData + confOffset,
+                            confBufferData + confOffset + numClasses_);
+        ++count;
       }
     }
+    CHECK_EQ(numConf_, count);
     confProb_->softmax(*confProb_);
     MatrixPtr confLossOutput;
     Matrix::resizeOrCreate(confLossOutput, numConf_, 1, false, false);
@@ -254,7 +258,7 @@ void MultiBoxLossLayer::forward(PassType passType) {
   }
   real loss = locLoss_ + confLoss_;
   MatrixPtr outV = getOutputValue();
-  vector<real> tmp(batchSize, loss);
+  std::vector<real> tmp(batchSize, loss);
   outV->copyFrom(&tmp[0], batchSize);
 }
 
@@ -274,16 +278,18 @@ void MultiBoxLossLayer::backward(const UpdateCallback& callback) {
       locDiff_->getData()[i] *= (1. / numMatches_);
     // Copy gradient back
     size_t count = 0;
-    for (size_t n = 0; n < batchSize; ++n)
+    const real* locDiffData = locDiff_->getData();
+    for (size_t n = 0; n < batchSize; ++n) {
       for (size_t i = 0; i < numPriors_; ++i) {
         if (allMatchIndices_[n][i] == -1) continue;
-        real* locDiffData = locBuffer_->getData() + n * numPriors_ * 4 + i * 4;
-        locDiffData[0] = (locDiff_->getData() + count * 4)[0];
-        locDiffData[1] = (locDiff_->getData() + count * 4)[1];
-        locDiffData[2] = (locDiff_->getData() + count * 4)[2];
-        locDiffData[3] = (locDiff_->getData() + count * 4)[3];
+        real* locBufferData =
+            locBuffer_->getData() + n * numPriors_ * 4 + i * 4;
+        std::copy(locDiffData + count * 4,
+                  locDiffData + (count + 1) * 4,
+                  locBufferData);
         ++count;
       }
+    }
     CHECK_EQ(count, numMatches_);
   }
 
@@ -293,21 +299,24 @@ void MultiBoxLossLayer::backward(const UpdateCallback& callback) {
     for (size_t i = 0; i < numConf_ * numClasses_; ++i)
       confProb_->getData()[i] *= (1. / numMatches_);
     size_t count = 0;
+    const real* confProbData = confProb_->getData();
     for (size_t n = 0; n < batchSize; ++n) {
       for (size_t i = 0; i < numPriors_; ++i) {
         if (allMatchIndices_[n][i] == -1) continue;
         real* confDiffData = confBuffer_->getData() +
                              n * numPriors_ * numClasses_ + i * numClasses_;
-        for (size_t j = 0; j < numClasses_; ++j)
-          confDiffData[j] = (confProb_->getData() + count * numClasses_)[j];
+        std::copy(confProbData + count * numClasses_,
+                  confProbData + (count + 1) * numClasses_,
+                  confDiffData);
         ++count;
       }
       for (size_t i = 0; i < allNegIndices_[n].size(); ++i) {
         int idx = allNegIndices_[n][i];
         real* confDiffData = confBuffer_->getData() +
                              n * numPriors_ * numClasses_ + idx * numClasses_;
-        for (size_t j = 0; j < numClasses_; ++j)
-          confDiffData[j] = (confProb_->getData() + count * numClasses_)[j];
+        std::copy(confProbData + count * numClasses_,
+                  confProbData + (count + 1) * numClasses_,
+                  confDiffData);
         ++count;
       }
     }
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index c46b335d99..17f6704ea1 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1679,7 +1679,7 @@ class PriorBoxLayer(LayerBase):
 @config_layer('multibox_loss')
 class MultiBoxLossLayer(LayerBase):
     def __init__(self, name, inputs, input_num, num_classes, overlap_threshold,
-                 neg_pos_ratio, neg_overlap, background_id):
+                 neg_pos_ratio, neg_overlap, background_id, **xargs):
         super(MultiBoxLossLayer, self).__init__(name, 'multibox_loss', 0,
                                                 inputs)
         config_assert(
@@ -1701,7 +1701,7 @@ class MultiBoxLossLayer(LayerBase):
 class DetectionOutputLayer(LayerBase):
     def __init__(self, name, inputs, size, input_num, num_classes,
                  nms_threshold, nms_top_k, keep_top_k, confidence_threshold,
-                 background_id):
+                 background_id, **xargs):
         super(DetectionOutputLayer, self).__init__(name, 'detection_output', 0,
                                                    inputs)
         config_assert(
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 770559dc77..1286ed198e 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -1092,22 +1092,19 @@ def multibox_loss_layer(input_loc,
     :type background_id: int
     :return: LayerOutput
     """
-    input_loc_num = 0
-    input_conf_num = 0
-
     if isinstance(input_loc, LayerOutput):
         input_loc = [input_loc]
     assert isinstance(input_loc, collections.Sequence)  # list or tuple
     for each in input_loc:
         assert isinstance(each, LayerOutput)
-        input_loc_num += 1
+    input_loc_num = len(input_loc)
 
     if isinstance(input_conf, LayerOutput):
         input_conf = [input_conf]
     assert isinstance(input_conf, collections.Sequence)  # list or tuple
     for each in input_conf:
         assert isinstance(each, LayerOutput)
-        input_conf_num += 1
+    input_conf_num = len(input_conf)
     # Check the input layer number.
     assert input_loc_num == input_conf_num
 

From 0fa409246b98c636d4dd32553782ca962f70a6f7 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Thu, 29 Jun 2017 09:43:00 +0000
Subject: [PATCH 07/79] fix bugs

---
 go/master/c/client.go              | 18 ++++++++++++++++--
 go/master/client.go                | 21 +++++++++++++++------
 go/master/client_test.go           | 18 ++++++++++++++----
 python/paddle/v2/reader/creator.py |  6 ++----
 4 files changed, 47 insertions(+), 16 deletions(-)

diff --git a/go/master/c/client.go b/go/master/c/client.go
index b88911b858..79e13e4b63 100644
--- a/go/master/c/client.go
+++ b/go/master/c/client.go
@@ -13,6 +13,7 @@ typedef int paddle_master_client;
 import "C"
 
 import (
+	"io"
 	"sync"
 	"unsafe"
 
@@ -84,14 +85,27 @@ func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int
 	return C.PADDLE_MASTER_OK
 }
 
+// return value:
+//     0:ok
+//    -1:EOF
+//    -2:error
 //export paddle_next_record
 func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int {
 	c := get(client)
-	r := c.NextRecord()
-	if r == nil {
+	r, err := c.NextRecord()
+	if err == io.EOF {
 		// EOF
+		*record = (*C.uchar)(nullPtr)
 		return -1
 	}
+
+	if err != nil {
+		// Error
+		// TODO: return the type of error?
+		*record = (*C.uchar)(nullPtr)
+		return -2
+	}
+
 	if len(r) == 0 {
 		// Empty record
 		*record = (*C.uchar)(nullPtr)
diff --git a/go/master/client.go b/go/master/client.go
index fa479338c5..c122d17c8f 100644
--- a/go/master/client.go
+++ b/go/master/client.go
@@ -1,6 +1,7 @@
 package master
 
 import (
+	"io"
 	"os"
 	"time"
 
@@ -17,7 +18,12 @@ type Addresser interface {
 // Client is the client of the master server.
 type Client struct {
 	conn *connection.Conn
-	ch   chan []byte
+	ch   chan record
+}
+
+type record struct {
+	r   []byte
+	err error
 }
 
 // NewClient creates a new Client.
@@ -27,7 +33,7 @@ type Client struct {
 func NewClient(addr Addresser, bufSize int) *Client {
 	c := &Client{}
 	c.conn = connection.New()
-	c.ch = make(chan []byte, bufSize)
+	c.ch = make(chan record, bufSize)
 	go c.monitorMaster(addr)
 	go c.getRecords()
 	return c
@@ -52,18 +58,20 @@ func (c *Client) getRecords() {
 
 			s := recordio.NewRangeScanner(f, &chunk.Index, -1, -1)
 			for s.Scan() {
-				c.ch <- s.Record()
+				c.ch <- record{s.Record(), nil}
 			}
 
 			if s.Err() != nil {
+				c.ch <- record{nil, s.Err()}
 				log.Errorln(err, chunk.Path)
 			}
 
 			err = f.Close()
-			c.ch <- nil
 			if err != nil {
 				log.Errorln(err)
 			}
+
+			c.ch <- record{nil, io.EOF}
 		}
 
 		// We treat a task as finished whenever the last data
@@ -133,6 +141,7 @@ func (c *Client) taskFinished(taskID int) error {
 //
 // NextRecord will block until the next record is available. It is
 // thread-safe.
-func (c *Client) NextRecord() []byte {
-	return <-c.ch
+func (c *Client) NextRecord() ([]byte, error) {
+	r := <-c.ch
+	return r.r, r.err
 }
diff --git a/go/master/client_test.go b/go/master/client_test.go
index 85a86761c2..05201941e3 100644
--- a/go/master/client_test.go
+++ b/go/master/client_test.go
@@ -2,6 +2,7 @@ package master_test
 
 import (
 	"fmt"
+	"io"
 	"net"
 	"net/http"
 	"net/rpc"
@@ -69,13 +70,22 @@ func TestNextRecord(t *testing.T) {
 
 	for pass := 0; pass < 50; pass++ {
 		received := make(map[byte]bool)
-		for i := 0; i < total; i++ {
-			r := c.NextRecord()
+		for i := 0; i <= total; i++ {
+			r, err := c.NextRecord()
+			if err == io.EOF {
+				break
+			}
+
+			if err != nil {
+				t.Fatal(pass, i, "Read error:", err)
+			}
+
 			if len(r) != 1 {
-				t.Fatal("Length should be 1.", r)
+				t.Fatal(pass, i, "Length should be 1.", r)
 			}
+
 			if received[r[0]] {
-				t.Fatal("Received duplicate.", received, r)
+				t.Fatal(pass, i, "Received duplicate.", received, r)
 			}
 			received[r[0]] = true
 		}
diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py
index 3376d7accb..b575f57dc6 100644
--- a/python/paddle/v2/reader/creator.py
+++ b/python/paddle/v2/reader/creator.py
@@ -79,7 +79,6 @@ def recordio_local(paths):
 
     return reader
 
-
 def recordio(paths, addr="", buf_size=100):
     """
     Creates a data reader that outputs record one one by one 
@@ -90,8 +89,8 @@ def recordio(paths, addr="", buf_size=100):
     import os
     import paddle.v2.master.client as cloud
 
-    if len(os.environ["KUBERNETES_SERVICE_HOST"]) == 0:
-        return recordio_local(path)
+    if "KUBERNETES_SERVICE_HOST" not in os.environ.keys():
+        return recordio_local(paths)
 
     def reader():
         c = cloud(addr, buf_size)
@@ -106,4 +105,3 @@ def recordio(paths, addr="", buf_size=100):
         c.close()
 
     return reader
-

From b79784ee9e0fd67933d4793e8ab4564f7a30c780 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Thu, 29 Jun 2017 09:52:21 +0000
Subject: [PATCH 08/79] fix bugs

---
 python/paddle/v2/master/client.py  | 18 ++++++++++++++----
 python/paddle/v2/reader/creator.py |  2 +-
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/python/paddle/v2/master/client.py b/python/paddle/v2/master/client.py
index 9fd3ef0860..0cc01b7310 100644
--- a/python/paddle/v2/master/client.py
+++ b/python/paddle/v2/master/client.py
@@ -26,17 +26,27 @@ class client(object):
             holder[idx] = c_ptr
         lib.paddle_set_dataset(self.c, holder, len(paths))
 
+    # return format: (record, errno)
+    # errno =  0: ok
+    #       = -1: EOF
+    #       < -1: error
     def next_record(self):
         p = ctypes.c_char_p()
         ret = ctypes.pointer(p)
         size = lib.paddle_next_record(self.c, ret)
-        if size < 0:
+        if size == -1:
             # EOF
-            return None
+            return None, -1
+
+        if size < -1:
+            # Error
+            return None, size
+
         if size == 0:
             # Empty record
-            return ""
+            return "", 0
+
         record = ret.contents.value[:size]
         # Memory created from C should be freed.
         lib.mem_free(ret.contents)
-        return record
+        return record, 0
diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py
index b575f57dc6..2e8626e565 100644
--- a/python/paddle/v2/reader/creator.py
+++ b/python/paddle/v2/reader/creator.py
@@ -97,7 +97,7 @@ def recordio(paths, addr="", buf_size=100):
         c.set_dataset(paths)
 
         while True:
-            r = client.next_record()
+            r, err = client.next_record()
             if r is None:
                 break
             yield r

From 3a0919bab31fd64ea6ae73a61755b92c619a411e Mon Sep 17 00:00:00 2001
From: yangyaming <mxscmxsc@gmail.com>
Date: Thu, 29 Jun 2017 21:31:42 +0800
Subject: [PATCH 09/79] Add test for configuration and add doc.

---
 doc/api/v2/config/layer.rst                   | 13 ++++++++++
 .../tests/configs/file_list.sh                |  2 +-
 .../configs/test_detection_output_layer.py    | 23 +++++++++++++++++
 .../tests/configs/test_multibox_loss_layer.py | 25 +++++++++++++++++++
 4 files changed, 62 insertions(+), 1 deletion(-)
 create mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_detection_output_layer.py
 create mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_multibox_loss_layer.py

diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index c7b017bc07..0a8465919d 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -445,6 +445,11 @@ smooth_l1_cost
 ..  autoclass:: paddle.v2.layer.smooth_l1_cost
     :noindex:
 
+multibox_loss
+--------------
+..  autoclass:: paddle.v2.layer.multibox_loss
+    :noindex:
+
 Check Layer
 ============
 
@@ -468,3 +473,11 @@ prelu
 --------
 ..  autoclass:: paddle.v2.layer.prelu
     :noindex:
+
+Detection output Layer
+======================
+
+detection_output
+---
+..  autoclass:: paddle.v2.layer.detection_output
+    :noindex:
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index c24102255f..45fb848886 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -6,6 +6,6 @@ img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cos
 test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
 test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
 test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer
-test_prelu_layer test_row_conv)
+test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_detection_output_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_detection_output_layer.py
new file mode 100644
index 0000000000..3572a2cb07
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_detection_output_layer.py
@@ -0,0 +1,23 @@
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000, learning_rate=1e-5)
+
+input_loc = data_layer(name='input_loc', size=16, height=16, width=1)
+
+input_conf = data_layer(name='input_conf', size=8, height=1, width=8)
+
+priorbox = data_layer(name='priorbox', size=32, height=4, width=8)
+
+detout = detection_output_layer(
+    input_loc=input_loc,
+    input_conf=input_conf,
+    priorbox=priorbox,
+    num_classes=21,
+    nms_threshold=0.45,
+    nms_top_k=400,
+    keep_top_k=200,
+    confidence_threshold=0.01,
+    background_id=0,
+    name='test_detection_output')
+
+outputs(detout)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_multibox_loss_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_multibox_loss_layer.py
new file mode 100644
index 0000000000..c3376c47bd
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_multibox_loss_layer.py
@@ -0,0 +1,25 @@
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000, learning_rate=1e-5)
+
+input_loc = data_layer(name='input_loc', size=16, height=16, width=1)
+
+input_conf = data_layer(name='input_conf', size=8, height=1, width=8)
+
+priorbox = data_layer(name='priorbox', size=32, height=4, width=8)
+
+label = data_layer(name='label', size=24, height=4, width=6)
+
+multibox_loss = multibox_loss_layer(
+    input_loc=input_loc,
+    input_conf=input_conf,
+    priorbox=priorbox,
+    label=label,
+    num_classes=21,
+    overlap_threshold=0.5,
+    neg_pos_ratio=3.0,
+    neg_overlap=0.5,
+    background_id=0,
+    name='test_multibox_loss')
+
+outputs(multibox_loss)

From b3c5808e13bc94fbc933c803c59fed979a11f515 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Fri, 30 Jun 2017 03:11:57 +0000
Subject: [PATCH 10/79] rm cloud EOF

---
 go/master/c/client.go             | 7 -------
 go/master/client.go               | 3 ---
 go/master/client_test.go          | 7 +------
 python/paddle/v2/master/client.py | 5 -----
 4 files changed, 1 insertion(+), 21 deletions(-)

diff --git a/go/master/c/client.go b/go/master/c/client.go
index 79e13e4b63..a37894fefe 100644
--- a/go/master/c/client.go
+++ b/go/master/c/client.go
@@ -13,7 +13,6 @@ typedef int paddle_master_client;
 import "C"
 
 import (
-	"io"
 	"sync"
 	"unsafe"
 
@@ -93,12 +92,6 @@ func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int
 func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int {
 	c := get(client)
 	r, err := c.NextRecord()
-	if err == io.EOF {
-		// EOF
-		*record = (*C.uchar)(nullPtr)
-		return -1
-	}
-
 	if err != nil {
 		// Error
 		// TODO: return the type of error?
diff --git a/go/master/client.go b/go/master/client.go
index c122d17c8f..985b96b0af 100644
--- a/go/master/client.go
+++ b/go/master/client.go
@@ -1,7 +1,6 @@
 package master
 
 import (
-	"io"
 	"os"
 	"time"
 
@@ -70,8 +69,6 @@ func (c *Client) getRecords() {
 			if err != nil {
 				log.Errorln(err)
 			}
-
-			c.ch <- record{nil, io.EOF}
 		}
 
 		// We treat a task as finished whenever the last data
diff --git a/go/master/client_test.go b/go/master/client_test.go
index 05201941e3..0a401d8a43 100644
--- a/go/master/client_test.go
+++ b/go/master/client_test.go
@@ -2,7 +2,6 @@ package master_test
 
 import (
 	"fmt"
-	"io"
 	"net"
 	"net/http"
 	"net/rpc"
@@ -70,12 +69,8 @@ func TestNextRecord(t *testing.T) {
 
 	for pass := 0; pass < 50; pass++ {
 		received := make(map[byte]bool)
-		for i := 0; i <= total; i++ {
+		for i := 0; i < total; i++ {
 			r, err := c.NextRecord()
-			if err == io.EOF {
-				break
-			}
-
 			if err != nil {
 				t.Fatal(pass, i, "Read error:", err)
 			}
diff --git a/python/paddle/v2/master/client.py b/python/paddle/v2/master/client.py
index 0cc01b7310..6ddb09e4e8 100644
--- a/python/paddle/v2/master/client.py
+++ b/python/paddle/v2/master/client.py
@@ -28,16 +28,11 @@ class client(object):
 
     # return format: (record, errno)
     # errno =  0: ok
-    #       = -1: EOF
     #       < -1: error
     def next_record(self):
         p = ctypes.c_char_p()
         ret = ctypes.pointer(p)
         size = lib.paddle_next_record(self.c, ret)
-        if size == -1:
-            # EOF
-            return None, -1
-
         if size < -1:
             # Error
             return None, size

From 97bbd179569f48bfcf1a3ff3225c331ad8e3fbf4 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Fri, 30 Jun 2017 03:14:29 +0000
Subject: [PATCH 11/79] rm cloud EOF

---
 go/master/c/client.go | 1 -
 1 file changed, 1 deletion(-)

diff --git a/go/master/c/client.go b/go/master/c/client.go
index a37894fefe..13ed3b7680 100644
--- a/go/master/c/client.go
+++ b/go/master/c/client.go
@@ -86,7 +86,6 @@ func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int
 
 // return value:
 //     0:ok
-//    -1:EOF
 //    -2:error
 //export paddle_next_record
 func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int {

From 26e661bc51e2fac36c3692d748b7db8a950cb370 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Mon, 3 Jul 2017 03:05:36 +0000
Subject: [PATCH 12/79] fix by helin's comments

---
 go/master/c/client.go              |  4 ++--
 python/paddle/v2/master/client.py  |  4 ++--
 python/paddle/v2/reader/creator.py | 34 ++++++++++++++++++------------
 3 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/go/master/c/client.go b/go/master/c/client.go
index 635688f196..31f4311974 100644
--- a/go/master/c/client.go
+++ b/go/master/c/client.go
@@ -106,7 +106,7 @@ func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int
 
 // return value:
 //     0:ok
-//    -2:error
+//    -1:error
 //export paddle_next_record
 func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int {
 	c := get(client)
@@ -115,7 +115,7 @@ func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int {
 		// Error
 		// TODO: return the type of error?
 		*record = (*C.uchar)(nullPtr)
-		return -2
+		return -1
 	}
 
 	if len(r) == 0 {
diff --git a/python/paddle/v2/master/client.py b/python/paddle/v2/master/client.py
index 6ddb09e4e8..70f9e43c96 100644
--- a/python/paddle/v2/master/client.py
+++ b/python/paddle/v2/master/client.py
@@ -28,12 +28,12 @@ class client(object):
 
     # return format: (record, errno)
     # errno =  0: ok
-    #       < -1: error
+    #       <  0: error
     def next_record(self):
         p = ctypes.c_char_p()
         ret = ctypes.pointer(p)
         size = lib.paddle_next_record(self.c, ret)
-        if size < -1:
+        if size < 0:
             # Error
             return None, size
 
diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py
index 2e8626e565..20624d5286 100644
--- a/python/paddle/v2/reader/creator.py
+++ b/python/paddle/v2/reader/creator.py
@@ -57,29 +57,31 @@ def text_file(path):
     return reader
 
 
-def recordio_local(paths):
+def recordio_local(paths, buf_size=100):
     """
-    Creates a data reader that outputs record one one by one 
-        from given local recordio fils path.
+    Creates a data reader from given RecordIO file paths separated by ",", 
+        glob pattern is supported.
     :path: path of recordio files.
     :returns: data reader of recordio files.
     """
 
     import recordio as rec
+    import paddle.v2.reader.decorator as dec
 
     def reader():
-        for i, path in enumerate(paths):
-            f = rec.reader(path)
-            while True:
-                r = f.read()
-                if r is None:
-                    break
-                yield r
-            f.close()
+        a = ','.join(paths)
+        f = rec.reader(a)
+        while True:
+            r = f.read()
+            if r is None:
+                break
+            yield r
+        f.close()
+
+    return dec.buffered(reader, buf_size)
 
-    return reader
 
-def recordio(paths, addr="", buf_size=100):
+def recordio(paths, buf_size=100):
     """
     Creates a data reader that outputs record one one by one 
         from given local or cloud recordio path.
@@ -92,6 +94,12 @@ def recordio(paths, addr="", buf_size=100):
     if "KUBERNETES_SERVICE_HOST" not in os.environ.keys():
         return recordio_local(paths)
 
+    host_name = "MASTER_SERVICE_HOST"
+    if host_name not in os.environ.keys():
+        raise Exception('not find ' + host_name + ' in environ.')
+
+    addr = os.environ(host)
+
     def reader():
         c = cloud(addr, buf_size)
         c.set_dataset(paths)

From 16b8e59e1ab8cb33d175ce6d4bfe3f19419acb06 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 3 Jul 2017 15:32:51 +0800
Subject: [PATCH 13/79] Update new authors

---
 AUTHORS.md | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/AUTHORS.md b/AUTHORS.md
index d5baee2161..08eaab10ea 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -1,5 +1,23 @@
 | Github account | name |
 |---|---|
+| beckett1124 | Bin Qi |
+| Canpio | Jiayi Feng |
+| chengxiaohua1105 | Xiaohua Cheng |
+| xushaoyong | Shaoyong Xu |
+| liuyuan | Yuan Liu |
+| xujun05 | Jun Xu |
+| dzhwinter | Zhihong Dong |
+| Guo Sheng | Sheng Guo |
+| kuke | Yibing Liu |
+| llxxxll | YongFeng Liu |
+| cxysteven | Xingyi Cheng |
+| NHZlX | Zhaolong Xing |
+| pakchoi | Chuanjiang Song |
+| pkuyym | Yaming Yang |
+| Superjom | Chunwei Yan |
+| wanghaoshuang | Haoshuang Wang |
+| wangzhen-nlp | Zhen Wang |
+| wwhu | Weiwei Hu |
 | reyoung | Yang Yu |
 | gangliao | Gang Liao |
 | luotao01 | Tao Luo |

From 696ba1d2e1f3fdac763c4dd29b5353b512f9b7fa Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Mon, 3 Jul 2017 16:01:50 +0800
Subject: [PATCH 14/79] init tensor_test.cc

---
 paddle/framework/CMakeLists.txt |  1 +
 paddle/framework/tensor.h       |  5 +--
 paddle/framework/tensor_test.cc | 71 +++++++++++++++++++++++++++++++++
 3 files changed, 74 insertions(+), 3 deletions(-)
 create mode 100644 paddle/framework/tensor_test.cc

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 6aa6b9bc2d..41bf3837aa 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -2,6 +2,7 @@
 cc_library(ddim SRCS ddim.cc)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
+cc_test(tensor_test SRCS tensor_test.cc DEPS ddim)
 cc_test(variable_test SRCS variable_test.cc)
 cc_test(scope_test SRCS scope_test.cc)
 cc_test(enforce_test SRCS enforce_test.cc)
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 067f2a8526..8d658d5097 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -19,13 +19,12 @@ namespace framework {
 
 class Tensor {
   using paddle::platform::Place;
-  using paddle::platform::get_place;
 
  public:
   template <typename T>
   const T* data() const {
-    PADDLE_ASSERT(holder_ != nullptr,
-                  "Tensor::data must be called after Tensor::mutable_data");
+    PADDLE_ENFORCE(holder_ != nullptr,
+                   "Tensor::data must be called after Tensor::mutable_data");
     return static_cast<const T*>(holder->Ptr());
   }
 
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
new file mode 100644
index 0000000000..fa44b24b64
--- /dev/null
+++ b/paddle/framework/tensor_test.cc
@@ -0,0 +1,71 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
+#include "paddle/framework/tensor.h"
+#include <gtest/gtest.h>
+
+TEST(Tensor, Data) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+
+  Tensor cpu_tensor;
+}
+
+/* mutable_data() is not tested at present
+   because Memory::Alloc() and Memory::Free() have not been ready.
+
+TEST(Tensor, MutableData) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+
+  Tensor cpu_tensor;
+  float* p1 = nullptr;
+  float* p2 = nullptr;
+  // initialization
+  p1 = cpu_tensor.mutable_data<float>(make_ddim({1, 2, 3}), CPUPlace());
+  EXPECT_NE(p1, nullptr);
+  // set cpu_tensor a new dim with large size
+  // momery is supposed to be re-allocated
+  p2 = cpu_tensor.mutable_data<float>(make_ddim({3, 4}));
+  EXPECT_NE(p2, nullptr);
+  EXPECT_NE(p1, p2);
+  // set cpu_tensor a new dim with same size
+  // momery block is supposed to be unchanged
+  p1 = cpu_tensor.mutable_data<float>(make_ddim({2, 2, 3}));
+  EXPECT_EQ(p1, p2);
+  // set cpu_tensor a new dim with smaller size
+  // momery block is supposed to be unchanged
+  p2 = cpu_tensor.mutable_data<float>(make_ddim({2, 2}));
+  EXPECT_EQ(p1, p2);
+
+  Tensor gpu_tensor;
+  float* p1 = nullptr;
+  float* p2 = nullptr;
+  // initialization
+  p1 = gpu_tensor.mutable_data<float>(make_ddim({1, 2, 3}), GPUPlace());
+  EXPECT_NE(p1, nullptr);
+  // set gpu_tensor a new dim with large size
+  // momery is supposed to be re-allocated
+  p2 = gpu_tensor.mutable_data<float>(make_ddim({3, 4}));
+  EXPECT_NE(p2, nullptr);
+  EXPECT_NE(p1, p2);
+  // set gpu_tensor a new dim with same size
+  // momery block is supposed to be unchanged
+  p1 = gpu_tensor.mutable_data<float>(make_ddim({2, 2, 3}));
+  EXPECT_EQ(p1, p2);
+  // set gpu_tensor a new dim with smaller size
+  // momery block is supposed to be unchanged
+  p2 = gpu_tensor.mutable_data<float>(make_ddim({2, 2}));
+  EXPECT_EQ(p1, p2);
+}
+*/
\ No newline at end of file

From 9f408dfb1b81daee795d9c0d8ed177e6ab4e10a8 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Mon, 3 Jul 2017 16:20:25 +0800
Subject: [PATCH 15/79] fix some compile error

---
 paddle/framework/tensor.h | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 8d658d5097..7fa662fbb5 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -14,32 +14,39 @@ limitations under the License. */
 
 #pragma once
 
+#include <memory>
+#include <type_traits>
+#include <typeinfo>
+#include "paddle/framework/ddim.h"
+#include "paddle/framework/enforce.h"
+#include "paddle/memory/memory.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/place.h"
+
 namespace paddle {
 namespace framework {
 
 class Tensor {
-  using paddle::platform::Place;
-
  public:
   template <typename T>
   const T* data() const {
     PADDLE_ENFORCE(holder_ != nullptr,
                    "Tensor::data must be called after Tensor::mutable_data");
-    return static_cast<const T*>(holder->Ptr());
+    return static_cast<const T*>(holder_->Ptr());
   }
 
   template <typename T,  // must be POD types
-            typename = std::enable_if<std::is_pod<T>::value>::type>
-  T* mutable_data(DDim dims, Place place) {
+            typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
+  T* mutable_data(DDim dims, paddle::platform::Place place) {
     if (holder_ == nullptr || holder_->Place() != place ||
-        holder_->Size() < dims.product() * sizeof(T)) {
-      holder_.reset(new PlaceholderImpl(place, dims.product() * sizeof(T)));
+        holder_->Size() < product(dims) * sizeof(T)) {
+      holder_.reset(new PlaceholderImpl<T>(place, product(dims) * sizeof(T)));
     }
     return static_cast<T*>(holder_->Ptr());
   }
 
   template <typename T,  // must be POD types
-            typename = std::enable_if<std::is_pod<T>::value>::type>
+            typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
   T* mutable_data(DDim dims) {
     return mutable_data<T>(dims, paddle::platform::get_place());
   }
@@ -50,24 +57,24 @@ class Tensor {
   struct Placeholder {
     virtual ~Placeholder() {}
     virtual void* Ptr() const = 0;
-    virtual Place Place() const = 0;
+    virtual paddle::platform::Place Place() const = 0;
     virtual size_t Size() const = 0;
   };
 
   template <typename T>
   struct PlaceholderImpl : public Placeholder {
-    PlaceholderImpl(Place pl, size_t size)
+    PlaceholderImpl(paddle::platform::Place pl, size_t size)
         : ptr_(paddle::memory::Alloc(pl, size), paddle::memory::Deleter(pl)),
           place_(pl),
           size_(size) {}
 
     virtual void* Ptr() const { return static_cast<void*>(ptr_.get()); }
     virtual size_t Size() const { return size_; }
-    virtual Place Place() const { return place_; }
+    virtual paddle::platform::Place Place() const { return place_; }
 
     std::unique_ptr<T, memory::Deleter> ptr_;
-    Place place_;  // record the place of ptr_.
-    size_t size_;  // size of the memory block.
+    paddle::platform::Place place_;  // record the place of ptr_.
+    size_t size_;                    // size of the memory block.
   };
 
   std::unique_ptr<Placeholder> holder_;  // holds the memory block if allocated.

From bdd27208778e82ca037b2b3f6d25337403db4092 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 3 Jul 2017 16:26:33 +0800
Subject: [PATCH 16/79] Add OpProto implementation

OpProto is a proto message that helps 3rd-party language bindings, e.g.
`Python`, to generate operator creation methods. The operator creation
method is the low-level API for 3rd-party language bindings. Op creation
methods take the user's input in that language, and convert users inputs
into `OpDesc` message, then passing that `OpDesc` message to Paddle's
C++ core and create an operator.

* A separated `attr_type.proto` is added, because that file wound
  be included by `op_desc.proto` in future.
---
 paddle/framework/CMakeLists.txt  |  1 +
 paddle/framework/attr_type.proto | 28 +++++++++++++
 paddle/framework/op_proto.proto  | 69 ++++++++++++++++++++++++++++++++
 3 files changed, 98 insertions(+)
 create mode 100644 paddle/framework/attr_type.proto
 create mode 100644 paddle/framework/op_proto.proto

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 6aa6b9bc2d..3284015908 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -5,3 +5,4 @@ nv_test(dim_test SRCS dim_test.cu DEPS ddim)
 cc_test(variable_test SRCS variable_test.cc)
 cc_test(scope_test SRCS scope_test.cc)
 cc_test(enforce_test SRCS enforce_test.cc)
+proto_library(op_proto SRCS op_proto.proto attr_type.proto)
diff --git a/paddle/framework/attr_type.proto b/paddle/framework/attr_type.proto
new file mode 100644
index 0000000000..2d8e0476d7
--- /dev/null
+++ b/paddle/framework/attr_type.proto
@@ -0,0 +1,28 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+syntax="proto2";
+package paddle.framework;
+
+// Attribute Type for paddle's Op.
+// Op contains many attributes. Each type of attributes could be different.
+// The AttrType will be shared between AttrDesc and AttrProto.
+enum AttrType {
+    INT = 0;
+    FLOAT = 1;
+    STRING = 2;
+    INTS = 3;
+    FLOATS = 4;
+    STRINGS = 5;
+}
\ No newline at end of file
diff --git a/paddle/framework/op_proto.proto b/paddle/framework/op_proto.proto
new file mode 100644
index 0000000000..22df6f9c6b
--- /dev/null
+++ b/paddle/framework/op_proto.proto
@@ -0,0 +1,69 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+// Protocol Message for 3rd-party language binding.
+//
+// Paddle Python package will use `OpProto` to generate op creation methods.
+// The op creation methods take user's input and generate `OpDesc` proto message,
+// then pass `OpDesc` to C++ side and create Op pointer.
+//
+syntax="proto2";
+package paddle.framework;
+
+import "attr_type.proto";
+
+// Attribute protocol message for 3rd-party language binding.
+// It will store the Op support what attribute and what type.
+message AttrProto {
+    // Supported attribute name. e.g. `scale` for cosine op.
+    required string name = 1;
+
+    // Supported attribute type.
+    required AttrType type = 2;
+
+    // Supported attribute comments. It helps 3rd-party language generate doc-string.
+    required string comment = 3;
+}
+
+// Input or output message for 3rd-party language binding.
+// It contains parameter name and its comments.
+message VarProto {
+    // Input or output name in that op creation function.
+    // e.g. `cos(a, b, output, ...)`, "a", "b", "output" are names.
+    required string name = 1;
+
+    // The comment for that input. It helps 3rd-party language generate doc-string.
+    required string comment = 2;
+}
+
+// Op protocol message for 3rd-party language binding.
+// It contains all information for generating op creation method.
+message OpProto {
+    // The input information to generate op creation method.
+    repeated VarProto inputs = 1;
+
+    // The output information to generate op creation method.
+    repeated VarProto outputs = 2;
+
+    // The attribute information to generate op creation method.
+    repeated AttrProto attrs = 3;
+
+    // The comments for that Op. It helps 3rd-party language generate
+    // doc-string. The whole documentation of that Op is generated by comment,
+    // inputs, outputs, attrs together.
+    required string comment = 4;
+
+    // The type of that Op.
+    required string type = 5;
+}

From c9cd5b6e9dd9c92ae236709c61e3cde7a17ee2b9 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 3 Jul 2017 16:39:36 +0800
Subject: [PATCH 17/79] Update Authors.md

---
 AUTHORS.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/AUTHORS.md b/AUTHORS.md
index 08eaab10ea..09698ac140 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -18,6 +18,8 @@
 | wanghaoshuang | Haoshuang Wang |
 | wangzhen-nlp | Zhen Wang |
 | wwhu | Weiwei Hu |
+| xinghai-sun | XingHai Sun |
+| zhaopu7 | Pu Zhao |
 | reyoung | Yang Yu |
 | gangliao | Gang Liao |
 | luotao01 | Tao Luo |

From 9bf98168281952efee1ed5fd1a61b743b0847834 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 3 Jul 2017 16:47:11 +0800
Subject: [PATCH 18/79] Add OpProto unittest.

---
 paddle/framework/CMakeLists.txt   |  4 +++-
 paddle/framework/op_proto_test.cc | 31 +++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+), 1 deletion(-)
 create mode 100644 paddle/framework/op_proto_test.cc

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 3284015908..50107faaed 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -5,4 +5,6 @@ nv_test(dim_test SRCS dim_test.cu DEPS ddim)
 cc_test(variable_test SRCS variable_test.cc)
 cc_test(scope_test SRCS scope_test.cc)
 cc_test(enforce_test SRCS enforce_test.cc)
-proto_library(op_proto SRCS op_proto.proto attr_type.proto)
+proto_library(attr_type SRCS attr_type.proto)
+proto_library(op_proto SRCS op_proto.proto)
+cc_test(op_proto_test SRCS op_proto_test.cc DEPS attr_type op_proto protobuf)
diff --git a/paddle/framework/op_proto_test.cc b/paddle/framework/op_proto_test.cc
new file mode 100644
index 0000000000..9c054bde44
--- /dev/null
+++ b/paddle/framework/op_proto_test.cc
@@ -0,0 +1,31 @@
+#include <gtest/gtest.h>
+#include <paddle/framework/op_proto.pb.h>
+
+TEST(TestOpProto, ALL) {
+  paddle::framework::OpProto proto;
+  {
+    auto ipt = proto.mutable_inputs()->Add();
+    *ipt->mutable_name() = "a";
+    *ipt->mutable_comment() = "the one input of cosine op";
+  }
+  {
+    auto ipt = proto.mutable_inputs()->Add();
+    *ipt->mutable_name() = "b";
+    *ipt->mutable_comment() = "the other input of cosine op";
+  }
+  {
+    auto opt = proto.mutable_outputs()->Add();
+    *opt->mutable_name() = "output";
+    *opt->mutable_comment() = "the output of cosine op";
+  }
+  {
+    auto attr = proto.mutable_attrs()->Add();
+    *attr->mutable_name() = "scale";
+    attr->set_type(paddle::framework::AttrType::FLOAT);
+    *attr->mutable_comment() = "the scale attribute of cosine op";
+  }
+  proto.set_type("cos");
+  *proto.mutable_comment() = "cosine op, output = scale * cos(a, b)";
+
+  ASSERT_TRUE(proto.IsInitialized());
+}
\ No newline at end of file

From 0e61730039b11861d5a90188987bad2241a08f95 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Mon, 3 Jul 2017 12:05:38 +0800
Subject: [PATCH 19/79] stride pooling for max and average layer

---
 paddle/gserver/layers/MaxLayer.h              |  5 ++
 .../layers/SequenceLastInstanceLayer.cpp      |  3 +-
 paddle/gserver/layers/SequencePoolLayer.cpp   |  5 +-
 paddle/gserver/layers/SequencePoolLayer.h     |  2 -
 paddle/gserver/tests/test_LayerGrad.cpp       | 12 ++++-
 paddle/parameter/Argument.cpp                 |  6 +--
 paddle/parameter/Argument.h                   |  2 +-
 paddle/parameter/tests/test_argument.cpp      |  4 +-
 python/paddle/trainer/config_parser.py        |  8 +++
 .../paddle/trainer_config_helpers/layers.py   | 12 +++++
 .../protostr/test_sequence_pooling.protostr   | 51 +++++++++++++++++++
 .../tests/configs/test_sequence_pooling.py    |  8 +++
 12 files changed, 103 insertions(+), 15 deletions(-)

diff --git a/paddle/gserver/layers/MaxLayer.h b/paddle/gserver/layers/MaxLayer.h
index baa58ca2d7..adf7ab4ae4 100644
--- a/paddle/gserver/layers/MaxLayer.h
+++ b/paddle/gserver/layers/MaxLayer.h
@@ -26,6 +26,11 @@ namespace paddle {
  * If SequenceLevel = kNonSeq:
  *    Output: output size is the number of input sequences (NOT input instances)
  *    output[i] = max_{for each instance in this sequence}{input[i]}
+ *    If stride_ > 0:
+ *      Output: a shorten sequence. The operation of getting max instance of a
+ *              sequence is independently performed on every slice of the input
+ *              sequence, which is obtained by sliding a window with the window
+ *              size set to stride_.
  * If SequenceLevel = kSeq:
  *    Check input sequence must has sub-sequence
  *    Output: output size is the number of input sub-sequences
diff --git a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
index 944c705166..8127cbf09c 100644
--- a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
+++ b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
@@ -73,8 +73,7 @@ bool SequenceLastInstanceLayer::init(const LayerMap& layerMap,
 void SequenceLastInstanceLayer::forward(PassType passType) {
   SequencePoolLayer::forward(passType);
 
-  auto starts = (stride_ > 0) ? stridePositions_->getData()
-                              : startPositions_->getData(false);
+  auto starts = startPositions_->getData(false);
   MatrixPtr inputValue = getInputValue(0);
   MatrixPtr outputValue = getOutputValue();
 
diff --git a/paddle/gserver/layers/SequencePoolLayer.cpp b/paddle/gserver/layers/SequencePoolLayer.cpp
index 4179a9e7e0..2a693b110a 100644
--- a/paddle/gserver/layers/SequencePoolLayer.cpp
+++ b/paddle/gserver/layers/SequencePoolLayer.cpp
@@ -72,9 +72,8 @@ void SequencePoolLayer::forward(PassType passType) {
   if (stride_ > 0) {
     CHECK_EQ(input.hasSubseq(), 0UL)
         << "sequence stride pooling is invalid for hasSubseq now";
-    output_.poolSequenceWithStride(
-        input, stride_, &stridePositions_, reversed_);
-    newBatchSize_ = stridePositions_->getSize() - 1;
+    output_.poolSequenceWithStride(input, stride_, &startPositions_, reversed_);
+    newBatchSize_ = startPositions_->getSize() - 1;
   }
 
   resetOutput(newBatchSize_, dim);
diff --git a/paddle/gserver/layers/SequencePoolLayer.h b/paddle/gserver/layers/SequencePoolLayer.h
index 293d1bf278..058627def8 100644
--- a/paddle/gserver/layers/SequencePoolLayer.h
+++ b/paddle/gserver/layers/SequencePoolLayer.h
@@ -47,8 +47,6 @@ protected:
   size_t newBatchSize_;
   ICpuGpuVectorPtr startPositions_;
   int stride_;
-  // Store the start position of each window.
-  IVectorPtr stridePositions_;
   // Whether the input sequence is reversed or not.
   bool reversed_ = false;
 
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 297756025b..ed067e7c3a 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -845,8 +845,12 @@ void testDegradeLayer(bool hasSubseq,
 
 TEST(Layer, MaxLayer) {
   testDegradeLayer(false, "max", "non-seq", -1);  // seq max to non-seq
-  testDegradeLayer(true, "max", "non-seq", -1);   // hasSubseq max to non-seq
-  testDegradeLayer(true, "max", "seq", -1);       // hasSubseq max to seq
+  testDegradeLayer(false,
+                   "max",
+                   "non-seq",
+                   5);  // seq max to a shorten seq, stride window = 5
+  testDegradeLayer(true, "max", "non-seq", -1);  // hasSubseq max to non-seq
+  testDegradeLayer(true, "max", "seq", -1);      // hasSubseq max to seq
 }
 
 TEST(Layer, SequenceLastInstanceLayer) {
@@ -868,6 +872,10 @@ TEST(Layer, SequenceLastInstanceLayer) {
 
 TEST(Layer, AverageLayer) {
   testDegradeLayer(false, "average", "non-seq", -1);  // seq average to non-seq
+  testDegradeLayer(false,
+                   "max",
+                   "non-seq",
+                   5);  // seq average to a shorten seq, stride window = 5
   testDegradeLayer(
       true, "average", "non-seq", -1);           // hasSubseq average to non-seq
   testDegradeLayer(true, "average", "seq", -1);  // hasSubseq average to seq
diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp
index 5beced3bb5..ef72b973c1 100644
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -561,7 +561,7 @@ void Argument::degradeSequence(const Argument& input) {
 
 void Argument::poolSequenceWithStride(const Argument& input,
                                       size_t stride,
-                                      IVectorPtr* stridePostions,
+                                      ICpuGpuVectorPtr* stridePostions,
                                       bool reversed) {
   // If input.sequenceStartPositions = [0, 9, 14, 17, 30] and stride = 5,
   // then sequenceStartPositions = [0, 2, 3, 4, 7].
@@ -598,8 +598,8 @@ void Argument::poolSequenceWithStride(const Argument& input,
   stridePos.emplace_back(starts[numSequences]);
   int size = stridePos.size();
   CHECK_EQ(size - 1, tgtBuf[numSequences]);
-  IVector::resizeOrCreate(*stridePostions, size, false);
-  (*stridePostions)->copyFrom(stridePos.data(), size);
+  ICpuGpuVector::resizeOrCreate(*stridePostions, size, false);
+  (*stridePostions)->getMutableVector(false)->copyFrom(stridePos.data(), size);
 }
 
 void Argument::getValueString(
diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h
index 09bd633616..0ccdef802e 100644
--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -299,7 +299,7 @@ struct Argument {
    */
   void poolSequenceWithStride(const Argument& input,
                               size_t stride,
-                              IVectorPtr* stridePositions,
+                              ICpuGpuVectorPtr* stridePositions,
                               bool reversed = false);
   /**
    * @brief getValueString will return the argument's output in string. There
diff --git a/paddle/parameter/tests/test_argument.cpp b/paddle/parameter/tests/test_argument.cpp
index 98ab013548..19df6ea957 100644
--- a/paddle/parameter/tests/test_argument.cpp
+++ b/paddle/parameter/tests/test_argument.cpp
@@ -31,7 +31,7 @@ TEST(Argument, poolSequenceWithStride) {
   int strideResultReversed[] = {0, 4, 9, 14, 17, 20, 25, 30};
 
   for (auto reversed : {false, true}) {
-    IVectorPtr stridePositions;
+    ICpuGpuVectorPtr stridePositions;
     output.poolSequenceWithStride(
         input, 5 /* stride */, &stridePositions, reversed);
 
@@ -45,7 +45,7 @@ TEST(Argument, poolSequenceWithStride) {
     CHECK_EQ(stridePositions->getSize(), 8UL);
     auto result = reversed ? strideResultReversed : strideResult;
     for (int i = 0; i < 8; i++) {
-      CHECK_EQ(stridePositions->getData()[i], result[i]);
+      CHECK_EQ(stridePositions->getData(false)[i], result[i]);
     }
   }
 }
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index b7418101d8..5ca7df7476 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -2420,10 +2420,14 @@ class MaxLayer(LayerBase):
                  trans_type='non-seq',
                  bias=False,
                  output_max_index=None,
+                 stride=-1,
                  **xargs):
         super(MaxLayer, self).__init__(name, 'max', 0, inputs=inputs, **xargs)
         config_assert(len(self.inputs) == 1, 'MaxLayer must have 1 input')
+        if trans_type == 'seq':
+            config_assert(stride == -1, 'subseq does not support stride window')
         self.config.trans_type = trans_type
+        self.config.seq_pool_stride = stride
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
             self.set_layer_size(input_layer.size)
@@ -2685,11 +2689,15 @@ class AverageLayer(LayerBase):
                  average_strategy='average',
                  trans_type='non-seq',
                  bias=False,
+                 stride=-1,
                  **xargs):
         super(AverageLayer, self).__init__(
             name, 'average', 0, inputs=inputs, **xargs)
         self.config.average_strategy = average_strategy
+        if trans_type == 'seq':
+            config_assert(stride == -1, 'subseq does not support stride window')
         self.config.trans_type = trans_type
+        self.config.seq_pool_stride = stride
         config_assert(len(inputs) == 1, 'AverageLayer must have 1 input')
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index a601d5c84a..5e8bf4b203 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -1090,10 +1090,16 @@ def pooling_layer(input,
                   name=None,
                   bias_attr=None,
                   agg_level=AggregateLevel.TO_NO_SEQUENCE,
+                  stride=-1,
                   layer_attr=None):
     """
     Pooling layer for sequence inputs, not used for Image.
 
+    If stride > 0, this layer slides a window whose size is determined by stride,
+    and return the pooling value of the window as the output. Thus, a long sequence
+    will be shorten. Note that for sequence with sub-sequence, the default value
+    of stride is -1.
+
     The example usage is:
 
     .. code-block:: python
@@ -1112,6 +1118,8 @@ def pooling_layer(input,
     :param pooling_type: Type of pooling, MaxPooling(default), AvgPooling,
                          SumPooling, SquareRootNPooling.
     :type pooling_type: BasePoolingType|None
+    :param stride: window size.
+    :type stride: Int
     :param bias_attr: Bias parameter attribute. False if no bias.
     :type bias_attr: ParameterAttribute|None|False
     :param layer_attr: The Extra Attributes for layer, such as dropout.
@@ -1129,12 +1137,16 @@ def pooling_layer(input,
         extra_dict['output_max_index'] = pooling_type.output_max_index
     extra_dict.update(ExtraLayerAttribute.to_kwargs(layer_attr))
 
+    if agg_level == AggregateLevel.TO_SEQUENCE:
+        assert stride == -1
+
     Layer(
         name=name,
         type=pooling_type.name,
         inputs=[Input(input.name)],
         bias=ParamAttr.to_bias(bias_attr),
         trans_type=agg_level,
+        stride=stride,
         **extra_dict)
 
     return LayerOutput(
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr
index 5a217f5544..8989561df0 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr
@@ -14,6 +14,7 @@ layers {
     input_layer_name: "dat_in"
   }
   trans_type: "seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__seq_pooling_1__"
@@ -24,6 +25,7 @@ layers {
     input_layer_name: "dat_in"
   }
   trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__seq_pooling_2__"
@@ -35,6 +37,7 @@ layers {
   }
   average_strategy: "average"
   trans_type: "seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__seq_pooling_3__"
@@ -46,6 +49,7 @@ layers {
   }
   average_strategy: "average"
   trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__seq_pooling_4__"
@@ -57,6 +61,7 @@ layers {
   }
   average_strategy: "sum"
   trans_type: "seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__seq_pooling_5__"
@@ -68,6 +73,7 @@ layers {
   }
   average_strategy: "sum"
   trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__seq_pooling_6__"
@@ -77,8 +83,44 @@ layers {
   inputs {
     input_layer_name: "dat_in"
   }
+  trans_type: "non-seq"
+  seq_pool_stride: 5
+}
+layers {
+  name: "__seq_pooling_7__"
+  type: "average"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "dat_in"
+  }
+  average_strategy: "average"
+  trans_type: "non-seq"
+  seq_pool_stride: 5
+}
+layers {
+  name: "__seq_pooling_8__"
+  type: "average"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "dat_in"
+  }
+  average_strategy: "sum"
+  trans_type: "non-seq"
+  seq_pool_stride: 5
+}
+layers {
+  name: "__seq_pooling_9__"
+  type: "max"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "dat_in"
+  }
   output_max_index: true
   trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 input_layer_names: "dat_in"
 output_layer_names: "__seq_pooling_0__"
@@ -88,6 +130,9 @@ output_layer_names: "__seq_pooling_3__"
 output_layer_names: "__seq_pooling_4__"
 output_layer_names: "__seq_pooling_5__"
 output_layer_names: "__seq_pooling_6__"
+output_layer_names: "__seq_pooling_7__"
+output_layer_names: "__seq_pooling_8__"
+output_layer_names: "__seq_pooling_9__"
 sub_models {
   name: "root"
   layer_names: "dat_in"
@@ -98,6 +143,9 @@ sub_models {
   layer_names: "__seq_pooling_4__"
   layer_names: "__seq_pooling_5__"
   layer_names: "__seq_pooling_6__"
+  layer_names: "__seq_pooling_7__"
+  layer_names: "__seq_pooling_8__"
+  layer_names: "__seq_pooling_9__"
   input_layer_names: "dat_in"
   output_layer_names: "__seq_pooling_0__"
   output_layer_names: "__seq_pooling_1__"
@@ -106,6 +154,9 @@ sub_models {
   output_layer_names: "__seq_pooling_4__"
   output_layer_names: "__seq_pooling_5__"
   output_layer_names: "__seq_pooling_6__"
+  output_layer_names: "__seq_pooling_7__"
+  output_layer_names: "__seq_pooling_8__"
+  output_layer_names: "__seq_pooling_9__"
   is_recurrent_layer_group: false
 }
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py b/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py
index 3c49eb56c1..3c205eabd8 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py
@@ -14,6 +14,14 @@ for pt in POOL_TYPE:
     for al in AGG_LEVEL:
         opts.append(pooling_layer(input=din, agg_level=al, pooling_type=pt()))
 
+for pt in POOL_TYPE:
+    opts.append(
+        pooling_layer(
+            input=din,
+            agg_level=AggregateLevel.TO_NO_SEQUENCE,
+            pooling_type=pt(),
+            stride=5))
+
 opts.append(
     pooling_layer(
         input=din, pooling_type=MaxPooling(output_max_index=True)))

From e146fe836bc5178b497329dacddc7a1dc5063bcd Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Mon, 3 Jul 2017 17:22:58 +0800
Subject: [PATCH 20/79] fix compile errors and add assert test

---
 paddle/framework/tensor.h       |  25 ++++++--
 paddle/framework/tensor_test.cc | 100 ++++++++++++++++++--------------
 2 files changed, 76 insertions(+), 49 deletions(-)

diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 7fa662fbb5..73eedd7375 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -20,7 +20,6 @@ limitations under the License. */
 #include "paddle/framework/ddim.h"
 #include "paddle/framework/enforce.h"
 #include "paddle/memory/memory.h"
-#include "paddle/platform/assert.h"
 #include "paddle/platform/place.h"
 
 namespace paddle {
@@ -63,21 +62,35 @@ class Tensor {
 
   template <typename T>
   struct PlaceholderImpl : public Placeholder {
-    PlaceholderImpl(paddle::platform::Place pl, size_t size)
-        : ptr_(paddle::memory::Alloc(pl, size), paddle::memory::Deleter(pl)),
-          place_(pl),
+   private:
+    class Deleter {
+     public:
+      Deleter(platform::Place place) : place_(place) {}
+      void operator()(T* ptr) {
+        paddle::memory::Free(place_, static_cast<void*>(ptr));
+      }
+
+     private:
+      paddle::platform::Place place_;
+    };
+
+   public:
+    PlaceholderImpl(paddle::platform::Place place, size_t size)
+        : ptr_(static_cast<T*>(paddle::memory::Alloc(place, size)),
+               Deleter(place)),
+          place_(place),
           size_(size) {}
 
     virtual void* Ptr() const { return static_cast<void*>(ptr_.get()); }
     virtual size_t Size() const { return size_; }
     virtual paddle::platform::Place Place() const { return place_; }
 
-    std::unique_ptr<T, memory::Deleter> ptr_;
+    std::unique_ptr<T, Deleter> ptr_;
     paddle::platform::Place place_;  // record the place of ptr_.
     size_t size_;                    // size of the memory block.
   };
 
-  std::unique_ptr<Placeholder> holder_;  // holds the memory block if allocated.
+  std::shared_ptr<Placeholder> holder_;  // holds the memory block if allocated.
 };
 
 }  // namespace framework
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index fa44b24b64..f76a31e921 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -13,12 +13,23 @@
 
 #include "paddle/framework/tensor.h"
 #include <gtest/gtest.h>
+#include <string>
 
-TEST(Tensor, Data) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
+TEST(Tensor, ASSERT) {
+  paddle::framework::Tensor cpu_tensor;
 
-  Tensor cpu_tensor;
+  bool caught = false;
+  try {
+    const double* p __attribute__((unused)) = cpu_tensor.data<double>();
+  } catch (paddle::framework::EnforceNotMet err) {
+    caught = true;
+    std::string msg = "Tensor::data must be called after Tensor::mutable_data";
+    const char* what = err.what();
+    for (size_t i = 0; i < msg.length(); ++i) {
+      ASSERT_EQ(what[i], msg[i]);
+    }
+  }
+  ASSERT_TRUE(caught);
 }
 
 /* mutable_data() is not tested at present
@@ -27,45 +38,48 @@ TEST(Tensor, Data) {
 TEST(Tensor, MutableData) {
   using namespace paddle::framework;
   using namespace paddle::platform;
+  {
+    Tensor cpu_tensor;
+    float* p1 = nullptr;
+    float* p2 = nullptr;
+    // initialization
+    p1 = cpu_tensor.mutable_data<float>(make_ddim({1, 2, 3}), CPUPlace());
+    EXPECT_NE(p1, nullptr);
+    // set cpu_tensor a new dim with large size
+    // momery is supposed to be re-allocated
+    p2 = cpu_tensor.mutable_data<float>(make_ddim({3, 4}));
+    EXPECT_NE(p2, nullptr);
+    EXPECT_NE(p1, p2);
+    // set cpu_tensor a new dim with same size
+    // momery block is supposed to be unchanged
+    p1 = cpu_tensor.mutable_data<float>(make_ddim({2, 2, 3}));
+    EXPECT_EQ(p1, p2);
+    // set cpu_tensor a new dim with smaller size
+    // momery block is supposed to be unchanged
+    p2 = cpu_tensor.mutable_data<float>(make_ddim({2, 2}));
+    EXPECT_EQ(p1, p2);
+  }
 
-  Tensor cpu_tensor;
-  float* p1 = nullptr;
-  float* p2 = nullptr;
-  // initialization
-  p1 = cpu_tensor.mutable_data<float>(make_ddim({1, 2, 3}), CPUPlace());
-  EXPECT_NE(p1, nullptr);
-  // set cpu_tensor a new dim with large size
-  // momery is supposed to be re-allocated
-  p2 = cpu_tensor.mutable_data<float>(make_ddim({3, 4}));
-  EXPECT_NE(p2, nullptr);
-  EXPECT_NE(p1, p2);
-  // set cpu_tensor a new dim with same size
-  // momery block is supposed to be unchanged
-  p1 = cpu_tensor.mutable_data<float>(make_ddim({2, 2, 3}));
-  EXPECT_EQ(p1, p2);
-  // set cpu_tensor a new dim with smaller size
-  // momery block is supposed to be unchanged
-  p2 = cpu_tensor.mutable_data<float>(make_ddim({2, 2}));
-  EXPECT_EQ(p1, p2);
-
-  Tensor gpu_tensor;
-  float* p1 = nullptr;
-  float* p2 = nullptr;
-  // initialization
-  p1 = gpu_tensor.mutable_data<float>(make_ddim({1, 2, 3}), GPUPlace());
-  EXPECT_NE(p1, nullptr);
-  // set gpu_tensor a new dim with large size
-  // momery is supposed to be re-allocated
-  p2 = gpu_tensor.mutable_data<float>(make_ddim({3, 4}));
-  EXPECT_NE(p2, nullptr);
-  EXPECT_NE(p1, p2);
-  // set gpu_tensor a new dim with same size
-  // momery block is supposed to be unchanged
-  p1 = gpu_tensor.mutable_data<float>(make_ddim({2, 2, 3}));
-  EXPECT_EQ(p1, p2);
-  // set gpu_tensor a new dim with smaller size
-  // momery block is supposed to be unchanged
-  p2 = gpu_tensor.mutable_data<float>(make_ddim({2, 2}));
-  EXPECT_EQ(p1, p2);
+  {
+    Tensor gpu_tensor;
+    float* p1 = nullptr;
+    float* p2 = nullptr;
+    // initialization
+    p1 = gpu_tensor.mutable_data<float>(make_ddim({1, 2, 3}), GPUPlace());
+    EXPECT_NE(p1, nullptr);
+    // set gpu_tensor a new dim with large size
+    // momery is supposed to be re-allocated
+    p2 = gpu_tensor.mutable_data<float>(make_ddim({3, 4}));
+    EXPECT_NE(p2, nullptr);
+    EXPECT_NE(p1, p2);
+    // set gpu_tensor a new dim with same size
+    // momery block is supposed to be unchanged
+    p1 = gpu_tensor.mutable_data<float>(make_ddim({2, 2, 3}));
+    EXPECT_EQ(p1, p2);
+    // set gpu_tensor a new dim with smaller size
+    // momery block is supposed to be unchanged
+    p2 = gpu_tensor.mutable_data<float>(make_ddim({2, 2}));
+    EXPECT_EQ(p1, p2);
+  }
 }
 */
\ No newline at end of file

From d054a5eef806d76458f9155bf5a4ffb98ba474d3 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Mon, 3 Jul 2017 19:08:27 +0800
Subject: [PATCH 21/79] re-submit

---
 paddle/framework/tensor.h       | 2 +-
 paddle/framework/tensor_test.cc | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 73eedd7375..f777661a1c 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -30,7 +30,7 @@ class Tensor {
   template <typename T>
   const T* data() const {
     PADDLE_ENFORCE(holder_ != nullptr,
-                   "Tensor::data must be called after Tensor::mutable_data");
+                   "Tensor::data must be called after Tensor::mutable_data.");
     return static_cast<const T*>(holder_->Ptr());
   }
 
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index f76a31e921..727d81f8d7 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -23,7 +23,7 @@ TEST(Tensor, ASSERT) {
     const double* p __attribute__((unused)) = cpu_tensor.data<double>();
   } catch (paddle::framework::EnforceNotMet err) {
     caught = true;
-    std::string msg = "Tensor::data must be called after Tensor::mutable_data";
+    std::string msg = "Tensor::data must be called after Tensor::mutable_data.";
     const char* what = err.what();
     for (size_t i = 0; i < msg.length(); ++i) {
       ASSERT_EQ(what[i], msg[i]);
@@ -82,4 +82,4 @@ TEST(Tensor, MutableData) {
     EXPECT_EQ(p1, p2);
   }
 }
-*/
\ No newline at end of file
+*/

From 2d1f95de873542ae591b4575e14539f26945b162 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Mon, 3 Jul 2017 19:33:33 +0800
Subject: [PATCH 22/79] fix a compile error

---
 paddle/framework/tensor.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index f777661a1c..6a152f6a6d 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -37,8 +37,10 @@ class Tensor {
   template <typename T,  // must be POD types
             typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
   T* mutable_data(DDim dims, paddle::platform::Place place) {
-    if (holder_ == nullptr || holder_->Place() != place ||
-        holder_->Size() < product(dims) * sizeof(T)) {
+    if (holder_ == nullptr ||
+        !(holder_->Place() ==
+          place) /* some versions of boost::variant don't have operator!= */
+        || holder_->Size() < product(dims) * sizeof(T)) {
       holder_.reset(new PlaceholderImpl<T>(place, product(dims) * sizeof(T)));
     }
     return static_cast<T*>(holder_->Ptr());

From e48e21da2b2522e4a9e1bca589d68eb02a419fb0 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Mon, 3 Jul 2017 20:14:30 +0800
Subject: [PATCH 23/79] remove unnecessary include

---
 paddle/framework/tensor.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 6a152f6a6d..ce5d98b04e 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include <memory>
 #include <type_traits>
-#include <typeinfo>
 #include "paddle/framework/ddim.h"
 #include "paddle/framework/enforce.h"
 #include "paddle/memory/memory.h"

From 3ba7a738f3f3e77240d026db57692d66bc9481ed Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Mon, 3 Jul 2017 20:37:42 +0800
Subject: [PATCH 24/79] add dynamic_load

---
 paddle/platform/cublas.h          |  87 +++++++++++++++++
 paddle/platform/cudnn.h           | 114 ++++++++++++++++++++++
 paddle/platform/curand.h          |  42 ++++++++
 paddle/platform/dynamic_loader.cc | 157 ++++++++++++++++++++++++++++++
 paddle/platform/dynamic_loader.h  |  63 ++++++++++++
 5 files changed, 463 insertions(+)
 create mode 100644 paddle/platform/cublas.h
 create mode 100644 paddle/platform/cudnn.h
 create mode 100644 paddle/platform/curand.h
 create mode 100644 paddle/platform/dynamic_loader.cc
 create mode 100644 paddle/platform/dynamic_loader.h

diff --git a/paddle/platform/cublas.h b/paddle/platform/cublas.h
new file mode 100644
index 0000000000..70c9713325
--- /dev/null
+++ b/paddle/platform/cublas.h
@@ -0,0 +1,87 @@
+#include <cublas_v2.h>
+#include "paddle/platform/dynamic_loader.h"
+
+namespace paddle {
+namespace dyload {
+namespace dynload {
+
+std::once_flag cublas_dso_flag;
+void *cublas_dso_handle = nullptr;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load cublas routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#ifdef PADDLE_USE_DSO
+#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)                                       \
+  struct DynLoad__##__name {                                                   \
+    template <typename... Args>                                                \
+    cublasStatus_t operator()(Args... args) {                                  \
+      typedef cublasStatus_t (*cublasFunc)(Args...);                           \
+      std::call_once(cublas_dso_flag, GetCublasDsoHandle, &cublas_dso_handle); \
+      void *p_##__name = dlsym(cublas_dso_handle, #__name);                    \
+      return reinterpret_cast<cublasFunc>(p_##__name)(args...);                \
+    }                                                                          \
+  } __name;  // struct DynLoad__##__name
+#else
+#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)      \
+  struct DynLoad__##__name {                  \
+    template <typename... Args>               \
+    cublasStatus_t operator()(Args... args) { \
+      return __name(args...);                 \
+    }                                         \
+  } __name;  // struct DynLoad__##__name
+#endif
+
+#define DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) DYNAMIC_LOAD_CUBLAS_WRAP(__name)
+
+// include all needed cublas functions in HPPL
+// clang-format off
+#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
+  __macro(cublasSgemv)                    \
+  __macro(cublasDgemv)                    \
+  __macro(cublasSgemm)                    \
+  __macro(cublasDgemm)                    \
+  __macro(cublasSgeam)                    \
+  __macro(cublasDgeam)                    \
+
+DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasCreate)
+DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasDestroy)
+DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasSetStream)
+DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasSetPointerMode)
+DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasGetPointerMode)
+DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgemmBatched)
+DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgemmBatched)
+DYNAMIC_LOAD_CUBLAS_WRAP(cublasCgemmBatched)
+DYNAMIC_LOAD_CUBLAS_WRAP(cublasZgemmBatched)
+DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetrfBatched)
+DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetriBatched)
+DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgetrfBatched)
+DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgetriBatched)
+CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
+
+#undef DYNAMIC_LOAD_CUBLAS_WRAP
+#undef DYNAMIC_LOAD_CUBLAS_V2_WRAP
+#undef CUBLAS_BLAS_ROUTINE_EACH
+
+} /* namespace dynload */
+
+// clang-format on
+#ifndef PADDLE_TYPE_DOUBLE
+#define CUBLAS_GEAM dynload::cublasSgeam
+#define CUBLAS_GEMV dynload::cublasSgemv
+#define CUBLAS_GEMM dynload::cublasSgemm
+#define CUBLAS_GETRF dynload::cublasSgetrfBatched
+#define CUBLAS_GETRI dynload::cublasSgetriBatched
+#else
+#define CUBLAS_GEAM dynload::cublasDgeam
+#define CUBLAS_GEMV dynload::cublasDgemv
+#define CUBLAS_GEMM dynload::cublasDgemm
+#define CUBLAS_GETRF dynload::cublasDgetrfBatched
+#define CUBLAS_GETRI dynload::cublasDgetriBatched
+#endif
+}  // namespace dyload
+}  // namespace paddle
diff --git a/paddle/platform/cudnn.h b/paddle/platform/cudnn.h
new file mode 100644
index 0000000000..ab878cd555
--- /dev/null
+++ b/paddle/platform/cudnn.h
@@ -0,0 +1,114 @@
+#include <cudnn.h>
+#include "paddle/platform/dynamic_loader.h"
+
+namespace paddle {
+namespace dyload {
+
+std::once_flag cudnn_dso_flag;
+void* cudnn_dso_handle = nullptr;
+
+#ifdef PADDLE_USE_DSO
+
+#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                                     \
+  struct DynLoad__##__name {                                                \
+    template <typename... Args>                                             \
+    auto operator()(Args... args) -> decltype(__name(args...)) {            \
+      using cudnn_func = decltype(__name(args...)) (*)(Args...);            \
+      std::call_once(cudnn_dso_flag, GetCudnnDsoHandle, &cudnn_dso_handle); \
+      void* p_##__name = dlsym(cudnn_dso_handle, #__name);                  \
+      return reinterpret_cast<cudnn_func>(p_##__name)(args...);             \
+    }                                                                       \
+  } __name; /* struct DynLoad__##__name */
+
+#else
+
+#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                          \
+  struct DynLoad__##__name {                                     \
+    template <typename... Args>                                  \
+    auto operator()(Args... args) -> decltype(__name(args...)) { \
+      return __name(args...);                                    \
+    }                                                            \
+  } __name; /* struct DynLoad__##__name */
+
+#endif
+
+/**
+ * include all needed cudnn functions in HPPL
+ * different cudnn version has different interfaces
+ **/
+// clang-format off
+#define CUDNN_DNN_ROUTINE_EACH(__macro)                   \
+  __macro(cudnnSetTensor4dDescriptor)                     \
+  __macro(cudnnSetTensor4dDescriptorEx)                   \
+  __macro(cudnnGetConvolutionNdForwardOutputDim)          \
+  __macro(cudnnGetConvolutionForwardAlgorithm)            \
+  __macro(cudnnCreateTensorDescriptor)                    \
+  __macro(cudnnDestroyTensorDescriptor)                   \
+  __macro(cudnnCreateFilterDescriptor)                    \
+  __macro(cudnnSetFilter4dDescriptor)                     \
+  __macro(cudnnSetPooling2dDescriptor)                    \
+  __macro(cudnnDestroyFilterDescriptor)                   \
+  __macro(cudnnCreateConvolutionDescriptor)               \
+  __macro(cudnnCreatePoolingDescriptor)                   \
+  __macro(cudnnDestroyPoolingDescriptor)                  \
+  __macro(cudnnSetConvolution2dDescriptor)                \
+  __macro(cudnnDestroyConvolutionDescriptor)              \
+  __macro(cudnnCreate)                                    \
+  __macro(cudnnDestroy)                                   \
+  __macro(cudnnSetStream)                                 \
+  __macro(cudnnActivationForward)                         \
+  __macro(cudnnConvolutionForward)                        \
+  __macro(cudnnConvolutionBackwardBias)                   \
+  __macro(cudnnGetConvolutionForwardWorkspaceSize)        \
+  __macro(cudnnTransformTensor)                           \
+  __macro(cudnnPoolingForward)                            \
+  __macro(cudnnPoolingBackward)                           \
+  __macro(cudnnSoftmaxBackward)                           \
+  __macro(cudnnSoftmaxForward)                            \
+  __macro(cudnnGetVersion)                                \
+  __macro(cudnnGetErrorString)
+CUDNN_DNN_ROUTINE_EACH(DYNAMIC_LOAD_CUDNN_WRAP)
+
+#define CUDNN_DNN_ROUTINE_EACH_R2(__macro)                \
+  __macro(cudnnAddTensor)                                 \
+  __macro(cudnnConvolutionBackwardData)                   \
+  __macro(cudnnConvolutionBackwardFilter)
+CUDNN_DNN_ROUTINE_EACH_R2(DYNAMIC_LOAD_CUDNN_WRAP)
+
+// APIs available after R3:
+#if CUDNN_VERSION >= 3000
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro)              \
+  __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize)     \
+  __macro(cudnnGetConvolutionBackwardDataAlgorithm)           \
+  __macro(cudnnGetConvolutionBackwardFilterAlgorithm)         \
+  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize)
+CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DYNAMIC_LOAD_CUDNN_WRAP)
+#undef CUDNN_DNN_ROUTINE_EACH_AFTER_R3
+#endif
+
+
+// APIs available after R4:
+#if CUDNN_VERSION >= 4007
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro)             \
+  __macro(cudnnBatchNormalizationForwardTraining)            \
+  __macro(cudnnBatchNormalizationForwardInference)           \
+  __macro(cudnnBatchNormalizationBackward)
+CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DYNAMIC_LOAD_CUDNN_WRAP)
+#undef CUDNN_DNN_ROUTINE_EACH_AFTER_R4
+#endif
+
+// APIs in R5
+#if CUDNN_VERSION >= 5000
+#define CUDNN_DNN_ROUTINE_EACH_R5(__macro)                    \
+  __macro(cudnnCreateActivationDescriptor)                    \
+  __macro(cudnnSetActivationDescriptor)                       \
+  __macro(cudnnGetActivationDescriptor)                       \
+  __macro(cudnnDestroyActivationDescriptor)
+CUDNN_DNN_ROUTINE_EACH_R5(DYNAMIC_LOAD_CUDNN_WRAP)
+#undef CUDNN_DNN_ROUTINE_EACH_R5
+#endif
+
+#undef CUDNN_DNN_ROUTINE_EACH
+// clang-format on
+}  // namespace dyload
+}  // namespace paddle
diff --git a/paddle/platform/curand.h b/paddle/platform/curand.h
new file mode 100644
index 0000000000..692c024e6e
--- /dev/null
+++ b/paddle/platform/curand.h
@@ -0,0 +1,42 @@
+#include <curand.h>
+#include "paddle/platform/dynamic_loader.h"
+
+namespace paddle {
+namespace dyload {
+#ifdef PADDLE_USE_DSO
+#define DYNAMIC_LOAD_CURAND_WRAP(__name)                                       \
+  struct DynLoad__##__name {                                                   \
+    template <typename... Args>                                                \
+    curandStatus_t operator()(Args... args) {                                  \
+      typedef curandStatus_t (*curandFunc)(Args...);                           \
+      std::call_once(curand_dso_flag, GetCurandDsoHandle, &curand_dso_handle); \
+      void *p_##__name = dlsym(curand_dso_handle, #__name);                    \
+      return reinterpret_cast<curandFunc>(p_##__name)(args...);                \
+    }                                                                          \
+  } __name; /* struct DynLoad__##__name */
+#else
+#define DYNAMIC_LOAD_CURAND_WRAP(__name)      \
+  struct DynLoad__##__name {                  \
+    template <typename... Args>               \
+    curandStatus_t operator()(Args... args) { \
+      return __name(args...);                 \
+    }                                         \
+  } __name; /* struct DynLoad__##__name */
+#endif
+
+/* include all needed curand functions in HPPL */
+// clang-format off
+#define CURAND_RAND_ROUTINE_EACH(__macro)    \
+  __macro(curandCreateGenerator)             \
+  __macro(curandSetStream)                   \
+  __macro(curandSetPseudoRandomGeneratorSeed)\
+  __macro(curandGenerateUniform)             \
+  __macro(curandGenerateUniformDouble)
+// clang-format on
+
+CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
+
+#undef CURAND_RAND_ROUTINE_EACH
+#undef DYNAMIC_LOAD_CURAND_WRAP
+}
+}  // namespace paddle
diff --git a/paddle/platform/dynamic_loader.cc b/paddle/platform/dynamic_loader.cc
new file mode 100644
index 0000000000..9036eaf642
--- /dev/null
+++ b/paddle/platform/dynamic_loader.cc
@@ -0,0 +1,157 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gflags/gflags.h>
+#include "DynamicLoader.h"
+#include "Logging.h"
+
+DEFINE_string(cudnn_dir, "",
+              "Specify path for loading libcudnn.so. For instance, "
+              "/usr/local/cudnn/lib. If empty [default], dlopen "
+              "will search cudnn from LD_LIBRARY_PATH");
+
+DEFINE_string(cuda_dir, "",
+              "Specify path for loading cuda library, such as libcublas, "
+              "libcurand. For instance, /usr/local/cuda/lib64. If default, "
+              "dlopen will search cuda from LD_LIBRARY_PATH");
+
+DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
+
+DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so.");
+
+static inline std::string join(const std::string& part1,
+                               const std::string& part2) {
+  // directory separator
+  const char sep = '/';
+  if (!part2.empty() && part2.front() == sep) {
+    return part2;
+  }
+  std::string ret;
+  ret.reserve(part1.size() + part2.size() + 1);
+  ret = part1;
+  if (!ret.empty() && ret.back() != sep) {
+    ret += sep;
+  }
+  ret += part2;
+  return ret;
+}
+
+static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
+                                               void** dso_handle,
+                                               int dynload_flags) {
+  VLOG(3) << "Try to find library: " << dso_path
+          << " from default system path.";
+  // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
+  *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+
+// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
+// bring System Integrity Projection (SIP), if dso_handle
+// is null, search from default package path in Mac OS.
+#if defined(__APPLE__) || defined(__OSX__)
+  if (nullptr == *dso_handle) {
+    dso_path = join("/usr/local/cuda/lib/", dso_path);
+    *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+    if (nullptr == *dso_handle) {
+      if (dso_path == "libcudnn.dylib") {
+        LOG(FATAL)
+            << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n"  // NOLINT
+            << "For instance, sudo tar -xzf "
+               "cudnn-7.5-osx-x64-v5.0-ga.tgz -C "  // NOLINT
+            << "/usr/local \n sudo chmod a+r "
+               "/usr/local/cuda/include/cudnn.h "  // NOLINT
+            << "/usr/local/cuda/lib/libcudnn*";
+      }
+    }
+  }
+#endif
+}
+
+static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
+                                              const std::string& dso_name,
+                                              void** dso_handle) {
+  int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
+  *dso_handle = nullptr;
+
+  std::string dlPath = dso_name;
+  if (search_root.empty()) {
+    GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
+  } else {
+    // search xxx.so from custom path
+    dlPath = join(search_root, dso_name);
+    *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
+    // if not found, search from default path
+    if (nullptr == *dso_handle) {
+      LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " ("
+                   << dlerror() << ")";
+      dlPath = dso_name;
+      GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
+    }
+  }
+
+  CHECK(nullptr != *dso_handle) << "Failed to find dynamic library: " << dlPath
+                                << " (" << dlerror() << ") \n"
+                                << "Please specify its path correctly using "
+                                   "following ways: \n"
+
+                                << "Method. set environment variable "
+                                   "LD_LIBRARY_PATH on Linux or "
+                                << "DYLD_LIBRARY_PATH on Mac OS. \n"
+                                << "For instance, issue command: export "
+                                   "LD_LIBRARY_PATH=... \n"
+
+                                << "Note: After Mac OS 10.11, using the "
+                                   "DYLD_LIBRARY_PATH is impossible "
+                                << "unless System Integrity Protection (SIP) "
+                                   "is disabled.";
+}
+
+void GetCublasDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
+#else
+  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
+#endif
+}
+
+void GetCudnnDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle);
+#else
+  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle);
+#endif
+}
+
+void GetCurandDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
+#else
+  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
+#endif
+}
+
+void GetWarpCTCDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib", dso_handle);
+#else
+  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so", dso_handle);
+#endif
+}
+
+void GetLapackDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.dylib", dso_handle);
+#else
+  GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.so", dso_handle);
+#endif
+}
diff --git a/paddle/platform/dynamic_loader.h b/paddle/platform/dynamic_loader.h
new file mode 100644
index 0000000000..9b5ad21724
--- /dev/null
+++ b/paddle/platform/dynamic_loader.h
@@ -0,0 +1,63 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef DYNAMIC_LOAD_H_
+#define DYNAMIC_LOAD_H_
+
+#include <dlfcn.h>
+#include <memory>
+#include <mutex>
+#include <string>
+
+/**
+ * @brief    load the DSO of CUBLAS
+ *
+ * @param    **dso_handle   dso handler
+ *
+ */
+void GetCublasDsoHandle(void** dso_handle);
+
+/**
+ * @brief    load the DSO of CUDNN
+ *
+ * @param    **dso_handle   dso handler
+ *
+ */
+void GetCudnnDsoHandle(void** dso_handle);
+
+/**
+ * @brief    load the DSO of CURAND
+ *
+ * @param    **dso_handle   dso handler
+ *
+ */
+void GetCurandDsoHandle(void** dso_handle);
+
+/**
+ * @brief    load the DSO of warp-ctc
+ *
+ * @param    **dso_handle   dso handler
+ *
+ */
+void GetWarpCTCDsoHandle(void** dso_handle);
+
+/**
+ * @brief    load the DSO of lapack
+ *
+ * @param    **dso_handle   dso handler
+ *
+ */
+void GetLapackDsoHandle(void** dso_handle);
+
+#endif  // DYNAMIC_LOAD_H_

From a30754b05e1ef58b5803c3d9996ed0cc69100ac5 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Mon, 3 Jul 2017 20:41:31 +0800
Subject: [PATCH 25/79] test device_context

---
 paddle/platform/CMakeLists.txt         |   3 +
 paddle/platform/device_context.h       | 166 +++++++++++++++++++++++++
 paddle/platform/device_context_test.cu |  29 +++++
 3 files changed, 198 insertions(+)
 create mode 100644 paddle/platform/device_context.h
 create mode 100644 paddle/platform/device_context_test.cu

diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index c7d7b14518..c95b54a4df 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -2,3 +2,6 @@ nv_test(cuda_test SRCS cuda_test.cu)
 
 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
+
+cc_library(dynamic_loader SRCS dynamic_loader.cc)
+nv_test(device_context_test SRCS device_context_test.cu DEPS place dynamic_loader glog gflags)
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
new file mode 100644
index 0000000000..f95aac4a36
--- /dev/null
+++ b/paddle/platform/device_context.h
@@ -0,0 +1,166 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifndef PADDLE_ONLY_CPU
+#include "paddle/platform/cublas.h"
+#include "paddle/platform/cuda.h"
+#include "paddle/platform/cudnn.h"
+#include "paddle/platform/curand.h"
+#define EIGEN_USE_GPU
+#endif
+
+#include "paddle/framework/enforce.h"
+#include "paddle/platform/place.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace paddle {
+namespace platform {
+
+class DeviceContext {
+ public:
+  virtual ~DeviceContext() {}
+};
+
+class CpuDeviceContext : public DeviceContext {
+  Eigen::DefaultDevice eigen_device() {
+    if (!eigen_device_) {
+      eigen_device_ = new Eigen::DefaultDevice();
+    }
+    return *eigen_device_;
+  }
+
+ private:
+  Eigen::DefaultDevice* eigen_device_{nullptr};
+};
+
+#ifndef PADDLE_ONLY_CPU
+class DeviceGuard {
+ public:
+  explicit DeviceGuard(GPUPlace new_place) : previous_(GetCurrentDeviceId()) {
+    if (previous_ != new_place) {
+      paddle::platform::SetDeviceId(new_place.device);
+    }
+  }
+
+  ~DeviceGuard() { paddle::platform::SetDeviceId(previous_.device); }
+
+ private:
+  GPUPlace previous_;
+};
+
+class CudaDeviceContext : public DeviceContext {
+ public:
+  explicit CudaDeviceContext(const GPUPlace gpu_place) : gpu_place_(gpu_place) {
+    DeviceGuard guard(gpu_place_);
+    paddle::platform::throw_on_error(cudaStreamCreate(&stream_),
+                                     "cudaStreamCreate failed");
+    eigen_stream_ = new Eigen::CudaStreamDevice(&stream_);
+    eigen_device_ = new Eigen::GpuDevice(eigen_stream_);
+  }
+
+  void Wait() {
+    paddle::platform::throw_on_error(cudaStreamSynchronize(stream_),
+                                     "cudaStreamSynchronize failed");
+  }
+
+  cudaStream_t stream() { return stream_; }
+
+  Eigen::GpuDevice eigen_device() { return *eigen_device_; }
+
+  cublasHandle_t cublas_handle() {
+    if (!blas_handle_) {
+      DeviceGuard guard(gpu_place_);
+      PADDLE_ENFORCE(cublasCreate(&blas_handle_) == CUBLAS_STATUS_SUCCESS,
+                     "cublasCreate failed");
+      PADDLE_ENFORCE(
+          cublasSetStream(blas_handle_, stream_) == CUBLAS_STATUS_SUCCESS,
+          "cublasSetStream failed");
+    }
+    return blas_handle_;
+  }
+
+  cudnnHandle_t cudnn_handle() {
+    if (!dnn_handle_) {
+      DeviceGuard guard(gpu_place_);
+      PADDLE_ENFORCE(cudnnCreate(&dnn_handle_) == CUDNN_STATUS_SUCCESS,
+                     "cudnnCreate failed");
+      PADDLE_ENFORCE(
+          cudnnSetStream(dnn_handle_, stream_) == CUDNN_STATUS_SUCCESS,
+          "cudnnSetStream failed");
+    }
+    return dnn_handle_;
+  }
+
+  curandGenerator_t curand_generator() {
+    if (!rand_generator_) {
+      DeviceGuard guard(gpu_place_);
+      PADDLE_ENFORCE(
+          curandCreateGenerator(&rand_generator_, CURAND_RNG_PSEUDO_DEFAULT) ==
+              CURAND_STATUS_SUCCESS,
+          "curandCreateGenerator failed");
+      PADDLE_ENFORCE(
+          curandSetPseudoRandomGeneratorSeed(rand_generator_, random_seed_) ==
+              CURAND_STATUS_SUCCESS,
+          "curandSetPseudoRandomGeneratorSeed failed");
+      PADDLE_ENFORCE(
+          curandSetStream(rand_generator_, stream_) == CURAND_STATUS_SUCCESS,
+          "curandSetStream failed");
+    }
+    return rand_generator_;
+  }
+
+  ~CudaDeviceContext() {
+    Wait();
+    if (blas_handle_) {
+      PADDLE_ENFORCE(cublasDestroy(blas_handle_) == CUBLAS_STATUS_SUCCESS,
+                     "cublasDestroy failed");
+    }
+
+    if (dnn_handle_) {
+      PADDLE_ENFORCE(cudnnDestroy(dnn_handle_) == CUDNN_STATUS_SUCCESS,
+                     "cudnnDestroy failed");
+    }
+
+    if (rand_generator_) {
+      PADDLE_ENFORCE(
+          curandDestroyGenerator(rand_generator_) == CURAND_STATUS_SUCCESS,
+          "curandDestroyGenerator failed");
+    }
+
+    delete eigen_stream_;
+    delete eigen_device_;
+
+    paddle::platform::throw_on_error(cudaStreamDestroy(stream_),
+                                     "cudaStreamDestroy failed");
+  }
+
+ private:
+  GPUPlace gpu_place_;
+  cudaStream_t stream_;
+
+  Eigen::CudaStreamDevice* eigen_stream_;
+  Eigen::GpuDevice* eigen_device_;
+
+  cublasHandle_t blas_handle_{nullptr};
+
+  cudnnHandle_t dnn_handle_{nullptr};
+
+  int random_seed_;
+  curandGenerator_t rand_generator_{nullptr};
+};
+#endif
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/device_context_test.cu b/paddle/platform/device_context_test.cu
new file mode 100644
index 0000000000..a15fb53b71
--- /dev/null
+++ b/paddle/platform/device_context_test.cu
@@ -0,0 +1,29 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/device_context.h"
+#include "gtest/gtest.h"
+
+
+TEST(DeviceContext, CudaDevice) {
+  int count = paddle::platform::GetDeviceCount();
+  for (int i = 0; i < count; i++) {
+    paddle::platform::CudaDeviceContext* device_context = new paddle::platform::CudaDeviceContext(i);
+    __attribute__((unused)) Eigen::GpuDevice gpu_device = device_context->eigen_device();
+    __attribute__((unused)) cudnnHandle_t cudnn_handle = device_context->cudnn_handle();
+    __attribute__((unused)) cublasHandle_t cublas_handle = device_context->cublas_handle();
+    __attribute__((unused)) curandGenerator_t curand_handle = device_context->curand_generator();
+    delete device_context;
+  }
+}

From a77fcef3f99724e85e2239ad91683b7afe913cd8 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Mon, 3 Jul 2017 12:55:39 +0000
Subject: [PATCH 26/79] fix cuda compile error

---
 paddle/platform/cublas.h          |  3 --
 paddle/platform/cuda.h            |  9 ++++++
 paddle/platform/curand.h          |  5 ++-
 paddle/platform/device_context.h  | 52 +++++++++++++++++--------------
 paddle/platform/dynamic_loader.cc |  4 +--
 5 files changed, 43 insertions(+), 30 deletions(-)

diff --git a/paddle/platform/cublas.h b/paddle/platform/cublas.h
index 70c9713325..d60eb501e9 100644
--- a/paddle/platform/cublas.h
+++ b/paddle/platform/cublas.h
@@ -3,7 +3,6 @@
 
 namespace paddle {
 namespace dyload {
-namespace dynload {
 
 std::once_flag cublas_dso_flag;
 void *cublas_dso_handle = nullptr;
@@ -67,8 +66,6 @@ CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
 #undef DYNAMIC_LOAD_CUBLAS_V2_WRAP
 #undef CUBLAS_BLAS_ROUTINE_EACH
 
-} /* namespace dynload */
-
 // clang-format on
 #ifndef PADDLE_TYPE_DOUBLE
 #define CUBLAS_GEAM dynload::cublasSgeam
diff --git a/paddle/platform/cuda.h b/paddle/platform/cuda.h
index 8fe891f9ce..05290b0e1e 100644
--- a/paddle/platform/cuda.h
+++ b/paddle/platform/cuda.h
@@ -33,6 +33,15 @@ int GetDeviceCount(void) {
   throw_on_error(cudaGetDeviceCount(&count), "cudaGetDeviceCount failed");
   return count;
 }
+int GetCurrentDeviceId(void) {
+  int device_id;
+  throw_on_error(cudaGetDevice(&device_id), "cudaGetDevice failed");
+  return device_id;
+}
+
+void SetDeviceId(int device_id) {
+  throw_on_error(cudaSetDevice(device_id), "cudaSetDevice failed");
+}
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/curand.h b/paddle/platform/curand.h
index 692c024e6e..edff6526bd 100644
--- a/paddle/platform/curand.h
+++ b/paddle/platform/curand.h
@@ -3,6 +3,8 @@
 
 namespace paddle {
 namespace dyload {
+std::once_flag curand_dso_flag;
+void *curand_dso_handle = nullptr;
 #ifdef PADDLE_USE_DSO
 #define DYNAMIC_LOAD_CURAND_WRAP(__name)                                       \
   struct DynLoad__##__name {                                                   \
@@ -31,7 +33,8 @@ namespace dyload {
   __macro(curandSetStream)                   \
   __macro(curandSetPseudoRandomGeneratorSeed)\
   __macro(curandGenerateUniform)             \
-  __macro(curandGenerateUniformDouble)
+  __macro(curandGenerateUniformDouble)       \
+  __macro(curandDestroyGenerator)
 // clang-format on
 
 CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index f95aac4a36..65e76666a7 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -83,11 +83,12 @@ class CudaDeviceContext : public DeviceContext {
   cublasHandle_t cublas_handle() {
     if (!blas_handle_) {
       DeviceGuard guard(gpu_place_);
-      PADDLE_ENFORCE(cublasCreate(&blas_handle_) == CUBLAS_STATUS_SUCCESS,
-                     "cublasCreate failed");
       PADDLE_ENFORCE(
-          cublasSetStream(blas_handle_, stream_) == CUBLAS_STATUS_SUCCESS,
-          "cublasSetStream failed");
+          paddle::dyload::cublasCreate(&blas_handle_) == CUBLAS_STATUS_SUCCESS,
+          "cublasCreate failed");
+      PADDLE_ENFORCE(paddle::dyload::cublasSetStream(blas_handle_, stream_) ==
+                         CUBLAS_STATUS_SUCCESS,
+                     "cublasSetStream failed");
     }
     return blas_handle_;
   }
@@ -95,11 +96,12 @@ class CudaDeviceContext : public DeviceContext {
   cudnnHandle_t cudnn_handle() {
     if (!dnn_handle_) {
       DeviceGuard guard(gpu_place_);
-      PADDLE_ENFORCE(cudnnCreate(&dnn_handle_) == CUDNN_STATUS_SUCCESS,
-                     "cudnnCreate failed");
       PADDLE_ENFORCE(
-          cudnnSetStream(dnn_handle_, stream_) == CUDNN_STATUS_SUCCESS,
-          "cudnnSetStream failed");
+          paddle::dyload::cudnnCreate(&dnn_handle_) == CUDNN_STATUS_SUCCESS,
+          "cudnnCreate failed");
+      PADDLE_ENFORCE(paddle::dyload::cudnnSetStream(dnn_handle_, stream_) ==
+                         CUDNN_STATUS_SUCCESS,
+                     "cudnnSetStream failed");
     }
     return dnn_handle_;
   }
@@ -107,17 +109,17 @@ class CudaDeviceContext : public DeviceContext {
   curandGenerator_t curand_generator() {
     if (!rand_generator_) {
       DeviceGuard guard(gpu_place_);
+      PADDLE_ENFORCE(paddle::dyload::curandCreateGenerator(
+                         &rand_generator_, CURAND_RNG_PSEUDO_DEFAULT) ==
+                         CURAND_STATUS_SUCCESS,
+                     "curandCreateGenerator failed");
       PADDLE_ENFORCE(
-          curandCreateGenerator(&rand_generator_, CURAND_RNG_PSEUDO_DEFAULT) ==
-              CURAND_STATUS_SUCCESS,
-          "curandCreateGenerator failed");
-      PADDLE_ENFORCE(
-          curandSetPseudoRandomGeneratorSeed(rand_generator_, random_seed_) ==
-              CURAND_STATUS_SUCCESS,
+          paddle::dyload::curandSetPseudoRandomGeneratorSeed(
+              rand_generator_, random_seed_) == CURAND_STATUS_SUCCESS,
           "curandSetPseudoRandomGeneratorSeed failed");
-      PADDLE_ENFORCE(
-          curandSetStream(rand_generator_, stream_) == CURAND_STATUS_SUCCESS,
-          "curandSetStream failed");
+      PADDLE_ENFORCE(paddle::dyload::curandSetStream(
+                         rand_generator_, stream_) == CURAND_STATUS_SUCCESS,
+                     "curandSetStream failed");
     }
     return rand_generator_;
   }
@@ -125,19 +127,21 @@ class CudaDeviceContext : public DeviceContext {
   ~CudaDeviceContext() {
     Wait();
     if (blas_handle_) {
-      PADDLE_ENFORCE(cublasDestroy(blas_handle_) == CUBLAS_STATUS_SUCCESS,
-                     "cublasDestroy failed");
+      PADDLE_ENFORCE(
+          paddle::dyload::cublasDestroy(blas_handle_) == CUBLAS_STATUS_SUCCESS,
+          "cublasDestroy failed");
     }
 
     if (dnn_handle_) {
-      PADDLE_ENFORCE(cudnnDestroy(dnn_handle_) == CUDNN_STATUS_SUCCESS,
-                     "cudnnDestroy failed");
+      PADDLE_ENFORCE(
+          paddle::dyload::cudnnDestroy(dnn_handle_) == CUDNN_STATUS_SUCCESS,
+          "cudnnDestroy failed");
     }
 
     if (rand_generator_) {
-      PADDLE_ENFORCE(
-          curandDestroyGenerator(rand_generator_) == CURAND_STATUS_SUCCESS,
-          "curandDestroyGenerator failed");
+      PADDLE_ENFORCE(paddle::dyload::curandDestroyGenerator(rand_generator_) ==
+                         CURAND_STATUS_SUCCESS,
+                     "curandDestroyGenerator failed");
     }
 
     delete eigen_stream_;
diff --git a/paddle/platform/dynamic_loader.cc b/paddle/platform/dynamic_loader.cc
index 9036eaf642..c34abc392c 100644
--- a/paddle/platform/dynamic_loader.cc
+++ b/paddle/platform/dynamic_loader.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "dynamic_loader.h"
 #include <gflags/gflags.h>
-#include "DynamicLoader.h"
-#include "Logging.h"
+#include <glog/logging.h>
 
 DEFINE_string(cudnn_dir, "",
               "Specify path for loading libcudnn.so. For instance, "

From 3f63d96abec165426bcd464f7aff32e2e42ed021 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 3 Jul 2017 23:16:11 +0800
Subject: [PATCH 27/79] Fix link error in op_proto_test.

---
 paddle/framework/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 50107faaed..f7e5753ac2 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -7,4 +7,4 @@ cc_test(scope_test SRCS scope_test.cc)
 cc_test(enforce_test SRCS enforce_test.cc)
 proto_library(attr_type SRCS attr_type.proto)
 proto_library(op_proto SRCS op_proto.proto)
-cc_test(op_proto_test SRCS op_proto_test.cc DEPS attr_type op_proto protobuf)
+cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto attr_type protobuf)

From e12d7269ff473db5cc87de1344630eb348017a4a Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Tue, 4 Jul 2017 01:22:01 +0000
Subject: [PATCH 28/79] fix by helin's comments

---
 python/paddle/v2/reader/creator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py
index 20624d5286..61b5cc134f 100644
--- a/python/paddle/v2/reader/creator.py
+++ b/python/paddle/v2/reader/creator.py
@@ -106,7 +106,7 @@ def recordio(paths, buf_size=100):
 
         while True:
             r, err = client.next_record()
-            if r is None:
+            if err < 0:
                 break
             yield r
 

From ed18647e37f4e345f02171f29af6e22fab4790ea Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Tue, 4 Jul 2017 11:00:59 +0800
Subject: [PATCH 29/79] finish test

---
 paddle/platform/CMakeLists.txt         |   1 -
 paddle/platform/cuda.h                 |   1 +
 paddle/platform/device_context.h       | 170 -------------------------
 paddle/platform/device_context_test.cu |  29 -----
 4 files changed, 1 insertion(+), 200 deletions(-)
 delete mode 100644 paddle/platform/device_context.h
 delete mode 100644 paddle/platform/device_context_test.cu

diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index c95b54a4df..ffdc23d599 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -4,4 +4,3 @@ cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
 cc_library(dynamic_loader SRCS dynamic_loader.cc)
-nv_test(device_context_test SRCS device_context_test.cu DEPS place dynamic_loader glog gflags)
diff --git a/paddle/platform/cuda.h b/paddle/platform/cuda.h
index 05290b0e1e..5ed36c0f02 100644
--- a/paddle/platform/cuda.h
+++ b/paddle/platform/cuda.h
@@ -33,6 +33,7 @@ int GetDeviceCount(void) {
   throw_on_error(cudaGetDeviceCount(&count), "cudaGetDeviceCount failed");
   return count;
 }
+
 int GetCurrentDeviceId(void) {
   int device_id;
   throw_on_error(cudaGetDevice(&device_id), "cudaGetDevice failed");
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
deleted file mode 100644
index 65e76666a7..0000000000
--- a/paddle/platform/device_context.h
+++ /dev/null
@@ -1,170 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifndef PADDLE_ONLY_CPU
-#include "paddle/platform/cublas.h"
-#include "paddle/platform/cuda.h"
-#include "paddle/platform/cudnn.h"
-#include "paddle/platform/curand.h"
-#define EIGEN_USE_GPU
-#endif
-
-#include "paddle/framework/enforce.h"
-#include "paddle/platform/place.h"
-#include "unsupported/Eigen/CXX11/Tensor"
-
-namespace paddle {
-namespace platform {
-
-class DeviceContext {
- public:
-  virtual ~DeviceContext() {}
-};
-
-class CpuDeviceContext : public DeviceContext {
-  Eigen::DefaultDevice eigen_device() {
-    if (!eigen_device_) {
-      eigen_device_ = new Eigen::DefaultDevice();
-    }
-    return *eigen_device_;
-  }
-
- private:
-  Eigen::DefaultDevice* eigen_device_{nullptr};
-};
-
-#ifndef PADDLE_ONLY_CPU
-class DeviceGuard {
- public:
-  explicit DeviceGuard(GPUPlace new_place) : previous_(GetCurrentDeviceId()) {
-    if (previous_ != new_place) {
-      paddle::platform::SetDeviceId(new_place.device);
-    }
-  }
-
-  ~DeviceGuard() { paddle::platform::SetDeviceId(previous_.device); }
-
- private:
-  GPUPlace previous_;
-};
-
-class CudaDeviceContext : public DeviceContext {
- public:
-  explicit CudaDeviceContext(const GPUPlace gpu_place) : gpu_place_(gpu_place) {
-    DeviceGuard guard(gpu_place_);
-    paddle::platform::throw_on_error(cudaStreamCreate(&stream_),
-                                     "cudaStreamCreate failed");
-    eigen_stream_ = new Eigen::CudaStreamDevice(&stream_);
-    eigen_device_ = new Eigen::GpuDevice(eigen_stream_);
-  }
-
-  void Wait() {
-    paddle::platform::throw_on_error(cudaStreamSynchronize(stream_),
-                                     "cudaStreamSynchronize failed");
-  }
-
-  cudaStream_t stream() { return stream_; }
-
-  Eigen::GpuDevice eigen_device() { return *eigen_device_; }
-
-  cublasHandle_t cublas_handle() {
-    if (!blas_handle_) {
-      DeviceGuard guard(gpu_place_);
-      PADDLE_ENFORCE(
-          paddle::dyload::cublasCreate(&blas_handle_) == CUBLAS_STATUS_SUCCESS,
-          "cublasCreate failed");
-      PADDLE_ENFORCE(paddle::dyload::cublasSetStream(blas_handle_, stream_) ==
-                         CUBLAS_STATUS_SUCCESS,
-                     "cublasSetStream failed");
-    }
-    return blas_handle_;
-  }
-
-  cudnnHandle_t cudnn_handle() {
-    if (!dnn_handle_) {
-      DeviceGuard guard(gpu_place_);
-      PADDLE_ENFORCE(
-          paddle::dyload::cudnnCreate(&dnn_handle_) == CUDNN_STATUS_SUCCESS,
-          "cudnnCreate failed");
-      PADDLE_ENFORCE(paddle::dyload::cudnnSetStream(dnn_handle_, stream_) ==
-                         CUDNN_STATUS_SUCCESS,
-                     "cudnnSetStream failed");
-    }
-    return dnn_handle_;
-  }
-
-  curandGenerator_t curand_generator() {
-    if (!rand_generator_) {
-      DeviceGuard guard(gpu_place_);
-      PADDLE_ENFORCE(paddle::dyload::curandCreateGenerator(
-                         &rand_generator_, CURAND_RNG_PSEUDO_DEFAULT) ==
-                         CURAND_STATUS_SUCCESS,
-                     "curandCreateGenerator failed");
-      PADDLE_ENFORCE(
-          paddle::dyload::curandSetPseudoRandomGeneratorSeed(
-              rand_generator_, random_seed_) == CURAND_STATUS_SUCCESS,
-          "curandSetPseudoRandomGeneratorSeed failed");
-      PADDLE_ENFORCE(paddle::dyload::curandSetStream(
-                         rand_generator_, stream_) == CURAND_STATUS_SUCCESS,
-                     "curandSetStream failed");
-    }
-    return rand_generator_;
-  }
-
-  ~CudaDeviceContext() {
-    Wait();
-    if (blas_handle_) {
-      PADDLE_ENFORCE(
-          paddle::dyload::cublasDestroy(blas_handle_) == CUBLAS_STATUS_SUCCESS,
-          "cublasDestroy failed");
-    }
-
-    if (dnn_handle_) {
-      PADDLE_ENFORCE(
-          paddle::dyload::cudnnDestroy(dnn_handle_) == CUDNN_STATUS_SUCCESS,
-          "cudnnDestroy failed");
-    }
-
-    if (rand_generator_) {
-      PADDLE_ENFORCE(paddle::dyload::curandDestroyGenerator(rand_generator_) ==
-                         CURAND_STATUS_SUCCESS,
-                     "curandDestroyGenerator failed");
-    }
-
-    delete eigen_stream_;
-    delete eigen_device_;
-
-    paddle::platform::throw_on_error(cudaStreamDestroy(stream_),
-                                     "cudaStreamDestroy failed");
-  }
-
- private:
-  GPUPlace gpu_place_;
-  cudaStream_t stream_;
-
-  Eigen::CudaStreamDevice* eigen_stream_;
-  Eigen::GpuDevice* eigen_device_;
-
-  cublasHandle_t blas_handle_{nullptr};
-
-  cudnnHandle_t dnn_handle_{nullptr};
-
-  int random_seed_;
-  curandGenerator_t rand_generator_{nullptr};
-};
-#endif
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/platform/device_context_test.cu b/paddle/platform/device_context_test.cu
deleted file mode 100644
index a15fb53b71..0000000000
--- a/paddle/platform/device_context_test.cu
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/platform/device_context.h"
-#include "gtest/gtest.h"
-
-
-TEST(DeviceContext, CudaDevice) {
-  int count = paddle::platform::GetDeviceCount();
-  for (int i = 0; i < count; i++) {
-    paddle::platform::CudaDeviceContext* device_context = new paddle::platform::CudaDeviceContext(i);
-    __attribute__((unused)) Eigen::GpuDevice gpu_device = device_context->eigen_device();
-    __attribute__((unused)) cudnnHandle_t cudnn_handle = device_context->cudnn_handle();
-    __attribute__((unused)) cublasHandle_t cublas_handle = device_context->cublas_handle();
-    __attribute__((unused)) curandGenerator_t curand_handle = device_context->curand_generator();
-    delete device_context;
-  }
-}

From 76b7be46da5fe211d25e62712673cc01bea98d54 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Tue, 4 Jul 2017 11:16:49 +0800
Subject: [PATCH 30/79] add deps for dyload cc_library

---
 paddle/platform/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index ffdc23d599..4f6381b8af 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -3,4 +3,4 @@ nv_test(cuda_test SRCS cuda_test.cu)
 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
-cc_library(dynamic_loader SRCS dynamic_loader.cc)
+cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags)

From 3b073fdc2be1c808db27519e01e3a61c07927959 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Tue, 4 Jul 2017 11:25:11 +0800
Subject: [PATCH 31/79] fix error in test_LayerGrad

---
 paddle/gserver/tests/test_LayerGrad.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index ed067e7c3a..d3c99eb8b9 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -873,7 +873,7 @@ TEST(Layer, SequenceLastInstanceLayer) {
 TEST(Layer, AverageLayer) {
   testDegradeLayer(false, "average", "non-seq", -1);  // seq average to non-seq
   testDegradeLayer(false,
-                   "max",
+                   "average",
                    "non-seq",
                    5);  // seq average to a shorten seq, stride window = 5
   testDegradeLayer(

From f535b79820ae97ade802053dc421a893460367c8 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Tue, 4 Jul 2017 12:05:52 +0800
Subject: [PATCH 32/79] sort the Author.md with Alphabetical order

---
 AUTHORS.md | 74 +++++++++++++++++++++++++++---------------------------
 1 file changed, 37 insertions(+), 37 deletions(-)

diff --git a/AUTHORS.md b/AUTHORS.md
index 09698ac140..4db4a4a8e7 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -1,48 +1,48 @@
 | Github account | name |
 |---|---|
+| backyes | Yan-Fei Wang |
 | beckett1124 | Bin Qi |
-| Canpio | Jiayi Feng |
-| chengxiaohua1105 | Xiaohua Cheng |
-| xushaoyong | Shaoyong Xu |
-| liuyuan | Yuan Liu |
-| xujun05 | Jun Xu |
-| dzhwinter | Zhihong Dong |
-| Guo Sheng | Sheng Guo |
-| kuke | Yibing Liu |
-| llxxxll | YongFeng Liu |
-| cxysteven | Xingyi Cheng |
-| NHZlX | Zhaolong Xing |
-| pakchoi | Chuanjiang Song |
-| pkuyym | Yaming Yang |
-| Superjom | Chunwei Yan |
-| wanghaoshuang | Haoshuang Wang |
-| wangzhen-nlp | Zhen Wang |
-| wwhu | Weiwei Hu |
-| xinghai-sun | XingHai Sun |
-| zhaopu7 | Pu Zhao |
-| reyoung | Yang Yu |
+| Canpio | Jia-Yi Feng |
+| chengxiaohua1105 | Xiao-Hua Cheng |
+| cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
+| cxysteven | Xing-Yi Cheng |
+| dzhwinter | Zhi-Hong Dong |
+| emailweixu | Wei Xu |
 | gangliao | Gang Liao |
-| luotao01 | Tao Luo |
-| jacquesqiao | Long-Fei Qiao |
-| qingqing01 | Qing-Qing Dang |
+| gongweibao | Wei-Bao Gong |
+| Guo Sheng | Sheng Guo |
+| Haichao-Zhang | Hai-Chao Zhang |
 | hedaoyuan | Dao-Yuan He |
-| wangyang59 | Yang Wang |
+| helinwang | He-Lin Wang |
+| jacquesqiao | Long-Fei Qiao |
+| kuke | Yi-Bing Liu |
+| lcy-seso | Ying Cao |
+| lipeng-unisound | Peng Li |
+| liuyuan | Yuan Liu |
+| livc | Zhao Li |
+| llxxxll | Yong-Feng Liu |
+| luotao01 | Tao Luo |
+| lzhao4ever | Liang Zhao |
+| NHZlX | Zhao-Long Xing |
+| pakchoi | Chuan-Jiang Song |
+| pengli09 | Peng Li |
+| pkuyym | Ya-Ming Yang |
 | QiJune | Jun Qi |
+| qingqing01 | Qing-Qing Dang |
+| reyoung | Yang Yu |
+| Superjom | Chun-Wei Yan |
 | tianbingsz | Tian-Bing Xu |
-| cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
 | typhoonzero | Yi Wu |
-| backyes | Yan-Fei Wang |
-| pengli09 | Peng Li |
-| livc | Zhao Li |
+| wanghaoshuang | Hao-Shuang Wang |
+| wangyang59 | Yang Wang |
+| wangzhen-nlp | Zhen Wang |
+| wen-bo-yang | Wen-Bo Yang |
+| wwhu | Wei-Wei Hu |
+| xinghai-sun | Xing-Hai Sun |
 | Xreki | Yi-Qun Liu |
+| xujun05 | Jun Xu |
+| xushaoyong | Shao-Yong Xu |
 | Yancey1989 | Xu Yan |
-| emailweixu | Wei Xu |
-| wen-bo-yang | Wen-Bo Yang |
-| helinwang | He-Lin Wang |
-| lcy-seso | Ying Cao |
-| Zrachel | Rui-Qing Zhang |
-| Haichao-Zhang | Hai-Chao Zhang |
-| gongweibao | Wei-Bao Gong |
-| lzhao4ever | Liang Zhao |
+| zhaopu7 | Pu Zhao |
 | zhouxiao-coder | Xiao Zhou |
-| lipeng-unisound | Peng Li |
+| Zrachel | Rui-Qing Zhang |

From 06156daa281e55fe5d06217cc545cd8c09aa4c9d Mon Sep 17 00:00:00 2001
From: "Superjom (Chunwei Yan)" <superjom@gmail.com>
Date: Tue, 4 Jul 2017 12:07:16 +0800
Subject: [PATCH 33/79] net design with NetBuilder (#2598)

* move net_design to framework

* change CreateNet result to unique_ptr

* rename "ScratchNet" -> "PlainNet"

* add three methods to NetBase

* add NetBuilder

* add InferShape to NetBuilder.Run

* rename ApplyGradient, ApplyOptimizer -> AddGradientOps, AddOptimiz

* rename PlainNet::CreateNet -> BuildNet

* add Error and other rename actions
---
 paddle/framework/net_design.md | 250 +++++++++++++++++++++++++++++++++
 1 file changed, 250 insertions(+)
 create mode 100644 paddle/framework/net_design.md

diff --git a/paddle/framework/net_design.md b/paddle/framework/net_design.md
new file mode 100644
index 0000000000..a5f0483081
--- /dev/null
+++ b/paddle/framework/net_design.md
@@ -0,0 +1,250 @@
+# Network Design
+
+`Network` is the container and controller of a set of operators,
+user can build a real network from a `NetDesc` which is a protobuf message 
+and use `Network.Run()` to run all the operators in the network.
+
+A network object knows all Operators belonging to this network. Variables, 
+which are inputs and outputs of these operators, 
+are created and managed by a hierarchy of Scope objects.
+
+# API
+
+## Net
+To make the `Network` extendable, a base class is defined like this
+
+```c++
+// operator's index stored in a network.
+typedef int OpIndex;
+
+// The minimum a network should be implemented.
+class Net {
+ public:
+  // run all the operators and return success(true) or not, with all the
+  // variables are located in `scope`. `context` describes the detail execution
+  // environment for ops. `begin` and `end` specify the scope of `ops_` to run,
+  // If no positive indexes are provided, all operators in `ops_` will run.
+  virtual Error Run(Scope *scope, OpContext *context, OpIndex begin = -1,
+                   OpIndex end = -1) const = 0;
+
+  // Add an Operator according to `def`.
+  virtual OpIndex AddOp(const proto::OpDef &def) = 0;
+
+  // Add optimizer operators acctording to `attrs`.
+  virtual Error AddOptimizerOps(const OptAttrs &attrs) = 0;
+
+  // Add backward operators.
+  virtual Error AddBackwardOps() = 0;
+
+  // Infer the shapes of variables required by operators in the network. The
+  // `scope` will be mutated according to the inferred shapes.
+
+  static std::unique_ptr<Net> Create(const NetDesc &def = NetDesc());
+};
+```
+
+All network implementations should build networks from a protobuf message which 
+describes the structure of a real network; `Run` method should be implemented by 
+all implementations to offer a universal method to forward or backward compute a network.
+
+`Net::Create` is a method of factory pattern and can be implemented like
+
+```c++
+std::unique<Net> Net::Create(const NetDesc& def) {
+  switch (def.model_type()) {
+    case NN:
+      return new Network(def);
+    case Recursive:
+      return new RecursiveNet(def);
+    case Recurrent:
+      return new RecurrentNet(def);
+  }
+  return nullptr;
+}
+```
+
+Network is designed as the container of operators. to make it more extendable,
+we decouple it from the related variable resources. 
+
+`Run(Scope* scope)` takes the scope as a argument so that it can run in different scopes.
+
+Finally, `Net` can be used as followed
+
+```c++
+Scope default_scope;
+OpContext default_context;
+auto net = Net::CreateNet(def);
+
+if (net) {
+  net.Run(&default_scope, &default_context);
+}
+```
+
+## `PlainNet` as a simple implementation of `BaseNet`
+
+A very basic implementation is as follows. All it does is simply to run every operators in sequence.
+
+```c++
+class PlainNet : public Net {
+ public:
+  // Create a network describe by `def`.  NetDesc is the definition of a network.
+  PlainNet(const NetDesc &def);
+
+  // Infer all the operators' input and output varialbes' shapes, will be called before every mini-batch
+  training.
+  virtual Error InferShape(Scope *scope) override;
+
+  // Run all the operators with the `scope`, if no scope is provided, default
+  // scope will be used instead. If no OpContext is provicded, default context will be used.
+  virtual Error Run(Scope *scope = nullptr, OpContext *context=nullptr, OpIndex begin = -1,
+                   OpIndex end = -1) const override;
+
+  virtual OpIndex AddOp(const proto::OpDef &def) override;
+
+  virtual Error AddOptimizerOps(const OptAttrs &attrs) override;
+
+  virtual Error AddBackwardOps() override;
+
+ protected:
+  // Create operators accordding to `def`, will be called by the constructor.
+  Error BuildNet(const NetDesc &def);
+
+  // Add a operator which is identified as `type` and has attributes described
+  // in `attrs`, the `inputs` are the keys of readonly input variables,
+  // `outputs` are keys of mutable output variables. An `OpIndex` will be
+  // returned to indicate the offset of the new operator in `ops_`.
+  OpIndex AddOp(const std::string &type, const std::vector<string> &inputs,
+                const std::vector<string> &outputs,
+                const OprAttr &attrs = OprAttr());
+
+ private:
+  // the operators owned by `Network`.
+  std::vector<Operator> ops_;
+};
+```
+
+`PlainNet` will create operators so that a private member `ops_` is defined,
+the operators are created by `CreateNet`, and each operator is created by `AddOp`.
+
+
+## PlainNet Usage
+`PlainNet` can be used to define and run a network as follows
+
+```c++
+// create an empty scope located on CPU device.
+Scope scope(CPUPlace());
+
+// create and init variables described in `net_desc`.
+scope.CreateVariables(net_desc);
+scope.InitVariables(net_desc);
+
+// create a network according to `net_desc`
+auto net = Net::CreateNet(net_desc);
+// Add more operators if needed.
+net->AddOp(add...);
+net->AddOp(fc...);
+
+net->AddBackwardOps();
+net->AddOptimizerOps();
+
+// run the network providing the `scope`.
+net.Run(&scope);
+```
+
+## `NetBuilder` as a C++ syntax wrapper
+This is a detailed description of the user-related C++ network API, and may not needed in the prototype development stage.
+
+The `NetBuilder` will give users a much simpler syntax as follows to create a network, and demonstrates how to use the `BaseNet`'s raw interfaces.
+
+```c++
+Variable* fc_out = builder.AddOp("fc", input=image, size=100, activation="Sigmoid");
+Variable* prediction = builder.AddOp("fc", input=fc_out, size=10, activation="Sigmoid");
+Variable* loss = builder.AddOp("cross_entropy", input=prediction, label=label);
+Variable* avg_loss = builder.AddOp("mean", loss);
+
+builder.BackwardFrom(avg_loss)
+builder.AddOptimization(1e-4, "adam");
+builder.Run();
+```
+
+`NetBuilder` will call `Net` 's virtual functions to change the real network structure, here is a sample definition
+
+```c++
+class NetBuilder final {
+ public:
+  NetBuilder(Net* net) : net_(net) {}
+
+  Variable* AddOp(const string& type, const vector<Variable>& inputs,
+                  size_t size, Activation act) {
+    // much code here.
+    // ...
+    net_->AddOp(def);
+    need_rebuild_net_ = true;
+    net_->InferShape();
+    // ...
+  }
+
+  Error BackwardFrom(const Variable& cost);
+
+  Error Run(Scope* scope, OpContext* context, bool need_backward = true) {
+    // backward.
+    if (need_backward) {
+      if (need_rebuild_net_) {
+        AddBackwardOps();
+        AddOptimizerOps();
+      }
+      net_->Run(scope, context);
+      return;
+    }
+    // just forward.
+    net_->Run(scope, context, 0, last_forward_op_);
+  }
+
+ protected:
+  Error AddBackwardOps();
+  Error AddOptimizerOps();
+
+ private:
+  Net* net_;
+  OpIndex last_forward_op_{-1};
+  bool need_rebuild_net_{true};
+}
+```
+
+## Compatibility with RNN
+
+Benefitting from the decoupling of `PlainNet.Run` and `Scope`, `PlainNet` is compatible with future RNN design, 
+for example we can implement a simple recurrent neural network as follows
+
+```c++
+// copy some `vars` form `source` to `target`
+void Copy(const Scope &source, Scope &target,
+          const std::vector<std::string> &vars);
+
+Scope default_scope;
+// some initial mutations on `default_scope` here.
+
+auto rnn_step_net = PlainNet(rnn_step_net_def);
+
+// Create rnn's states, the last scope is used to store rnn outputs.
+Scope *rnn_states = new Scope[num_states + 1];
+
+for (int i = 0; i < num_states + 1; i++) {
+  // Initialize all rnn state scopes, copy parameters and so on.
+  rnn_states[i].CreateVars(rnn_step_net_def);
+  Copy(default_scope, rnn_states[i], rnn_related_vars);
+  // Prepare rnn's inlinks, just copy inlink variables to each state.
+  Copy(default_scope, rnn_states[i], inlink_vars);
+}
+
+// Run the rnn.
+for (int i = 0; i < num_states; i++) {
+  rnn_step_net.Run(rnn_states[i]);
+  // Copy current state's state variables to next state, the related variables
+  // are named like "previous_state_xxx".
+  Copy(rnn_states[i], rnn_states[i + 1], pre_state_vars)
+}
+
+// Copy rnn's final outputs to `default_scope`.
+Copy(rnn_states[num_states], default_scope, outlink_vars);
+```

From 3de3894b821c06daf596c0818b6c89f4dd185928 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 4 Jul 2017 12:53:00 +0800
Subject: [PATCH 34/79] Add DEPS to `proto_library`

Missing DEPS will cause compile error when parallel is large.
---
 cmake/generic.cmake             | 4 ++--
 paddle/framework/CMakeLists.txt | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index ca358da8f1..fb2222440c 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -323,10 +323,10 @@ endfunction(go_test)
 
 function(proto_library TARGET_NAME)
   set(oneValueArgs "")
-  set(multiValueArgs SRCS)
+  set(multiValueArgs SRCS DEPS)
   cmake_parse_arguments(proto_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
   set(proto_srcs)
   set(proto_hdrs)
   protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS})
-  cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS protobuf)
+  cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS ${proto_library_DEPS} protobuf)
 endfunction()
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index f7e5753ac2..e781866759 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -6,5 +6,5 @@ cc_test(variable_test SRCS variable_test.cc)
 cc_test(scope_test SRCS scope_test.cc)
 cc_test(enforce_test SRCS enforce_test.cc)
 proto_library(attr_type SRCS attr_type.proto)
-proto_library(op_proto SRCS op_proto.proto)
+proto_library(op_proto SRCS op_proto.proto DEPS attr_type)
 cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto attr_type protobuf)

From b8cc07920e3cf623250ea0b9b078049ff1348279 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 4 Jul 2017 13:20:55 +0800
Subject: [PATCH 35/79] FIX: add eigen3 interface deps

---
 cmake/external/eigen.cmake | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 45f44f617d..39b16c3b2b 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -5,7 +5,7 @@ SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3)
 INCLUDE_DIRECTORIES(${EIGEN_SOURCE_DIR}/src/eigen3)
 
 ExternalProject_Add(
-    eigen3
+    extern_eigen3
     ${EXTERNAL_PROJECT_LOG_ARGS}
     # for latest version, please get from official website
     # URL            "https://bitbucket.org/eigen/eigen/get/3.3.4.tar.gz"
@@ -26,4 +26,7 @@ ExternalProject_Add(
     TEST_COMMAND      ""
 )
 
-LIST(APPEND external_project_dependencies eigen3)
+ADD_LIBRARY(eigen3 INTERFACE)
+ADD_DEPENDENCIES(eigen3 extern_eigen3)
+
+LIST(APPEND external_project_dependencies extern_eigen3)

From 414c2b1734bcf39135935a201f7244d79a72e172 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 4 Jul 2017 13:25:30 +0800
Subject: [PATCH 36/79] FIX: add any as interface dep

---
 cmake/external/any.cmake | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/cmake/external/any.cmake b/cmake/external/any.cmake
index 62eea42692..b61e421871 100644
--- a/cmake/external/any.cmake
+++ b/cmake/external/any.cmake
@@ -5,7 +5,7 @@ SET(ANY_SOURCE_DIR ${THIRD_PARTY_PATH}/any)
 INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/linb_any)
 
 ExternalProject_Add(
-    linb_any
+    extern_lib_any
     ${EXTERNAL_PROJECT_LOG_ARGS}
     GIT_REPOSITORY  "https://github.com/thelink2012/any.git"
     GIT_TAG         "8fef1e93710a0edf8d7658999e284a1142c4c020"
@@ -17,5 +17,8 @@ ExternalProject_Add(
     TEST_COMMAND      ""
 )
 
+ADD_LIBRARY(lib_any INTERFACE)
+ADD_DEPENDENCIES(lib_any extern_lib_any)
+
 add_definitions(-DANY_IMPL_ANY_CAST_MOVEABLE)
-LIST(APPEND external_project_dependencies linb_any)
\ No newline at end of file
+LIST(APPEND external_project_dependencies extern_lib_any)

From b7397031e9c03363c4e3e3119ff371b0d8a13e7c Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 4 Jul 2017 13:47:02 +0800
Subject: [PATCH 37/79] Add target_link_libraries for cc_library

It will fix #2728.

Maybe it is silly to `target_link_libraries` for static library,
because a static library do not need to link other libraries. But
it will tell cmake how to propagate dependencies.

The solution comes from
[here](http://floooh.github.io/2016/01/12/cmake-dependency-juggling.html).

* Also change op_proto_test DEPS for testing this fix works.
---
 cmake/generic.cmake             | 2 ++
 paddle/framework/CMakeLists.txt | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index fb2222440c..cae9524b2f 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -162,6 +162,7 @@ function(cc_library TARGET_NAME)
     endif()
     if (cc_library_DEPS)
       add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
+      target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
     endif()
   else(cc_library_SRCS)
     if (cc_library_DEPS)
@@ -211,6 +212,7 @@ function(nv_library TARGET_NAME)
       endif()
       if (nv_library_DEPS)
         add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
+        target_link_libraries(${TARGET_NAME} ${nv_library_DEPS})
       endif()
     else(nv_library_SRCS)
       if (nv_library_DEPS)
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index e781866759..baad38e3c1 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -7,4 +7,4 @@ cc_test(scope_test SRCS scope_test.cc)
 cc_test(enforce_test SRCS enforce_test.cc)
 proto_library(attr_type SRCS attr_type.proto)
 proto_library(op_proto SRCS op_proto.proto DEPS attr_type)
-cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto attr_type protobuf)
+cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)

From 9eeabe986d039b3fe3b28e5ef98f66d6dd2a3e31 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Tue, 4 Jul 2017 14:03:58 +0800
Subject: [PATCH 38/79] follow comments

---
 paddle/platform/cublas.h          | 58 +++++++++++++++++++++----------
 paddle/platform/cudnn.h           | 38 +++++++++++++++-----
 paddle/platform/curand.h          | 40 +++++++++++++++------
 paddle/platform/dynamic_loader.cc | 16 +++++++--
 paddle/platform/dynamic_loader.h  | 14 ++++----
 5 files changed, 119 insertions(+), 47 deletions(-)

diff --git a/paddle/platform/cublas.h b/paddle/platform/cublas.h
index d60eb501e9..90704f37e6 100644
--- a/paddle/platform/cublas.h
+++ b/paddle/platform/cublas.h
@@ -1,7 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
 #include <cublas_v2.h>
 #include "paddle/platform/dynamic_loader.h"
 
 namespace paddle {
+namespace platform {
 namespace dyload {
 
 std::once_flag cublas_dso_flag;
@@ -15,15 +32,17 @@ void *cublas_dso_handle = nullptr;
  * note: default dynamic linked libs
  */
 #ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)                                       \
-  struct DynLoad__##__name {                                                   \
-    template <typename... Args>                                                \
-    cublasStatus_t operator()(Args... args) {                                  \
-      typedef cublasStatus_t (*cublasFunc)(Args...);                           \
-      std::call_once(cublas_dso_flag, GetCublasDsoHandle, &cublas_dso_handle); \
-      void *p_##__name = dlsym(cublas_dso_handle, #__name);                    \
-      return reinterpret_cast<cublasFunc>(p_##__name)(args...);                \
-    }                                                                          \
+#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)                           \
+  struct DynLoad__##__name {                                       \
+    template <typename... Args>                                    \
+    cublasStatus_t operator()(Args... args) {                      \
+      typedef cublasStatus_t (*cublasFunc)(Args...);               \
+      std::call_once(cublas_dso_flag,                              \
+                     paddle::platform::dyload::GetCublasDsoHandle, \
+                     &cublas_dso_handle);                          \
+      void *p_##__name = dlsym(cublas_dso_handle, #__name);        \
+      return reinterpret_cast<cublasFunc>(p_##__name)(args...);    \
+    }                                                              \
   } __name;  // struct DynLoad__##__name
 #else
 #define DYNAMIC_LOAD_CUBLAS_WRAP(__name)      \
@@ -68,17 +87,18 @@ CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
 
 // clang-format on
 #ifndef PADDLE_TYPE_DOUBLE
-#define CUBLAS_GEAM dynload::cublasSgeam
-#define CUBLAS_GEMV dynload::cublasSgemv
-#define CUBLAS_GEMM dynload::cublasSgemm
-#define CUBLAS_GETRF dynload::cublasSgetrfBatched
-#define CUBLAS_GETRI dynload::cublasSgetriBatched
+#define CUBLAS_GEAM paddle::platform::dynload::cublasSgeam
+#define CUBLAS_GEMV paddle::platform::dynload::cublasSgemv
+#define CUBLAS_GEMM paddle::platform::dynload::cublasSgemm
+#define CUBLAS_GETRF paddle::platform::dynload::cublasSgetrfBatched
+#define CUBLAS_GETRI paddle::platform::dynload::cublasSgetriBatched
 #else
-#define CUBLAS_GEAM dynload::cublasDgeam
-#define CUBLAS_GEMV dynload::cublasDgemv
-#define CUBLAS_GEMM dynload::cublasDgemm
-#define CUBLAS_GETRF dynload::cublasDgetrfBatched
-#define CUBLAS_GETRI dynload::cublasDgetriBatched
+#define CUBLAS_GEAM paddle::platform::dynload::cublasDgeam
+#define CUBLAS_GEMV paddle::platform::dynload::cublasDgemv
+#define CUBLAS_GEMM paddle::platform::dynload::cublasDgemm
+#define CUBLAS_GETRF paddle::platform::dynload::cublasDgetrfBatched
+#define CUBLAS_GETRI paddle::platform::dynload::cublasDgetriBatched
 #endif
 }  // namespace dyload
+}  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/cudnn.h b/paddle/platform/cudnn.h
index ab878cd555..06e2a05d86 100644
--- a/paddle/platform/cudnn.h
+++ b/paddle/platform/cudnn.h
@@ -1,7 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
 #include <cudnn.h>
 #include "paddle/platform/dynamic_loader.h"
 
 namespace paddle {
+namespace platform {
 namespace dyload {
 
 std::once_flag cudnn_dso_flag;
@@ -9,15 +26,17 @@ void* cudnn_dso_handle = nullptr;
 
 #ifdef PADDLE_USE_DSO
 
-#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                                     \
-  struct DynLoad__##__name {                                                \
-    template <typename... Args>                                             \
-    auto operator()(Args... args) -> decltype(__name(args...)) {            \
-      using cudnn_func = decltype(__name(args...)) (*)(Args...);            \
-      std::call_once(cudnn_dso_flag, GetCudnnDsoHandle, &cudnn_dso_handle); \
-      void* p_##__name = dlsym(cudnn_dso_handle, #__name);                  \
-      return reinterpret_cast<cudnn_func>(p_##__name)(args...);             \
-    }                                                                       \
+#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                           \
+  struct DynLoad__##__name {                                      \
+    template <typename... Args>                                   \
+    auto operator()(Args... args) -> decltype(__name(args...)) {  \
+      using cudnn_func = decltype(__name(args...)) (*)(Args...);  \
+      std::call_once(cudnn_dso_flag,                              \
+                     paddle::platform::dyload::GetCudnnDsoHandle, \
+                     &cudnn_dso_handle);                          \
+      void* p_##__name = dlsym(cudnn_dso_handle, #__name);        \
+      return reinterpret_cast<cudnn_func>(p_##__name)(args...);   \
+    }                                                             \
   } __name; /* struct DynLoad__##__name */
 
 #else
@@ -111,4 +130,5 @@ CUDNN_DNN_ROUTINE_EACH_R5(DYNAMIC_LOAD_CUDNN_WRAP)
 #undef CUDNN_DNN_ROUTINE_EACH
 // clang-format on
 }  // namespace dyload
+}  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/curand.h b/paddle/platform/curand.h
index edff6526bd..a9cbe48ef8 100644
--- a/paddle/platform/curand.h
+++ b/paddle/platform/curand.h
@@ -1,20 +1,39 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
 #include <curand.h>
 #include "paddle/platform/dynamic_loader.h"
 
 namespace paddle {
+namespace platform {
 namespace dyload {
 std::once_flag curand_dso_flag;
 void *curand_dso_handle = nullptr;
 #ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CURAND_WRAP(__name)                                       \
-  struct DynLoad__##__name {                                                   \
-    template <typename... Args>                                                \
-    curandStatus_t operator()(Args... args) {                                  \
-      typedef curandStatus_t (*curandFunc)(Args...);                           \
-      std::call_once(curand_dso_flag, GetCurandDsoHandle, &curand_dso_handle); \
-      void *p_##__name = dlsym(curand_dso_handle, #__name);                    \
-      return reinterpret_cast<curandFunc>(p_##__name)(args...);                \
-    }                                                                          \
+#define DYNAMIC_LOAD_CURAND_WRAP(__name)                           \
+  struct DynLoad__##__name {                                       \
+    template <typename... Args>                                    \
+    curandStatus_t operator()(Args... args) {                      \
+      typedef curandStatus_t (*curandFunc)(Args...);               \
+      std::call_once(curand_dso_flag,                              \
+                     paddle::platform::dyload::GetCurandDsoHandle, \
+                     &curand_dso_handle);                          \
+      void *p_##__name = dlsym(curand_dso_handle, #__name);        \
+      return reinterpret_cast<curandFunc>(p_##__name)(args...);    \
+    }                                                              \
   } __name; /* struct DynLoad__##__name */
 #else
 #define DYNAMIC_LOAD_CURAND_WRAP(__name)      \
@@ -41,5 +60,6 @@ CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
 
 #undef CURAND_RAND_ROUTINE_EACH
 #undef DYNAMIC_LOAD_CURAND_WRAP
-}
+}  // namespace dyload
+}  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/dynamic_loader.cc b/paddle/platform/dynamic_loader.cc
index c34abc392c..9e0aadf8e2 100644
--- a/paddle/platform/dynamic_loader.cc
+++ b/paddle/platform/dynamic_loader.cc
@@ -13,8 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "dynamic_loader.h"
-#include <gflags/gflags.h>
-#include <glog/logging.h>
+#include <dlfcn.h>
+#include <memory>
+#include <mutex>
+#include <string>
+#include "gflags/gflags.h"
+#include "glog/logging.h"
 
 DEFINE_string(cudnn_dir, "",
               "Specify path for loading libcudnn.so. For instance, "
@@ -30,6 +34,10 @@ DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
 
 DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so.");
 
+namespace paddle {
+namespace platform {
+namespace dyload {
+
 static inline std::string join(const std::string& part1,
                                const std::string& part2) {
   // directory separator
@@ -155,3 +163,7 @@ void GetLapackDsoHandle(void** dso_handle) {
   GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.so", dso_handle);
 #endif
 }
+
+}  // namespace dyload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/dynamic_loader.h b/paddle/platform/dynamic_loader.h
index 9b5ad21724..bb58fcba17 100644
--- a/paddle/platform/dynamic_loader.h
+++ b/paddle/platform/dynamic_loader.h
@@ -12,13 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef DYNAMIC_LOAD_H_
-#define DYNAMIC_LOAD_H_
+#pragma once
 
-#include <dlfcn.h>
-#include <memory>
-#include <mutex>
-#include <string>
+namespace paddle {
+namespace platform {
+namespace dyload {
 
 /**
  * @brief    load the DSO of CUBLAS
@@ -60,4 +58,6 @@ void GetWarpCTCDsoHandle(void** dso_handle);
  */
 void GetLapackDsoHandle(void** dso_handle);
 
-#endif  // DYNAMIC_LOAD_H_
+}  // namespace dyload
+}  // namespace platform
+}  // namespace paddle

From 3567ea6d7c85bdfdd42f57f6db98e5bc9bbac5e2 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Tue, 4 Jul 2017 14:58:20 +0800
Subject: [PATCH 39/79] move to dynload directory

---
 paddle/platform/CMakeLists.txt                |  4 +--
 paddle/platform/dynload/CMakeLists.txt        |  1 +
 paddle/platform/{ => dynload}/cublas.h        | 26 +++++++++----------
 paddle/platform/{ => dynload}/cudnn.h         | 26 +++++++++----------
 paddle/platform/{ => dynload}/curand.h        | 26 +++++++++----------
 .../platform/{ => dynload}/dynamic_loader.cc  |  4 +--
 .../platform/{ => dynload}/dynamic_loader.h   |  4 +--
 7 files changed, 46 insertions(+), 45 deletions(-)
 create mode 100644 paddle/platform/dynload/CMakeLists.txt
 rename paddle/platform/{ => dynload}/cublas.h (95%)
 rename paddle/platform/{ => dynload}/cudnn.h (97%)
 rename paddle/platform/{ => dynload}/curand.h (93%)
 rename paddle/platform/{ => dynload}/dynamic_loader.cc (99%)
 rename paddle/platform/{ => dynload}/dynamic_loader.h (96%)

diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index 4f6381b8af..cc6b52e927 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -1,6 +1,6 @@
+add_subdirectory(dynload)
+
 nv_test(cuda_test SRCS cuda_test.cu)
 
 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
-
-cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags)
diff --git a/paddle/platform/dynload/CMakeLists.txt b/paddle/platform/dynload/CMakeLists.txt
new file mode 100644
index 0000000000..9f829b7012
--- /dev/null
+++ b/paddle/platform/dynload/CMakeLists.txt
@@ -0,0 +1 @@
+cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags)
diff --git a/paddle/platform/cublas.h b/paddle/platform/dynload/cublas.h
similarity index 95%
rename from paddle/platform/cublas.h
rename to paddle/platform/dynload/cublas.h
index 90704f37e6..c9150ac573 100644
--- a/paddle/platform/cublas.h
+++ b/paddle/platform/dynload/cublas.h
@@ -19,7 +19,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace platform {
-namespace dyload {
+namespace dynload {
 
 std::once_flag cublas_dso_flag;
 void *cublas_dso_handle = nullptr;
@@ -32,17 +32,17 @@ void *cublas_dso_handle = nullptr;
  * note: default dynamic linked libs
  */
 #ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)                           \
-  struct DynLoad__##__name {                                       \
-    template <typename... Args>                                    \
-    cublasStatus_t operator()(Args... args) {                      \
-      typedef cublasStatus_t (*cublasFunc)(Args...);               \
-      std::call_once(cublas_dso_flag,                              \
-                     paddle::platform::dyload::GetCublasDsoHandle, \
-                     &cublas_dso_handle);                          \
-      void *p_##__name = dlsym(cublas_dso_handle, #__name);        \
-      return reinterpret_cast<cublasFunc>(p_##__name)(args...);    \
-    }                                                              \
+#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)                            \
+  struct DynLoad__##__name {                                        \
+    template <typename... Args>                                     \
+    cublasStatus_t operator()(Args... args) {                       \
+      typedef cublasStatus_t (*cublasFunc)(Args...);                \
+      std::call_once(cublas_dso_flag,                               \
+                     paddle::platform::dynload::GetCublasDsoHandle, \
+                     &cublas_dso_handle);                           \
+      void *p_##__name = dlsym(cublas_dso_handle, #__name);         \
+      return reinterpret_cast<cublasFunc>(p_##__name)(args...);     \
+    }                                                               \
   } __name;  // struct DynLoad__##__name
 #else
 #define DYNAMIC_LOAD_CUBLAS_WRAP(__name)      \
@@ -99,6 +99,6 @@ CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
 #define CUBLAS_GETRF paddle::platform::dynload::cublasDgetrfBatched
 #define CUBLAS_GETRI paddle::platform::dynload::cublasDgetriBatched
 #endif
-}  // namespace dyload
+}  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/cudnn.h b/paddle/platform/dynload/cudnn.h
similarity index 97%
rename from paddle/platform/cudnn.h
rename to paddle/platform/dynload/cudnn.h
index 06e2a05d86..c03424b375 100644
--- a/paddle/platform/cudnn.h
+++ b/paddle/platform/dynload/cudnn.h
@@ -19,24 +19,24 @@ limitations under the License. */
 
 namespace paddle {
 namespace platform {
-namespace dyload {
+namespace dynload {
 
 std::once_flag cudnn_dso_flag;
 void* cudnn_dso_handle = nullptr;
 
 #ifdef PADDLE_USE_DSO
 
-#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                           \
-  struct DynLoad__##__name {                                      \
-    template <typename... Args>                                   \
-    auto operator()(Args... args) -> decltype(__name(args...)) {  \
-      using cudnn_func = decltype(__name(args...)) (*)(Args...);  \
-      std::call_once(cudnn_dso_flag,                              \
-                     paddle::platform::dyload::GetCudnnDsoHandle, \
-                     &cudnn_dso_handle);                          \
-      void* p_##__name = dlsym(cudnn_dso_handle, #__name);        \
-      return reinterpret_cast<cudnn_func>(p_##__name)(args...);   \
-    }                                                             \
+#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                            \
+  struct DynLoad__##__name {                                       \
+    template <typename... Args>                                    \
+    auto operator()(Args... args) -> decltype(__name(args...)) {   \
+      using cudnn_func = decltype(__name(args...)) (*)(Args...);   \
+      std::call_once(cudnn_dso_flag,                               \
+                     paddle::platform::dynload::GetCudnnDsoHandle, \
+                     &cudnn_dso_handle);                           \
+      void* p_##__name = dlsym(cudnn_dso_handle, #__name);         \
+      return reinterpret_cast<cudnn_func>(p_##__name)(args...);    \
+    }                                                              \
   } __name; /* struct DynLoad__##__name */
 
 #else
@@ -129,6 +129,6 @@ CUDNN_DNN_ROUTINE_EACH_R5(DYNAMIC_LOAD_CUDNN_WRAP)
 
 #undef CUDNN_DNN_ROUTINE_EACH
 // clang-format on
-}  // namespace dyload
+}  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/curand.h b/paddle/platform/dynload/curand.h
similarity index 93%
rename from paddle/platform/curand.h
rename to paddle/platform/dynload/curand.h
index a9cbe48ef8..1ef7a8c833 100644
--- a/paddle/platform/curand.h
+++ b/paddle/platform/dynload/curand.h
@@ -19,21 +19,21 @@ limitations under the License. */
 
 namespace paddle {
 namespace platform {
-namespace dyload {
+namespace dynload {
 std::once_flag curand_dso_flag;
 void *curand_dso_handle = nullptr;
 #ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CURAND_WRAP(__name)                           \
-  struct DynLoad__##__name {                                       \
-    template <typename... Args>                                    \
-    curandStatus_t operator()(Args... args) {                      \
-      typedef curandStatus_t (*curandFunc)(Args...);               \
-      std::call_once(curand_dso_flag,                              \
-                     paddle::platform::dyload::GetCurandDsoHandle, \
-                     &curand_dso_handle);                          \
-      void *p_##__name = dlsym(curand_dso_handle, #__name);        \
-      return reinterpret_cast<curandFunc>(p_##__name)(args...);    \
-    }                                                              \
+#define DYNAMIC_LOAD_CURAND_WRAP(__name)                            \
+  struct DynLoad__##__name {                                        \
+    template <typename... Args>                                     \
+    curandStatus_t operator()(Args... args) {                       \
+      typedef curandStatus_t (*curandFunc)(Args...);                \
+      std::call_once(curand_dso_flag,                               \
+                     paddle::platform::dynload::GetCurandDsoHandle, \
+                     &curand_dso_handle);                           \
+      void *p_##__name = dlsym(curand_dso_handle, #__name);         \
+      return reinterpret_cast<curandFunc>(p_##__name)(args...);     \
+    }                                                               \
   } __name; /* struct DynLoad__##__name */
 #else
 #define DYNAMIC_LOAD_CURAND_WRAP(__name)      \
@@ -60,6 +60,6 @@ CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
 
 #undef CURAND_RAND_ROUTINE_EACH
 #undef DYNAMIC_LOAD_CURAND_WRAP
-}  // namespace dyload
+}  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/dynamic_loader.cc b/paddle/platform/dynload/dynamic_loader.cc
similarity index 99%
rename from paddle/platform/dynamic_loader.cc
rename to paddle/platform/dynload/dynamic_loader.cc
index 9e0aadf8e2..8ef67bad8c 100644
--- a/paddle/platform/dynamic_loader.cc
+++ b/paddle/platform/dynload/dynamic_loader.cc
@@ -36,7 +36,7 @@ DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so.");
 
 namespace paddle {
 namespace platform {
-namespace dyload {
+namespace dynload {
 
 static inline std::string join(const std::string& part1,
                                const std::string& part2) {
@@ -164,6 +164,6 @@ void GetLapackDsoHandle(void** dso_handle) {
 #endif
 }
 
-}  // namespace dyload
+}  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/dynamic_loader.h b/paddle/platform/dynload/dynamic_loader.h
similarity index 96%
rename from paddle/platform/dynamic_loader.h
rename to paddle/platform/dynload/dynamic_loader.h
index bb58fcba17..a99b05443f 100644
--- a/paddle/platform/dynamic_loader.h
+++ b/paddle/platform/dynload/dynamic_loader.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace platform {
-namespace dyload {
+namespace dynload {
 
 /**
  * @brief    load the DSO of CUBLAS
@@ -58,6 +58,6 @@ void GetWarpCTCDsoHandle(void** dso_handle);
  */
 void GetLapackDsoHandle(void** dso_handle);
 
-}  // namespace dyload
+}  // namespace dynload
 }  // namespace platform
 }  // namespace paddle

From a211374d53090733667f2be2cf629cf858757c6d Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 4 Jul 2017 15:01:05 +0800
Subject: [PATCH 40/79] FIX: interface deps under cmake < 3.3

---
 cmake/external/any.cmake   | 13 ++++++++++---
 cmake/external/eigen.cmake | 13 ++++++++++---
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/cmake/external/any.cmake b/cmake/external/any.cmake
index b61e421871..edf6edc0bd 100644
--- a/cmake/external/any.cmake
+++ b/cmake/external/any.cmake
@@ -17,8 +17,15 @@ ExternalProject_Add(
     TEST_COMMAND      ""
 )
 
-ADD_LIBRARY(lib_any INTERFACE)
-ADD_DEPENDENCIES(lib_any extern_lib_any)
+if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_any_dummy.c)
+    file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";")
+    add_library(lib_any STATIC ${dummyfile})
+else()
+    add_library(lib_any INTERFACE)
+endif()
+
+add_dependencies(lib_any extern_lib_any)
 
 add_definitions(-DANY_IMPL_ANY_CAST_MOVEABLE)
-LIST(APPEND external_project_dependencies extern_lib_any)
+LIST(APPEND external_project_dependencies lib_any)
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 39b16c3b2b..1f2fdcac65 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -26,7 +26,14 @@ ExternalProject_Add(
     TEST_COMMAND      ""
 )
 
-ADD_LIBRARY(eigen3 INTERFACE)
-ADD_DEPENDENCIES(eigen3 extern_eigen3)
+if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/eigen3_dummy.c)
+    file(WRITE ${dummyfile} "const char * dummy_eigen3 = \"${dummyfile}\";")
+    add_library(eigen3 STATIC ${dummyfile})
+else()
+    add_library(eigen3 INTERFACE)
+endif()
 
-LIST(APPEND external_project_dependencies extern_eigen3)
+add_dependencies(eigen3 extern_eigen3)
+
+LIST(APPEND external_project_dependencies eigen3)

From 817f317bef82eb2c024927e6a62b048a1ba93d4a Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 4 Jul 2017 15:39:08 +0800
Subject: [PATCH 41/79] FIX: INTERFACE path

---
 cmake/external/any.cmake   | 2 +-
 cmake/external/eigen.cmake | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/external/any.cmake b/cmake/external/any.cmake
index edf6edc0bd..45e3764e84 100644
--- a/cmake/external/any.cmake
+++ b/cmake/external/any.cmake
@@ -2,7 +2,7 @@ INCLUDE(ExternalProject)
 
 SET(ANY_SOURCE_DIR ${THIRD_PARTY_PATH}/any)
 
-INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/linb_any)
+INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/extern_lib_any)
 
 ExternalProject_Add(
     extern_lib_any
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 1f2fdcac65..3e6cedbb0d 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -2,7 +2,7 @@ INCLUDE(ExternalProject)
 
 SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3)
 
-INCLUDE_DIRECTORIES(${EIGEN_SOURCE_DIR}/src/eigen3)
+INCLUDE_DIRECTORIES(${EIGEN_SOURCE_DIR}/src/extern_eigen3)
 
 ExternalProject_Add(
     extern_eigen3

From 9045063b535c400ff8ebf20d0b8534103ec6d9ab Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Tue, 4 Jul 2017 15:58:15 +0800
Subject: [PATCH 42/79] pserver etcd client (#2559)

* init etcd cclient

* add etcd

* add etcd.go

* fix compile problem

* move code to etcd.go

* add etcd_lister.go for pserver client

* add etcd_client_test.go

* merge etcd_client_test and client_test

* refine client_test.go

* refine code

* format code

* add TODO and use interface instead of struct

* fix typo of initDesiredPservers

* optimize dir structure of go/pserver/client

* add a flag to config index for pserver

* follow comment

* fix path

* optimize code

* remove err in pserver NewEtcd

* restore comment about /ps_desired
---
 CMakeLists.txt                                |   2 +-
 go/CMakeLists.txt                             |   2 +-
 go/cmd/pserver/pserver.go                     |  16 ++-
 go/master/etcd_client.go                      |   4 +-
 .../{cclient => client/c}/CMakeLists.txt      |   2 +-
 go/pserver/{cclient => client/c}/cclient.go   |  26 ++--
 .../{cclient => client/c}/test/CMakeLists.txt |   0
 .../{cclient => client/c}/test/test_cclient.c |   0
 .../{cclient => client/c}/test/test_mnist.py  |   0
 .../{cclient => client/c}/test/test_train.py  |   0
 .../c}/test/testdata/optimizer.pb             | Bin
 go/pserver/{ => client}/client.go             |  17 +--
 go/pserver/{ => client}/client_test.go        |  77 +++++++++--
 go/pserver/client/etcd_client.go              | 125 ++++++++++++++++++
 go/pserver/etcd_client.go                     |  13 +-
 go/pserver/optimizer.go                       |   2 +-
 go/pserver/optimizer_test.go                  |   2 +-
 go/pserver/service.go                         |   3 -
 go/pserver/service_test.go                    |   8 +-
 19 files changed, 246 insertions(+), 53 deletions(-)
 rename go/pserver/{cclient => client/c}/CMakeLists.txt (67%)
 rename go/pserver/{cclient => client/c}/cclient.go (88%)
 rename go/pserver/{cclient => client/c}/test/CMakeLists.txt (100%)
 rename go/pserver/{cclient => client/c}/test/test_cclient.c (100%)
 rename go/pserver/{cclient => client/c}/test/test_mnist.py (100%)
 rename go/pserver/{cclient => client/c}/test/test_train.py (100%)
 rename go/pserver/{cclient => client/c}/test/testdata/optimizer.pb (100%)
 rename go/pserver/{ => client}/client.go (92%)
 rename go/pserver/{ => client}/client_test.go (54%)
 create mode 100644 go/pserver/client/etcd_client.go

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5349f59805..5bedbbefa8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -113,7 +113,7 @@ include(coveralls)          # set code coverage
 include_directories("${PROJ_ROOT}")
 include_directories("${PROJ_ROOT}/paddle/cuda/include")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
-include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/cclient")
+include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c")
 include_directories(${Boost_INCLUDE_DIRS})
 
 set(EXTERNAL_LIBS
diff --git a/go/CMakeLists.txt b/go/CMakeLists.txt
index 014697d155..f00c70a058 100644
--- a/go/CMakeLists.txt
+++ b/go/CMakeLists.txt
@@ -13,7 +13,7 @@
 # limitations under the License.
 #
 
-add_subdirectory(pserver/cclient)
+add_subdirectory(pserver/client/c)
 add_subdirectory(cmd/pserver)
 add_subdirectory(cmd/master)
 add_subdirectory(master/c)
diff --git a/go/cmd/pserver/pserver.go b/go/cmd/pserver/pserver.go
index 8a42d4f8af..31ef450f03 100644
--- a/go/cmd/pserver/pserver.go
+++ b/go/cmd/pserver/pserver.go
@@ -15,6 +15,7 @@ import (
 
 func main() {
 	port := flag.Int("port", 0, "port of the pserver")
+	index := flag.Int("index", -1, "index of this pserver, should be larger or equal than 0")
 	etcdEndpoint := flag.String("etcd-endpoint", "http://127.0.0.1:2379",
 		"comma separated endpoint string for pserver to connect to etcd")
 	etcdTimeout := flag.Int("etcd-timeout", 5, "timeout for etcd calls")
@@ -29,11 +30,16 @@ func main() {
 	}
 	log.SetLevel(level)
 
-	timeout := time.Second * time.Duration((*etcdTimeout))
-	e := pserver.NewEtcdClient(*etcdEndpoint, *numPservers, timeout)
-	idx, err := e.Register()
-	if err != nil {
-		panic(err)
+	var idx int
+	if *index >= 0 {
+		idx = *index
+	} else {
+		timeout := time.Second * time.Duration((*etcdTimeout))
+		e := pserver.NewEtcdClient(*etcdEndpoint, *numPservers, timeout)
+		idx, err = e.Register()
+		if err != nil {
+			panic(err)
+		}
 	}
 
 	s, err := pserver.NewService(idx)
diff --git a/go/master/etcd_client.go b/go/master/etcd_client.go
index e27c014792..04c1394e96 100644
--- a/go/master/etcd_client.go
+++ b/go/master/etcd_client.go
@@ -50,7 +50,7 @@ func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePat
 	lock := concurrency.NewMutex(sess, lockPath)
 	// It's fine for the lock to get stuck, in this case we have
 	// multiple master servers running (only configured to have
-	// one master running, but split-brain problem may cuase
+	// one master running, but split-brain problem may cause
 	// multiple master servers running), and the cluster management
 	// software will kill one of them.
 	log.Debugf("Trying to acquire lock at %s.", lockPath)
@@ -98,7 +98,7 @@ func (e *EtcdClient) Save(state []byte) error {
 			// We lost the master lock and can not acquire
 			// it back, it means some other master is
 			// already started. We don't want cluster
-			// managment system to kill the master server
+			// management system to kill the master server
 			// who is holding the lock and running
 			// correctly. So the most feasible solution is
 			// to kill current master server. The current
diff --git a/go/pserver/cclient/CMakeLists.txt b/go/pserver/client/c/CMakeLists.txt
similarity index 67%
rename from go/pserver/cclient/CMakeLists.txt
rename to go/pserver/client/c/CMakeLists.txt
index 7fe74c62f1..a3fcaeef19 100644
--- a/go/pserver/cclient/CMakeLists.txt
+++ b/go/pserver/client/c/CMakeLists.txt
@@ -1,5 +1,5 @@
 cc_library(paddle_go_optimizer DEPS paddle_optimizer paddle_proto glog gflags protobuf)
-go_library(paddle_pserver_cclient STATIC)
+go_library(paddle_pserver_cclient STATIC DEPS paddle_go_optimizer)
 if(WITH_TESTING)
   add_subdirectory(test)
 endif()
diff --git a/go/pserver/cclient/cclient.go b/go/pserver/client/c/cclient.go
similarity index 88%
rename from go/pserver/cclient/cclient.go
rename to go/pserver/client/c/cclient.go
index bbaf43d9f1..7ddaceb7ed 100644
--- a/go/pserver/cclient/cclient.go
+++ b/go/pserver/client/c/cclient.go
@@ -30,15 +30,16 @@ import (
 	"unsafe"
 
 	"github.com/PaddlePaddle/Paddle/go/pserver"
+	"github.com/PaddlePaddle/Paddle/go/pserver/client"
 	log "github.com/sirupsen/logrus"
 )
 
 var nullPtr = unsafe.Pointer(uintptr(0))
 var mu sync.Mutex
-var handleMap = make(map[C.paddle_pserver_client]*pserver.Client)
+var handleMap = make(map[C.paddle_pserver_client]*client.Client)
 var curHandle C.paddle_pserver_client
 
-func add(c *pserver.Client) C.paddle_pserver_client {
+func add(c *client.Client) C.paddle_pserver_client {
 	mu.Lock()
 	defer mu.Unlock()
 	client := curHandle
@@ -47,13 +48,13 @@ func add(c *pserver.Client) C.paddle_pserver_client {
 	return client
 }
 
-func get(client C.paddle_pserver_client) *pserver.Client {
+func get(client C.paddle_pserver_client) *client.Client {
 	mu.Lock()
 	defer mu.Unlock()
 	return handleMap[client]
 }
 
-func remove(client C.paddle_pserver_client) *pserver.Client {
+func remove(client C.paddle_pserver_client) *client.Client {
 	mu.Lock()
 	defer mu.Unlock()
 	h := handleMap[client]
@@ -80,9 +81,9 @@ func (s selector) Select() bool {
 	return bool(s)
 }
 
-type lister []pserver.Server
+type lister []client.Server
 
-func (l lister) List() []pserver.Server {
+func (l lister) List() []client.Server {
 	return l
 }
 
@@ -90,19 +91,22 @@ func (l lister) List() []pserver.Server {
 func paddle_new_pserver_client(addrs *C.char, selected int) C.paddle_pserver_client {
 	a := C.GoString(addrs)
 	as := strings.Split(a, ",")
-	servers := make([]pserver.Server, len(as))
+	servers := make([]client.Server, len(as))
 	for i := range as {
 		servers[i].Index = i
 		servers[i].Addr = as[i]
 	}
-	c := pserver.NewClient(lister(servers), len(as), selector(selected != 0))
+	c := client.NewClient(lister(servers), len(as), selector(selected != 0))
 	return add(c)
 }
 
 //export paddle_new_etcd_pserver_client
-func paddle_new_etcd_pserver_client(etcd_addr *C.char) C.paddle_pserver_client {
-	// TODO(helin): fault tolerant pserver client using etcd.
-	panic("not implemented.")
+func paddle_new_etcd_pserver_client(etcd_endpoints *C.char, selected int) C.paddle_pserver_client {
+	// TODO(Longfei: use etcd lock to decide which trainer to initialize the parameters)
+	addr := C.GoString(etcd_endpoints)
+	etcd_client := client.NewEtcd(addr)
+	c := client.NewClient(etcd_client, etcd_client.Desired(), selector(selected != 0))
+	return add(c)
 }
 
 //export paddle_pserver_client_release
diff --git a/go/pserver/cclient/test/CMakeLists.txt b/go/pserver/client/c/test/CMakeLists.txt
similarity index 100%
rename from go/pserver/cclient/test/CMakeLists.txt
rename to go/pserver/client/c/test/CMakeLists.txt
diff --git a/go/pserver/cclient/test/test_cclient.c b/go/pserver/client/c/test/test_cclient.c
similarity index 100%
rename from go/pserver/cclient/test/test_cclient.c
rename to go/pserver/client/c/test/test_cclient.c
diff --git a/go/pserver/cclient/test/test_mnist.py b/go/pserver/client/c/test/test_mnist.py
similarity index 100%
rename from go/pserver/cclient/test/test_mnist.py
rename to go/pserver/client/c/test/test_mnist.py
diff --git a/go/pserver/cclient/test/test_train.py b/go/pserver/client/c/test/test_train.py
similarity index 100%
rename from go/pserver/cclient/test/test_train.py
rename to go/pserver/client/c/test/test_train.py
diff --git a/go/pserver/cclient/test/testdata/optimizer.pb b/go/pserver/client/c/test/testdata/optimizer.pb
similarity index 100%
rename from go/pserver/cclient/test/testdata/optimizer.pb
rename to go/pserver/client/c/test/testdata/optimizer.pb
diff --git a/go/pserver/client.go b/go/pserver/client/client.go
similarity index 92%
rename from go/pserver/client.go
rename to go/pserver/client/client.go
index 6938b9d5ce..aa8bfe30c2 100644
--- a/go/pserver/client.go
+++ b/go/pserver/client/client.go
@@ -1,4 +1,4 @@
-package pserver
+package client
 
 import (
 	"errors"
@@ -7,6 +7,7 @@ import (
 	"time"
 
 	"github.com/PaddlePaddle/Paddle/go/connection"
+	"github.com/PaddlePaddle/Paddle/go/pserver"
 	log "github.com/sirupsen/logrus"
 )
 
@@ -105,7 +106,7 @@ func (c *Client) BeginInitParams() bool {
 }
 
 // InitParam initializes the parameter on parameter servers.
-func (c *Client) InitParam(paramWithConfigs ParameterWithConfig) error {
+func (c *Client) InitParam(paramWithConfigs pserver.ParameterWithConfig) error {
 	return c.pservers[c.partition(paramWithConfigs.Param.Name)].Call("Service.InitParam", paramWithConfigs, nil)
 }
 
@@ -123,13 +124,13 @@ func (c *Client) FinishInitParams() error {
 
 // SendGrads sends gradients to parameter servers for updating
 // parameters.
-func (c *Client) SendGrads(grads []Gradient) error {
+func (c *Client) SendGrads(grads []pserver.Gradient) error {
 	if len(grads) == 0 {
 		return errors.New("no gradient received")
 	}
 	errCh := make(chan error, len(grads))
 	for _, g := range grads {
-		go func(g Gradient) {
+		go func(g pserver.Gradient) {
 			err := c.pservers[c.partition(g.Name)].Call("Service.SendGrad", g, nil)
 			errCh <- err
 		}(g)
@@ -151,7 +152,7 @@ func (c *Client) SendGrads(grads []Gradient) error {
 
 type result struct {
 	idx   int
-	param Parameter
+	param pserver.Parameter
 	err   error
 }
 
@@ -170,12 +171,12 @@ func (r results) Swap(i int, j int) {
 }
 
 // GetParams gets parameters from parameter servers.
-func (c *Client) GetParams(names []string) ([]Parameter, error) {
+func (c *Client) GetParams(names []string) ([]pserver.Parameter, error) {
 	rCh := make(chan result, len(names))
 
 	for idx, name := range names {
 		go func(name string, idx int) {
-			var parameter Parameter
+			var parameter pserver.Parameter
 			err := c.pservers[c.partition(name)].Call("Service.GetParam", name, &parameter)
 			rCh <- result{idx: idx, param: parameter, err: err}
 		}(name, idx)
@@ -196,7 +197,7 @@ func (c *Client) GetParams(names []string) ([]Parameter, error) {
 	}
 	sort.Sort(rs)
 
-	ps := make([]Parameter, len(rs))
+	ps := make([]pserver.Parameter, len(rs))
 	for i := range rs {
 		ps[i] = rs[i].param
 	}
diff --git a/go/pserver/client_test.go b/go/pserver/client/client_test.go
similarity index 54%
rename from go/pserver/client_test.go
rename to go/pserver/client/client_test.go
index b805efa921..29b400812c 100644
--- a/go/pserver/client_test.go
+++ b/go/pserver/client/client_test.go
@@ -1,6 +1,7 @@
-package pserver_test
+package client_test
 
 import (
+	"context"
 	"io/ioutil"
 	"net"
 	"net/http"
@@ -8,15 +9,25 @@ import (
 	"strconv"
 	"strings"
 	"testing"
+	"time"
 
 	"github.com/PaddlePaddle/Paddle/go/pserver"
+	"github.com/PaddlePaddle/Paddle/go/pserver/client"
+	"github.com/coreos/etcd/clientv3"
+	log "github.com/sirupsen/logrus"
 )
 
-const numPserver = 10
+const (
+	numPserver    = 10
+	etcdEndpoints = "127.0.0.1:2379"
+	timeout       = 2 * time.Second
+)
 
-var port [numPserver]int
+var pserverClientPorts [numPserver]int
 
-func init() {
+// this function init pserver client and return their ports in an array.
+func initClient() [numPserver]int {
+	var ports [numPserver]int
 	for i := 0; i < numPserver; i++ {
 		l, err := net.Listen("tcp", ":0")
 		if err != nil {
@@ -28,7 +39,7 @@ func init() {
 		if err != nil {
 			panic(err)
 		}
-		port[i] = p
+		ports[i] = p
 
 		go func(l net.Listener) {
 			s, err := pserver.NewService(0)
@@ -49,6 +60,31 @@ func init() {
 			}
 		}(l)
 	}
+	return ports
+}
+
+func initNativeClient() {
+	pserverClientPorts = initClient()
+}
+
+func initEtcdClient() {
+	client, err := clientv3.New(clientv3.Config{
+		Endpoints:   []string{etcdEndpoints},
+		DialTimeout: time.Second * time.Duration(1),
+	})
+	if err != nil {
+		log.Errorf("err %v", err)
+	}
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
+	client.Delete(ctx, pserver.PsDesired)
+	client.Delete(ctx, pserver.PsPath)
+	client.Put(ctx, pserver.PsDesired, strconv.Itoa(numPserver))
+	ports := initClient()
+	for i := 0; i < numPserver; i++ {
+		client.Put(ctx, pserver.PsPath+strconv.Itoa(i), ":"+strconv.Itoa(ports[i]))
+	}
+	cancel()
+	client.Close()
 }
 
 type selector bool
@@ -57,25 +93,20 @@ func (s selector) Select() bool {
 	return bool(s)
 }
 
-type lister []pserver.Server
+type lister []client.Server
 
-func (l lister) List() []pserver.Server {
+func (l lister) List() []client.Server {
 	return l
 }
 
-func TestClientFull(t *testing.T) {
-	servers := make([]pserver.Server, numPserver)
-	for i := 0; i < numPserver; i++ {
-		servers[i] = pserver.Server{Index: i, Addr: ":" + strconv.Itoa(port[i])}
-	}
-	c := pserver.NewClient(lister(servers), len(servers), selector(true))
+func ClientTest(t *testing.T, c *client.Client) {
 	selected := c.BeginInitParams()
 	if !selected {
 		t.Fatal("should be selected.")
 	}
 
 	const numParameter = 100
-	config, err := ioutil.ReadFile("./cclient/test/testdata/optimizer.pb")
+	config, err := ioutil.ReadFile("./c/test/testdata/optimizer.pb")
 	if err != nil {
 		t.Fatalf("read optimizer proto failed")
 	}
@@ -129,3 +160,21 @@ func TestClientFull(t *testing.T) {
 		}
 	}
 }
+
+func TestNativeClient(t *testing.T) {
+	initNativeClient()
+	servers := make([]client.Server, numPserver)
+	for i := 0; i < numPserver; i++ {
+		servers[i] = client.Server{Index: i, Addr: ":" + strconv.Itoa(pserverClientPorts[i])}
+	}
+	c1 := client.NewClient(lister(servers), len(servers), selector(true))
+	ClientTest(t, c1)
+}
+
+// TODO: tmperary disable etcdClient test for dependency of etcd)
+func EtcdClient(t *testing.T) {
+	initEtcdClient()
+	etcd_client := client.NewEtcd(etcdEndpoints)
+	c2 := client.NewClient(etcd_client, etcd_client.Desired(), selector(true))
+	ClientTest(t, c2)
+}
diff --git a/go/pserver/client/etcd_client.go b/go/pserver/client/etcd_client.go
new file mode 100644
index 0000000000..1fd3479aa8
--- /dev/null
+++ b/go/pserver/client/etcd_client.go
@@ -0,0 +1,125 @@
+package client
+
+import (
+	"context"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/PaddlePaddle/Paddle/go/pserver"
+	"github.com/coreos/etcd/clientv3"
+	log "github.com/sirupsen/logrus"
+)
+
+const (
+	DefaultEtcdTimeout time.Duration = 5 * time.Second
+)
+
+// EtcdClient is used by pserver client that is a part of trainer process.
+// TODO:
+// 1. add watcher to watch the change state of pservers)
+// 1. add etcd lock)
+type EtcdClient struct {
+	client    *clientv3.Client
+	timeout   time.Duration
+	endpoints []string
+}
+
+// Desired read ps desired number from etcd.
+func (p *EtcdClient) Desired() int {
+	var psDesired int
+	for {
+		ctx, cancel := context.WithTimeout(context.Background(), p.timeout)
+		resp, err := p.client.Get(ctx, pserver.PsDesired)
+		cancel()
+		if err != nil {
+			log.Errorf("Get ps dresire number failed! recnnectiong..., %v", err)
+			time.Sleep(p.timeout)
+			continue
+		}
+
+		kvs := resp.Kvs
+		if len(kvs) == 0 {
+			log.Infoln("Waiting for ps desired registered ...")
+			time.Sleep(p.timeout)
+			continue
+		}
+
+		psDesired, err = strconv.Atoi(string(resp.Kvs[0].Value))
+		if err != nil {
+			log.Errorf("psDesired %s invalid %v", psDesired, err)
+			time.Sleep(p.timeout)
+			continue
+		}
+
+		log.Debugf("Get psDesired number: %d", psDesired)
+		break
+	}
+	return psDesired
+}
+
+// List return the pserver list read from etcd.
+func (p *EtcdClient) List() []Server {
+	psDesired := p.Desired()
+
+	servers := make([]Server, psDesired)
+	for {
+		for i := 0; i < psDesired; i++ {
+			ctx, cancel := context.WithTimeout(context.Background(), p.timeout)
+			cancel()
+			psKey := pserver.PsPath + strconv.Itoa(i)
+			log.Debugf("checking %s", psKey)
+			resp, err := p.client.Get(ctx, psKey)
+			if err != nil {
+				log.Infof("Get psKey= %s error, %v", psKey, err)
+				time.Sleep(p.timeout)
+				continue
+			}
+			kvs := resp.Kvs
+			if len(kvs) == 0 {
+				log.Infof("Waiting for ps addr registered ...")
+				time.Sleep(p.timeout)
+				continue
+			}
+
+			psAddr := string(resp.Kvs[0].Value)
+			// TODO(Longfei) check the ps address
+			if psAddr == "" {
+				log.Infof("Get psKey = %s, psAddr is empty", psKey)
+				time.Sleep(p.timeout)
+				continue
+			}
+			log.Infof("got value (%s) for key: %s", psAddr, psKey)
+			servers[i].Index = i
+			servers[i].Addr = psAddr
+		}
+		break
+	}
+	return servers
+}
+
+// NewEtcd create a etcd client to return the state of pserver on etcd.
+func NewEtcd(endpoints string) *EtcdClient {
+	ep := strings.Split(endpoints, ",")
+	var cli *clientv3.Client
+	var err error
+	for {
+		cli, err = clientv3.New(clientv3.Config{
+			Endpoints:   ep,
+			DialTimeout: DefaultEtcdTimeout,
+		})
+		if err != nil {
+			log.Errorf("Init etcd connection failed: %v", err)
+			time.Sleep(DefaultEtcdTimeout)
+			continue
+		}
+		break
+	}
+	log.Infof("Connected to etcd: %s\n", endpoints)
+	client := &EtcdClient{
+		client:    cli,
+		timeout:   DefaultEtcdTimeout,
+		endpoints: ep,
+	}
+	return client
+}
diff --git a/go/pserver/etcd_client.go b/go/pserver/etcd_client.go
index 4d88243edd..37b8d522c1 100644
--- a/go/pserver/etcd_client.go
+++ b/go/pserver/etcd_client.go
@@ -13,6 +13,13 @@ import (
 	log "github.com/sirupsen/logrus"
 )
 
+const (
+	// PsDesired is etcd path for store desired pserver count
+	PsDesired = "/ps_desired"
+	// PsAddr is the base dir for pserver to store their addr
+	PsPath = "/ps/"
+)
+
 // EtcdClient is the etcd client that the pserver uses for fault
 // tolerance, service registry and coordination.
 type EtcdClient struct {
@@ -68,7 +75,7 @@ func (e *EtcdClient) Register() (int, error) {
 	// it at the same time.
 	for {
 		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
-		_, err := e.initDesiredPsercers(ctx, e.numPservers)
+		_, err := e.initDesiredPservers(ctx, e.numPservers)
 		cancel()
 		if err != nil {
 			log.Warn(err)
@@ -120,7 +127,7 @@ func (e *EtcdClient) Register() (int, error) {
 	return pserverIdx, nil
 }
 
-func (e *EtcdClient) initDesiredPsercers(ctx context.Context, numPservers int) (*clientv3.TxnResponse, error) {
+func (e *EtcdClient) initDesiredPservers(ctx context.Context, numPservers int) (*clientv3.TxnResponse, error) {
 	return concurrency.NewSTM(e.etcdClient, func(c concurrency.STM) error {
 		dsStr := c.Get(PsDesired)
 		if dsStr == "" {
@@ -136,7 +143,7 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) {
 	_, err := concurrency.NewSTM(e.etcdClient, func(c concurrency.STM) error {
 		registered := false
 		for i := 0; i < e.desired; i++ {
-			psKey := "/ps/" + strconv.Itoa(i)
+			psKey := PsPath + strconv.Itoa(i)
 			log.Debugf("checking %s", psKey)
 			ps := c.Get(psKey)
 			log.Debugf("got value (%s) for key: %s", ps, psKey)
diff --git a/go/pserver/optimizer.go b/go/pserver/optimizer.go
index b4a040f46b..bca3718af3 100644
--- a/go/pserver/optimizer.go
+++ b/go/pserver/optimizer.go
@@ -2,7 +2,7 @@ package pserver
 
 // #cgo CFLAGS: -I ../../
 // //FIXME: ldflags contain "build" path
-// #cgo LDFLAGS: ../../build/go/pserver/cclient/libpaddle_go_optimizer.a -lstdc++
+// #cgo LDFLAGS: ../../build/go/pserver/client/c/libpaddle_go_optimizer.a -lstdc++
 // #include "paddle/optimizer/optimizer.h"
 // #include <stdlib.h>
 // #include <string.h>
diff --git a/go/pserver/optimizer_test.go b/go/pserver/optimizer_test.go
index b99b5a5f0b..0b2f4cfa41 100644
--- a/go/pserver/optimizer_test.go
+++ b/go/pserver/optimizer_test.go
@@ -11,7 +11,7 @@ func TestOptimizerCreateRelease(t *testing.T) {
 		ElementType: Int32,
 	}
 	p.Content = []byte{1, 3}
-	config, err := ioutil.ReadFile("./cclient/test/testdata/optimizer.pb")
+	config, err := ioutil.ReadFile("./client/c/test/testdata/optimizer.pb")
 	if err != nil {
 		t.Fatalf("read optimizer proto failed")
 	}
diff --git a/go/pserver/service.go b/go/pserver/service.go
index e15a4e5a58..7711dc027e 100644
--- a/go/pserver/service.go
+++ b/go/pserver/service.go
@@ -24,9 +24,6 @@ const (
 	Float64
 )
 
-// PsDesired is etcd path for store desired pserver count
-const PsDesired = "/ps_desired"
-
 // Parameter is a piece of data to sync with the parameter server.
 type Parameter struct {
 	Name        string
diff --git a/go/pserver/service_test.go b/go/pserver/service_test.go
index 30e3ac8ae1..b6d20d2c8b 100644
--- a/go/pserver/service_test.go
+++ b/go/pserver/service_test.go
@@ -10,6 +10,10 @@ import (
 	"github.com/PaddlePaddle/Paddle/go/pserver"
 )
 
+const (
+	OptimizerConfig = "./client/c/test/testdata/optimizer.pb"
+)
+
 func TestServiceFull(t *testing.T) {
 	s, err := pserver.NewService(0)
 	if err != nil {
@@ -19,7 +23,7 @@ func TestServiceFull(t *testing.T) {
 	p.Name = "param_a"
 	p.Content = []byte{1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0}
 	p.ElementType = pserver.Int32
-	config, err := ioutil.ReadFile("./cclient/test/testdata/optimizer.pb")
+	config, err := ioutil.ReadFile(OptimizerConfig)
 	if err != nil {
 		t.Fatalf("read optimizer proto failed")
 	}
@@ -149,7 +153,7 @@ func TestBlockUntilInitialized(t *testing.T) {
 	p.Name = "param_a"
 	p.Content = []byte{1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0}
 	p.ElementType = pserver.Int32
-	config, err := ioutil.ReadFile("./cclient/test/testdata/optimizer.pb")
+	config, err := ioutil.ReadFile(OptimizerConfig)
 	if err != nil {
 		t.Fatalf("read optimizer proto failed")
 	}

From 3f5e5a24c497714530e8f55f2f076fc4e3168d9c Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Tue, 4 Jul 2017 08:16:08 +0000
Subject: [PATCH 43/79] fix cmake error

---
 .travis.yml                | 2 +-
 go/master/c/CMakeLists.txt | 2 +-
 go/pserver/optimizer.go    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index a53bd18094..4f72e2ca33 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -42,7 +42,7 @@ before_install:
     function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:
   - |
-    timeout 2580 paddle/scripts/travis/${JOB}.sh  # 43min timeout
+    timeout 2580 paddle/scripts/travis/${JOB}.sh -e "WITH_GOLANG=ON"  # 43min timeout
     RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi;
 notifications:
   email:
diff --git a/go/master/c/CMakeLists.txt b/go/master/c/CMakeLists.txt
index 94d6bb0b2e..d900850be0 100644
--- a/go/master/c/CMakeLists.txt
+++ b/go/master/c/CMakeLists.txt
@@ -1 +1 @@
-go_library(paddle_master SHARED)
+go_library(paddle_master SHARED DEPS paddle_go_optimizer)
diff --git a/go/pserver/optimizer.go b/go/pserver/optimizer.go
index bca3718af3..d84f55b987 100644
--- a/go/pserver/optimizer.go
+++ b/go/pserver/optimizer.go
@@ -2,7 +2,7 @@ package pserver
 
 // #cgo CFLAGS: -I ../../
 // //FIXME: ldflags contain "build" path
-// #cgo LDFLAGS: ../../build/go/pserver/client/c/libpaddle_go_optimizer.a -lstdc++
+// #cgo LDFLAGS: ../../build/go/pserver/client/c/libpaddle_go_optimizer.a -lstdc++ -lm
 // #include "paddle/optimizer/optimizer.h"
 // #include <stdlib.h>
 // #include <string.h>

From d8941e67ec5da7333666b31264704dae7d830ca2 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Tue, 4 Jul 2017 08:24:28 +0000
Subject: [PATCH 44/79] fix bugs

---
 .travis.yml                    | 2 +-
 paddle/scripts/docker/build.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 4f72e2ca33..16432dac0c 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -42,7 +42,7 @@ before_install:
     function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:
   - |
-    timeout 2580 paddle/scripts/travis/${JOB}.sh -e "WITH_GOLANG=ON"  # 43min timeout
+    export WITH_GOLANG=ON && timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
     RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi;
 notifications:
   email:
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index a182e5f4ae..1ccee686df 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -3,7 +3,7 @@
 set -xe
 
 # Set BASE_IMAGE according to env variables
-if [ ${WITH_GPU} == "ON" ]; then
+if [[ ${WITH_GPU} == "ON" ]]; then
   BASE_IMAGE="nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04"
 else
   BASE_IMAGE="ubuntu:16.04"

From 86543f7f6a8f0fc073977794abee9ae5b033f78e Mon Sep 17 00:00:00 2001
From: yangyaming <mxscmxsc@gmail.com>
Date: Tue, 4 Jul 2017 16:40:00 +0800
Subject: [PATCH 45/79] Follow comments.

---
 doc/api/v2/config/layer.rst                   |  2 +-
 paddle/gserver/layers/DetectionOutputLayer.h  |  8 +-
 paddle/gserver/layers/MultiBoxLossLayer.cpp   |  6 +-
 paddle/gserver/layers/MultiBoxLossLayer.h     |  2 +-
 .../paddle/trainer_config_helpers/layers.py   | 20 +++--
 .../test_detection_output_layer.protostr      | 66 ++++++++++++++++
 .../test_multibox_loss_layer.protostr         | 79 +++++++++++++++++++
 7 files changed, 164 insertions(+), 19 deletions(-)
 create mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_detection_output_layer.protostr
 create mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_multibox_loss_layer.protostr

diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index 0a8465919d..4f4a9187bc 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -478,6 +478,6 @@ Detection output Layer
 ======================
 
 detection_output
----
+----------------
 ..  autoclass:: paddle.v2.layer.detection_output
     :noindex:
diff --git a/paddle/gserver/layers/DetectionOutputLayer.h b/paddle/gserver/layers/DetectionOutputLayer.h
index 9cc568219c..a232af0a69 100644
--- a/paddle/gserver/layers/DetectionOutputLayer.h
+++ b/paddle/gserver/layers/DetectionOutputLayer.h
@@ -22,14 +22,14 @@ limitations under the License. */
 namespace paddle {
 
 /**
- * The detection output layer for a SSD detection task. This layer apply the
- * Non-maximum suppression to the all predicted bounding box and keep the
+ * The detection output layer for a SSD detection task. This layer applies the
+ * Non-maximum suppression to the all predicted bounding box and keeps the
  * Top-K bounding boxes.
- * - Input: This layer needs three input layers: This first input layer
+ * - Input: This layer needs three input layers: The first input layer
  *          is the priorbox layer. The rest two input layers are convolution
  *          layers for generating bbox location offset and the classification
  *          confidence.
- * - Output: The predict bounding box location.
+ * - Output: The predict bounding box locations.
  */
 
 class DetectionOutputLayer : public Layer {
diff --git a/paddle/gserver/layers/MultiBoxLossLayer.cpp b/paddle/gserver/layers/MultiBoxLossLayer.cpp
index f2d7b8eb1d..bbf1166dce 100644
--- a/paddle/gserver/layers/MultiBoxLossLayer.cpp
+++ b/paddle/gserver/layers/MultiBoxLossLayer.cpp
@@ -258,8 +258,7 @@ void MultiBoxLossLayer::forward(PassType passType) {
   }
   real loss = locLoss_ + confLoss_;
   MatrixPtr outV = getOutputValue();
-  std::vector<real> tmp(batchSize, loss);
-  outV->copyFrom(&tmp[0], batchSize);
+  outV->assign(loss);
 }
 
 void MultiBoxLossLayer::backward(const UpdateCallback& callback) {
@@ -336,6 +335,9 @@ void MultiBoxLossLayer::backward(const UpdateCallback& callback) {
     const MatrixPtr inLocG = getInputGrad(*getLocInputLayer(n));
     const MatrixPtr inConfG = getInputGrad(*getConfInputLayer(n));
     size_t height = getInput(*getLocInputLayer(n)).getFrameHeight();
+    // only for unittest, there are no width and height information
+    // when constructing matrix in unittest, so we should
+    // set the shape in configuration
     if (!height) height = layerConf.height();
     size_t width = getInput(*getLocInputLayer(n)).getFrameWidth();
     if (!width) width = layerConf.width();
diff --git a/paddle/gserver/layers/MultiBoxLossLayer.h b/paddle/gserver/layers/MultiBoxLossLayer.h
index 9767fed7f1..9935da5644 100644
--- a/paddle/gserver/layers/MultiBoxLossLayer.h
+++ b/paddle/gserver/layers/MultiBoxLossLayer.h
@@ -30,7 +30,7 @@ namespace paddle {
  * The loss is composed by the location loss and the confidence loss.
  * The location loss is a smooth L1 loss and the confidence loss is
  * a softmax loss.
- * - Input: This layer need four input layers: This first input layer
+ * - Input: This layer needs four input layers: The first input layer
  *          is the priorbox layer and the second layer is a label layer.
  *          The rest two input layers are convolution layers for generating
  *          bbox location offset and the classification confidence.
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 1286ed198e..86e91e2c57 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -1072,10 +1072,10 @@ def multibox_loss_layer(input_loc,
 
     :param name: The Layer Name.
     :type name: basestring
-    :param input_loc: The input predict location.
-    :type input_loc: LayerOutput
+    :param input_loc: The input predict locations.
+    :type input_loc: LayerOutput | List of LayerOutput
     :param input_conf: The input priorbox confidence.
-    :type input_conf: LayerOutput
+    :type input_conf: LayerOutput | List of LayerOutput
     :param priorbox: The input priorbox location and the variance.
     :type priorbox: LayerOutput
     :param label: The input label.
@@ -1146,10 +1146,10 @@ def detection_output_layer(input_loc,
 
     :param name: The Layer Name.
     :type name: basestring
-    :param input_loc: The input predict location.
-    :type input_loc: LayerOutput
+    :param input_loc: The input predict locations.
+    :type input_loc: LayerOutput | List of LayerOutput.
     :param input_conf: The input priorbox confidence.
-    :type input_conf: LayerOutput
+    :type input_conf: LayerOutput | List of LayerOutput.
     :param priorbox: The input priorbox location and the variance.
     :type priorbox: LayerOutput
     :param num_classes: The number of the classification.
@@ -1166,22 +1166,20 @@ def detection_output_layer(input_loc,
     :type background_id: int
     :return: LayerOutput
     """
-    input_loc_num = 0
-    input_conf_num = 0
-
     if isinstance(input_loc, LayerOutput):
         input_loc = [input_loc]
     assert isinstance(input_loc, collections.Sequence)  # list or tuple
     for each in input_loc:
         assert isinstance(each, LayerOutput)
-        input_loc_num += 1
+    input_loc_num = len(input_loc)
 
     if isinstance(input_conf, LayerOutput):
         input_conf = [input_conf]
     assert isinstance(input_conf, collections.Sequence)  # list or tuple
     for each in input_conf:
         assert isinstance(each, LayerOutput)
-        input_conf_num += 1
+    input_conf_num = len(input_conf)
+
     # Check the input layer number.
     assert input_loc_num == input_conf_num
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_detection_output_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_detection_output_layer.protostr
new file mode 100644
index 0000000000..6690f9852a
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_detection_output_layer.protostr
@@ -0,0 +1,66 @@
+type: "nn"
+layers {
+  name: "input_loc"
+  type: "data"
+  size: 16
+  active_type: ""
+  height: 16
+  width: 1
+}
+layers {
+  name: "input_conf"
+  type: "data"
+  size: 8
+  active_type: ""
+  height: 1
+  width: 8
+}
+layers {
+  name: "priorbox"
+  type: "data"
+  size: 32
+  active_type: ""
+  height: 4
+  width: 8
+}
+layers {
+  name: "test_detection_output"
+  type: "detection_output"
+  size: 1400
+  active_type: ""
+  inputs {
+    input_layer_name: "priorbox"
+    detection_output_conf {
+      num_classes: 21
+      nms_threshold: 0.45
+      nms_top_k: 400
+      background_id: 0
+      input_num: 1
+      keep_top_k: 200
+      confidence_threshold: 0.01
+    }
+  }
+  inputs {
+    input_layer_name: "input_loc"
+  }
+  inputs {
+    input_layer_name: "input_conf"
+  }
+}
+input_layer_names: "priorbox"
+input_layer_names: "input_loc"
+input_layer_names: "input_conf"
+output_layer_names: "test_detection_output"
+sub_models {
+  name: "root"
+  layer_names: "input_loc"
+  layer_names: "input_conf"
+  layer_names: "priorbox"
+  layer_names: "test_detection_output"
+  input_layer_names: "priorbox"
+  input_layer_names: "input_loc"
+  input_layer_names: "input_conf"
+  output_layer_names: "test_detection_output"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_multibox_loss_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_multibox_loss_layer.protostr
new file mode 100644
index 0000000000..0ba84dcc6d
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_multibox_loss_layer.protostr
@@ -0,0 +1,79 @@
+type: "nn"
+layers {
+  name: "input_loc"
+  type: "data"
+  size: 16
+  active_type: ""
+  height: 16
+  width: 1
+}
+layers {
+  name: "input_conf"
+  type: "data"
+  size: 8
+  active_type: ""
+  height: 1
+  width: 8
+}
+layers {
+  name: "priorbox"
+  type: "data"
+  size: 32
+  active_type: ""
+  height: 4
+  width: 8
+}
+layers {
+  name: "label"
+  type: "data"
+  size: 24
+  active_type: ""
+  height: 4
+  width: 6
+}
+layers {
+  name: "test_multibox_loss"
+  type: "multibox_loss"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "priorbox"
+    multibox_loss_conf {
+      num_classes: 21
+      overlap_threshold: 0.5
+      neg_pos_ratio: 3.0
+      neg_overlap: 0.5
+      background_id: 0
+      input_num: 1
+    }
+  }
+  inputs {
+    input_layer_name: "label"
+  }
+  inputs {
+    input_layer_name: "input_loc"
+  }
+  inputs {
+    input_layer_name: "input_conf"
+  }
+}
+input_layer_names: "priorbox"
+input_layer_names: "label"
+input_layer_names: "input_loc"
+input_layer_names: "input_conf"
+output_layer_names: "test_multibox_loss"
+sub_models {
+  name: "root"
+  layer_names: "input_loc"
+  layer_names: "input_conf"
+  layer_names: "priorbox"
+  layer_names: "label"
+  layer_names: "test_multibox_loss"
+  input_layer_names: "priorbox"
+  input_layer_names: "label"
+  input_layer_names: "input_loc"
+  input_layer_names: "input_conf"
+  output_layer_names: "test_multibox_loss"
+  is_recurrent_layer_group: false
+}
+

From e6fcdd47e5c153497efb54ff5737f4fc6a13596d Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Tue, 4 Jul 2017 17:22:43 +0800
Subject: [PATCH 46/79] fix wrong including header-file in files in
 paddle/platform/dynload dir

---
 paddle/platform/dynload/cublas.h          |  4 ++-
 paddle/platform/dynload/cudnn.h           |  4 ++-
 paddle/platform/dynload/curand.h          |  4 ++-
 paddle/platform/dynload/dynamic_loader.cc | 41 ++++++++++-------------
 4 files changed, 26 insertions(+), 27 deletions(-)

diff --git a/paddle/platform/dynload/cublas.h b/paddle/platform/dynload/cublas.h
index c9150ac573..258cc88031 100644
--- a/paddle/platform/dynload/cublas.h
+++ b/paddle/platform/dynload/cublas.h
@@ -15,7 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include <cublas_v2.h>
-#include "paddle/platform/dynamic_loader.h"
+#include <dlfcn.h>
+#include <mutex>
+#include "paddle/platform/dynload/dynamic_loader.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/platform/dynload/cudnn.h b/paddle/platform/dynload/cudnn.h
index c03424b375..0a9562c573 100644
--- a/paddle/platform/dynload/cudnn.h
+++ b/paddle/platform/dynload/cudnn.h
@@ -15,7 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include <cudnn.h>
-#include "paddle/platform/dynamic_loader.h"
+#include <dlfcn.h>
+#include <mutex>
+#include "paddle/platform/dynload/dynamic_loader.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/platform/dynload/curand.h b/paddle/platform/dynload/curand.h
index 1ef7a8c833..9dc0a25c0f 100644
--- a/paddle/platform/dynload/curand.h
+++ b/paddle/platform/dynload/curand.h
@@ -15,7 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include <curand.h>
-#include "paddle/platform/dynamic_loader.h"
+#include <dlfcn.h>
+#include <mutex>
+#include "paddle/platform/dynload/dynamic_loader.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/platform/dynload/dynamic_loader.cc b/paddle/platform/dynload/dynamic_loader.cc
index 8ef67bad8c..dd914e006d 100644
--- a/paddle/platform/dynload/dynamic_loader.cc
+++ b/paddle/platform/dynload/dynamic_loader.cc
@@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "dynamic_loader.h"
+#include "paddle/platform/dynload/dynamic_loader.h"
 #include <dlfcn.h>
 #include <memory>
 #include <mutex>
 #include <string>
 #include "gflags/gflags.h"
 #include "glog/logging.h"
+#include "paddle/framework/enforce.h"
 
 DEFINE_string(cudnn_dir, "",
               "Specify path for loading libcudnn.so. For instance, "
@@ -72,13 +73,12 @@ static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
     *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
     if (nullptr == *dso_handle) {
       if (dso_path == "libcudnn.dylib") {
-        LOG(FATAL)
-            << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n"  // NOLINT
-            << "For instance, sudo tar -xzf "
-               "cudnn-7.5-osx-x64-v5.0-ga.tgz -C "  // NOLINT
-            << "/usr/local \n sudo chmod a+r "
-               "/usr/local/cuda/include/cudnn.h "  // NOLINT
-            << "/usr/local/cuda/lib/libcudnn*";
+        PADDLE_ENFORCE(true,
+                       "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n "
+                       "For instance, sudo tar -xzf "
+                       "cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local \n sudo "
+                       "chmod a+r /usr/local/cuda/include/cudnn.h "
+                       "/usr/local/cuda/lib/libcudnn*");
       }
     }
   }
@@ -106,22 +106,15 @@ static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
       GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
     }
   }
-
-  CHECK(nullptr != *dso_handle) << "Failed to find dynamic library: " << dlPath
-                                << " (" << dlerror() << ") \n"
-                                << "Please specify its path correctly using "
-                                   "following ways: \n"
-
-                                << "Method. set environment variable "
-                                   "LD_LIBRARY_PATH on Linux or "
-                                << "DYLD_LIBRARY_PATH on Mac OS. \n"
-                                << "For instance, issue command: export "
-                                   "LD_LIBRARY_PATH=... \n"
-
-                                << "Note: After Mac OS 10.11, using the "
-                                   "DYLD_LIBRARY_PATH is impossible "
-                                << "unless System Integrity Protection (SIP) "
-                                   "is disabled.";
+  PADDLE_ENFORCE(nullptr != *dso_handle,
+                 "Failed to find dynamic library: %s ( %s ) \n Please specify "
+                 "its path correctly using following ways: \n Method. set "
+                 "environment variable LD_LIBRARY_PATH on Linux or "
+                 "DYLD_LIBRARY_PATH on Mac OS. \n For instance, issue command: "
+                 "export LD_LIBRARY_PATH=... \n Note: After Mac OS 10.11, "
+                 "using the DYLD_LIBRARY_PATH is impossible unless System "
+                 "Integrity Protection (SIP) is disabled.",
+                 dlPath, dlerror());
 }
 
 void GetCublasDsoHandle(void** dso_handle) {

From 571714159aeb42903fca14d614dcb1e6942b5cc4 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Tue, 4 Jul 2017 20:04:32 +0800
Subject: [PATCH 47/79] add op_desc.proto (#2736)

* add op_desc.proto

In Operator design, we need a proto message to describe an Operator.
Third-party language such as python can build this proto message and use
AddOp(const OpDesc& op_desc) of Paddle core to construct an Op in the
Network.
---
 paddle/framework/CMakeLists.txt  |  3 ++
 paddle/framework/op_desc.proto   | 56 ++++++++++++++++++++++++++++++++
 paddle/framework/op_desc_test.cc | 35 ++++++++++++++++++++
 3 files changed, 94 insertions(+)
 create mode 100644 paddle/framework/op_desc.proto
 create mode 100644 paddle/framework/op_desc_test.cc

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index baad38e3c1..a016f57b3e 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -8,3 +8,6 @@ cc_test(enforce_test SRCS enforce_test.cc)
 proto_library(attr_type SRCS attr_type.proto)
 proto_library(op_proto SRCS op_proto.proto DEPS attr_type)
 cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
+
+proto_library(op_desc SRCS op_desc.proto DEPS attr_type)
+cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
diff --git a/paddle/framework/op_desc.proto b/paddle/framework/op_desc.proto
new file mode 100644
index 0000000000..89497f3c16
--- /dev/null
+++ b/paddle/framework/op_desc.proto
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+syntax="proto2";
+package paddle.framework;
+
+import "attr_type.proto";
+
+// AttrDesc is used to describe Attributes of an Operator. It contain's
+// name, type, and value of Attribute.
+//
+// e.g, for scale=3.0: name=scala, type=AttrType.FLOAT, value=3.0
+message AttrDesc {
+    required string name = 1;
+    required AttrType type = 2;
+    optional int32 i = 3;
+    optional float f = 4;
+    optional string s = 5;
+    repeated int32 ints = 6;
+    repeated float floats = 7;
+    repeated string strings = 8;
+};
+
+// Protocol Message to describe an Operator.
+//
+// In PaddlePaddle, Operator is used to do a certain computation such
+// as "add", "sub", "cosine", etc.
+//  (1) Operator needs to know the input and output variable names.
+//  (2) Some ops may have special attributes such as "scale" in "CosineOp".
+//
+// 3rd-party language can build this proto message and call
+// AddOp(const OpDesc& op_desc) of Paddle core to create an Operator.
+message OpDesc {
+    // input names of this Operator.
+    repeated string inputs = 1;
+
+    // output names of this Operator.
+    repeated string outputs = 2;
+
+    // type of this Operator, such as "add", "sub", "fc".
+    required string type = 3;
+
+    // Attributes of this Operator. e.g., scale=3.0 in cosine op.
+    repeated AttrDesc attrs = 4;
+};
\ No newline at end of file
diff --git a/paddle/framework/op_desc_test.cc b/paddle/framework/op_desc_test.cc
new file mode 100644
index 0000000000..d0c52523b6
--- /dev/null
+++ b/paddle/framework/op_desc_test.cc
@@ -0,0 +1,35 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/framework/op_desc.pb.h>
+
+TEST(OpDesc, Create) {
+  paddle::framework::OpDesc op_desc;
+  op_desc.set_type("add");
+  op_desc.add_inputs("X");
+  op_desc.add_inputs("Y");
+  op_desc.add_outputs("Z");
+
+  auto attr = op_desc.mutable_attrs()->Add();
+  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_f(3.14);
+
+  // required field name is not set, so IsInitialized should be false.
+  ASSERT_FALSE(op_desc.IsInitialized());
+
+  attr->set_name("add");
+  // after all required fields are set, IsInitialized should be true now.
+  ASSERT_TRUE(op_desc.IsInitialized());
+}
\ No newline at end of file

From 1ecddd8174fea793e70071163b7e47a750064499 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 4 Jul 2017 21:21:02 +0800
Subject: [PATCH 48/79] Remove buggy BarrierStat

The implementation of BarrierStat is buggy, and it is not necessary
for Paddle to diagnose which node in cluster is slow.
---
 paddle/parameter/tests/test_common.cpp |  50 ---
 paddle/pserver/ParameterServer2.cpp    | 215 -------------
 paddle/pserver/ParameterServer2.h      |  49 ---
 paddle/utils/BarrierStat.cpp           | 340 --------------------
 paddle/utils/BarrierStat.h             | 425 -------------------------
 paddle/utils/Stat.cpp                  |  61 ----
 paddle/utils/Stat.h                    |  17 -
 7 files changed, 1157 deletions(-)
 delete mode 100644 paddle/utils/BarrierStat.cpp
 delete mode 100644 paddle/utils/BarrierStat.h

diff --git a/paddle/parameter/tests/test_common.cpp b/paddle/parameter/tests/test_common.cpp
index 8bab5a6289..64d204aea1 100644
--- a/paddle/parameter/tests/test_common.cpp
+++ b/paddle/parameter/tests/test_common.cpp
@@ -172,53 +172,3 @@ TEST_F(CommonTest, syncThreadPool) {
     EXPECT_EQ((int)0, nums[i]);
   }
 }
-
-TEST_F(CommonTest, barrierStat) {
-  const int threadNum = 10;
-
-  SyncThreadPool pool(threadNum);
-
-#define TEST_BARRIER_RANDOM(statName, numConnThreads, ...)       \
-  pool.exec([&](int tid, size_t numThreads) {                    \
-    struct timeval time;                                         \
-    gettimeofday(&time, nullptr);                                \
-    uint64_t usec = timeToMicroSecond(time);                     \
-    std::srand(usec);                                            \
-    auto value = std::rand() % 100000;                           \
-    usleep(value);                                               \
-    REGISTER_SLOW_NODES_PROBE(                                   \
-        globalStat, statName, numConnThreads, tid, __VA_ARGS__); \
-  });
-
-  for (auto i = 0; i < 10; i++) {
-    TEST_BARRIER_RANDOM("synThreadBarrier1", threadNum);
-    TEST_BARRIER_RANDOM("synThreadBarrier2", threadNum);
-  }
-
-  globalStat.printAllStatus();
-  globalStat.reset();
-
-  for (auto i = 0; i < 10; i++) {
-    TEST_BARRIER_RANDOM("synThreadBarrier3", threadNum, "tag0");
-    TEST_BARRIER_RANDOM("synThreadBarrier4", threadNum, "tag1");
-  }
-
-  globalStat.printAllStatus();
-  globalStat.reset();
-
-// use it to test accurate barrier gap
-#define TEST_BARRIER(statName, numConnThreads, ...)              \
-  pool.exec([&](int tid, size_t numThreads) {                    \
-    usleep(tid * 10000);                                         \
-    REGISTER_SLOW_NODES_PROBE(                                   \
-        globalStat, statName, numConnThreads, tid, __VA_ARGS__); \
-  });
-
-  for (auto i = 0; i < 10; i++) {
-    TEST_BARRIER("synThreadBarrier3", threadNum, "tag0");
-    TEST_BARRIER("synThreadBarrier4", threadNum, "tag1");
-  }
-
-  globalStat.printAllStatus();
-  globalStat.reset();
-}
diff --git a/paddle/pserver/ParameterServer2.cpp b/paddle/pserver/ParameterServer2.cpp
index 41ac15336d..d7c1d4f788 100644
--- a/paddle/pserver/ParameterServer2.cpp
+++ b/paddle/pserver/ParameterServer2.cpp
@@ -217,10 +217,6 @@ void ParameterServer2::setConfig(const SetConfigRequest& request,
 
   SetConfigResponse response;
   callback(response);
-
-  /// always defined, barrier slowest node function need it.
-  statSet_.reset(new StatSet("ParameterServer" +
-                             str::to_string(static_cast<int>(serverId_))));
 }
 
 real bufferSum(const std::vector<ParameterServer2::Buffer>& buffers) {
@@ -369,50 +365,7 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
                                    std::vector<Buffer>* outputBuffers) {
   VLOG(1) << "pserver: addGradient";
 
-  // forwardbackward delta from all trainers
-  // indicate the fluctuation caused by forwardbackward.
-  if (!numPassFinishClients_) {
-    REGISTER_BARRIER_DELTA_SERVER_SET(
-        *statSet_,
-        "forwardbackwardDelta",
-        FLAGS_num_gradient_servers,
-        request.trainer_id(),
-        request.forwardbackward_time(),
-        isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
-  }
-
   {
-    /// approximately pure network overhead
-    REGISTER_TIMER_DYNAMIC_SET(
-        "pushRecv", timeToMicroSecond(*handleRequestBegin_), -1, *statSet_);
-  }
-
-#ifndef PADDLE_DISABLE_TIMER
-  gettimeofday(&(*addGradBegin_), nullptr);
-#endif
-
-  /// barrier fluctuation caused by network and previous forwardbackward
-  if (!numPassFinishClients_) {
-    REGISTER_BARRIER_TIMER_SERVER_SET(
-        *statSet_,
-        "handleReqBegin",
-        FLAGS_num_gradient_servers,
-        request.trainer_id(),
-        (*handleRequestBegin_),
-        isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
-  }
-
-  if (!numPassFinishClients_) {
-    REGISTER_BARRIER_TIMER_SERVER(
-        *statSet_,
-        "addGradBegin",
-        FLAGS_num_gradient_servers,
-        request.trainer_id(),
-        isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
-  }
-
-  {
-    REGISTER_TIMER_DYNAMIC("addGradCore", -1, *statSet_);
     ReadLockGuard guard(parameterMutex_);
     int bufferIndex = 0;
     for (const auto& block : request.blocks()) {
@@ -444,15 +397,6 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
       std::lock_guard<std::mutex> guard(*info.lock);
       simd::addTo(gradientSumBuffer, gradientBuffer, size);
     }
-
-    if (!numPassFinishClients_) {
-      REGISTER_BARRIER_TIMER_SERVER(
-          *statSet_,
-          "addGradCoreFinish",
-          FLAGS_num_gradient_servers,
-          request.trainer_id(),
-          isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
-    }
   }
   if (request.batch_status() == BATCH_FINISH ||
       request.batch_status() == BATCH_START_AND_FINISH) {
@@ -461,47 +405,12 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
     VLOG(1) << "num samples: " << numSamplesProcessed_
             << ", new cost:" << cost_;
 
-    /// numPassFinishClients_ means some trainer has entered finishPass
-    if (!numPassFinishClients_) {
-      REGISTER_SLOW_NODES_PROBE(
-          *statSet_,
-          "SLOW_NODES",
-          FLAGS_num_gradient_servers,
-          request.trainer_id(),
-          isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
-    }
-
     /// notify doOperation gradient ready
     gradientReadyBarrier_.wait();
 
-    /// if wait pass finish does not start, do check
-    if (!numPassFinishClients_) {
-      CHECK_BARRIER_TIMER(*statSet_,
-                          "SLOW_NODES",
-                          FLAGS_num_gradient_servers,
-                          isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
-    }
-
-    /// barrier performance while all parameter add is finished
-    /// can indicate the fluctation caused by computation at pserver.
-    if (!numPassFinishClients_) {
-      REGISTER_BARRIER_TIMER_SERVER(
-          *statSet_,
-          "paraReady",
-          FLAGS_num_gradient_servers,
-          request.trainer_id(),
-          isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
-    }
     /// wait doOperation finish
     parameterReadyBarrier_.wait();
     VLOG(1) << "start send back";
-    {
-      /// total time except overhead of network.
-      REGISTER_TIMER_DYNAMIC_SET("sendParaNoRecvNoSend",
-                                 timeToMicroSecond(*addGradBegin_),
-                                 -1,
-                                 *statSet_);
-    }
   }
 }
 
@@ -543,57 +452,6 @@ bool ParameterServer2::asyncGrdientCommitCheckAndStat(
   return commitGradient;
 }
 
-void ParameterServer2::printAsyncGradientCommitStatAndReset() {
-  std::stringstream statFormat;
-  if (asyncUpdateSteps_) {
-    statFormat << "async discard gradients stat: " << std::endl;
-    statFormat << "serverId: " << serverId_
-               << " serverType: " << isSparseServer_
-               << " total updates: " << asyncUpdateSteps_
-               << " discard updates: " << asyncLaggedGradientsNum_
-               << " discard ratio: "
-               << (real)asyncLaggedGradientsNum_ / (real)asyncUpdateSteps_;
-    statFormat << std::endl;
-    statFormat << std::endl;
-
-    statFormat << "Async Gradient Update Steps distribution: " << std::endl
-               << "Sample: 1:1912(0.00284449) means "
-               << "the updates step=1 count 1912 times "
-               << "and account for 0.284449% of total updates" << std::endl;
-    size_t index = 0;
-    for (const auto& stat : asyncUpdateStat_) {
-      statFormat << index << ":" << stat << "("
-                 << (real)stat / (real)asyncUpdateSteps_ << ") ";
-      index++;
-    }
-    statFormat << std::endl;
-    statFormat << std::endl;
-
-    statFormat << "Async Gradient Discard based on trainer_id: " << std::endl
-               << "Sample: 2:22(0.0016363) means "
-               << "total discarded updates from trainer_id=2 count 22 "
-               << "and account for 0.16363% of all updates from trainer_id=2"
-               << std::endl;
-    for (auto i = 0; i < FLAGS_num_gradient_servers; i++) {
-      real ratio =
-          (real)asyncTrainerDiscardStat_[i] /
-          (real)(asyncTrainerCommitStat_[i] + asyncTrainerDiscardStat_[i]);
-      statFormat << i << ":" << asyncTrainerDiscardStat_[i] << "(" << ratio
-                 << ")"
-                 << " ";
-    }
-    LOG(INFO) << statFormat.str();
-
-    /// reset stat
-    asyncUpdateSteps_ = 0;
-    asyncTrainerSteps_.assign(asyncTrainerSteps_.size(), 0);
-    asyncLaggedGradientsNum_ = 0;
-    asyncUpdateStat_.assign(asyncUpdateStat_.size(), 0);
-    asyncTrainerDiscardStat_.assign(asyncTrainerDiscardStat_.size(), 0);
-    asyncTrainerCommitStat_.assign(asyncTrainerCommitStat_.size(), 0);
-  }
-}
-
 static ThreadLocal<std::vector<bool>> localBlockBitset_;
 
 void ParameterServer2::asyncSGD(const SendParameterRequest& request,
@@ -695,7 +553,6 @@ void ParameterServer2::asyncSGD(const SendParameterRequest& request,
   if (request.trainer_id() == 0) {
     /// batchId_ is approximately equal to "real batchId_"
     batchId_++;
-    tuningAsyncsgdMidOutput();
   }
 }
 
@@ -881,34 +738,6 @@ void ParameterServer2::sendParameter(const SendParameterRequest& request,
         }
         (*requestVec_).clear();
         (*callbackVec_).clear();
-
-        /// barrier perfromance while all data are send finished.
-        /// indicates network flucatuation for big message.
-        if (!numPassFinishClients_) {
-          REGISTER_BARRIER_TIMER_SERVER(
-              *statSet_,
-              "sendParamFinish",
-              FLAGS_num_gradient_servers,
-              request.trainer_id(),
-              isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
-        }
-        /// all time exhausted in parameterServer for big message.
-        /// it contains network and computation at pserver.
-        {
-          /// total time including overhead of network.
-          REGISTER_TIMER_DYNAMIC_SET("sendParaTotal",
-                                     timeToMicroSecond(*handleRequestBegin_),
-                                     -1,
-                                     *statSet_);
-        }
-        /// all time exhausted in pserverServer except recieve network.
-        {
-          /// total time except overhead of network receive
-          REGISTER_TIMER_DYNAMIC_SET("sendParaNoRecv",
-                                     timeToMicroSecond(*addGradBegin_),
-                                     -1,
-                                     *statSet_);
-        }
       }
       break;
     case PSERVER_UPDATE_MODE_SET_PARAM:
@@ -1088,8 +917,6 @@ void ParameterServer2::op_SGD(const Operation& operation,
   }
 
   {
-    REGISTER_TIMER_DYNAMIC("op_SGD", -1, *statSet_);
-
     parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
       BlockInfo& info = blockInfos_[blockId];
       const ParameterConfig& config = getParameterConfig(blockId);
@@ -1113,7 +940,6 @@ void ParameterServer2::op_SGD(const Operation& operation,
   }
 
   batchId_++;
-  tuningSgdMidOutput();
 }
 
 void ParameterServer2::op_start_pass(const Operation& operation,
@@ -1146,8 +972,6 @@ void ParameterServer2::op_finish_pass(const Operation& operation,
     /// finish pass
     info.optimizer->finishPass();
   });
-
-  tuningSgdFinished();
   batchId_ = 0;
 }
 
@@ -1515,7 +1339,6 @@ void ParameterServer2::asyncFinishPass(const SynchronizeRequest& request,
   callback(SynchronizeResponse());
 
   if (request.trainer_id() == 0) {
-    tuningAsyncsgdFinished();
     batchId_ = 0;
   }
 }
@@ -1574,42 +1397,4 @@ void ParameterServer2::releaseMatrix(const ReleaseMatrixRequest& request,
   callback(response);
 }
 
-void ParameterServer2::tuningSgdMidOutput() {
-  if (batchId_ && batchId_ % FLAGS_log_period_server == 0) {
-    LOG(INFO) << "======== Batch=" << batchId_ << "=======";
-    statSet_->setThreadInfo(true);
-    statSet_->printAllStatus();
-    /// not reset raw data for reducing the overhead of performance tuning
-    statSet_->reset(false);
-  }
-}
-
-void ParameterServer2::tuningSgdFinished() {
-  LOG(INFO) << "======== Batch=" << batchId_ << " pass END"
-            << "=======";
-  statSet_->setThreadInfo(true);
-  statSet_->printAllStatus();
-  /**
-   * reset raw data at end of pass since some raw data could be not
-   * complete. Otherwise the raw data will pollute next pass performance
-   * tuning
-   */
-  statSet_->reset();
-}
-
-void ParameterServer2::tuningAsyncsgdMidOutput() {
-#ifndef PADDLE_DISABLE_TIMER
-  if (batchId_ && batchId_ % FLAGS_log_period_server == 0) {
-    LOG(INFO) << "======== [not accurate] Batch=" << batchId_ << "=======";
-    printAsyncGradientCommitStatAndReset();
-  }
-#endif
-}
-
-void ParameterServer2::tuningAsyncsgdFinished() {
-  LOG(INFO) << "======== [not accurate] Batch=" << batchId_ << " pass END"
-            << "=======";
-  printAsyncGradientCommitStatAndReset();
-}
-
 }  // namespace paddle
diff --git a/paddle/pserver/ParameterServer2.h b/paddle/pserver/ParameterServer2.h
index 0f5a589590..f7d3587b88 100644
--- a/paddle/pserver/ParameterServer2.h
+++ b/paddle/pserver/ParameterServer2.h
@@ -298,24 +298,6 @@ protected:
   /// barrier performance tuning sync-sgd required
   std::atomic<int64_t> batchId_;
 
-  /// the beginning of addGradient without network overhead
-  ThreadLocal<struct timeval> addGradBegin_;
-
-  /**
-   * tuning barrier performance
-   * to better control log for sparse and dense parameter,
-   * we use different log entities for different parameterServer
-   * objects.
-   * it will output lots of performance stats to perceive the
-   * overhead of network, fluctuation of computation from
-   * forwardbackward and network, computation from optimization
-   * at pserver end, barrier overhead, etc. to understand tuning
-   * data, focus on the synchronization between addGradient and
-   * doOperation which indirectly call op_SGD operation controlled
-   * by remote updater controller
-   */
-  std::unique_ptr<StatSet> statSet_;
-
 public:
   struct Buffer {
     real* base;
@@ -325,7 +307,6 @@ public:
 protected:
   /// async gradient commit control
   bool asyncGrdientCommitCheckAndStat(const SendParameterRequest& request);
-  void printAsyncGradientCommitStatAndReset();
 
 public:
   /// disable default parameter for overloading
@@ -710,36 +691,6 @@ public:
 
   void op_load(const Operation& operation, OperationResult* result);
   void op_save(const Operation& operation, OperationResult* result);
-
-  /**
-   * @brief output log in at the middle stage of training
-   *
-   * @note  flush log histroy and state at the end for sgd
-   */
-  void tuningSgdMidOutput();
-
-  /**
-   * @brief output log in at the end stage of training
-   *
-   * @note  flush log histroy and state at the end for sgd. it will also
-   *        flush some stateful stat for next pass.
-   */
-  void tuningSgdFinished();
-
-  /**
-   * @brief output log in at the middle stage of training
-   *
-   * @note  flush log histroy and state at the end for async-sgd.
-   *        it will log some performance log if some lagged node are found
-   */
-  void tuningAsyncsgdMidOutput();
-
-  /**
-   * @brief output log in at the end stage of training
-   *
-   * @note  flush log histroy and state at the end for async-sgd.
-   */
-  void tuningAsyncsgdFinished();
 };
 
 }  // namespace paddle
diff --git a/paddle/utils/BarrierStat.cpp b/paddle/utils/BarrierStat.cpp
deleted file mode 100644
index a6dbdcae3f..0000000000
--- a/paddle/utils/BarrierStat.cpp
+++ /dev/null
@@ -1,340 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/utils/BarrierStat.h"
-#include <string.h>
-#include <sys/types.h>
-#include <algorithm>
-#include <iomanip>
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/Stat.h"
-
-DEFINE_bool(log_barrier_abstract,
-            true,
-            "if true, show abstract of barrier performance");
-DEFINE_int32(log_barrier_lowest_nodes,
-             5,
-             "how many lowest node will be logged");
-DEFINE_bool(log_barrier_show_log,
-            false,  // for performance tuning insight
-            "if true, always show barrier abstract even with little gap");
-
-namespace paddle {
-
-std::ostream &operator<<(std::ostream &output, const BarrierStatBase &stat) {
-  if (FLAGS_log_barrier_abstract) {
-    std::lock_guard<std::mutex> guard(stat.lock_);
-    stat.showAbstract(output);
-  }
-  return output;
-}
-
-BarrierStatBase::BarrierStatBase(uint16_t numConnThreads,
-                                 const std::string &name)
-    : totSamples_(0), numConnThreads_(numConnThreads), name_(name) {
-  abstract_.resize(numConnThreads_);
-  if (FLAGS_log_barrier_show_log) {
-    rateThreshold_ = 0.0;
-  } else {
-    /* probablity of abnormal node
-     * p = 1/n + (n/8)/(n+1), n = nodes, n > 1
-     * if the freq of lowest trainerId larger than p,
-     * output FLAGS_log_barrier_lowest_nodes lastTrainerId.
-     * numConnThreads_ indicates nodes
-     */
-    float n = (float)numConnThreads;
-    rateThreshold_ = 1.0 / n + (n / 8.0) / (n + 1.0);
-  }
-}
-
-BarrierEndStat::BarrierEndStat(uint16_t numConnThreads, const std::string &name)
-    : BarrierStatBase(numConnThreads, name) {
-  timeVector_.reset(new TimeVectorEnd(numConnThreads_));
-  reset(true);
-  LOG(INFO) << " create barrierEndStat: " << name
-            << " endBarrier warning rate: " << rateThreshold_;
-}
-
-/*
- * Note:
- * the design different pserver entity owns different statSet to obey
- * the background that different pserver runs separately.
- */
-void BarrierEndStat::updateStat(struct timeval &cur, int32_t trainerId) {
-  CHECK_LT(trainerId, numConnThreads_) << "trainerId is invalid in barrier";
-
-  std::lock_guard<std::mutex> guard(lock_);
-  timeVector_->addTimeval(cur, trainerId);
-
-  if (timeVector_->full()) {
-    std::lock_guard<std::mutex> abstractGuard(abstractLock_);
-    auto id = timeVector_->getLastTrainerId();
-    auto delta = timeToMicroSecond(timeVector_->getDelta());
-    auto secondDelta = timeToMicroSecond(timeVector_->get1NDelta());
-    auto lastTwoDelta = timeToMicroSecond(timeVector_->getMinus1NDelta());
-    auto midDelta = timeToMicroSecond(timeVector_->getMidNDelta());
-    // discard first sample, since first sample probably is abnormal.
-    if (totSamples_) {
-      abstract_[id].freq++;
-
-      if (delta < abstract_[id].minDelta) {
-        abstract_[id].minDelta = delta;
-      }
-      if (delta > abstract_[id].maxDelta) {
-        abstract_[id].maxDelta = delta;
-      }
-      abstract_[id].totDelta += delta;
-      abstract_[id].totSecondDelta += secondDelta;
-      abstract_[id].totLastTwoDelta += lastTwoDelta;
-      abstract_[id].totMidDelta += midDelta;
-
-      // update totAbstract_
-      totAbstract_.freq++;
-      if (delta < totAbstract_.minDelta) {
-        totAbstract_.minDelta = delta;
-      }
-      if (delta > totAbstract_.maxDelta) {
-        totAbstract_.maxDelta = delta;
-      }
-      totAbstract_.totDelta += delta;
-      totAbstract_.totSecondDelta += secondDelta;
-      totAbstract_.totLastTwoDelta += lastTwoDelta;
-      totAbstract_.totMidDelta += midDelta;
-    }
-
-    totSamples_++;
-    timeVector_->reset();
-  }
-}
-
-void BarrierEndStat::reset(bool clearRawData) {
-  int32_t i = 0;
-
-  totSamples_ = 0;
-
-  std::lock_guard<std::mutex> guard(abstractLock_);
-
-  if (clearRawData) {
-    timeVector_->reset();
-  }
-
-  for (auto &abstract : abstract_) {
-    memset((void *)&abstract, 0, sizeof(abstract));
-    abstract.minDelta = UINT64_MAX;
-    abstract.trainerId = i++;
-  }
-  memset((void *)&totAbstract_, 0, sizeof(Abstract));
-  totAbstract_.minDelta = UINT64_MAX;
-}
-
-void BarrierEndStat::showAbstract(std::ostream &output) const {
-  // do not support the case "<=2 pserver"
-  if (numConnThreads_ <= 2 || !totSamples_) {
-    return;
-  }
-
-  // duplicate freq info
-  std::vector<struct Abstract> outputAbstract = abstract_;
-  std::sort(outputAbstract.begin(),
-            outputAbstract.end(),
-            [](const struct Abstract &a, const struct Abstract &b) {
-              return a.freq > b.freq;
-            });
-
-  auto rate = (float)outputAbstract[0].freq / (float)totSamples_;
-  if (rate < rateThreshold_) {
-    return;
-  }
-
-  output << std::setw(20) << name_ << std::endl;
-
-  /*
-   * Note:
-   * avgGap:        the average delta between 1 -- n arriving trainers
-   * avgSecondGap:  the average delta between 2 -- n arriving trainers
-   * avgLastTwoGap: the average delta between n-1 -- n  arriving trainers
-   * avgMidGap:     the average delta between n/2 -- n  arriving trainers
-   * rato: samples / totSamples
-   *
-   * the stat is based on per trainer if trainer_id is set, totAbstract is
-   * stat based on all trainers scope.
-   */
-  output << std::setw(42) << " " << std::setw(15) << "trainerId"
-         << std::setw(15) << "avgGap" << std::setw(15) << "avgSecondGap"
-         << std::setw(15) << "avgLastTwoGap" << std::setw(15) << "avgMidGap"
-         << std::setw(10) << "rate" << std::setw(10) << "samples"
-         << std::setw(10) << "totSamples" << std::endl;
-  // show totAbstract, it's valuable when lastTrainerId is even-distributed'
-  if (!totAbstract_.freq) return;
-  output << std::setw(42) << " " << std::setw(15) << "totAbstract"
-         << std::setw(15) << (totAbstract_.totDelta / totAbstract_.freq) * 0.001
-         << std::setw(15)
-         << (totAbstract_.totSecondDelta / totAbstract_.freq) * 0.001
-         << std::setw(15)
-         << (totAbstract_.totLastTwoDelta / totAbstract_.freq) * 0.001
-         << std::setw(15)
-         << (totAbstract_.totMidDelta / totAbstract_.freq) * 0.001
-         << std::setw(10) << (float)totAbstract_.freq / (float)totSamples_
-         << std::setw(10) << (float)totAbstract_.freq << std::setw(10)
-         << (float)totSamples_ << std::endl;
-
-  // show lastTrainerId abstract
-  int count = 0;
-  for (auto &abstract : outputAbstract) {
-    if (!abstract.freq || count++ >= FLAGS_log_barrier_lowest_nodes) {
-      break;
-    }
-    // output format control
-    output << std::setw(42) << " " << std::setw(15) << abstract.trainerId
-           << std::setw(15) << (abstract.totDelta / abstract.freq) * 0.001
-           << std::setw(15) << (abstract.totSecondDelta / abstract.freq) * 0.001
-           << std::setw(15)
-           << (abstract.totLastTwoDelta / abstract.freq) * 0.001
-           << std::setw(15) << (abstract.totMidDelta / abstract.freq) * 0.001
-           << std::setw(10) << (float)abstract.freq / (float)totSamples_
-           << std::setw(10) << (float)abstract.freq << std::setw(10)
-           << (float)totSamples_ << std::endl;
-  }
-}
-
-BarrierDeltaStat::BarrierDeltaStat(uint16_t numConnThreads,
-                                   const std::string &name)
-    : BarrierStatBase(numConnThreads, name) {
-  timeVector_.reset(new TimeVectorDelta(numConnThreads_));
-  reset(true);
-  LOG(INFO) << " create barrierDeltaStat: " << name
-            << " barrierDelta warning rate: " << rateThreshold_;
-}
-
-void BarrierDeltaStat::updateStat(uint64_t delta, int32_t trainerId) {
-  CHECK_LT(trainerId, numConnThreads_) << "trainerId is invalid in barrier";
-
-  std::lock_guard<std::mutex> guard(lock_);
-  timeVector_->addTimeval(delta, trainerId);
-
-  if (timeVector_->full()) {
-    std::lock_guard<std::mutex> abstractGuard(abstractLock_);
-    auto id = timeVector_->getMaxTrainerId();
-    auto delta = timeVector_->getDelta();
-    // discard first sample, since first sample probably is abnormal.
-    if (totSamples_) {
-      abstract_[id].freq++;
-
-      if (delta < abstract_[id].minDelta) {
-        abstract_[id].minDelta = delta;
-      }
-      if (delta > abstract_[id].maxDelta) {
-        abstract_[id].maxDelta = delta;
-      }
-      abstract_[id].totDelta += delta;
-
-      // update totAbstract_
-      totAbstract_.freq++;
-      if (delta < totAbstract_.minDelta) {
-        totAbstract_.minDelta = delta;
-      }
-      if (delta > totAbstract_.maxDelta) {
-        totAbstract_.maxDelta = delta;
-      }
-      totAbstract_.totDelta += delta;
-    }
-
-    totSamples_++;
-    timeVector_->reset();
-  }
-}
-
-void BarrierDeltaStat::reset(bool clearRawData) {
-  int32_t i = 0;
-
-  totSamples_ = 0;
-
-  std::lock_guard<std::mutex> guard(abstractLock_);
-
-  if (clearRawData) {
-    timeVector_->reset();
-  }
-
-  for (auto &abstract : abstract_) {
-    memset((void *)&abstract, 0, sizeof(abstract));
-    abstract.minDelta = UINT64_MAX;
-    abstract.trainerId = i++;
-  }
-  memset((void *)&totAbstract_, 0, sizeof(Abstract));
-  totAbstract_.minDelta = UINT64_MAX;
-}
-
-void BarrierDeltaStat::showAbstract(std::ostream &output) const {
-  // do not support the case "<=2 pserver"
-  if (numConnThreads_ <= 2 || !totSamples_) {
-    return;
-  }
-
-  // duplicate freq info
-  std::vector<struct Abstract> outputAbstract = abstract_;
-  std::sort(outputAbstract.begin(),
-            outputAbstract.end(),
-            [](const struct Abstract &a, const struct Abstract &b) {
-              return a.freq > b.freq;
-            });
-
-  auto rate = (float)outputAbstract[0].freq / (float)totSamples_;
-  if (rate < rateThreshold_) {
-    return;
-  }
-
-  output << std::setw(20) << name_ << std::endl;
-
-  /* Note:
-   * Gap means the delta from all trainers' forwardbackward
-   * avgGap: average Gap in log_period batches
-   * minGap: min Gap in log_period batches
-   * maxGap: max Gap in log_period batches
-   * trainerId: the slowest trainer_id
-   *
-   * the stat is based on per trainer if trainer_id is set, totAbstract is
-   * stat based on all trainers scope.
-   */
-  output << std::setw(42) << " " << std::setw(15) << "trainerId"
-         << std::setw(15) << "avgGap" << std::setw(10) << "minGap"
-         << std::setw(10) << "maxGap" << std::setw(10) << "rate"
-         << std::setw(10) << "samples" << std::setw(10) << "totSamples"
-         << std::endl;
-  // show totAbstract, it's valuable when lastTrainerId is even-distributed'
-  if (!totAbstract_.freq) return;
-  output << std::setw(42) << " " << std::setw(15) << "totAbstract"
-         << std::setw(15) << (totAbstract_.totDelta / totAbstract_.freq) * 0.001
-         << std::setw(10) << totAbstract_.minDelta * 0.001 << std::setw(10)
-         << totAbstract_.maxDelta * 0.001 << std::setw(10)
-         << (float)totAbstract_.freq / (float)totSamples_ << std::setw(10)
-         << (float)totAbstract_.freq << std::setw(10) << (float)totSamples_
-         << std::endl;
-
-  // show lastTrainerId abstract
-  int count = 0;
-  for (auto &abstract : outputAbstract) {
-    if (!abstract.freq || count++ >= FLAGS_log_barrier_lowest_nodes) {
-      break;
-    }
-    // output format control
-    output << std::setw(42) << " " << std::setw(15) << abstract.trainerId
-           << std::setw(15) << (abstract.totDelta / abstract.freq) * 0.001
-           << std::setw(10) << abstract.minDelta * 0.001 << std::setw(10)
-           << abstract.maxDelta * 0.001 << std::setw(10)
-           << (float)abstract.freq / (float)totSamples_ << std::setw(10)
-           << (float)abstract.freq << std::setw(10) << (float)totSamples_
-           << std::endl;
-  }
-}
-}  // namespace paddle
diff --git a/paddle/utils/BarrierStat.h b/paddle/utils/BarrierStat.h
deleted file mode 100644
index a9c925eff6..0000000000
--- a/paddle/utils/BarrierStat.h
+++ /dev/null
@@ -1,425 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-#include <sys/time.h>
-#include <iostream>
-#include <list>
-#include <memory>
-#include <mutex>
-#include <string>
-#include <unordered_map>
-
-#include "Locks.h"
-#include "Logging.h"
-#include "ThreadLocal.h"
-
-namespace paddle {
-
-inline uint64_t timeToMicroSecond(struct timeval time) {
-  return time.tv_sec * 1000000LU + time.tv_usec;
-}
-
-class TimeVectorEnd {
-  /*
-   * help class for gathering all barrier performance data
-   * which shows time point property.
-   * freqently used in barrier performance tuning API, such
-   * as tuning which is slowest node in sync-sgd mode training.
-   */
-public:
-  explicit TimeVectorEnd(uint16_t size) : size_(size) {
-    index_ = 0;
-    timeArray_.resize(size);
-    trainerIds_.resize(size);
-  }
-  ~TimeVectorEnd() {}
-
-  uint16_t size() { return size_; }
-
-  bool full() { return index_ == size_; }
-
-  bool empty() { return index_ == 0; }
-
-  void reset() { index_ = 0; }
-
-  void addTimeval(struct timeval time, int32_t trainerId) {
-    timeArray_[index_] = time;
-    trainerIds_[index_] = trainerId;
-    index_++;
-  }
-
-  struct timeval getDelta() const {
-    struct timeval delta;
-    CHECK_GT(size_, 1) << "not support with 1 pserver";
-    timersub(&timeArray_[size_ - 1], &timeArray_[0], &delta);
-    return delta;
-  }
-
-  /* 2, n delta */
-  struct timeval get1NDelta() const {
-    CHECK_GT(size_, 2) << "not support with less than 2 pservers";
-    struct timeval delta;
-    timersub(&timeArray_[size_ - 1], &timeArray_[1], &delta);
-    return delta;
-  }
-
-  /* n-1, n delta */
-  struct timeval getMinus1NDelta() const {
-    CHECK_GT(size_, 2) << "not support with less than 2 pservers";
-    struct timeval delta;
-    timersub(&timeArray_[size_ - 1], &timeArray_[size_ - 2], &delta);
-    return delta;
-  }
-
-  /* n/2, n delta */
-  struct timeval getMidNDelta() const {
-    CHECK_GT(size_, 2) << "not support with less than 2 pservers";
-    struct timeval delta;
-    timersub(&timeArray_[size_ - 1], &timeArray_[size_ / 2], &delta);
-    return delta;
-  }
-
-  int32_t getLastTrainerId() const { return trainerIds_[index_ - 1]; }
-
-private:
-  uint16_t size_;
-  uint16_t index_;
-  std::vector<struct timeval> timeArray_;
-  std::vector<int32_t> trainerIds_;
-};
-
-class TimeVectorDelta {
-  /*
-   * help class for gathering performance data which shows time
-   * delta property, such as tuning the time distribution of
-   * forwardBackward time from all cluster nodes.
-   */
-public:
-  explicit TimeVectorDelta(uint16_t size)
-      : size_(size), min_(UINT64_MAX), max_(0) {
-    index_ = 0;
-    timeArray_.resize(size);
-  }
-  ~TimeVectorDelta() {}
-
-  uint16_t size() { return size_; }
-
-  bool full() { return index_ == size_; }
-
-  bool empty() { return index_ == 0; }
-
-  void reset() {
-    index_ = 0;
-    min_ = UINT64_MAX;
-    max_ = 0;
-  }
-
-  void addTimeval(uint64_t delta, int32_t trainerId) {
-    timeArray_[index_] = delta;
-    index_++;
-    if (delta < min_) {
-      min_ = delta;
-    }
-    if (delta > max_) {
-      max_ = delta;
-      maxTrainerId_ = trainerId;
-    }
-  }
-
-  uint64_t getDelta() const {
-    CHECK_GT(size_, 1) << "not support with 1 pserver";
-    return max_ - min_;
-  }
-
-  /* 2, n delta */
-  uint64_t get1NDelta() const {
-    CHECK_GT(size_, 2) << "not support with less than 2 pservers";
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /* n-1, n delta */
-  uint64_t getMinus1NDelta() const {
-    CHECK_GT(size_, 2) << "not support with less than 2 pservers";
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /* n/2, n delta */
-  uint64_t getMidNDelta() const {
-    CHECK_GT(size_, 2) << "not support with less than 2 pservers";
-    LOG(FATAL) << "Not implemented";
-  }
-
-  int32_t getMaxTrainerId() const { return maxTrainerId_; }
-
-private:
-  uint16_t size_;
-  uint16_t index_;
-  std::vector<uint64_t> timeArray_;
-
-private:
-  uint64_t min_;
-  uint64_t max_;
-  int32_t maxTrainerId_;
-};
-
-// total samples stats, us
-struct Abstract {
-  // last trainerId for barrier end, maxDelta trainerId for barrier delta
-  int32_t trainerId;
-  uint64_t minDelta;
-  uint64_t maxDelta;
-  uint64_t totDelta;
-  // first one is probably itself, so discard it.
-  uint64_t totSecondDelta;
-  // to confirm if last node destroy barrier performance.
-  uint64_t totLastTwoDelta;
-  // n/2-n delta
-  uint64_t totMidDelta;
-  uint64_t freq;
-};
-
-// barrier performance tunning stats
-class BarrierStatBase {
-public:
-  BarrierStatBase(uint16_t numConnThreads, const std::string &name);
-
-  virtual ~BarrierStatBase() {}
-
-  // if called at pserver end, then trainId means trainer's id.
-  // by default trainer does not use trainerId, so set it to -1
-  virtual void updateStat(struct timeval &cur, int32_t trainerId = -1) = 0;
-  virtual void updateStat(uint64_t delta, int32_t trainerId = -1) = 0;
-
-  const std::string &getName() { return name_; }
-
-  virtual void reset(bool clearRawData = true) {}
-  // since the timeVector_ is not stateful, so it's not clear whether the
-  // the barrier delta is correct. if one timestamp was lost, the all data
-  // from barrier stat becomes rubbish. -_-
-  virtual bool checkPassBarrier() {
-    LOG(INFO) << "bug implementation found";
-    return false;
-  }
-
-protected:
-  virtual void showAbstract(std::ostream &output) const {}
-  friend std::ostream &operator<<(std::ostream &output,
-                                  const BarrierStatBase &stat);
-
-protected:
-  mutable std::mutex lock_;
-  std::mutex abstractLock_;  // see note on updaterStat
-  // each freqency for each barrier trainer
-  std::vector<struct Abstract> abstract_;
-  // it is valuable when do perf-tuining, if lastTrainerId acts uniform
-  // distribution
-  struct Abstract totAbstract_;
-  uint64_t totSamples_;
-
-protected:
-  uint16_t numConnThreads_;  // total updates needed
-  float rateThreshold_;
-  std::string name_;
-};
-
-// the end-time of arriving real/forged barrier position
-class BarrierEndStat : public BarrierStatBase {
-public:
-  BarrierEndStat(uint16_t numConnThreads, const std::string &name);
-  ~BarrierEndStat() {}
-
-  virtual void updateStat(struct timeval &cur, int32_t trainerId = -1);
-  virtual void updateStat(uint64_t delta, int32_t trainerId = -1) {
-    LOG(INFO) << "have no delta updateStat in BarrierEndStat";
-  }
-  virtual void reset(bool clearRawData = true);
-  virtual bool checkPassBarrier() { return timeVector_->empty(); }
-
-protected:
-  /*
-   * LOG:
-   * readAllBlocks_denseUpdater
-   * trainerId      avgGap         avgSecondGap   avgLastTwoGap  avgMidGap rate
-   * 44             86.702         81.022         9.984          50.472 0.144737
-   * 46             87.723         82.939         8.737          50.019 0.118421
-   * 35             100.923        96.752         14.305         61.979
-   * 0.0657895
-   * log_barrier_abstract, log_barrier_lowest_nodes, log_barrier_threshold
-   * control details.
-   */
-  virtual void showAbstract(std::ostream &output) const;
-
-private:
-  std::unique_ptr<TimeVectorEnd> timeVector_;
-};
-
-// the delta-time from different trainers,
-// eg, find the degree of imbalance of BP time at pserver end
-// the entry value in timerVector_ is BP delta, do evaluation to BP delta.
-class BarrierDeltaStat : public BarrierStatBase {
-public:
-  BarrierDeltaStat(uint16_t numConnThreads, const std::string &name);
-  ~BarrierDeltaStat() {}
-
-  virtual void updateStat(uint64_t delta, int32_t trainerId = -1);
-  virtual void updateStat(struct timeval &cur, int32_t trainerId = -1) {
-    LOG(INFO) << "have no timeval updateStat in BarrierDeltaStat";
-  }
-
-  virtual void reset(bool clearRawData = true);
-
-  virtual bool checkPassBarrier() { return timeVector_->empty(); }
-
-protected:
-  virtual void showAbstract(std::ostream &outPut) const;
-
-private:
-  // store delta time in uint64_t, eg BP time of all trainers
-  std::unique_ptr<TimeVectorDelta> timeVector_;
-};
-
-// to distinguish different contexts for same parallel threads, and different
-// threads with same code-sgement, just use tagName to tag the run-time
-// position.
-// in Sparse, sendParallel threads can not only run in the stage of push&pull
-// with same thread group, but also run in the stage of pull&push with different
-// thread group, tag will be used to distinguish different run-time barrier
-// position.
-// trainerId in REGISTER_BARRIER_TIMER_SERVER is used to retreive lowest trainer
-// nodes.
-
-// end barrier
-#define __REGISTER_BARRIER_TIMER_SERVER(                            \
-    set, statName, numConnThreads, trainerId, ...)                  \
-  do {                                                              \
-    if (numConnThreads > 2) {                                       \
-      std::string internalName =                                    \
-          std::string(statName) + std::string(__VA_ARGS__);         \
-      BarrierStatPtr __stat =                                       \
-          (set).getStat(numConnThreads, internalName, BARRIER_END); \
-      struct timeval cur;                                           \
-      gettimeofday(&cur, nullptr);                                  \
-      __stat->updateStat(cur, trainerId);                           \
-    }                                                               \
-  } while (0);
-
-// end barrier with user-defined timer
-#define __REGISTER_BARRIER_TIMER_SERVER_SET(                        \
-    set, statName, numConnThreads, trainerId, cur, ...)             \
-  do {                                                              \
-    if (numConnThreads > 2) {                                       \
-      std::string internalName =                                    \
-          std::string(statName) + std::string(__VA_ARGS__);         \
-      BarrierStatPtr __stat =                                       \
-          (set).getStat(numConnThreads, internalName, BARRIER_END); \
-      __stat->updateStat(cur, trainerId);                           \
-    }                                                               \
-  } while (0);
-
-// delta barrier
-#define __REGISTER_BARRIER_DELTA_SERVER_SET(                          \
-    set, statName, numConnThreads, trainerId, delta, ...)             \
-  do {                                                                \
-    if (numConnThreads > 2) {                                         \
-      std::string internalName =                                      \
-          std::string(statName) + std::string(__VA_ARGS__);           \
-      BarrierStatPtr __stat =                                         \
-          (set).getStat(numConnThreads, internalName, BARRIER_DELTA); \
-      __stat->updateStat(delta, trainerId);                           \
-    }                                                                 \
-  } while (0);
-
-// check end barrier
-#define __CHECK_BARRIER_TIMER(set, statName, numConnThreads, ...)   \
-  do {                                                              \
-    std::string internalName =                                      \
-        std::string(statName) + std::string(__VA_ARGS__);           \
-    BarrierStatPtr __stat =                                         \
-        (set).getStat(numConnThreads, internalName, BARRIER_END);   \
-    PCHECK(__stat->checkPassBarrier()) << internalName              \
-                                       << ": invalid barrier data"; \
-  } while (0);
-
-/*
- * Note:
- * with sync-sgd algriothm in cluster mode, lots of synchronize action exsit at
- * pserve end. these synchronizaton actions have impact on the efficiency of
- * parameter exchange. the synchronizaton(barrier) GAP is composed of lots of
- * factors, such as the forwardBackward variance, network fluncation. we try
- * to have a quantitative analysis on these factor, so we design lots of barrier
- * time to capture these performance. these barrier also can be placed at
- * implict barrier position.
- *
- * example:
- * in sync-sgd algorithm, each parameter server waits for all gradients from
- * all trainers, thus, an explict barrier point exsit before doing optimization.
- * the barrier timer located before the point can sense the barrier condition.
- *
- */
-
-// try to capture which trainer is slowest node in sync-sgd at pserver.
-#define REGISTER_SLOW_NODES_PROBE(                 \
-    set, statName, numConnThreads, trainerId, ...) \
-  __REGISTER_BARRIER_TIMER_SERVER(                 \
-      (set), statName, numConnThreads, trainerId, __VA_ARGS__)
-// try to check if all threads or trainers have passed barriers for data
-// accuracy.
-#define CHECK_BARRIER_TIMER(set, statName, numConnThreads, ...) \
-  __CHECK_BARRIER_TIMER((set), statName, numConnThreads, __VA_ARGS__)
-
-#ifdef PADDLE_DISABLE_TIMER
-
-#define REGISTER_BARRIER_TIMER_SERVER( \
-    set, statName, numConnThreads, trainerId, ...)
-#define REGISTER_BARRIER_TIMER_SERVER_SET( \
-    set, statName, numConnThreads, trainerId, cur, ...)
-#define REGISTER_BARRIER_DELTA_SERVER_SET( \
-    set, statName, numConnThreads, trainerId, cur, ...)
-
-#else
-
-/*
- * sensing barrier time distribution for all parallelization threads.
- * it provides low API for slow node check(REGISTER_SLOW_NODES_PROBE)
- */
-#define REGISTER_BARRIER_TIMER_SERVER(             \
-    set, statName, numConnThreads, trainerId, ...) \
-  __REGISTER_BARRIER_TIMER_SERVER(                 \
-      (set), statName, numConnThreads, trainerId, __VA_ARGS__)
-
-/*
- * sensing barrier time distribution for all parallelization threads.
- * but time point for barrier performance is set by user.
- * eg, with this api, you can get implict barrier point such as the beginning
- * time distribution
- * for receiving data.
- */
-#define REGISTER_BARRIER_TIMER_SERVER_SET(              \
-    set, statName, numConnThreads, trainerId, cur, ...) \
-  __REGISTER_BARRIER_TIMER_SERVER_SET(                  \
-      (set), statName, numConnThreads, trainerId, cur, __VA_ARGS__)
-
-// try to capture time delta from all trainers, such as forwardBackward time
-// which implies
-// computation fluctuation
-#define REGISTER_BARRIER_DELTA_SERVER_SET(                \
-    set, statName, numConnThreads, trainerId, delta, ...) \
-  __REGISTER_BARRIER_DELTA_SERVER_SET(                    \
-      (set), statName, numConnThreads, trainerId, delta, __VA_ARGS__)
-
-#endif  // DISABLE_TIMER
-}  // namespace paddle
diff --git a/paddle/utils/Stat.cpp b/paddle/utils/Stat.cpp
index c7194d3bf1..ff1b1bf888 100644
--- a/paddle/utils/Stat.cpp
+++ b/paddle/utils/Stat.cpp
@@ -97,34 +97,6 @@ std::ostream& operator<<(std::ostream& outPut, const Stat& stat) {
   return outPut;
 }
 
-BarrierStatPtr StatSet::getStat(uint16_t numConnThreads,
-                                const std::string& name,
-                                BarrierStatType bType) {
-  {
-    ReadLockGuard guard(lock_);
-    auto it = barrierStatSet_.find(name);
-    if (it != barrierStatSet_.end()) {
-      return it->second;
-    }
-  }
-
-  std::lock_guard<RWLock> guard(lock_);
-  // test again with lock_guard
-  auto it = barrierStatSet_.find(name);
-  if (it != barrierStatSet_.end()) {
-    return it->second;
-  }
-
-  BarrierStatPtr stat;
-  if (bType == BARRIER_END) {
-    stat = std::make_shared<BarrierEndStat>(numConnThreads, name);
-  } else if (bType == BARRIER_DELTA) {
-    stat = std::make_shared<BarrierDeltaStat>(numConnThreads, name);
-  }
-  auto ret = barrierStatSet_.insert(std::make_pair(name, stat));
-  return ret.first->second;
-}
-
 void StatSet::printSegTimerStatus() {
   ReadLockGuard guard(lock_);
   LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ')
@@ -135,46 +107,20 @@ void StatSet::printSegTimerStatus() {
   }
 }
 
-void StatSet::printBarrierTimerStatus() {
-  ReadLockGuard guard(lock_);
-  if (barrierStatSet_.empty()) {
-    return;
-  }
-  // control barrierAbstact in runtime, so enable compliation
-  LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ')
-            << "======= BarrierStatSet status ======" << std::endl;
-  for (auto& stat : barrierStatSet_) {
-    LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ')
-              << *(stat.second);
-  }
-}
-
 void StatSet::printAllStatus() {
 #ifndef PADDLE_DISABLE_TIMER
   printSegTimerStatus();
 #endif
-  printBarrierTimerStatus();
   LOG(INFO) << std::setiosflags(std::ios::left)
             << "--------------------------------------------------"
             << std::endl;
 }
 
-void StatSet::printStatus(const std::string& name) {
-  ReadLockGuard guard(lock_);
-  auto iter = statSet_.find(name);
-  CHECK(iter != statSet_.end()) << name << " is not registed in " << name_;
-  LOG(INFO) << *(iter->second);
-}
-
 void StatSet::reset(bool clearRawData) {
   ReadLockGuard guard(lock_);
   for (auto& stat : statSet_) {
     stat.second->reset();
   }
-  // reset barrierStat
-  for (auto& stat : barrierStatSet_) {
-    stat.second->reset(clearRawData);
-  }
 }
 
 void StatSet::setThreadInfo(const std::string& name, bool flag) {
@@ -184,13 +130,6 @@ void StatSet::setThreadInfo(const std::string& name, bool flag) {
   iter->second->setThreadInfo(flag);
 }
 
-void StatSet::deleteStat(const std::string& name) {
-  std::lock_guard<RWLock> guard(lock_);
-  auto iter = statSet_.find(name);
-  CHECK(iter != statSet_.end()) << name << " is not registed in " << name_;
-  statSet_.erase(iter);
-}
-
 StatInfo::~StatInfo() {
   if (stat_) {
     std::lock_guard<std::mutex> guard(stat_->lock_);
diff --git a/paddle/utils/Stat.h b/paddle/utils/Stat.h
index d9cc6e413a..79fd3b8cf0 100644
--- a/paddle/utils/Stat.h
+++ b/paddle/utils/Stat.h
@@ -23,7 +23,6 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 
-#include "BarrierStat.h"
 #include "Locks.h"
 #include "Logging.h"
 #include "ThreadLocal.h"
@@ -60,12 +59,6 @@ public:
 
 class Stat;
 typedef std::shared_ptr<Stat> StatPtr;
-typedef std::shared_ptr<BarrierStatBase> BarrierStatPtr;
-
-enum BarrierStatType {
-  BARRIER_END = 0,
-  BARRIER_DELTA = 1,
-};
 
 class StatSet {
 public:
@@ -74,11 +67,8 @@ public:
 
   // print to LOG(INFO)
   void printSegTimerStatus();
-  void printBarrierTimerStatus();
   void printAllStatus();
 
-  void printStatus(const std::string& name);
-
   StatPtr getStat(const std::string& name) {
     {
       ReadLockGuard guard(lock_);
@@ -93,12 +83,6 @@ public:
     return ret.first->second;
   }
 
-  BarrierStatPtr getStat(uint16_t numConnThreads,
-                         const std::string& name,
-                         BarrierStatType bType);
-
-  void deleteStat(const std::string& name);
-
   // true for showing stats for each thread
   // false for showing stats aggragated over threads
   void setThreadInfo(const std::string& name, bool flag);
@@ -120,7 +104,6 @@ public:
 
 private:
   std::unordered_map<std::string, StatPtr> statSet_;
-  std::unordered_map<std::string, BarrierStatPtr> barrierStatSet_;
   const std::string name_;
   RWLock lock_;
 };

From 166dfbb085ef4ebbccea190abc436524fb80ed57 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Wed, 5 Jul 2017 02:36:10 +0000
Subject: [PATCH 49/79] fix cmake errors

---
 cmake/generic.cmake           | 7 ++++---
 go/cmd/master/CMakeLists.txt  | 2 +-
 go/cmd/pserver/CMakeLists.txt | 2 +-
 go/pserver/optimizer.go       | 2 +-
 4 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index cae9524b2f..97196114ff 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -284,7 +284,7 @@ function(go_library TARGET_NAME)
   add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
     COMMAND rm "${${TARGET_NAME}_LIB_PATH}"
     # Golang build source code
-    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
+    COMMAND env LIBRARY_PATH=${CMAKE_BINARY_DIR}/go/pserver/client/c/:$ENV{LIBRARY_PATH} GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
     -o "${${TARGET_NAME}_LIB_PATH}"
     "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${GO_SOURCE}"
     # must run under GOPATH
@@ -300,10 +300,11 @@ function(go_binary TARGET_NAME)
   string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 
   add_custom_command(OUTPUT ${TARGET_NAME}_timestamp
-    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build
+      COMMAND env LIBRARY_PATH=${CMAKE_BINARY_DIR}/go/pserver/client/c/:$ENV{LIBRARY_PATH}
+      GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build
     -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
     "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${go_binary_SRCS}"
-  WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
+    WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
   # TODO: don't know what ${TARGET_NAME}_link does
   add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${TARGET_NAME}_timestamp ${go_binary_DEPS})
   install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME} DESTINATION bin)
diff --git a/go/cmd/master/CMakeLists.txt b/go/cmd/master/CMakeLists.txt
index 9e149967e7..1058ffa86b 100644
--- a/go/cmd/master/CMakeLists.txt
+++ b/go/cmd/master/CMakeLists.txt
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-go_binary(master SRC master.go)
+go_binary(master SRC master.go DEPS paddle_go_optimizer)
diff --git a/go/cmd/pserver/CMakeLists.txt b/go/cmd/pserver/CMakeLists.txt
index bc1da3348c..51db6dff04 100644
--- a/go/cmd/pserver/CMakeLists.txt
+++ b/go/cmd/pserver/CMakeLists.txt
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-go_binary(pserver SRCS pserver.go)
+go_binary(pserver SRCS pserver.go DEPS paddle_go_optimizer)
diff --git a/go/pserver/optimizer.go b/go/pserver/optimizer.go
index d84f55b987..2c9b0d5652 100644
--- a/go/pserver/optimizer.go
+++ b/go/pserver/optimizer.go
@@ -2,7 +2,7 @@ package pserver
 
 // #cgo CFLAGS: -I ../../
 // //FIXME: ldflags contain "build" path
-// #cgo LDFLAGS: ../../build/go/pserver/client/c/libpaddle_go_optimizer.a -lstdc++ -lm
+// #cgo LDFLAGS: -lpaddle_go_optimizer -lstdc++ -lm
 // #include "paddle/optimizer/optimizer.h"
 // #include <stdlib.h>
 // #include <string.h>

From cd437f5072b0482685d107c386e587bc1fe59044 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Wed, 5 Jul 2017 05:16:41 +0000
Subject: [PATCH 50/79] fix bugs

---
 go/pserver/client/c/test/CMakeLists.txt | 4 +++-
 go/pserver/optimizer.go                 | 1 -
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/go/pserver/client/c/test/CMakeLists.txt b/go/pserver/client/c/test/CMakeLists.txt
index f287f85071..44bc183738 100644
--- a/go/pserver/client/c/test/CMakeLists.txt
+++ b/go/pserver/client/c/test/CMakeLists.txt
@@ -1,2 +1,4 @@
-cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient)
+# FIXME:It's ugly
+#cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient paddle_go_optimizer)
+cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient paddle_optimizer paddle_proto glog gflags protobuf)
 add_style_check_target(test_cclient test_cclient.c)
diff --git a/go/pserver/optimizer.go b/go/pserver/optimizer.go
index 2c9b0d5652..93389b93a7 100644
--- a/go/pserver/optimizer.go
+++ b/go/pserver/optimizer.go
@@ -1,7 +1,6 @@
 package pserver
 
 // #cgo CFLAGS: -I ../../
-// //FIXME: ldflags contain "build" path
 // #cgo LDFLAGS: -lpaddle_go_optimizer -lstdc++ -lm
 // #include "paddle/optimizer/optimizer.h"
 // #include <stdlib.h>

From 1409b17e4f20afdd922b8566be324581ed3f0e54 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Wed, 5 Jul 2017 06:06:13 +0000
Subject: [PATCH 51/79] add fixme

---
 cmake/generic.cmake | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 97196114ff..74396abdbb 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -281,6 +281,7 @@ function(go_library TARGET_NAME)
 
   file(GLOB GO_SOURCE RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.go")
   string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+  # FIXME: link path
   add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
     COMMAND rm "${${TARGET_NAME}_LIB_PATH}"
     # Golang build source code
@@ -299,6 +300,7 @@ function(go_binary TARGET_NAME)
   cmake_parse_arguments(go_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
   string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 
+  # FIXME: link path
   add_custom_command(OUTPUT ${TARGET_NAME}_timestamp
       COMMAND env LIBRARY_PATH=${CMAKE_BINARY_DIR}/go/pserver/client/c/:$ENV{LIBRARY_PATH}
       GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build

From 7364fcd4c3c6b08b569ed2bb809bed9904b55030 Mon Sep 17 00:00:00 2001
From: wuyi05 <wuyi05@baidu.com>
Date: Wed, 5 Jul 2017 15:42:17 +0800
Subject: [PATCH 52/79] add golang precommit

---
 .pre-commit-config.yaml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4cd8eb12f6..a7c450176d 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -21,3 +21,10 @@
     sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
     hooks:
     -   id: clang-formater
+-   repo: https://github.com/dnephin/pre-commit-golang
+    sha: e4693a4c282b4fc878eda172a929f7a6508e7d16
+    hooks:
+      -   id: go-fmt
+      -   id: go-vet
+      -   id: go-lint
+      -   id: gometalinter

From e7b071f33a2af3168586ef2710835b694f61e958 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Wed, 5 Jul 2017 15:55:26 +0800
Subject: [PATCH 53/79] update annotation with comments

---
 paddle/gserver/layers/AverageLayer.h                |  4 ++++
 paddle/gserver/layers/MaxLayer.h                    |  7 +++----
 paddle/gserver/layers/SequenceLastInstanceLayer.cpp |  7 +++----
 paddle/gserver/layers/SequencePoolLayer.h           |  5 +++--
 python/paddle/trainer_config_helpers/layers.py      | 11 +++++++----
 5 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/paddle/gserver/layers/AverageLayer.h b/paddle/gserver/layers/AverageLayer.h
index 332552a304..db4a17bfb0 100644
--- a/paddle/gserver/layers/AverageLayer.h
+++ b/paddle/gserver/layers/AverageLayer.h
@@ -25,6 +25,10 @@ namespace paddle {
  * If SequenceLevel = kNonSeq:
  *    Output: output size is the number of input sequences (NOT input instances)
  *    output[i] = average_{for each instance in this sequence}{input[i]}
+ *    If stride_ > 0:
+ *      Output: a shorten sequence. Stride is the step size by which we slide a
+ *              window upon the input sequence, and the average pooling
+ *              operation is then applied to each interval independently.
  * If SequenceLevel = kSeq:
  *    Check input sequence must has sub-sequence
  *    Output: output size is the number of input sub-sequences
diff --git a/paddle/gserver/layers/MaxLayer.h b/paddle/gserver/layers/MaxLayer.h
index adf7ab4ae4..fa536fce2b 100644
--- a/paddle/gserver/layers/MaxLayer.h
+++ b/paddle/gserver/layers/MaxLayer.h
@@ -27,10 +27,9 @@ namespace paddle {
  *    Output: output size is the number of input sequences (NOT input instances)
  *    output[i] = max_{for each instance in this sequence}{input[i]}
  *    If stride_ > 0:
- *      Output: a shorten sequence. The operation of getting max instance of a
- *              sequence is independently performed on every slice of the input
- *              sequence, which is obtained by sliding a window with the window
- *              size set to stride_.
+ *      Output: a shorten sequence. Stride is the step size by which we slide a
+ *              window upon the input sequence, and the max pooling operation is
+ *              then applied to each interval independently.
  * If SequenceLevel = kSeq:
  *    Check input sequence must has sub-sequence
  *    Output: output size is the number of input sub-sequences
diff --git a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
index 8127cbf09c..323cc47df1 100644
--- a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
+++ b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
@@ -26,10 +26,9 @@ namespace paddle {
  * If SequenceLevel = kNonseq:
  *   Output: a sequence containing only the last instance of the input sequence
  *   If stride_ > 0:
- *      Output: a shorten sequence. The operation of getting last instance of a
- *              sequence is independently performed on every slice of the input
- *              sequence, which is obtained by sliding a window with the window
- *              size set to stride_.
+ *      Output: a shorten sequence. Stride is the step size by which we slide a
+ *              window upon the input sequence, and getting last instance
+ *              operation is then applied to each interval independently.
  * If SequenceLevel = kSeq:
  *   Check input sequence must has sub-sequence
  *   Output: a sequence containing only the last instance of each sub-sequence
diff --git a/paddle/gserver/layers/SequencePoolLayer.h b/paddle/gserver/layers/SequencePoolLayer.h
index 058627def8..e207afd1dc 100644
--- a/paddle/gserver/layers/SequencePoolLayer.h
+++ b/paddle/gserver/layers/SequencePoolLayer.h
@@ -28,8 +28,9 @@ namespace paddle {
  * sequence}{input[i]}
  *    If stride_ > 0:
  *        Check input sequence must not have sub-sequence
- *        Output: a shorten sequence, pooling is performed upon a small local
- *                area
+ *        Output: a shorten sequence. Stride is the step size by which we slide
+ *                a window upon the input sequence, and the pooling operation
+ *                is then applied to each interval independently.
  * If SequenceLevel = kSeq:
  *    Check input sequence must has sub-sequence
  *    Output: output size is the number of input sub-sequences
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 5e8bf4b203..2f52a27e60 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -1097,7 +1097,10 @@ def pooling_layer(input,
 
     If stride > 0, this layer slides a window whose size is determined by stride,
     and return the pooling value of the window as the output. Thus, a long sequence
-    will be shorten. Note that for sequence with sub-sequence, the default value
+    will be shorten. 
+    
+    The parameter stride specifies the intervals at which to apply the pooling 
+    operation. Note that for sequence with sub-sequence, the default value
     of stride is -1.
 
     The example usage is:
@@ -1118,7 +1121,7 @@ def pooling_layer(input,
     :param pooling_type: Type of pooling, MaxPooling(default), AvgPooling,
                          SumPooling, SquareRootNPooling.
     :type pooling_type: BasePoolingType|None
-    :param stride: window size.
+    :param stride: The step size between successive pooling regions.
     :type stride: Int
     :param bias_attr: Bias parameter attribute. False if no bias.
     :type bias_attr: ParameterAttribute|None|False
@@ -1408,7 +1411,7 @@ def last_seq(input,
     :type name: basestring
     :param input: Input layer name.
     :type input: LayerOutput
-    :param stride: window size.
+    :param stride: The step size between successive pooling regions.
     :type stride: Int
     :param layer_attr: extra layer attributes.
     :type layer_attr: ExtraLayerAttribute.
@@ -1464,7 +1467,7 @@ def first_seq(input,
     :type name: basestring
     :param input: Input layer name.
     :type input: LayerOutput
-    :param stride: window size.
+    :param stride: The step size between successive pooling regions.
     :type stride: Int
     :param layer_attr: extra layer attributes.
     :type layer_attr: ExtraLayerAttribute.

From 7ed6463ee91e0b71e7beca313554eae36da1c4e4 Mon Sep 17 00:00:00 2001
From: yangyaming <yangyaming@baidu.com>
Date: Wed, 24 May 2017 13:55:58 +0800
Subject: [PATCH 54/79] fix bugs for CrossChannelNormLayer

---
 .../gserver/layers/CrossChannelNormLayer.cpp  | 32 ++++++++++++++-----
 paddle/gserver/layers/NormLayer.cpp           | 10 ------
 paddle/gserver/tests/LayerGradUtil.cpp        |  7 +++-
 paddle/gserver/tests/LayerGradUtil.h          |  6 ++++
 paddle/gserver/tests/test_LayerGrad.cpp       |  5 ++-
 5 files changed, 40 insertions(+), 20 deletions(-)

diff --git a/paddle/gserver/layers/CrossChannelNormLayer.cpp b/paddle/gserver/layers/CrossChannelNormLayer.cpp
index 3fbccc1103..4dfe460561 100644
--- a/paddle/gserver/layers/CrossChannelNormLayer.cpp
+++ b/paddle/gserver/layers/CrossChannelNormLayer.cpp
@@ -36,6 +36,16 @@ MatrixPtr CrossChannelNormLayer::createSpatialMatrix(MatrixPtr data,
       data->getData() + iter * spatialDim, 1, spatialDim, false, useGpu_);
 }
 
+bool CrossChannelNormLayer::init(const LayerMap& layerMap,
+                                 const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  CHECK(parameters_[0]);
+  const NormConfig& conf = config_.inputs(0).norm_conf();
+  channels_ = conf.channels();
+  scale_.reset(new Weight(channels_, 1, parameters_[0]));
+  return true;
+}
+
 void CrossChannelNormLayer::forward(PassType passType) {
   Layer::forward(passType);
   MatrixPtr inV = getInputValue(0);
@@ -63,6 +73,7 @@ void CrossChannelNormLayer::forward(PassType passType) {
 
     // compute norm.
     spatialBuffer_->sumCols(*dataTmp, 1, 0);
+    spatialBuffer_->add(*normTmp);
     spatialBuffer_->sqrt2(*spatialBuffer_);
     normTmp->copyFrom(*spatialBuffer_);
     outVTmp->copyFrom(*inVTmp);
@@ -82,6 +93,9 @@ void CrossChannelNormLayer::backward(const UpdateCallback& callback) {
   size_t dataDim = inG->getWidth();
   size_t spatialDim = dataDim / channels_;
 
+  MatrixPtr inGBuffer;
+  Matrix::resizeOrCreate(inGBuffer, channels_, spatialDim, false, useGpu_);
+
   dataBuffer_->dotMul(*outG, *outV);
   Matrix::resizeOrCreate(scaleDiff_, channels_, 1, false, useGpu_);
   Matrix::resizeOrCreate(channelBuffer_, channels_, 1, false, useGpu_);
@@ -100,22 +114,24 @@ void CrossChannelNormLayer::backward(const UpdateCallback& callback) {
     scaleDiff_->add(*channelBuffer_, 1.);
 
     sampleBuffer_->dotMul(*inVTmp, *outGTmp);
-    spatialBuffer_->sumCols(*sampleBuffer_, 1., 1.);
+    spatialBuffer_->sumCols(*sampleBuffer_, 1., 0.);
     // scale the grad
-    inGTmp->copyFrom(*inVTmp);
-    inGTmp->mulRowVector(*spatialBuffer_);
+    inGBuffer->copyFrom(*inVTmp);
+    inGBuffer->mulRowVector(*spatialBuffer_);
     // divide by square of norm
     spatialBuffer_->dotMul(*normTmp, *normTmp);
-    inGTmp->divRowVector(*spatialBuffer_);
+    inGBuffer->divRowVector(*spatialBuffer_);
     // subtract
-    inGTmp->add(*outGTmp, -1, 1);
+    inGBuffer->add(*outGTmp, -1, 1);
     // divide by norm
-    inGTmp->divRowVector(*normTmp);
+    inGBuffer->divRowVector(*normTmp);
     // scale the diff
-    inGTmp->mulColVector(*scale_->getW());
+    inGBuffer->mulColVector(*scale_->getW());
+
+    inGTmp->add(*inGBuffer);
   }
   // updata scale
-  if (scale_->getWGrad()) scale_->getWGrad()->copyFrom(*scaleDiff_);
+  if (scale_->getWGrad()) scale_->getWGrad()->add(*scaleDiff_);
   scale_->getParameterPtr()->incUpdate(callback);
 }
 
diff --git a/paddle/gserver/layers/NormLayer.cpp b/paddle/gserver/layers/NormLayer.cpp
index e094078bfe..caef710092 100644
--- a/paddle/gserver/layers/NormLayer.cpp
+++ b/paddle/gserver/layers/NormLayer.cpp
@@ -56,14 +56,4 @@ bool ResponseNormLayer::init(const LayerMap& layerMap,
   return true;
 }
 
-bool CrossChannelNormLayer::init(const LayerMap& layerMap,
-                                 const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-  CHECK(parameters_[0]);
-  const NormConfig& conf = config_.inputs(0).norm_conf();
-  channels_ = conf.channels();
-  scale_.reset(new Weight(channels_, 1, parameters_[0]));
-  return true;
-}
-
 }  // namespace paddle
diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp
index e3591ba4df..66aafba844 100644
--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
@@ -465,7 +465,6 @@ void initTestLayer(TestConfig testConf,
                            ParameterConfig paraConfig) {
     paraConfig.set_name(paraName);
     paraConfig.set_size(paraSize);
-    paraConfig.set_initial_std(1);
     paraConfig.set_is_static(isStatic);
     auto para =
         std::make_shared<Parameter>(paraConfig, FLAGS_use_gpu, initialize);
@@ -499,6 +498,12 @@ void initTestLayer(TestConfig testConf,
         paraConfig.add_dims((*layerMap)[input.input_layer_name()]->getSize());
         paraConfig.add_dims(testConf.layerConfig.size());
       }
+      if (testConf.hasParamInitialValue) {
+        paraConfig.set_initial_mean(testConf.paramInitialMean);
+        paraConfig.set_initial_std(testConf.paramInitialStd);
+      } else {
+        paraConfig.set_initial_std(1);
+      }
       initParameter(paraName, paraSize, inputDef.isStatic, false, paraConfig);
     }
   }
diff --git a/paddle/gserver/tests/LayerGradUtil.h b/paddle/gserver/tests/LayerGradUtil.h
index 18a6525a14..5ea7ca0f24 100644
--- a/paddle/gserver/tests/LayerGradUtil.h
+++ b/paddle/gserver/tests/LayerGradUtil.h
@@ -125,12 +125,18 @@ struct TestConfig {
   LayerConfig layerConfig;
   std::vector<InputDef> inputDefs;
   size_t biasSize;
+  real paramInitialMean;
+  real paramInitialStd;
+  bool hasParamInitialValue;
   bool testAccumulate;
   bool testState;
   bool staticBias;
   bool testBatchState;
   TestConfig()
       : biasSize(0),
+        paramInitialMean(0),
+        paramInitialStd(1),
+        hasParamInitialValue(false),
         testAccumulate(true),
         testState(false),
         staticBias(false),
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 59d1e9273d..6441e08b48 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1661,6 +1661,9 @@ TEST(Layer, PadLayer) {
 
 TEST(Layer, CrossChannelNormLayer) {
   TestConfig config;
+  config.hasParamInitialValue = true;
+  config.paramInitialMean = 1.;
+  config.paramInitialStd = 0.;
   config.layerConfig.set_type("norm");
   config.layerConfig.set_size(100);
   LayerInputConfig* input = config.layerConfig.add_inputs();
@@ -1674,7 +1677,7 @@ TEST(Layer, CrossChannelNormLayer) {
   config.inputDefs.push_back({INPUT_DATA, "layer_0", 100, 10});
 
   for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "cross-channel-norm", 10, false, useGpu, false, 5);
+    testLayerGrad(config, "cross-channel-norm", 10, false, useGpu, false);
   }
 }
 

From 2bf4f1bbc1e4abc9c173b89aeb96c40b404e94f4 Mon Sep 17 00:00:00 2001
From: yangyaming <yangyaming@baidu.com>
Date: Wed, 24 May 2017 14:22:41 +0800
Subject: [PATCH 55/79] make adding eps more clear

---
 paddle/gserver/layers/CrossChannelNormLayer.cpp | 7 +++----
 paddle/gserver/tests/LayerGradUtil.h            | 4 ++--
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/paddle/gserver/layers/CrossChannelNormLayer.cpp b/paddle/gserver/layers/CrossChannelNormLayer.cpp
index 4dfe460561..d72503217f 100644
--- a/paddle/gserver/layers/CrossChannelNormLayer.cpp
+++ b/paddle/gserver/layers/CrossChannelNormLayer.cpp
@@ -61,9 +61,7 @@ void CrossChannelNormLayer::forward(PassType passType) {
   Matrix::resizeOrCreate(dataBuffer_, batchSize, dataDim, false, useGpu_);
   Matrix::resizeOrCreate(spatialBuffer_, 1, spatialDim, false, useGpu_);
   Matrix::resizeOrCreate(normBuffer_, batchSize, spatialDim, false, useGpu_);
-  normBuffer_->zeroMem();
-  // add eps to avoid overflow
-  normBuffer_->addScalar(*normBuffer_, 1e-6);
+
   inV->square2(*dataBuffer_);
   for (size_t i = 0; i < batchSize; i++) {
     const MatrixPtr inVTmp = createSampleMatrix(inV, i, spatialDim);
@@ -73,7 +71,8 @@ void CrossChannelNormLayer::forward(PassType passType) {
 
     // compute norm.
     spatialBuffer_->sumCols(*dataTmp, 1, 0);
-    spatialBuffer_->add(*normTmp);
+    // add eps to avoid overflow
+    spatialBuffer_->add(1e-6);
     spatialBuffer_->sqrt2(*spatialBuffer_);
     normTmp->copyFrom(*spatialBuffer_);
     outVTmp->copyFrom(*inVTmp);
diff --git a/paddle/gserver/tests/LayerGradUtil.h b/paddle/gserver/tests/LayerGradUtil.h
index 5ea7ca0f24..9dbd202757 100644
--- a/paddle/gserver/tests/LayerGradUtil.h
+++ b/paddle/gserver/tests/LayerGradUtil.h
@@ -134,8 +134,8 @@ struct TestConfig {
   bool testBatchState;
   TestConfig()
       : biasSize(0),
-        paramInitialMean(0),
-        paramInitialStd(1),
+        paramInitialMean(0.0),
+        paramInitialStd(1.0),
         hasParamInitialValue(false),
         testAccumulate(true),
         testState(false),

From 7c6aa04f6185e92082b9a742d5c746b335406711 Mon Sep 17 00:00:00 2001
From: wuyi05 <wuyi05@baidu.com>
Date: Wed, 5 Jul 2017 16:24:53 +0800
Subject: [PATCH 56/79] add go pre-commit and travis build

---
 .pre-commit-config.yaml            |  4 ++--
 .travis.yml                        |  4 ++--
 go/pserver/service.go              |  6 ++++--
 paddle/scripts/travis/build_doc.sh | 11 ++++++-----
 4 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index a7c450176d..61b989dc69 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -25,6 +25,6 @@
     sha: e4693a4c282b4fc878eda172a929f7a6508e7d16
     hooks:
       -   id: go-fmt
-      -   id: go-vet
+          files: (.*\.go)
       -   id: go-lint
-      -   id: gometalinter
+          files: (.*\.go)
diff --git a/.travis.yml b/.travis.yml
index 16432dac0c..aafeeba027 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -33,7 +33,7 @@ addons:
       - ccache
 before_install:
   - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
-  # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python 
+  # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
   # protobuf version.
   - pip install numpy wheel 'protobuf==3.1' sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker
   - pip install rarfile
@@ -42,7 +42,7 @@ before_install:
     function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:
   - |
-    export WITH_GOLANG=ON && timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
+    timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
     RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi;
 notifications:
   email:
diff --git a/go/pserver/service.go b/go/pserver/service.go
index 7711dc027e..ad16a5708d 100644
--- a/go/pserver/service.go
+++ b/go/pserver/service.go
@@ -10,8 +10,10 @@ import (
 type ElementType int
 
 const (
+	// AlreadyInitialized is true if pserver is initialized
 	AlreadyInitialized = "pserver already initialized"
-	Uninitialized      = "pserver not fully initialized"
+	// Uninitialized is true if pserver not fully initialized
+	Uninitialized = "pserver not fully initialized"
 )
 
 // Supported element types
@@ -55,7 +57,7 @@ func NewService(idx int) (*Service, error) {
 	s := &Service{
 		idx: idx,
 	}
-  s.optMap = make(map[string]*optimizer)
+	s.optMap = make(map[string]*optimizer)
 	s.initialized = make(chan struct{})
 	return s, nil
 }
diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh
index a44bd35357..a443851580 100755
--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
@@ -5,13 +5,14 @@ set -e
 mkdir -p $TRAVIS_BUILD_DIR/build
 cd $TRAVIS_BUILD_DIR/build
 
-# Compile Documentation only.
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_STYLE_CHECK=OFF
+# Compile paddle binaries first
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_GOLANG=ON -DWITH_STYLE_CHECK=OFF
 
 mkdir output
 make -j `nproc`
 find .. -name '*whl' | xargs pip install  # install all wheels.
 rm -rf *
+# Compile Documentation only.
 cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=ON
 make -j `nproc` paddle_docs paddle_docs_cn
 
@@ -25,7 +26,7 @@ SSH_REPO=${REPO/https:\/\/github.com\//git@github.com:}
 SHA=`git rev-parse --verify HEAD`
 
 # Documentation branch name
-# gh-pages branch is used for PaddlePaddle.org. The English version of 
+# gh-pages branch is used for PaddlePaddle.org. The English version of
 # documentation in `doc` directory, and the chinese version in `doc_cn`
 # directory.
 TARGET_BRANCH="gh-pages"
@@ -51,7 +52,7 @@ function deploy_docs() {
 
   # checkout github page branch
   git checkout $TARGET_BRANCH || git checkout --orphan $TARGET_BRANCH
-  
+
   mkdir -p ${DIR}
   # remove old docs. mv new docs.
   set +e
@@ -62,7 +63,7 @@ function deploy_docs() {
   git add .
 }
 
-deploy_docs "master" "." 
+deploy_docs "master" "."
 deploy_docs "develop" "./develop/"
 
 # Check is there anything changed.

From 81bfd47eb3fdbf7a0c398f6ad7e62f1d6e7350c1 Mon Sep 17 00:00:00 2001
From: wuyi05 <wuyi05@baidu.com>
Date: Wed, 5 Jul 2017 16:32:14 +0800
Subject: [PATCH 57/79] add glide in travis

---
 .travis.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.travis.yml b/.travis.yml
index aafeeba027..498674469b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -37,6 +37,7 @@ before_install:
   # protobuf version.
   - pip install numpy wheel 'protobuf==3.1' sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker
   - pip install rarfile
+  - curl https://glide.sh/get | bash
   - eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
   - |
     function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }

From 2f085a7bcf11f5501bded27862988022e32299a0 Mon Sep 17 00:00:00 2001
From: wuyi05 <wuyi05@baidu.com>
Date: Wed, 5 Jul 2017 17:08:19 +0800
Subject: [PATCH 58/79] add go pserver deps

---
 go/cmd/pserver/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/go/cmd/pserver/CMakeLists.txt b/go/cmd/pserver/CMakeLists.txt
index bc1da3348c..51db6dff04 100644
--- a/go/cmd/pserver/CMakeLists.txt
+++ b/go/cmd/pserver/CMakeLists.txt
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-go_binary(pserver SRCS pserver.go)
+go_binary(pserver SRCS pserver.go DEPS paddle_go_optimizer)

From 5eb8bf0324ba7de923760dc05aa7e850a9ae103f Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 5 Jul 2017 17:23:41 +0800
Subject: [PATCH 59/79] Correct GLOG CHECK in Paddle

Use CHECK instead of PCHECK, because PCHECK is used for errno.
---
 paddle/pserver/LightNetwork.cpp    | 28 ++++++++++++++--------------
 paddle/pserver/SocketChannel.cpp   | 22 +++++++++++-----------
 paddle/pserver/test/SocketTest.cpp | 28 ++++++++++++++--------------
 paddle/trainer/Tester.cpp          |  2 +-
 paddle/utils/ThreadLocal.h         | 12 ++++++------
 5 files changed, 46 insertions(+), 46 deletions(-)

diff --git a/paddle/pserver/LightNetwork.cpp b/paddle/pserver/LightNetwork.cpp
index 922f25734d..8616fd2d5a 100644
--- a/paddle/pserver/LightNetwork.cpp
+++ b/paddle/pserver/LightNetwork.cpp
@@ -142,7 +142,7 @@ SocketServer::SocketServer(const std::string &addr, int port, int rdmaCpu)
   }
 
   /// trigger to initialize RDMA lib
-  PCHECK(RdmaClientDaemons::get()) << "initilizate RDMA failed\n";
+  CHECK(RdmaClientDaemons::get()) << "initilizate RDMA failed\n";
 }
 
 SocketServer::~SocketServer() {
@@ -168,7 +168,7 @@ void SocketServer::tcpServer() {
 
   /// First call to socket() function
   socket_ = socket(AF_INET, SOCK_STREAM, 0);
-  PCHECK(socket_ >= 0) << "ERROR opening socket";
+  CHECK(socket_ >= 0) << "ERROR opening socket";
 
   /// Initialize socket structure
   bzero((char *)&serv_addr, sizeof(serv_addr));
@@ -176,7 +176,7 @@ void SocketServer::tcpServer() {
   serv_addr.sin_port = htons(port_);
   if (!addr_.empty()) {
     server = gethostbyname(addr_.c_str());
-    PCHECK(server) << "ERROR, no such host: " << addr_;
+    CHECK(server) << "ERROR, no such host: " << addr_;
     bcopy((char *)server->h_addr,
           (char *)&serv_addr.sin_addr.s_addr,
           server->h_length);
@@ -187,7 +187,7 @@ void SocketServer::tcpServer() {
   setOption(socket_);
 
   /// Now bind the host address using bind() call.
-  PCHECK(bind(socket_, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) >= 0)
+  CHECK(bind(socket_, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) >= 0)
       << "ERROR on binding " << addr_;
 
   /// Now start listening for the clients, here process will
@@ -201,7 +201,7 @@ void SocketServer::tcpServer() {
     if (stopping_) {
       break;
     }
-    PCHECK(newsockfd >= 0) << "ERROR on accept";
+    CHECK(newsockfd >= 0) << "ERROR on accept";
     constexpr int kPeerNameLen = 128;
     char peerName[kPeerNameLen];
     CHECK(inet_ntop(AF_INET, &cli_addr.sin_addr, peerName, kPeerNameLen));
@@ -227,14 +227,14 @@ void SocketServer::rdmaServer() {
 
   /// First call to socket() function
   rdmaSocket_ = rdma::ssocket(rdmaCpu_);
-  PCHECK(rdmaSocket_) << "ERROR opening RDMA socket";
+  CHECK(rdmaSocket_) << "ERROR opening RDMA socket";
 
-  PCHECK(rdma::bind(rdmaSocket_, rdmaUri_.c_str()) == 0)
+  CHECK(rdma::bind(rdmaSocket_, rdmaUri_.c_str()) == 0)
       << "ERROR bind RDMA socket";
 
   /// Now start listening for the clients, here process will
   /// go in sleep mode and will wait for the incoming connection
-  PCHECK(rdma::listen(rdmaSocket_) == 0) << "ERROR listen RDMA socket";
+  CHECK(rdma::listen(rdmaSocket_) == 0) << "ERROR listen RDMA socket";
 
   while (true) {
     /// Accept actual connection from the client
@@ -242,7 +242,7 @@ void SocketServer::rdmaServer() {
     if (stopping_) {
       break;
     }
-    PCHECK(newsock) << "ERROR on accept";
+    CHECK(newsock) << "ERROR on accept";
 
     constexpr int kPeerNameLen = 128;
     char peerName[kPeerNameLen];
@@ -290,7 +290,7 @@ RdmaClientDaemons::RdmaClientDaemons() {
     onlineCpus_ = rdma::numCpus();
     for (auto i = 0; i < onlineCpus_; i++) {
       socket = rdma::csocket(i);
-      PCHECK(socket) << "ERROR open client socket daemon";
+      CHECK(socket) << "ERROR open client socket daemon";
 
       rdmaClientSocket_.push_back(socket);
     }
@@ -355,7 +355,7 @@ void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) {
 
   /// Create a socket point
   int sockfd = socket(AF_INET, SOCK_STREAM, 0);
-  PCHECK(sockfd >= 0) << "ERROR opening socket";
+  CHECK(sockfd >= 0) << "ERROR opening socket";
 
 #if defined(__OSX__) || defined(__APPLE__)
   server = getipnodebyname(serverAddr.c_str(), AF_INET, AI_DEFAULT, &errRet);
@@ -396,8 +396,8 @@ void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) {
       }
       std::this_thread::sleep_for(std::chrono::seconds(1));
     } else {
-      PCHECK(errno != 0) << "ERROR connecting to " << serverAddr << ":"
-                         << serverPort << "errorno: " << errno;
+      CHECK(errno != 0) << "ERROR connecting to " << serverAddr << ":"
+                        << serverPort << "errorno: " << errno;
     }
   } while (errno == ECONNREFUSED);
 
@@ -426,7 +426,7 @@ void SocketClient::RdmaClient(const std::string &serverAddr, int serverPort) {
 
   /// connect to server with socket daemon
   sock = rdma::connect(socketDaemon_, rdmaUri.c_str());
-  PCHECK(sock) << "ERROR connect to server" << rdmaUri;
+  CHECK(sock) << "ERROR connect to server" << rdmaUri;
 
   std::vector<std::string> seg;
   str::split(rdmaUri, '/', &seg);
diff --git a/paddle/pserver/SocketChannel.cpp b/paddle/pserver/SocketChannel.cpp
index 0599889164..12e3bc6552 100644
--- a/paddle/pserver/SocketChannel.cpp
+++ b/paddle/pserver/SocketChannel.cpp
@@ -51,7 +51,7 @@ size_t SocketChannel::read(void* buf, size_t size) {
     else
       len = rdma::read(rdmaSocket_, (char*)buf + total, size - total);
 
-    PCHECK(len >= 0) << " peer=" << peerName_;
+    CHECK(len >= 0) << " peer=" << peerName_;
     if (len <= 0) {
       return total;
     }
@@ -69,7 +69,7 @@ size_t SocketChannel::write(const void* buf, size_t size) {
     else
       len = rdma::write(rdmaSocket_, (char*)buf + total, size - total);
 
-    PCHECK(len >= 0) << " peer=" << peerName_;
+    CHECK(len >= 0) << " peer=" << peerName_;
     if (len <= 0) {
       return total;
     }
@@ -98,10 +98,10 @@ static size_t readwritev(IOFunc iofunc,
   while (size < total) {
     ssize_t len =
         iofunc(socket, &iovs[curIov], std::min(iovcnt - curIov, maxiovs));
-    PCHECK(len > 0) << " peer=" << peerName << " curIov=" << curIov
-                    << " iovCnt=" << iovcnt
-                    << " iovs[curIov].base=" << iovs[curIov].iov_base
-                    << " iovs[curIov].iov_len=" << iovs[curIov].iov_len;
+    CHECK(len > 0) << " peer=" << peerName << " curIov=" << curIov
+                   << " iovCnt=" << iovcnt
+                   << " iovs[curIov].base=" << iovs[curIov].iov_base
+                   << " iovs[curIov].iov_len=" << iovs[curIov].iov_len;
     size += len;
 
     /// restore iovs[curIov] to the original value
@@ -183,7 +183,7 @@ void SocketChannel::writeMessage(const std::vector<struct iovec>& userIovs) {
     header.totalLength += iov.iov_len;
   }
 
-  PCHECK(writev(iovs) == (size_t)header.totalLength);
+  CHECK(writev(iovs) == (size_t)header.totalLength);
 }
 
 std::unique_ptr<MsgReader> SocketChannel::readMessage() {
@@ -194,7 +194,7 @@ std::unique_ptr<MsgReader> SocketChannel::readMessage() {
     return nullptr;
   }
 
-  PCHECK(len == sizeof(header));
+  CHECK(len == sizeof(header));
 
   std::unique_ptr<MsgReader> msgReader(new MsgReader(this, header.numIovs));
 
@@ -209,7 +209,7 @@ std::unique_ptr<MsgReader> SocketChannel::readMessage() {
 MsgReader::MsgReader(SocketChannel* channel, size_t numBlocks)
     : channel_(channel), blockLengths_(numBlocks), currentBlockIndex_(0) {
   size_t size = numBlocks * sizeof(blockLengths_[0]);
-  PCHECK(channel_->read(&blockLengths_[0], size) == size);
+  CHECK(channel_->read(&blockLengths_[0], size) == size);
 }
 
 void MsgReader::readBlocks(const std::vector<void*>& bufs) {
@@ -223,12 +223,12 @@ void MsgReader::readBlocks(const std::vector<void*>& bufs) {
     ++currentBlockIndex_;
   }
 
-  PCHECK(channel_->readv(&iovs) == totalLength);
+  CHECK(channel_->readv(&iovs) == totalLength);
 }
 
 void MsgReader::readNextBlock(void* buf) {
   CHECK_LT(currentBlockIndex_, blockLengths_.size());
-  PCHECK(channel_->read(buf, getNextBlockLength()) == getNextBlockLength());
+  CHECK(channel_->read(buf, getNextBlockLength()) == getNextBlockLength());
   ++currentBlockIndex_;
 }
 
diff --git a/paddle/pserver/test/SocketTest.cpp b/paddle/pserver/test/SocketTest.cpp
index 066a6c0293..6f6c9e596c 100644
--- a/paddle/pserver/test/SocketTest.cpp
+++ b/paddle/pserver/test/SocketTest.cpp
@@ -113,7 +113,7 @@ void SocketServer::run() {
 
   /* First call to socket() function */
   socket_ = socket(AF_INET, SOCK_STREAM, 0);
-  PCHECK(socket_ >= 0) << "ERROR opening socket";
+  CHECK(socket_ >= 0) << "ERROR opening socket";
 
   /* Initialize socket structure */
   bzero((char*)&serv_addr, sizeof(serv_addr));
@@ -122,7 +122,7 @@ void SocketServer::run() {
   serv_addr.sin_port = htons(port_);
 
   /* Now bind the host address using bind() call.*/
-  PCHECK(bind(socket_, (struct sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
+  CHECK(bind(socket_, (struct sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
       << "ERROR on binding";
 
   /* Now start listening for the clients, here process will
@@ -134,7 +134,7 @@ void SocketServer::run() {
   while (true) {
     /* Accept actual connection from the client */
     newsockfd = accept(socket_, (struct sockaddr*)&cli_addr, &clilen);
-    PCHECK(newsockfd >= 0) << "ERROR on accept";
+    CHECK(newsockfd >= 0) << "ERROR on accept";
 
     SocketWorker* worker = new SocketWorker(newsockfd);
     worker->start();
@@ -146,17 +146,17 @@ void SocketWorker::run() {
 
   while (true) {
     int64_t n = channel_.readAll(&header, sizeof(header));
-    PCHECK(n == sizeof(header)) << "ERROR reading from socket";
+    CHECK(n == sizeof(header)) << "ERROR reading from socket";
 
     buffer_.resize(header.dataLength);
     n = channel_.readAll(&buffer_[0], header.dataLength);
-    PCHECK(n == header.dataLength) << "ERROR reading from socket";
+    CHECK(n == header.dataLength) << "ERROR reading from socket";
 
     /* Write a response to the client */
     n = channel_.writeAll(&header, sizeof(header));
-    PCHECK(n == sizeof(header)) << "ERROR reading from socket";
+    CHECK(n == sizeof(header)) << "ERROR reading from socket";
     n = channel_.writeAll(buffer_.data(), buffer_.size());
-    PCHECK(n == header.dataLength) << "ERROR writing to socket";
+    CHECK(n == header.dataLength) << "ERROR writing to socket";
   }
 }
 
@@ -177,9 +177,9 @@ SocketClient::SocketClient(const std::string& serverAddr, int serverPort) {
 
   /* Create a socket point */
   int sockfd = socket(AF_INET, SOCK_STREAM, 0);
-  PCHECK(sockfd >= 0) << "ERROR opening socket";
+  CHECK(sockfd >= 0) << "ERROR opening socket";
   server = gethostbyname(serverAddr.c_str());
-  PCHECK(server) << "ERROR, no such host: " << serverAddr;
+  CHECK(server) << "ERROR, no such host: " << serverAddr;
 
   bzero((char*)&serv_addr, sizeof(serv_addr));
   serv_addr.sin_family = AF_INET;
@@ -189,7 +189,7 @@ SocketClient::SocketClient(const std::string& serverAddr, int serverPort) {
   serv_addr.sin_port = htons(serverPort);
 
   /* Now connect to the server */
-  PCHECK(connect(sockfd, (sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
+  CHECK(connect(sockfd, (sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
       << "ERROR connecting";
 
   channel_.reset(new SocketChannel(sockfd));
@@ -234,18 +234,18 @@ int main(int argc, char** argv) {
     cpuGrad.copyFrom(gpuGrad);
 
     header.dataLength = dataSize;
-    PCHECK(channel->writeAll(&header, sizeof(header)) == sizeof(header))
+    CHECK(channel->writeAll(&header, sizeof(header)) == sizeof(header))
         << "Client write header error";
 
-    PCHECK(channel->writeAll(cpuGrad.getData(), dataSize) == dataSize)
+    CHECK(channel->writeAll(cpuGrad.getData(), dataSize) == dataSize)
         << "Client write data error";
 
     /* Now read server response */
-    PCHECK(channel->readAll(&header, sizeof(header)) == sizeof(header))
+    CHECK(channel->readAll(&header, sizeof(header)) == sizeof(header))
         << "Client read header error";
 
     CHECK_EQ((uint64_t)header.dataLength, dataSize);
-    PCHECK(channel->readAll(cpuParam.getData(), dataSize) == dataSize)
+    CHECK(channel->readAll(cpuParam.getData(), dataSize) == dataSize)
         << "Client read data error";
 
     gpuParam.copyFrom(cpuParam);
diff --git a/paddle/trainer/Tester.cpp b/paddle/trainer/Tester.cpp
index 80664fa877..16e676d602 100644
--- a/paddle/trainer/Tester.cpp
+++ b/paddle/trainer/Tester.cpp
@@ -175,7 +175,7 @@ real Tester::forwardOneBatch(const DataBatch& dataBatch,
     }
     hl_stream_synchronize(HPPL_STREAM_DEFAULT);
     FILE* fp = fopen(featFile.c_str(), "ab+");
-    PCHECK(!ferror(fp)) << "Fail to open " << featFile;
+    CHECK(!ferror(fp)) << "Fail to open " << featFile;
 
     size_t sampleNum = featMatrices[0]->getHeight();
     for (size_t i = 0; i < sampleNum; ++i) {
diff --git a/paddle/utils/ThreadLocal.h b/paddle/utils/ThreadLocal.h
index a4987c9ec2..b5e2862546 100644
--- a/paddle/utils/ThreadLocal.h
+++ b/paddle/utils/ThreadLocal.h
@@ -51,7 +51,7 @@ template <class T>
 class ThreadLocal {
 public:
   ThreadLocal() {
-    PCHECK(pthread_key_create(&threadSpecificKey_, dataDestructor) == 0);
+    CHECK(pthread_key_create(&threadSpecificKey_, dataDestructor) == 0);
   }
   ~ThreadLocal() { pthread_key_delete(threadSpecificKey_); }
 
@@ -65,7 +65,7 @@ public:
     if (!p && createLocal) {
       p = new T();
       int ret = pthread_setspecific(threadSpecificKey_, p);
-      PCHECK(ret == 0);
+      CHECK(ret == 0);
     }
     return p;
   }
@@ -79,7 +79,7 @@ public:
     if (T* q = get(false)) {
       dataDestructor(q);
     }
-    PCHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
+    CHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
   }
 
   /**
@@ -112,7 +112,7 @@ private:
 template <class T>
 class ThreadLocalD {
 public:
-  ThreadLocalD() { PCHECK(pthread_key_create(&threadSpecificKey_, NULL) == 0); }
+  ThreadLocalD() { CHECK(pthread_key_create(&threadSpecificKey_, NULL) == 0); }
   ~ThreadLocalD() {
     pthread_key_delete(threadSpecificKey_);
     for (auto t : threadMap_) {
@@ -127,7 +127,7 @@ public:
     T* p = (T*)pthread_getspecific(threadSpecificKey_);
     if (!p) {
       p = new T();
-      PCHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
+      CHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
       updateMap(p);
     }
     return p;
@@ -141,7 +141,7 @@ public:
     if (T* q = (T*)pthread_getspecific(threadSpecificKey_)) {
       dataDestructor(q);
     }
-    PCHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
+    CHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
     updateMap(p);
   }
 

From 2e302085d7c0a79a8516533b29450a1febc25d79 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Wed, 5 Jul 2017 09:49:30 +0000
Subject: [PATCH 60/79] fix bugs

---
 cmake/generic.cmake                     | 4 ++--
 go/pserver/client/c/CMakeLists.txt      | 3 ++-
 go/pserver/client/c/test/CMakeLists.txt | 4 +---
 paddle/api/CMakeLists.txt               | 1 +
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 74396abdbb..d51b95a5d7 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -192,9 +192,9 @@ function(cc_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_executable(${TARGET_NAME} ${cc_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
+    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main -lstdc++ -lm)
     add_dependencies(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
-    add_test(${TARGET_NAME} ${TARGET_NAME})
+    add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
   endif()
 endfunction(cc_test)
 
diff --git a/go/pserver/client/c/CMakeLists.txt b/go/pserver/client/c/CMakeLists.txt
index a3fcaeef19..d2ac20e25c 100644
--- a/go/pserver/client/c/CMakeLists.txt
+++ b/go/pserver/client/c/CMakeLists.txt
@@ -1,5 +1,6 @@
 cc_library(paddle_go_optimizer DEPS paddle_optimizer paddle_proto glog gflags protobuf)
 go_library(paddle_pserver_cclient STATIC DEPS paddle_go_optimizer)
 if(WITH_TESTING)
-  add_subdirectory(test)
+    # TODO: add unit test
+    #add_subdirectory(test)
 endif()
diff --git a/go/pserver/client/c/test/CMakeLists.txt b/go/pserver/client/c/test/CMakeLists.txt
index 44bc183738..dce8645ce7 100644
--- a/go/pserver/client/c/test/CMakeLists.txt
+++ b/go/pserver/client/c/test/CMakeLists.txt
@@ -1,4 +1,2 @@
-# FIXME:It's ugly
-#cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient paddle_go_optimizer)
-cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient paddle_optimizer paddle_proto glog gflags protobuf)
+cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient paddle_go_optimizer)
 add_style_check_target(test_cclient test_cclient.c)
diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt
index 39d8aa075b..84da89a142 100644
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -66,6 +66,7 @@ SWIG_LINK_LIBRARIES(swig_paddle
     paddle_trainer_lib
     paddle_network
     paddle_parameter
+    paddle_optimizer
     paddle_math
     paddle_utils
     paddle_proto

From 204869c2dae9b03b1155be106484ef328e942132 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Wed, 5 Jul 2017 10:10:18 +0000
Subject: [PATCH 61/79] fix bugs

---
 paddle/scripts/docker/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 1ccee686df..ab60f1a38d 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -78,7 +78,7 @@ paddle version
 # PaddlePaddle.  This awkwardness is due to
 # https://github.com/PaddlePaddle/Paddle/issues/1854.  It also
 # describes a solution.
-if [ ${WITH_DOC} == "ON" ]; then
+if [[ ${WITH_DOC} == "ON" ]]; then
     cat <<EOF
 ========================================
 Building documentation ...

From b220c4757e79ec42be7e1180e3e74cf05f403495 Mon Sep 17 00:00:00 2001
From: wuyi05 <wuyi05@baidu.com>
Date: Wed, 5 Jul 2017 18:18:32 +0800
Subject: [PATCH 62/79] fix auto cgo LDFLAGS

---
 go/pserver/optimizer.go | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/go/pserver/optimizer.go b/go/pserver/optimizer.go
index d84f55b987..54d1082094 100644
--- a/go/pserver/optimizer.go
+++ b/go/pserver/optimizer.go
@@ -2,7 +2,7 @@ package pserver
 
 // #cgo CFLAGS: -I ../../
 // //FIXME: ldflags contain "build" path
-// #cgo LDFLAGS: ../../build/go/pserver/client/c/libpaddle_go_optimizer.a -lstdc++ -lm
+// #cgo LDFLAGS: ${SRCDIR}/../../build/go/pserver/client/c/libpaddle_go_optimizer.a -lstdc++ -lm
 // #include "paddle/optimizer/optimizer.h"
 // #include <stdlib.h>
 // #include <string.h>
@@ -56,8 +56,8 @@ func newOptimizer(paramWithConfigs ParameterWithConfig) *optimizer {
 
 func (o *optimizer) GetWeights() []byte {
 	var buffer unsafe.Pointer
-	buffer_len := C.paddle_optimizer_get_weights(o.opt, &buffer)
-	return cArrayToSlice(buffer, int(buffer_len)*C.sizeof_float)
+	bufferLen := C.paddle_optimizer_get_weights(o.opt, &buffer)
+	return cArrayToSlice(buffer, int(bufferLen)*C.sizeof_float)
 }
 
 func (o *optimizer) UpdateParameter(g Gradient) error {

From c37da0bd3ba14318198bfc6dd8f8ba5e13c1a269 Mon Sep 17 00:00:00 2001
From: yangyaming <mxscmxsc@gmail.com>
Date: Wed, 5 Jul 2017 18:36:47 +0800
Subject: [PATCH 63/79] Remove hasParamInitialValue flag.

---
 paddle/gserver/tests/LayerGradUtil.cpp  | 9 +++------
 paddle/gserver/tests/LayerGradUtil.h    | 2 --
 paddle/gserver/tests/test_LayerGrad.cpp | 1 -
 3 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp
index 66aafba844..15b8cedeb8 100644
--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
@@ -498,12 +498,9 @@ void initTestLayer(TestConfig testConf,
         paraConfig.add_dims((*layerMap)[input.input_layer_name()]->getSize());
         paraConfig.add_dims(testConf.layerConfig.size());
       }
-      if (testConf.hasParamInitialValue) {
-        paraConfig.set_initial_mean(testConf.paramInitialMean);
-        paraConfig.set_initial_std(testConf.paramInitialStd);
-      } else {
-        paraConfig.set_initial_std(1);
-      }
+      CHECK_GE(testConf.paramInitialStd, 0);
+      paraConfig.set_initial_mean(testConf.paramInitialMean);
+      paraConfig.set_initial_std(testConf.paramInitialStd);
       initParameter(paraName, paraSize, inputDef.isStatic, false, paraConfig);
     }
   }
diff --git a/paddle/gserver/tests/LayerGradUtil.h b/paddle/gserver/tests/LayerGradUtil.h
index 9dbd202757..d299b4dd09 100644
--- a/paddle/gserver/tests/LayerGradUtil.h
+++ b/paddle/gserver/tests/LayerGradUtil.h
@@ -127,7 +127,6 @@ struct TestConfig {
   size_t biasSize;
   real paramInitialMean;
   real paramInitialStd;
-  bool hasParamInitialValue;
   bool testAccumulate;
   bool testState;
   bool staticBias;
@@ -136,7 +135,6 @@ struct TestConfig {
       : biasSize(0),
         paramInitialMean(0.0),
         paramInitialStd(1.0),
-        hasParamInitialValue(false),
         testAccumulate(true),
         testState(false),
         staticBias(false),
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 6441e08b48..bf0136a10f 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1661,7 +1661,6 @@ TEST(Layer, PadLayer) {
 
 TEST(Layer, CrossChannelNormLayer) {
   TestConfig config;
-  config.hasParamInitialValue = true;
   config.paramInitialMean = 1.;
   config.paramInitialStd = 0.;
   config.layerConfig.set_type("norm");

From b68e90be820f7a925e114f76f27156e728fc9e79 Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@baifendian.com>
Date: Wed, 5 Jul 2017 21:30:28 +0800
Subject: [PATCH 64/79] fix go test building

---
 go/pserver/client/c/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/go/pserver/client/c/CMakeLists.txt b/go/pserver/client/c/CMakeLists.txt
index a3fcaeef19..34aa7ca5ff 100644
--- a/go/pserver/client/c/CMakeLists.txt
+++ b/go/pserver/client/c/CMakeLists.txt
@@ -1,4 +1,5 @@
 cc_library(paddle_go_optimizer DEPS paddle_optimizer paddle_proto glog gflags protobuf)
+target_link_libraries(paddle_go_optimizer stdc++ m)
 go_library(paddle_pserver_cclient STATIC DEPS paddle_go_optimizer)
 if(WITH_TESTING)
   add_subdirectory(test)

From 78f1274d6e2c75d0036ae2a7da6cbccfc844b8f0 Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@baifendian.com>
Date: Wed, 5 Jul 2017 21:40:12 +0800
Subject: [PATCH 65/79] remove unnessesary cc_test link

---
 cmake/generic.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index d51b95a5d7..c2962e35ef 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -192,7 +192,7 @@ function(cc_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_executable(${TARGET_NAME} ${cc_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main -lstdc++ -lm)
+    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
     add_dependencies(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
     add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
   endif()
@@ -285,7 +285,7 @@ function(go_library TARGET_NAME)
   add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
     COMMAND rm "${${TARGET_NAME}_LIB_PATH}"
     # Golang build source code
-    COMMAND env LIBRARY_PATH=${CMAKE_BINARY_DIR}/go/pserver/client/c/:$ENV{LIBRARY_PATH} GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
+    COMMAND GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
     -o "${${TARGET_NAME}_LIB_PATH}"
     "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${GO_SOURCE}"
     # must run under GOPATH

From 4d2a83c750c6168d16a4ee302b0c69e553bd0b34 Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@baifendian.com>
Date: Wed, 5 Jul 2017 21:58:46 +0800
Subject: [PATCH 66/79] update again

---
 go/pserver/client/c/test/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/go/pserver/client/c/test/CMakeLists.txt b/go/pserver/client/c/test/CMakeLists.txt
index f287f85071..dce8645ce7 100644
--- a/go/pserver/client/c/test/CMakeLists.txt
+++ b/go/pserver/client/c/test/CMakeLists.txt
@@ -1,2 +1,2 @@
-cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient)
+cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient paddle_go_optimizer)
 add_style_check_target(test_cclient test_cclient.c)

From 7848a3fb5c6de5c21a6c1c34a9d12e8e866c760c Mon Sep 17 00:00:00 2001
From: wuyi05 <wuyi05@baidu.com>
Date: Thu, 6 Jul 2017 09:45:01 +0800
Subject: [PATCH 67/79] remove cclient test

---
 go/pserver/client/c/CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/go/pserver/client/c/CMakeLists.txt b/go/pserver/client/c/CMakeLists.txt
index a3fcaeef19..d5c1ed38e5 100644
--- a/go/pserver/client/c/CMakeLists.txt
+++ b/go/pserver/client/c/CMakeLists.txt
@@ -1,5 +1,7 @@
 cc_library(paddle_go_optimizer DEPS paddle_optimizer paddle_proto glog gflags protobuf)
 go_library(paddle_pserver_cclient STATIC DEPS paddle_go_optimizer)
 if(WITH_TESTING)
-  add_subdirectory(test)
+  # FIXME: this test requires pserver which is not managed by the test
+  # we need some kind of e2e testing machanism.
+  # add_subdirectory(test)
 endif()

From d6ecae779a28d51e669a4c029d00ec57a98f2bc8 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 6 Jul 2017 11:25:28 +0800
Subject: [PATCH 68/79] FIX: propagation dependencies and out of date rebuild

---
 cmake/generic.cmake | 51 ++++++++++++++++++++++++++-------------------
 1 file changed, 30 insertions(+), 21 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index cae9524b2f..87d8caaec4 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -99,15 +99,37 @@ function(merge_static_libs TARGET_NAME)
   set(libs ${ARGN})
   list(REMOVE_DUPLICATES libs)
 
-  # First get the file names of the libraries to be merged
+  # Get all propagation dependencies from the merged libraries
   foreach(lib ${libs})
+    list(APPEND libs_deps ${${lib}_LIB_DEPENDS})
+  endforeach()
+
+  # To produce a library we need at least one source file.
+  # It is created by add_custom_command below and will helps 
+  # also help to track dependencies.
+  set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
+
+  # Make the generated dummy source file depended on all static input
+  # libs. If input lib changes,the source file is touched
+  # which causes the desired effect (relink).
+  add_custom_command(OUTPUT ${dummyfile}
+    COMMAND ${CMAKE_COMMAND} -E touch ${dummyfile}
+    DEPENDS ${libs})
+
+  # Generate dummy staic lib
+  file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
+  add_library(${TARGET_NAME} STATIC ${dummyfile})
+  target_link_libraries(${TARGET_NAME} ${libs_deps})
+
+  foreach(lib ${libs})
+    # Get the file names of the libraries to be merged
     set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
   endforeach()
 
+  # Get the file name of the generated library
+  set(outlibfile "$<TARGET_FILE:${TARGET_NAME}>")
+
   if(APPLE) # Use OSX's libtool to merge archives
-    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
-    file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
-    add_library(${TARGET_NAME} STATIC ${dummyfile})
 		add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
       COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
       COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles})
@@ -117,7 +139,8 @@ function(merge_static_libs TARGET_NAME)
       set(objdir ${lib}.objdir)
 
       add_custom_command(OUTPUT ${objdir}
-        COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir})
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir}
+        DEPENDS ${lib})
 
       add_custom_command(OUTPUT ${objlistfile}
         COMMAND ${CMAKE_AR} -x "$<TARGET_FILE:${lib}>"
@@ -125,23 +148,9 @@ function(merge_static_libs TARGET_NAME)
         DEPENDS ${lib} ${objdir}
         WORKING_DIRECTORY ${objdir})
 
-      # Empty dummy source file that goes into merged library
-      set(mergebase ${lib}.mergebase.c)
-      add_custom_command(OUTPUT ${mergebase}
-        COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}
-        DEPENDS ${objlistfile})
-
-      list(APPEND mergebases "${mergebase}")
-    endforeach()
-
-    # We need a target for the output merged library
-    add_library(${TARGET_NAME} STATIC ${mergebases})
-    set(outlibfile "$<TARGET_FILE:${TARGET_NAME}>")
-
-    foreach(lib ${libs})
       add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-      COMMAND ${CMAKE_AR} ru ${outlibfile} @"../${lib}.objlist"
-      WORKING_DIRECTORY ${lib}.objdir)
+        COMMAND ${CMAKE_AR} ru ${outlibfile} *.o 
+        WORKING_DIRECTORY ${objdir})
     endforeach()
 
     add_custom_command(TARGET ${TARGET_NAME} POST_BUILD

From 3e4ba647eec7bc16511e1146d5a696cd124c6a27 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 6 Jul 2017 11:28:52 +0800
Subject: [PATCH 69/79] FIX: remove duplicate

---
 cmake/generic.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 87d8caaec4..1a4600ef4b 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -103,6 +103,7 @@ function(merge_static_libs TARGET_NAME)
   foreach(lib ${libs})
     list(APPEND libs_deps ${${lib}_LIB_DEPENDS})
   endforeach()
+  list(REMOVE_DUPLICATES libs_deps)
 
   # To produce a library we need at least one source file.
   # It is created by add_custom_command below and will helps 

From 1b366dc2fff2b896fc92c1aa161183e6c88f6b7e Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 6 Jul 2017 14:44:40 +0800
Subject: [PATCH 70/79] Fix CI error on test_LayerGrad.LSTM

* We should not EXPECT_EQ between a float value and a int value.
  Use ASSERT_NEAR instead.
---
 paddle/gserver/tests/LayerGradUtil.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp
index 15b8cedeb8..9eca58f1a1 100644
--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
@@ -241,7 +241,7 @@ void testBatchState(LayerPtr testLayer,
 
     std::vector<Argument> args;
     args.push_back(out);
-    EXPECT_EQ(0, Argument::sum(args)) << "testBatchState failed";
+    ASSERT_NEAR(0, Argument::sum(args), 1e-5) << "testBatchState failed";
     for (size_t seqId = 0; seqId < numSequences; ++seqId) {
       start[seqId] += seqLens[seqId];
     }

From e2ea1f42e9202e5591e2de1ce5f96c573dcc6484 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 6 Jul 2017 14:12:45 +0800
Subject: [PATCH 71/79] Generate python protobufs for paddle.v2.framework

Python should be able to manipulate Protobuf message because:

1. Python's `create_op_creation_methods` take the `OpProto` array to
   generate all `op_creation_methods` in RunTime.

2. All `op_creation_methods` will create an `OpDesc` and pass it to
   Paddle C++ method `CreateOp` and return the Op handle.

Here is the list of what is added in this commit:

* Add `protobuf_generate_python` if it is not defined.
  * Before cmake 3.4, `protobuf_generate_python` is not defined. Just
    copy the implementation of that function in `protobuf.cmake`
* Add `py_proto_compile` function in `cmake/generic.cmake`.
  * It follows bazel's API interface.
    * https://github.com/pubref/rules_protobuf#rules
* Add an empty package named `paddle.v2.framework`, all python code of
  `paddle::framework` will be in that package.
* Generate protobuf's python module `__init__.py` by `touch` while
  compiling.
* Change setup.py.in, make `paddle.v2.framework.proto` uses the
  generated protobuf pythons.
---
 cmake/external/protobuf.cmake                 | 59 +++++++++++++++++++
 cmake/generic.cmake                           |  9 +++
 paddle/framework/CMakeLists.txt               |  5 +-
 python/CMakeLists.txt                         |  3 +-
 python/paddle/v2/framework/__init__.py        |  1 +
 .../paddle/v2/framework/tests/CMakeLists.txt  |  1 +
 .../v2/framework/tests/test_protobuf.py       | 26 ++++++++
 python/setup.py.in                            |  9 ++-
 8 files changed, 109 insertions(+), 4 deletions(-)
 create mode 100644 python/paddle/v2/framework/__init__.py
 create mode 100644 python/paddle/v2/framework/tests/CMakeLists.txt
 create mode 100644 python/paddle/v2/framework/tests/test_protobuf.py

diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 3c74944bc2..e629d61585 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -17,6 +17,65 @@ INCLUDE(ExternalProject)
 FIND_PACKAGE(Protobuf QUIET)
 SET(PROTOBUF_FOUND "OFF")
 
+if(NOT COMMAND protobuf_generate_python)  # before cmake 3.4, protobuf_genrerate_python is not defined.
+    function(protobuf_generate_python SRCS)
+        # shameless copy from https://github.com/Kitware/CMake/blob/master/Modules/FindProtobuf.cmake
+        if(NOT ARGN)
+            message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files")
+            return()
+        endif()
+
+        if(PROTOBUF_GENERATE_CPP_APPEND_PATH)
+            # Create an include path for each file specified
+            foreach(FIL ${ARGN})
+                get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
+                get_filename_component(ABS_PATH ${ABS_FIL} PATH)
+                list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
+                if(${_contains_already} EQUAL -1)
+                    list(APPEND _protobuf_include_path -I ${ABS_PATH})
+                endif()
+            endforeach()
+        else()
+            set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR})
+        endif()
+
+        if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS)
+            set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}")
+        endif()
+
+        if(DEFINED Protobuf_IMPORT_DIRS)
+            foreach(DIR ${Protobuf_IMPORT_DIRS})
+                get_filename_component(ABS_PATH ${DIR} ABSOLUTE)
+                list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
+                if(${_contains_already} EQUAL -1)
+                    list(APPEND _protobuf_include_path -I ${ABS_PATH})
+                endif()
+            endforeach()
+        endif()
+
+        set(${SRCS})
+        foreach(FIL ${ARGN})
+            get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
+            get_filename_component(FIL_WE ${FIL} NAME_WE)
+            if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH)
+                get_filename_component(FIL_DIR ${FIL} DIRECTORY)
+                if(FIL_DIR)
+                    set(FIL_WE "${FIL_DIR}/${FIL_WE}")
+                endif()
+            endif()
+
+            list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py")
+            add_custom_command(
+                    OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py"
+                    COMMAND  ${Protobuf_PROTOC_EXECUTABLE} --python_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL}
+                    DEPENDS ${ABS_FIL} ${Protobuf_PROTOC_EXECUTABLE}
+                    COMMENT "Running Python protocol buffer compiler on ${FIL}"
+                    VERBATIM )
+        endforeach()
+
+        set(${SRCS} ${${SRCS}} PARENT_SCOPE)
+    endfunction()
+endif()
 
 # Print and set the protobuf library information,
 # finish this cmake process and exit from this file.
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index d51b95a5d7..a92671ae62 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -335,3 +335,12 @@ function(proto_library TARGET_NAME)
   protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS})
   cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS ${proto_library_DEPS} protobuf)
 endfunction()
+
+function(py_proto_compile TARGET_NAME)
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS)
+  cmake_parse_arguments(py_proto_compile "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  set(py_srcs)
+  protobuf_generate_python(py_srcs ${py_proto_compile_SRCS})
+  add_custom_target(${TARGET_NAME} ALL DEPENDS ${py_srcs})
+endfunction()
\ No newline at end of file
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index dcd70d2851..970b2b9abd 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -9,6 +9,9 @@ cc_test(enforce_test SRCS enforce_test.cc)
 proto_library(attr_type SRCS attr_type.proto)
 proto_library(op_proto SRCS op_proto.proto DEPS attr_type)
 cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
-
 proto_library(op_desc SRCS op_desc.proto DEPS attr_type)
 cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
+py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto)
+# Generate an empty __init__.py to make framework_py_proto as a valid python module.
+add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
+add_dependencies(framework_py_proto framework_py_proto_init)
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 361e764e25..13a1802ee3 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -29,7 +29,7 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
 add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
     COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
     COMMAND ${CMAKE_COMMAND} -E touch ${OUTPUT_DIR}/.timestamp
-    DEPENDS gen_proto_py ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
+    DEPENDS gen_proto_py framework_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
 
 add_custom_target(paddle_python ALL DEPENDS
     ${OUTPUT_DIR}/.timestamp)
@@ -43,6 +43,7 @@ if (WITH_TESTING)
     add_subdirectory(paddle/v2/tests)
     add_subdirectory(paddle/v2/reader/tests)
     add_subdirectory(paddle/v2/plot/tests)
+    add_subdirectory(paddle/v2/framework/tests)
   endif()
 endif()
 install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}
diff --git a/python/paddle/v2/framework/__init__.py b/python/paddle/v2/framework/__init__.py
new file mode 100644
index 0000000000..c942373c66
--- /dev/null
+++ b/python/paddle/v2/framework/__init__.py
@@ -0,0 +1 @@
+__all__ = ['proto']
diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt
new file mode 100644
index 0000000000..8cb0c5c376
--- /dev/null
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -0,0 +1 @@
+add_python_test(test_framework test_protobuf.py)
diff --git a/python/paddle/v2/framework/tests/test_protobuf.py b/python/paddle/v2/framework/tests/test_protobuf.py
new file mode 100644
index 0000000000..f0e6019199
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_protobuf.py
@@ -0,0 +1,26 @@
+import paddle.v2.framework.proto.op_proto_pb2
+import paddle.v2.framework.proto.attr_type_pb2
+import unittest
+
+
+class TestFrameworkProto(unittest.TestCase):
+    def test_all(self):
+        op_proto_lib = paddle.v2.framework.proto.op_proto_pb2
+        attr_type_lib = paddle.v2.framework.proto.attr_type_pb2
+        op_proto = op_proto_lib.OpProto()
+        ipt0 = op_proto.inputs.add()
+        ipt0.name = "a"
+        ipt0.comment = "the input of cosine op"
+        ipt1 = op_proto.inputs.add()
+        ipt1.name = "b"
+        ipt1.comment = "the other input of cosine op"
+        opt = op_proto.outputs.add()
+        opt.name = "output"
+        opt.comment = "the output of cosine op"
+        op_proto.comment = "cosine op, output = scale*cos(a, b)"
+        attr = op_proto.attrs.add()
+        attr.name = "scale"
+        attr.comment = "scale of cosine op"
+        attr.type = attr_type_lib.FLOAT
+        op_proto.type = "cos"
+        self.assertTrue(op_proto.IsInitialized())
diff --git a/python/setup.py.in b/python/setup.py.in
index dae0166487..78423614a6 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -9,7 +9,9 @@ packages=['paddle',
           'paddle.v2.dataset',
           'paddle.v2.reader',
           'paddle.v2.master',
-          'paddle.v2.plot']
+          'paddle.v2.plot',
+          'paddle.v2.framework',
+          'paddle.v2.framework.proto']
 
 setup_requires=["requests",
                 "numpy",
@@ -29,6 +31,9 @@ setup(name='paddle',
       packages=packages,
       package_data={'paddle.v2.master': ['${paddle_master_LIB_NAME}'], },
       package_dir={
-          '': '${CMAKE_CURRENT_SOURCE_DIR}'
+          '': '${CMAKE_CURRENT_SOURCE_DIR}',
+          # The paddle.v2.framework.proto will be generated while compiling.
+          # So that package points to other directory.
+          'paddle.v2.framework.proto': '${CMAKE_BINARY_DIR}/paddle/framework'
       },
 )

From 847535f4fe6cea0b954a67fffea4c7b9ed96bd77 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 6 Jul 2017 15:42:29 +0800
Subject: [PATCH 72/79] FIX: propagation dependencies under linux

---
 cmake/generic.cmake | 69 +++++++++++++++++++++++++--------------------
 1 file changed, 39 insertions(+), 30 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 1a4600ef4b..3900ea2604 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -103,38 +103,33 @@ function(merge_static_libs TARGET_NAME)
   foreach(lib ${libs})
     list(APPEND libs_deps ${${lib}_LIB_DEPENDS})
   endforeach()
-  list(REMOVE_DUPLICATES libs_deps)
 
-  # To produce a library we need at least one source file.
-  # It is created by add_custom_command below and will helps 
-  # also help to track dependencies.
-  set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
-
-  # Make the generated dummy source file depended on all static input
-  # libs. If input lib changes,the source file is touched
-  # which causes the desired effect (relink).
-  add_custom_command(OUTPUT ${dummyfile}
-    COMMAND ${CMAKE_COMMAND} -E touch ${dummyfile}
-    DEPENDS ${libs})
-
-  # Generate dummy staic lib
-  file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
-  add_library(${TARGET_NAME} STATIC ${dummyfile})
-  target_link_libraries(${TARGET_NAME} ${libs_deps})
+  if(APPLE) # Use OSX's libtool to merge archives
+    # To produce a library we need at least one source file.
+    # It is created by add_custom_command below and will helps 
+    # also help to track dependencies.
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
 
-  foreach(lib ${libs})
-    # Get the file names of the libraries to be merged
-    set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
-  endforeach()
+    # Make the generated dummy source file depended on all static input
+    # libs. If input lib changes,the source file is touched
+    # which causes the desired effect (relink).
+    add_custom_command(OUTPUT ${dummyfile}
+      COMMAND ${CMAKE_COMMAND} -E touch ${dummyfile}
+      DEPENDS ${libs})
 
-  # Get the file name of the generated library
-  set(outlibfile "$<TARGET_FILE:${TARGET_NAME}>")
+    # Generate dummy staic lib
+    file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
+    add_library(${TARGET_NAME} STATIC ${dummyfile})
+    target_link_libraries(${TARGET_NAME} ${libs_deps})
 
-  if(APPLE) # Use OSX's libtool to merge archives
+    foreach(lib ${libs})
+      # Get the file names of the libraries to be merged
+      set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
+    endforeach()
 		add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
       COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
       COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles})
-	else() # general UNIX: use "ar" to extract objects and re-add to a common lib
+  else() # general UNIX: use "ar" to extract objects and re-add to a common lib
     foreach(lib ${libs})
       set(objlistfile ${lib}.objlist) # list of objects in the input library
       set(objdir ${lib}.objdir)
@@ -149,13 +144,27 @@ function(merge_static_libs TARGET_NAME)
         DEPENDS ${lib} ${objdir}
         WORKING_DIRECTORY ${objdir})
 
-      add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-        COMMAND ${CMAKE_AR} ru ${outlibfile} *.o 
-        WORKING_DIRECTORY ${objdir})
+      # Empty dummy source file that goes into merged library		
+      set(mergebase ${lib}.mergebase.c)		
+      add_custom_command(OUTPUT ${mergebase}		
+        COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}		
+        DEPENDS ${objlistfile})		
+
+      list(APPEND mergebases "${mergebase}")
     endforeach()
 
-    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-      COMMAND ${CMAKE_RANLIB} ${outlibfile})
+    add_library(${TARGET_NAME} STATIC ${mergebases})
+    target_link_libraries(${TARGET_NAME} ${libs_deps}) 
+
+    # Get the file name of the generated library
+    set(outlibfile "$<TARGET_FILE:${TARGET_NAME}>")
+
+    foreach(lib ${libs})
+      add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+        COMMAND ${CMAKE_AR} cr ${outlibfile} *.o  
+        COMMAND ${CMAKE_RANLIB} ${outlibfile}
+        WORKING_DIRECTORY ${lib}.objdir)
+    endforeach()
   endif()
 endfunction(merge_static_libs)
 

From 203364281ed8b86c53c520142b881f00aca5485e Mon Sep 17 00:00:00 2001
From: caoying03 <caoying03@baidu.com>
Date: Thu, 6 Jul 2017 16:44:54 +0800
Subject: [PATCH 73/79] enable error clipping in FC layer.

---
 python/paddle/trainer/config_parser.py | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 370529ed97..e020be9378 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from __future__ import print_function
+import pdb
 '''
 The following functions are available in the config file:
 
@@ -761,8 +762,8 @@ class DotMulOperator(Operator):
 
     def check_dims(self):
         for i in range(2):
-            config_assert(self.operator_conf.input_sizes[i] ==
-                          self.operator_conf.output_size,
+            config_assert(self.operator_conf.input_sizes[
+                i] == self.operator_conf.output_size,
                           "DotMul input_size != output_size")
 
     def calc_output_size(self, input_sizes):
@@ -1193,8 +1194,7 @@ def parse_image(image, input_layer_name, image_conf):
 def parse_norm(norm, input_layer_name, norm_conf):
     norm_conf.norm_type = norm.norm_type
     config_assert(
-        norm.norm_type in
-        ['rnorm', 'cmrnorm-projection', 'cross-channel-norm'],
+        norm.norm_type in ['rnorm', 'cmrnorm-projection', 'cross-channel-norm'],
         "norm-type %s is not in [rnorm, cmrnorm-projection, cross-channel-norm]"
         % norm.norm_type)
     norm_conf.channels = norm.channels
@@ -1571,7 +1571,13 @@ class MultiClassCrossEntropySelfNormCostLayer(LayerBase):
 
 @config_layer('fc')
 class FCLayer(LayerBase):
-    def __init__(self, name, size, inputs, bias=True, **xargs):
+    def __init__(self,
+                 name,
+                 size,
+                 inputs,
+                 bias=True,
+                 error_clipping_threshold=None,
+                 **xargs):
         super(FCLayer, self).__init__(name, 'fc', size, inputs=inputs, **xargs)
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
@@ -1589,6 +1595,9 @@ class FCLayer(LayerBase):
                                         format)
         self.create_bias_parameter(bias, self.config.size)
 
+        if error_clipping_threshold is not None:
+            self.config.error_clipping_threshold = error_clipping_threshold
+
 
 @config_layer('selective_fc')
 class SelectiveFCLayer(LayerBase):
@@ -3425,7 +3434,8 @@ DEFAULT_SETTING = dict(
 
 settings = copy.deepcopy(DEFAULT_SETTING)
 
-settings_deprecated = dict(usage_ratio=1., )
+settings_deprecated = dict(
+    usage_ratio=1., )
 
 trainer_settings = dict(
     save_dir="./output/model",

From 075954c17ceaf422478961d9a5d6aaa364458415 Mon Sep 17 00:00:00 2001
From: caoying03 <caoying03@baidu.com>
Date: Thu, 6 Jul 2017 17:40:58 +0800
Subject: [PATCH 74/79] follow comment.

---
 python/paddle/trainer/config_parser.py | 28 +++++++-------------------
 1 file changed, 7 insertions(+), 21 deletions(-)

diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 1fed6db33c..826ba2834a 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1353,7 +1353,8 @@ class LayerBase(object):
             device=None,
             active_type="",
             drop_rate=0.,
-            coeff=None):
+            coeff=None,
+            error_clipping_threshold=None):
         config_assert('@' not in name,
                       "layer name: %s contain special character @" % name)
         global g_current_submodel
@@ -1387,6 +1388,9 @@ class LayerBase(object):
         elif g_default_device is not None:
             self.config.device = g_default_device
 
+        if error_clipping_threshold is not None:
+            self.config.error_clipping_threshold = error_clipping_threshold
+
         for input_index in xrange(len(self.inputs)):
             input = self.inputs[input_index]
             input_config = None
@@ -1571,13 +1575,7 @@ class MultiClassCrossEntropySelfNormCostLayer(LayerBase):
 
 @config_layer('fc')
 class FCLayer(LayerBase):
-    def __init__(self,
-                 name,
-                 size,
-                 inputs,
-                 bias=True,
-                 error_clipping_threshold=None,
-                 **xargs):
+    def __init__(self, name, size, inputs, bias=True, **xargs):
         super(FCLayer, self).__init__(name, 'fc', size, inputs=inputs, **xargs)
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
@@ -1595,9 +1593,6 @@ class FCLayer(LayerBase):
                                         format)
         self.create_bias_parameter(bias, self.config.size)
 
-        if error_clipping_threshold is not None:
-            self.config.error_clipping_threshold = error_clipping_threshold
-
 
 @config_layer('selective_fc')
 class SelectiveFCLayer(LayerBase):
@@ -2791,13 +2786,7 @@ class TensorLayer(LayerBase):
 
 @config_layer('mixed')
 class MixedLayer(LayerBase):
-    def __init__(self,
-                 name,
-                 inputs,
-                 size=0,
-                 bias=True,
-                 error_clipping_threshold=None,
-                 **xargs):
+    def __init__(self, name, inputs, size=0, bias=True, **xargs):
         config_assert(inputs, 'inputs cannot be empty')
         super(MixedLayer, self).__init__(
             name, 'mixed', size, inputs=inputs, **xargs)
@@ -2879,9 +2868,6 @@ class MixedLayer(LayerBase):
             self.config.bias_size = psize
             self.create_bias_parameter(bias, psize)
 
-        if error_clipping_threshold is not None:
-            self.config.error_clipping_threshold = error_clipping_threshold
-
 
 # like MixedLayer, but no bias parameter
 @config_func

From f2a82b16a25c2eb825ddb0a46b4966b01f248f22 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Thu, 6 Jul 2017 11:58:43 +0000
Subject: [PATCH 75/79] add print messages

---
 python/CMakeLists.txt | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 361e764e25..7a57d922ef 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -17,15 +17,21 @@ add_custom_target(copy_paddle_master)
 SET(COPY_PADDLE_MASTER "")
 if(WITH_GOLANG)
   SET(COPY_PADDLE_MASTER "copy_paddle_master")
+  message("paddle_master_lib_path:" ${paddle_master_LIB_PATH})
+  message("PROJ_ROOT:" ${PROJ_ROOT})
   add_custom_command(TARGET ${COPY_PADDLE_MASTER}
     COMMAND cp ${paddle_master_LIB_PATH} ${PROJ_ROOT}/python/paddle/v2/master/
     )
   add_dependencies(copy_paddle_master paddle_master)
 endif(WITH_GOLANG)
 
+message("paddle_master_LIB_NAME:" ${paddle_master_LIB_NAME})
+message("CMAKE_CURRENT_BINARY_DIR:" ${CMAKE_CURRENT_BINARY_DIR})
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
     ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
 
+message("OUTPUT_DIR:" ${OUTPUT_DIR})
+message("py_env:" ${py_env})
 add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
     COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
     COMMAND ${CMAKE_COMMAND} -E touch ${OUTPUT_DIR}/.timestamp

From 660475b5ab1c6cc295420a527d549dc1f38ba03a Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Thu, 6 Jul 2017 12:14:30 +0000
Subject: [PATCH 76/79] modify to add paddle_master name

---
 python/CMakeLists.txt | 1 +
 python/setup.py.in    | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 7a57d922ef..633d2b3786 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -27,6 +27,7 @@ endif(WITH_GOLANG)
 
 message("paddle_master_LIB_NAME:" ${paddle_master_LIB_NAME})
 message("CMAKE_CURRENT_BINARY_DIR:" ${CMAKE_CURRENT_BINARY_DIR})
+message("CMAKE_CURRENT_SOURCE_DIR:" ${CMAKE_CURRENT_SOURCE_DIR})
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
     ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
 
diff --git a/python/setup.py.in b/python/setup.py.in
index dae0166487..9c77bed15f 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -27,7 +27,7 @@ setup(name='paddle',
       description='Parallel Distributed Deep Learning',
       install_requires=setup_requires,
       packages=packages,
-      package_data={'paddle.v2.master': ['${paddle_master_LIB_NAME}'], },
+      package_data={'paddle.v2.master': ['libpaddle_master.so'], },
       package_dir={
           '': '${CMAKE_CURRENT_SOURCE_DIR}'
       },

From b396055499c5bd34bea5753e7ca19e18e2f7044b Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Thu, 6 Jul 2017 13:34:40 +0000
Subject: [PATCH 77/79] add -V

---
 paddle/scripts/docker/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index ab60f1a38d..0579bfcc7a 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -60,7 +60,7 @@ EOF
 make -j `nproc`
 if [ ${WITH_TESTING:-OFF} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
     pip uninstall -y py-paddle paddle || true
-    ctest --output-on-failure
+    ctest -V --output-on-failure
 fi
 
 

From 4daa247d80a3f94b8f60fe084bd3887b4b5c698e Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Fri, 7 Jul 2017 01:12:48 +0000
Subject: [PATCH 78/79] rm -v

---
 paddle/scripts/docker/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 0579bfcc7a..ab60f1a38d 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -60,7 +60,7 @@ EOF
 make -j `nproc`
 if [ ${WITH_TESTING:-OFF} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
     pip uninstall -y py-paddle paddle || true
-    ctest -V --output-on-failure
+    ctest --output-on-failure
 fi
 
 

From 126e64fc830ba5b787a787fdd2e2b7f7e2ef1939 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Fri, 7 Jul 2017 01:35:16 +0000
Subject: [PATCH 79/79] add cmake

---
 python/CMakeLists.txt | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 633d2b3786..361e764e25 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -17,22 +17,15 @@ add_custom_target(copy_paddle_master)
 SET(COPY_PADDLE_MASTER "")
 if(WITH_GOLANG)
   SET(COPY_PADDLE_MASTER "copy_paddle_master")
-  message("paddle_master_lib_path:" ${paddle_master_LIB_PATH})
-  message("PROJ_ROOT:" ${PROJ_ROOT})
   add_custom_command(TARGET ${COPY_PADDLE_MASTER}
     COMMAND cp ${paddle_master_LIB_PATH} ${PROJ_ROOT}/python/paddle/v2/master/
     )
   add_dependencies(copy_paddle_master paddle_master)
 endif(WITH_GOLANG)
 
-message("paddle_master_LIB_NAME:" ${paddle_master_LIB_NAME})
-message("CMAKE_CURRENT_BINARY_DIR:" ${CMAKE_CURRENT_BINARY_DIR})
-message("CMAKE_CURRENT_SOURCE_DIR:" ${CMAKE_CURRENT_SOURCE_DIR})
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
     ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
 
-message("OUTPUT_DIR:" ${OUTPUT_DIR})
-message("py_env:" ${py_env})
 add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
     COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
     COMMAND ${CMAKE_COMMAND} -E touch ${OUTPUT_DIR}/.timestamp