From 1644c72accb59c325c7e17bb1bb46e03391a4c27 Mon Sep 17 00:00:00 2001
From: wangmeng28 <wangmeng28@baidu.com>
Date: Wed, 11 Oct 2017 16:07:30 +0800
Subject: [PATCH 01/52] Add framework of the factorization machine layer

---
 doc/api/v2/config/layer.rst                   | 15 +++--
 .../layers/FactorizationMachineLayer.cpp      | 65 +++++++++++++++++++
 .../layers/FactorizationMachineLayer.h        | 59 +++++++++++++++++
 paddle/gserver/tests/test_LayerGrad.cpp       | 19 ++++++
 proto/ModelConfig.proto                       |  3 +
 python/paddle/trainer/config_parser.py        | 15 +++++
 .../paddle/trainer_config_helpers/layers.py   | 65 +++++++++++++++++++
 .../tests/configs/file_list.sh                |  3 +-
 .../test_factorization_machine.protostr       | 39 +++++++++++
 .../configs/test_factorization_machine.py     |  9 +++
 10 files changed, 287 insertions(+), 5 deletions(-)
 create mode 100644 paddle/gserver/layers/FactorizationMachineLayer.cpp
 create mode 100644 paddle/gserver/layers/FactorizationMachineLayer.h
 create mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr
 create mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index d4e9d53e5c..89d6953c33 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -54,7 +54,7 @@ img_conv
 
 ..  _api_v2.layer_context_projection:
 
-context_projection 
+context_projection
 ------------------
 ..  autoclass:: paddle.v2.layer.context_projection
     :noindex:
@@ -70,7 +70,7 @@ Image Pooling Layer
 img_pool
 --------
 ..  autoclass:: paddle.v2.layer.img_pool
-    :noindex:   
+    :noindex:
 
 spp
 ---
@@ -99,7 +99,7 @@ sum_to_one_norm
 ---------------
 ..  autoclass:: paddle.v2.layer.sum_to_one_norm
     :noindex:
-    
+
 cross_channel_norm
 ------------------
 ..  autoclass:: paddle.v2.layer.cross_channel_norm
@@ -109,7 +109,7 @@ row_l2_norm
 -----------
 ..  autoclass:: paddle.v2.layer.row_l2_norm
     :noindex:
-    
+
 Recurrent Layers
 ================
 
@@ -395,6 +395,13 @@ multiplex
 ..  autoclass:: paddle.v2.layer.multiplex
     :noindex:
 
+Factorization Machine Layer
+============================
+
+factorization_machine
+---------------------
+..  autoclass:: paddle.v2.layer.factorization_machine
+    :noindex:
 
 Slicing and Joining Layers
 ==========================
diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp
new file mode 100644
index 0000000000..5456bf2601
--- /dev/null
+++ b/paddle/gserver/layers/FactorizationMachineLayer.cpp
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "FactorizationMachineLayer.h"
+#include <algorithm>
+#include <vector>
+#include "paddle/math/SparseMatrix.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(factorization_machine, FactorizationMachineLayer);
+
+bool FactorizationMachineLayer::init(const LayerMap& layerMap,
+                                     const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  factorSize_ = config_.factor_size();
+
+  /* initialize the latentVectors_ */
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  size_t height = inputLayers_[0]->getSize();
+  latentVectors_.reset(new Weight(height, factorSize_, parameters_[0]));
+
+  return true;
+}
+
+void FactorizationMachineLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  auto input = getInput(0);
+
+  int batchSize = input.getBatchSize();
+  int size = getSize();
+  reserveOutput(batchSize, size);
+
+  MatrixPtr outV = getOutputValue();
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/FactorizationMachineLayer.h b/paddle/gserver/layers/FactorizationMachineLayer.h
new file mode 100644
index 0000000000..e7807c8986
--- /dev/null
+++ b/paddle/gserver/layers/FactorizationMachineLayer.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/ThreadLocal.h"
+
+namespace paddle {
+/**
+ * @brief The Factorization Machine models pairwise (order-2) feature
+ * interactions as inner product of the learned latent vectors corresponding
+ * to each input feature.
+ *
+ * The Factorization Machine can effectively capture feature interactions
+ * especially when the input is sparse. While in principle FM can model higher
+ * order feature interaction, in practice usually only order-2 feature
+ * interactions are considered. The Factorization Machine Layer here only
+ * computes the order-2 interations with the formula:
+ *
+ * \f[
+ *     y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
+ * \f]
+ *
+ * The config file api is factorization_machine.
+ */
+
+class FactorizationMachineLayer : public Layer {
+protected:
+  /// The latent vectors, shape: (size, factorSize_)
+  std::unique_ptr<Weight> latentVectors_;
+  /// The hyperparameter that defines the dimensionality of the factorization
+  size_t factorSize_;
+
+public:
+  explicit FactorizationMachineLayer(const LayerConfig& config)
+      : Layer(config) {}
+  ~FactorizationMachineLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 90a3352898..542db5ee5b 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -2359,6 +2359,25 @@ TEST(Layer, ScaleShiftLayer) {
   }
 }
 
+void testFactorizationMachineLayer(InputType type, bool useGpu) {
+  const int FACTOR_SIZE = 10;
+  TestConfig config;
+  config.layerConfig.set_type("factorization_machine");
+  config.layerConfig.set_factor_size(FACTOR_SIZE);
+  config.biasSize = 1;
+  config.inputDefs.push_back({type, "layer_0", 8192, 0});
+  config.layerConfig.add_inputs();
+  testLayerGrad(config, "factorization_machine", 16, false, useGpu, false);
+}
+
+TEST(Layer, FactorizationMachineLayer) {
+  testFactorizationMachineLayer(INPUT_DATA, false);
+  testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, false);
+#ifdef PADDLE_WITH_CUDA
+  testFactorizationMachineLayer(INPUT_DATA, true);
+#endif
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
index ebf0911d6e..0d2140ccf9 100644
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -525,6 +525,9 @@ message LayerConfig {
 
   // for switch order layer
   optional ReshapeConfig reshape_conf = 59;
+
+  // for factorization machine layer
+  optional uint32 factor_size = 60;
 }
 
 message EvaluatorConfig {
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 098a51ab87..07b3ff66dc 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -3780,6 +3780,21 @@ class SwitchOrderLayer(LayerBase):
         self.config.reshape_conf.width_axis.extend(reshape['width'])
 
 
+@config_layer('factorization_machine')
+class FactorizationMachineLayer(LayerBase):
+    def __init__(self, name, inputs, factor_size, **xargs):
+        super(FactorizationMachineLayer, self).__init__(
+            name, 'factorization_machine', size=1, inputs=inputs, **xargs)
+        config_assert(
+            len(self.inputs) == 1,
+            'factorization machine layer must have one and only one input.')
+        self.config.factor_size = factor_size
+        input_layer = self.get_input_layer(0)
+        psize = input_layer.size * factor_size
+        dims = [input_layer.size, 1]
+        self.create_input_parameter(0, psize, dims)
+
+
 # Deprecated, use a new layer specific class instead
 @config_func
 def Layer(name, type, **xargs):
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index d37f29d2c4..e6348dca2a 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -143,6 +143,7 @@ __all__ = [
     'scale_shift_layer',
     'img_conv3d_layer',
     'resize_layer',
+    'factorization_machine',
 ]
 
 
@@ -253,6 +254,8 @@ class LayerType(object):
 
     RESIZE = 'resize'
 
+    FACTORIZATION_MACHINE = 'factorization_machine'
+
     @staticmethod
     def is_layer_type(type_name):
         """
@@ -6955,3 +6958,65 @@ def resize_layer(input, size, name=None):
     """
     Layer(name=name, type=LayerType.RESIZE, inputs=Input(input.name), size=size)
     return LayerOutput(name, LayerType.RESIZE, parents=[input], size=input.size)
+
+
+@wrap_name_default()
+@wrap_act_default(act=LinearActivation())
+@wrap_param_attr_default()
+@layer_support()
+def factorization_machine(input,
+                          factor_size,
+                          act=None,
+                          name=None,
+                          param_attr=None,
+                          layer_attr=None):
+    """
+    The Factorization Machine models pairwise feature interactions as inner
+    product of the learned latent vectors corresponding to each input feature.
+
+    The Factorization Machine can effectively capture feature interactions
+    especially when the input is sparse. In practice, usually order 2 feature
+    interactions are considered using Factorization Machine with the formula:
+
+    .. math::
+
+        y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
+
+    Note:
+        X is the input vector with size n. V is the factor matrix. Each row of V
+        is the latent vector corresponding to each input dimesion. The size of
+        each latent vector is k.
+
+    .. code-block:: python
+
+       factor_machine = factorization_machine(input=input_layer, factor_size=10)
+
+    :param input: The input layer.
+    :type input: LayerOutput
+    :param factor_size: The hyperparameter that defines the dimensionality of
+                        the latent vector size
+    :type context_len: int
+    :param act: Activation Type. Default is linear activation.
+    :type act: BaseActivation
+    :param param_attr: The Parameter Attribute. If None, the latent vectors will
+                       be initialized smartly. It's better to set it by
+                       yourself.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+
+    """
+    assert isinstance(input, LayerOutput)
+    assert factor_size > 0, "the factor_size must be greater than 0."
+
+    Layer(
+        inputs=[Input(input.name, **param_attr.attr)],
+        name=name,
+        factor_size=factor_size,
+        type=LayerType.FACTORIZATION_MACHINE,
+        active_type=act.name,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.FACTORIZATION_MACHINE, input, activation=act, size=1)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index 6a4550c209..40bbb04bd4 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -10,6 +10,7 @@ test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_la
 test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer
 test_kmax_seq_socre_layer test_sub_nested_seq_select_layer test_scale_shift_layer
 test_seq_slice_layer test_cross_entropy_over_beam test_pooling3D_layer
-test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer)
+test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer
+test_factorization_machine)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr
new file mode 100644
index 0000000000..585a5c7b23
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr
@@ -0,0 +1,39 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 1024
+  active_type: ""
+}
+layers {
+  name: "__factorization_machine_0__"
+  type: "factorization_machine"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___factorization_machine_0__.w0"
+  }
+  factor_size: 10
+}
+parameters {
+  name: "___factorization_machine_0__.w0"
+  size: 10240
+  initial_mean: 0.0
+  initial_std: 0.03125
+  dims: 1024
+  dims: 1
+  initial_strategy: 0
+  initial_smart: true
+}
+input_layer_names: "data"
+output_layer_names: "__factorization_machine_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__factorization_machine_0__"
+  input_layer_names: "data"
+  output_layer_names: "__factorization_machine_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py b/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py
new file mode 100644
index 0000000000..62ceb359cf
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py
@@ -0,0 +1,9 @@
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000, learning_rate=1e-5)
+
+data = data_layer(name='data', size=1024)
+
+fm = factorization_machine(input=data, factor_size=10)
+
+outputs(fm)

From f504c8a83d641b573ef0765227246460dea2f764 Mon Sep 17 00:00:00 2001
From: wangmeng28 <wangmeng28@baidu.com>
Date: Wed, 11 Oct 2017 21:47:27 +0800
Subject: [PATCH 02/52] Remove unnecessary configs

---
 paddle/gserver/tests/test_LayerGrad.cpp                       | 4 +---
 .../tests/configs/test_factorization_machine.py               | 2 --
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index f63c93c943..eea884cb50 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -2371,10 +2371,8 @@ void testFactorizationMachineLayer(InputType type, bool useGpu) {
 
 TEST(Layer, FactorizationMachineLayer) {
   testFactorizationMachineLayer(INPUT_DATA, false);
-  testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, false);
-#ifdef PADDLE_WITH_CUDA
   testFactorizationMachineLayer(INPUT_DATA, true);
-#endif
+  testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, false);
 }
 
 int main(int argc, char** argv) {
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py b/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py
index 62ceb359cf..b249de0fee 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py
@@ -1,7 +1,5 @@
 from paddle.trainer_config_helpers import *
 
-settings(batch_size=1000, learning_rate=1e-5)
-
 data = data_layer(name='data', size=1024)
 
 fm = factorization_machine(input=data, factor_size=10)

From 947b6a77ce08c1ca2dc386514f0e97eb75ade91a Mon Sep 17 00:00:00 2001
From: wangmeng28 <wangmeng28@baidu.com>
Date: Tue, 17 Oct 2017 00:26:53 +0800
Subject: [PATCH 03/52] Implement factorization machine layer

---
 .../layers/FactorizationMachineLayer.cpp      | 62 +++++++++++++++++--
 .../layers/FactorizationMachineLayer.h        | 12 ++++
 paddle/gserver/tests/test_LayerGrad.cpp       |  5 +-
 3 files changed, 73 insertions(+), 6 deletions(-)

diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp
index 5456bf2601..09128eeeef 100644
--- a/paddle/gserver/layers/FactorizationMachineLayer.cpp
+++ b/paddle/gserver/layers/FactorizationMachineLayer.cpp
@@ -33,7 +33,10 @@ bool FactorizationMachineLayer::init(const LayerMap& layerMap,
   /* initialize the latentVectors_ */
   CHECK_EQ(inputLayers_.size(), 1UL);
   size_t height = inputLayers_[0]->getSize();
-  latentVectors_.reset(new Weight(height, factorSize_, parameters_[0]));
+  latentVectors_ =
+      std::unique_ptr<Weight>(new Weight(height, factorSize_, parameters_[0]));
+
+  v2_ = latentVectors_->getW()->clone(0, 0, useGpu_);
 
   return true;
 }
@@ -41,14 +44,28 @@ bool FactorizationMachineLayer::init(const LayerMap& layerMap,
 void FactorizationMachineLayer::forward(PassType passType) {
   Layer::forward(passType);
 
-  auto input = getInput(0);
+  const MatrixPtr& inputV = getInputValue(0);
 
-  int batchSize = input.getBatchSize();
-  int size = getSize();
+  size_t batchSize = inputV->getHeight();
+  size_t size = getSize();
   reserveOutput(batchSize, size);
 
   MatrixPtr outV = getOutputValue();
 
+  Matrix::resizeOrCreate(tmpMul_, batchSize, factorSize_, false, useGpu_);
+  Matrix::resizeOrCreate(tmpOut_, batchSize, factorSize_, false, useGpu_);
+
+  REGISTER_TIMER_INFO("FwMulTimer", getName().c_str());
+  tmpMul_->mul(*inputV, *latentVectors_->getW());
+  tmpOut_->pow2(*tmpMul_, 2);
+  outV->sumRows(*tmpOut_, 0.5, 0);
+
+  x2_ = inputV->clone(0, 0, useGpu_);
+  x2_->pow2(*inputV, 2);
+  v2_->pow2(*latentVectors_->getW(), 2);
+  tmpOut_->mul(*x2_, *v2_);
+  outV->sumRows(*tmpOut_, -0.5, 1.0);
+
   /* activation */ {
     REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
     forwardActivation();
@@ -60,6 +77,43 @@ void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
     REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
     backwardActivation();
   }
+
+  const MatrixPtr& inputV = getInputValue(0);
+  const MatrixPtr& oGrad = getOutputGrad();
+
+  MatrixPtr tmpSum =
+      Matrix::create(1, latentVectors_->getW()->getHeight(), false, useGpu_);
+  MatrixPtr tmpSum_T = Matrix::create(tmpSum->getRowBuf(0),
+                                      latentVectors_->getW()->getHeight(),
+                                      1,
+                                      false,
+                                      useGpu_);
+
+  /* Calculate the gradients of the latentVectors_ matrix */
+  if (latentVectors_->getWGrad()) {
+    MatrixPtr tmpIn = inputV->clone(0, 0, useGpu_);
+    tmpIn->rowScale(0, *inputV, *oGrad);
+
+    latentVectors_->getWGrad()->mul(*tmpIn->getTranspose(), *tmpMul_, 1, 1);
+
+    tmpIn->rowScale(0, *x2_, *oGrad);
+    tmpSum->sumCols(*tmpIn, -1, 0);
+    latentVectors_->getWGrad()->addRowScale(
+        0, *latentVectors_->getW(), *tmpSum_T);
+
+    /* Increasing the number of gradient */
+    latentVectors_->getParameterPtr()->incUpdate(callback);
+  }
+
+  /* Calculate the input layers gradient */
+  MatrixPtr inGrad = getInputGrad(0);
+  if (inGrad != NULL) {
+    MatrixPtr latentVectors_T = latentVectors_->getW()->getTranspose();
+    inGrad->mul(*tmpMul_, *latentVectors_T, 1, 1);
+    tmpSum_T->sumRows(*v2_, -1, 0);
+    inGrad->addColScale(0, *inputV, *tmpSum);
+    inGrad->rowScale(0, *inGrad, *oGrad);
+  }
 }
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/FactorizationMachineLayer.h b/paddle/gserver/layers/FactorizationMachineLayer.h
index e7807c8986..7cf064690f 100644
--- a/paddle/gserver/layers/FactorizationMachineLayer.h
+++ b/paddle/gserver/layers/FactorizationMachineLayer.h
@@ -40,10 +40,22 @@ namespace paddle {
 class FactorizationMachineLayer : public Layer {
 protected:
   /// The latent vectors, shape: (size, factorSize_)
+  /// Each row of the latentVectors_ matrix is the latent vector
+  /// corresponding to one input feature dimension
   std::unique_ptr<Weight> latentVectors_;
   /// The hyperparameter that defines the dimensionality of the factorization
   size_t factorSize_;
 
+private:
+  /// The result of input matrix * letent vector matrix that will be used in
+  /// both forward and backward step
+  MatrixPtr tmpMul_;
+  MatrixPtr tmpOut_;
+  /// Store the square values of the letent vectors matrix
+  MatrixPtr v2_;
+  /// Store the square values of input matrix
+  MatrixPtr x2_;
+
 public:
   explicit FactorizationMachineLayer(const LayerConfig& config)
       : Layer(config) {}
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index eea884cb50..21e8fb7eed 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -2363,8 +2363,9 @@ void testFactorizationMachineLayer(InputType type, bool useGpu) {
   TestConfig config;
   config.layerConfig.set_type("factorization_machine");
   config.layerConfig.set_factor_size(FACTOR_SIZE);
-  config.biasSize = 1;
-  config.inputDefs.push_back({type, "layer_0", 8192, 0});
+  config.layerConfig.set_size(1);
+  config.biasSize = 0;
+  config.inputDefs.push_back({type, "layer_0", 1024, 10240});
   config.layerConfig.add_inputs();
   testLayerGrad(config, "factorization_machine", 16, false, useGpu, false);
 }

From 2ce8f1875bb6f69bdc48eb16e78a2c163316ca2b Mon Sep 17 00:00:00 2001
From: wangmeng28 <wangmeng28@baidu.com>
Date: Tue, 17 Oct 2017 11:09:41 +0800
Subject: [PATCH 04/52] Fix tests for factorization machine layer

---
 paddle/gserver/tests/test_LayerGrad.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 21e8fb7eed..54053b751b 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -2373,7 +2373,6 @@ void testFactorizationMachineLayer(InputType type, bool useGpu) {
 TEST(Layer, FactorizationMachineLayer) {
   testFactorizationMachineLayer(INPUT_DATA, false);
   testFactorizationMachineLayer(INPUT_DATA, true);
-  testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, false);
 }
 
 int main(int argc, char** argv) {

From 86053e7766a93ee0130131c20f262c58a4cbc86d Mon Sep 17 00:00:00 2001
From: wangmeng28 <wangmeng28@baidu.com>
Date: Tue, 17 Oct 2017 12:20:43 +0800
Subject: [PATCH 05/52] Reduce the input size in testing factorization machine

---
 paddle/gserver/tests/test_LayerGrad.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 54053b751b..6c604b1e67 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -2365,14 +2365,15 @@ void testFactorizationMachineLayer(InputType type, bool useGpu) {
   config.layerConfig.set_factor_size(FACTOR_SIZE);
   config.layerConfig.set_size(1);
   config.biasSize = 0;
-  config.inputDefs.push_back({type, "layer_0", 1024, 10240});
+  config.inputDefs.push_back({type, "layer_0", 128, 1280});
   config.layerConfig.add_inputs();
   testLayerGrad(config, "factorization_machine", 16, false, useGpu, false);
 }
 
 TEST(Layer, FactorizationMachineLayer) {
-  testFactorizationMachineLayer(INPUT_DATA, false);
-  testFactorizationMachineLayer(INPUT_DATA, true);
+  for (auto useGpu : {false, true}) {
+    testFactorizationMachineLayer(INPUT_DATA, useGpu);
+  }
 }
 
 int main(int argc, char** argv) {

From 9741ade8ee761f78291e249ea17ad5e3e2c904d2 Mon Sep 17 00:00:00 2001
From: wangmeng28 <wangmeng28@baidu.com>
Date: Tue, 17 Oct 2017 16:53:54 +0800
Subject: [PATCH 06/52] Change pow to square in factorization machine layer

---
 paddle/gserver/layers/FactorizationMachineLayer.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp
index 09128eeeef..8d9dcbaea7 100644
--- a/paddle/gserver/layers/FactorizationMachineLayer.cpp
+++ b/paddle/gserver/layers/FactorizationMachineLayer.cpp
@@ -57,12 +57,12 @@ void FactorizationMachineLayer::forward(PassType passType) {
 
   REGISTER_TIMER_INFO("FwMulTimer", getName().c_str());
   tmpMul_->mul(*inputV, *latentVectors_->getW());
-  tmpOut_->pow2(*tmpMul_, 2);
+  tmpMul_->square2(*tmpOut_);
   outV->sumRows(*tmpOut_, 0.5, 0);
 
   x2_ = inputV->clone(0, 0, useGpu_);
-  x2_->pow2(*inputV, 2);
-  v2_->pow2(*latentVectors_->getW(), 2);
+  inputV->square2(*x2_);
+  latentVectors_->getW()->square2(*v2_);
   tmpOut_->mul(*x2_, *v2_);
   outV->sumRows(*tmpOut_, -0.5, 1.0);
 

From 8654e8a5203c62ca7b69c1778ff0b71f7c5f8223 Mon Sep 17 00:00:00 2001
From: wangmeng28 <wangmeng28@baidu.com>
Date: Tue, 17 Oct 2017 23:42:51 +0800
Subject: [PATCH 07/52] Fix dims in config parser for factorization machine
 layer

---
 python/paddle/trainer/config_parser.py                          | 2 +-
 .../tests/configs/protostr/test_factorization_machine.protostr  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 9aba0b49ad..557a91ca7b 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -3794,7 +3794,7 @@ class FactorizationMachineLayer(LayerBase):
         self.config.factor_size = factor_size
         input_layer = self.get_input_layer(0)
         psize = input_layer.size * factor_size
-        dims = [input_layer.size, 1]
+        dims = [input_layer.size, factor_size]
         self.create_input_parameter(0, psize, dims)
 
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr
index 585a5c7b23..4f3002b199 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr
@@ -22,7 +22,7 @@ parameters {
   initial_mean: 0.0
   initial_std: 0.03125
   dims: 1024
-  dims: 1
+  dims: 10
   initial_strategy: 0
   initial_smart: true
 }

From 4c72b0634cc2c280f0edcc84a0ece00511fdd6cd Mon Sep 17 00:00:00 2001
From: wangmeng28 <wangmeng28@baidu.com>
Date: Wed, 18 Oct 2017 15:36:36 +0800
Subject: [PATCH 08/52] Fix creation of tmp variable in factorization machine
 layer

---
 paddle/gserver/layers/FactorizationMachineLayer.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp
index 8d9dcbaea7..e5c9d1a90d 100644
--- a/paddle/gserver/layers/FactorizationMachineLayer.cpp
+++ b/paddle/gserver/layers/FactorizationMachineLayer.cpp
@@ -33,10 +33,11 @@ bool FactorizationMachineLayer::init(const LayerMap& layerMap,
   /* initialize the latentVectors_ */
   CHECK_EQ(inputLayers_.size(), 1UL);
   size_t height = inputLayers_[0]->getSize();
+  CHECK_EQ(parameters_[0]->getSize(), height * factorSize_);
   latentVectors_ =
       std::unique_ptr<Weight>(new Weight(height, factorSize_, parameters_[0]));
 
-  v2_ = latentVectors_->getW()->clone(0, 0, useGpu_);
+  v2_ = Matrix::create(height, factorSize_, false, useGpu_);
 
   return true;
 }

From d9062cd9ee1297547c16d57c0d5024ceb3555d2f Mon Sep 17 00:00:00 2001
From: wangmeng28 <wangmeng28@baidu.com>
Date: Thu, 26 Oct 2017 00:43:47 +0800
Subject: [PATCH 09/52] Add sparse matrix support in factorization machine
 layer

---
 .../layers/FactorizationMachineLayer.cpp      | 24 +++++++++++++++----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp
index e5c9d1a90d..06658a2841 100644
--- a/paddle/gserver/layers/FactorizationMachineLayer.cpp
+++ b/paddle/gserver/layers/FactorizationMachineLayer.cpp
@@ -62,7 +62,12 @@ void FactorizationMachineLayer::forward(PassType passType) {
   outV->sumRows(*tmpOut_, 0.5, 0);
 
   x2_ = inputV->clone(0, 0, useGpu_);
-  inputV->square2(*x2_);
+  if (dynamic_cast<CpuSparseMatrix*>(x2_.get())) {
+    x2_->copyFrom(*inputV);
+    (dynamic_cast<CpuSparseMatrix*>(x2_.get()))->square2();
+  } else {
+    inputV->square2(*x2_);
+  }
   latentVectors_->getW()->square2(*v2_);
   tmpOut_->mul(*x2_, *v2_);
   outV->sumRows(*tmpOut_, -0.5, 1.0);
@@ -93,11 +98,20 @@ void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
   /* Calculate the gradients of the latentVectors_ matrix */
   if (latentVectors_->getWGrad()) {
     MatrixPtr tmpIn = inputV->clone(0, 0, useGpu_);
-    tmpIn->rowScale(0, *inputV, *oGrad);
-
-    latentVectors_->getWGrad()->mul(*tmpIn->getTranspose(), *tmpMul_, 1, 1);
+    if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
+      CpuSparseMatrix* inputV_s = dynamic_cast<CpuSparseMatrix*>(inputV.get());
+      CpuSparseMatrix* x2_s = dynamic_cast<CpuSparseMatrix*>(x2_.get());
+      CpuSparseMatrix* tmpIn_s = dynamic_cast<CpuSparseMatrix*>(tmpIn.get());
+      tmpIn_s->copyFrom(*inputV_s);
+      tmpIn_s->rowScale(0, *inputV_s, *oGrad);
+      latentVectors_->getWGrad()->mul(*tmpIn->getTranspose(), *tmpMul_, 1, 1);
+      tmpIn_s->rowScale(0, *x2_s, *oGrad);
+    } else {
+      tmpIn->rowScale(0, *inputV, *oGrad);
+      latentVectors_->getWGrad()->mul(*tmpIn->getTranspose(), *tmpMul_, 1, 1);
+      tmpIn->rowScale(0, *x2_, *oGrad);
+    }
 
-    tmpIn->rowScale(0, *x2_, *oGrad);
     tmpSum->sumCols(*tmpIn, -1, 0);
     latentVectors_->getWGrad()->addRowScale(
         0, *latentVectors_->getW(), *tmpSum_T);

From 509ae79a5de846dfd38bd85618b2467066413a97 Mon Sep 17 00:00:00 2001
From: wangmeng28 <wangmeng28@baidu.com>
Date: Thu, 26 Oct 2017 00:47:06 +0800
Subject: [PATCH 10/52] Add rowScale for CpuSparseMatrix

---
 paddle/math/CpuSparseMatrix.cpp | 17 +++++++++++++++++
 paddle/math/CpuSparseMatrix.h   |  9 +++++++++
 2 files changed, 26 insertions(+)

diff --git a/paddle/math/CpuSparseMatrix.cpp b/paddle/math/CpuSparseMatrix.cpp
index bf62229c03..e211c23a7e 100644
--- a/paddle/math/CpuSparseMatrix.cpp
+++ b/paddle/math/CpuSparseMatrix.cpp
@@ -260,6 +260,23 @@ void CpuSparseMatrix::printOneRow(std::ostream& os, size_t idx) const {
   os << ";";
 }
 
+void CpuSparseMatrix::rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c) {
+  CHECK(getFormat() != SPARSE_CSC) << "Not supported";
+  CHECK(height_ == b.getHeight());
+  CHECK(width_ == b.getWidth());
+  real* A = getValue();
+  real* B = b.getValue();
+  for (size_t i = 0; i < height_; i++) {
+    size_t start = getRowStartIdx(i);
+    size_t end = getRowStartIdx(i + 1);
+    CHECK(start == b.getRowStartIdx(i));
+    CHECK(end == b.getRowStartIdx(i + 1));
+    for (size_t j = start; j < end; j++) {
+      A[j] = B[j] * c.getElement(i, cCol);
+    }
+  }
+}
+
 void CpuSparseMatrix::randomizeUniform() {
   CHECK_LE(elementCnt_, height_ * width_);
   if (valueType_ == FLOAT_VALUE) {
diff --git a/paddle/math/CpuSparseMatrix.h b/paddle/math/CpuSparseMatrix.h
index 36d57bbb65..8f9ad67215 100644
--- a/paddle/math/CpuSparseMatrix.h
+++ b/paddle/math/CpuSparseMatrix.h
@@ -236,6 +236,15 @@ public:
               const unsigned int* cols,
               const real* values);
 
+  /**
+   * @brief this_row = b_row * c_row[cCol]
+   *
+   * @param[in]  cCol   the column of matrix c used to scale each row of b
+   * @param[in]  b      CpuSparseMatrix
+   * @param[in]  c      Matrix
+   */
+  void rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c);
+
   void randomizeUniform();
 
   void copyFrom(const GpuSparseMatrix& src, hl_stream_t stream);

From 4172fc09c39b61c3cb1933687680bab15153b59f Mon Sep 17 00:00:00 2001
From: wangmeng28 <wangmeng28@baidu.com>
Date: Wed, 1 Nov 2017 21:51:23 +0800
Subject: [PATCH 11/52] Add sparse input support for factorization machine
 layer

---
 paddle/gserver/layers/FactorizationMachineLayer.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp
index 06658a2841..3bd8d7cb4c 100644
--- a/paddle/gserver/layers/FactorizationMachineLayer.cpp
+++ b/paddle/gserver/layers/FactorizationMachineLayer.cpp
@@ -104,15 +104,21 @@ void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
       CpuSparseMatrix* tmpIn_s = dynamic_cast<CpuSparseMatrix*>(tmpIn.get());
       tmpIn_s->copyFrom(*inputV_s);
       tmpIn_s->rowScale(0, *inputV_s, *oGrad);
-      latentVectors_->getWGrad()->mul(*tmpIn->getTranspose(), *tmpMul_, 1, 1);
+      latentVectors_->getWGrad()->mul(*tmpIn_s->getTranspose(), *tmpMul_, 1, 1);
       tmpIn_s->rowScale(0, *x2_s, *oGrad);
+
+      MatrixPtr ones = Matrix::create(1, inputV->getHeight(), false, useGpu_);
+      ones->zeroMem();
+      ones->add(-1);
+      tmpSum->mul(*ones, *tmpIn_s, 1, 0);
     } else {
       tmpIn->rowScale(0, *inputV, *oGrad);
       latentVectors_->getWGrad()->mul(*tmpIn->getTranspose(), *tmpMul_, 1, 1);
       tmpIn->rowScale(0, *x2_, *oGrad);
+
+      tmpSum->sumCols(*tmpIn, -1, 0);
     }
 
-    tmpSum->sumCols(*tmpIn, -1, 0);
     latentVectors_->getWGrad()->addRowScale(
         0, *latentVectors_->getW(), *tmpSum_T);
 

From f3631a42dff4e1ad54b1c1fc8e5549a488158e02 Mon Sep 17 00:00:00 2001
From: Kavya Srinet <kavyasrinet@baidu.com>
Date: Mon, 13 Nov 2017 12:03:03 -0800
Subject: [PATCH 12/52] Updating the writeup of RNN doc

---
 doc/design/ops/rnn.md | 66 +++++++++++++++++++++----------------------
 1 file changed, 33 insertions(+), 33 deletions(-)

diff --git a/doc/design/ops/rnn.md b/doc/design/ops/rnn.md
index a78eea7d45..2f4854793f 100644
--- a/doc/design/ops/rnn.md
+++ b/doc/design/ops/rnn.md
@@ -1,62 +1,62 @@
 # RNNOp design
 
-This document is about an RNN operator which requires that instances in a mini-batch have the same length.  We will have a more flexible RNN operator.
+This document describes the RNN (Recurrent Neural Network) operator and how it is implemented in PaddlePaddle. The RNN op requires that all instances in a mini-batch have the same length. We will have a more flexible dynamic RNN operator in the future.
 
 ## RNN Algorithm Implementation
 
-<p aligh="center">
+<p align="center">
 <img src="./images/rnn.jpg"/>
 </p>
 
 The above diagram shows an RNN unrolled into a full network.
 
-There are several important concepts:
+There are several important concepts here:
 
-- *step-net*: the sub-graph to run at each step,
-- *memory*, $h_t$, the state of the current step,
-- *ex-memory*, $h_{t-1}$, the state of the previous step,
-- *initial memory value*, the ex-memory of the first step.
+- *step-net*: the sub-graph that runs at each step.
+- *memory*, $h_t$, the state of the current step.
+- *ex-memory*, $h_{t-1}$, the state of the previous step.
+- *initial memory value*, the memory of the first (initial) step.
 
 ### Step-scope
 
-There could be local variables defined in step-nets.  PaddlePaddle runtime realizes these variables in *step-scopes* -- scopes created for each step.
+There could be local variables defined in each step-net.  PaddlePaddle runtime realizes these variables in *step-scopes* which are created for each step.
 
-<p aligh="center">
+<p align="center">
 <img src="./images/rnn.png"/><br/>
-Figure 2 the RNN's data flow
+Figure 2 illustrates the RNN's data flow
 </p>
 
-Please be aware that all steps run the same step-net.  Each step
+Please be aware that every step runs the same step-net.  Each step does the following:
 
-1. creates the step-scope,
-2. realizes local variables, including step-outputs, in the step-scope, and
-3. runs the step-net, which could use these variables.
+1. Creates the step-scope.
+2. Initializes the local variables including step-outputs, in the step-scope.
+3. Runs the step-net, which uses the above mentioned variables.
 
-The RNN operator will compose its output from step outputs in step scopes.
+The RNN operator will compose its output from step outputs in each of the step scopes.
 
 ### Memory and Ex-memory
 
-Let's give more details about memory and ex-memory via a simply example:
+Let's give more details about memory and ex-memory using a simple example:
 
 $$
 h_t = U h_{t-1} + W x_t
 $$,
 
-where $h_t$ and $h_{t-1}$ are the memory and ex-memory of step $t$'s respectively.
+where $h_t$ and $h_{t-1}$ are the memory and ex-memory (previous memory) of step $t$ respectively.
 
-In the implementation, we can make an ex-memory variable either "refers to" the memory variable of the previous step,
-or copy the value of the previous memory value to the current ex-memory variable.
+In the implementation, we can make an ex-memory variable either "refer to" the memory variable of the previous step,
+or copy the memory value of the previous step to the current ex-memory variable.
 
 ### Usage in Python
 
 For more information on Block, please refer to the [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md).
 
-We can define an RNN's step-net using Block:
+We can define an RNN's step-net using a Block:
 
 ```python
 import paddle as pd
 
-X = some_op() # x is some operator's output, and is a LoDTensor
+X = some_op() # x is some operator's output and is a LoDTensor
 a = some_op()
 
 # declare parameters
@@ -68,7 +68,7 @@ with rnn.stepnet():
     x = rnn.add_input(X)
     # declare a memory (rnn's step)
     h = rnn.add_memory(init=a)
-    # h.pre_state() means previous memory of rnn
+    # h.pre_state(), the previous memory of rnn
     new_state = pd.add_two( pd.matmul(W, x) + pd.matmul(U, h.pre_state()))
     # update current memory
     h.update(new_state)
@@ -80,19 +80,19 @@ out = rnn()
 
 Python API functions in above example:
 
-- `rnn.add_input` indicates the parameter is a variable that will be segmented into step-inputs.
-- `rnn.add_memory` creates a variable used as the memory.
-- `rnn.add_outputs` mark the variables that will be concatenated across steps into the RNN output.
+- `rnn.add_input`: indicates that the parameter is a variable that will be segmented into step-inputs.
+- `rnn.add_memory`: creates a variable used as the memory.
+- `rnn.add_outputs`: marks the variables that will be concatenated across steps into the RNN output.
 
 ### Nested RNN and LoDTensor
 
 An RNN whose step-net includes other RNN operators is known as an *nested RNN*.
 
-For example, we could have a 2-level RNN, where the top level corresponds to paragraphs, and the lower level corresponds to sentences.
+For example, we could have a 2-level RNN, where the top level corresponds to paragraphs, and the lower level corresponds to sentences. Each step of the higher level RNN also receives an input from the corresponding step of the lower level, and additionally the output from the previous time step at the same level.
 
-The following figure illustrates the feeding of text into the lower level, one sentence each step, and the feeding of step outputs to the top level. The final top level output is about the whole text.
+The following figure illustrates feeding in text into the lower level, one sentence at a step, and the feeding in step outputs to the top level. The final top level output is about the whole text.
 
-<p aligh="center">
+<p align="center">
 <img src="./images/2_level_rnn.png"/>
 </p>
 
@@ -110,7 +110,7 @@ a = some_op()
 
 # chapter_data is a set of 128-dim word vectors
 # the first level of LoD is sentence
-# the second level of LoD is chapter
+# the second level of LoD is a chapter
 chapter_data = pd.Variable(shape=[None, 128], type=pd.lod_tensor, level=2)
 
 def lower_level_rnn(paragraph):
@@ -138,14 +138,14 @@ with top_level_rnn.stepnet():
         pd.matmul(W0, paragraph_data) + pd.matmul(U0, h.pre_state()))
     top_level_rnn.add_outputs(h)
 
-# just output the last step
+# output the last step
 chapter_out = top_level_rnn(output_all_steps=False)
 ```
 
-in above example, the construction of the `top_level_rnn` calls  `lower_level_rnn`.  The input is a LoD Tensor. The top level RNN segments input text data into paragraphs, and the lower level RNN segments each paragraph into sentences.
+In the above example, the construction of the `top_level_rnn` calls  `lower_level_rnn`.  The input is an LoD Tensor. The top level RNN segments input text data into paragraphs, and the lower level RNN segments each paragraph into sentences.
 
-By default, the `RNNOp` will concatenate the outputs from all the time steps,
-if the `output_all_steps` set to False, it will only output the final time step.
+By default, the `RNNOp` will concatenate the outputs from all the time steps.
+If the `output_all_steps` is set to False, it will only output the final time step.
 
 
 <p align="center">

From 4eb5b39cb2453c77a156f4f76f8436b574772afa Mon Sep 17 00:00:00 2001
From: Kavya Srinet <kavyasrinet@baidu.com>
Date: Mon, 13 Nov 2017 14:49:15 -0800
Subject: [PATCH 13/52] Editing the documentation for seq_decoder, and fixing
 typos

---
 doc/design/ops/sequence_decoder.md | 112 +++++++++++++----------------
 1 file changed, 48 insertions(+), 64 deletions(-)

diff --git a/doc/design/ops/sequence_decoder.md b/doc/design/ops/sequence_decoder.md
index 9007aae7a8..bb945ae48b 100644
--- a/doc/design/ops/sequence_decoder.md
+++ b/doc/design/ops/sequence_decoder.md
@@ -1,35 +1,28 @@
 # Design: Sequence Decoder Generating LoDTensors
-In tasks such as machine translation and image to text, 
-a [sequence decoder](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.md) is necessary to generate sequences.
+In tasks such as machine translation and visual captioning,
+a [sequence decoder](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.md) is necessary to generate sequences, one word at a time.
 
 This documentation describes how to implement the sequence decoder as an operator.
 
 ## Beam Search based Decoder
-The [beam search algorithm](https://en.wikipedia.org/wiki/Beam_search) is necessary when generating sequences, 
-it is a heuristic search algorithm that explores the paths by expanding the most promising node in a limited set.
+The [beam search algorithm](https://en.wikipedia.org/wiki/Beam_search) is necessary when generating sequences. It is a heuristic search algorithm that explores the paths by expanding the most promising node in a limited set.
 
-In the old version of PaddlePaddle, a C++ class `RecurrentGradientMachine` implements the general sequence decoder based on beam search, 
-due to the complexity, the implementation relays on a lot of special data structures, 
-quite trivial and hard to be customized by users.
+In the old version of PaddlePaddle, the C++ class `RecurrentGradientMachine` implements the general sequence decoder based on beam search, due to the complexity involved, the implementation relies on a lot of special data structures that are quite trivial and hard to be customized by users.
 
-There are a lot of heuristic tricks in the sequence generation tasks, 
-so the flexibility of sequence decoder is very important to users.
+There are a lot of heuristic tricks in the sequence generation tasks, so the flexibility of sequence decoder is very important to users.
 
-During PaddlePaddle's refactoring work,
-some new concept is proposed such as [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md) and [TensorArray](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md) that can better support sequence usage,
-and they can help to make the implementation of beam search based sequence decoder **more transparent and modular** .
+During the refactoring of PaddlePaddle, some new concepts are proposed such as:  [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md) and [TensorArray](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md) that can better support the sequence usage, and they can also help make the implementation of beam search based sequence decoder **more transparent and modular** .
 
-For example, the RNN sates, candidates IDs and probabilities of beam search can be represented as `LoDTensors`;
+For example, the RNN states, candidates IDs and probabilities of beam search can be represented all as `LoDTensors`;
 the selected candidate's IDs in each time step can be stored in a `TensorArray`, and `Packed` to the sentences translated.
 
 ## Changing LoD's absolute offset to relative offsets
-The current `LoDTensor` is designed to store levels of variable-length sequences,
-it stores several arrays of integers each represents a level.
+The current `LoDTensor` is designed to store levels of variable-length sequences. It stores several arrays of integers where each represents a level.
 
-The integers in each level represents the begin and end (not inclusive) offset of a sequence **in the underlying tensor**, 
-let's call this format the **absolute-offset LoD** for clear.
+The integers in each level represent the begin and end (not inclusive) offset of a sequence **in the underlying tensor**,
+let's call this format the **absolute-offset LoD** for clarity.
 
-The relative-offset LoD can fast retrieve any sequence but fails to represent empty sequences, for example, a two-level LoD is as follows
+The relative-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows
 ```python
 [[0, 3, 9]
  [0, 2, 3, 3, 3, 9]]
@@ -41,10 +34,9 @@ The first level tells that there are two sequences:
 while on the second level, there are several empty sequences that both begin and end at `3`.
 It is impossible to tell how many empty second-level sequences exist in the first-level sequences.
 
-There are many scenarios that relay on empty sequence representation,
-such as machine translation or image to text, one instance has no translations or the empty candidate set for a prefix.
+There are many scenarios that rely on empty sequence representation, for example in machine translation or visual captioning, one instance has no translation or the empty candidate set for a prefix.
 
-So let's introduce another format of LoD, 
+So let's introduce another format of LoD,
 it stores **the offsets of the lower level sequences** and is called **relative-offset** LoD.
 
 For example, to represent the same sequences of the above data
@@ -54,19 +46,18 @@ For example, to represent the same sequences of the above data
  [0, 2, 3, 3, 3, 9]]
 ```
 
-the first level represents that there are two sequences, 
+the first level represents that there are two sequences,
 their offsets in the second-level LoD is `[0, 3)` and `[3, 5)`.
 
 The second level is the same with the relative offset example because the lower level is a tensor.
 It is easy to find out the second sequence in the first-level LoD has two empty sequences.
 
-The following demos are based on relative-offset LoD.
+The following examples are based on relative-offset LoD.
 
 ## Usage in a simple machine translation model
-Let's start from a simple machine translation model that is simplified from [machine translation chapter](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation) to draw a simple blueprint of what a sequence decoder can do and how to use it.
+Let's start from a simple machine translation model that is simplified from the [machine translation chapter](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation) to draw a blueprint of what a sequence decoder can do and how to use it.
 
-The model has an encoder that learns the semantic vector from a sequence,
-and a decoder which uses the sequence decoder to generate new sentences.
+The model has an encoder that learns the semantic vector from a sequence, and a decoder which uses the sequence encoder to generate new sentences.
 
 **Encoder**
 ```python
@@ -117,7 +108,7 @@ def generate():
         # which means there are 2 sentences to translate
         #   - the first sentence has 1 translation prefixes, the offsets are [0, 1)
         #   - the second sentence has 2 translation prefixes, the offsets are [1, 3) and [3, 6)
-        # the target_word.lod is 
+        # the target_word.lod is
         # [[0, 1, 6]
         #  [0, 2, 4, 7, 9 12]]
         # which means 2 sentences to translate, each has 1 and 5 prefixes
@@ -154,37 +145,36 @@ def generate():
 
 translation_ids, translation_scores = decoder()
 ```
-The `decoder.beam_search` is a operator that given the candidates and the scores of translations including the candidates,
-return the result of the beam search algorithm.
+The `decoder.beam_search` is an operator that, given the candidates and the scores of translations including the candidates,
+returns the result of the beam search algorithm.
 
-In this way, users can customize anything on the inputs or outputs of beam search, for example, two ways to prune some translation prefixes
+In this way, users can customize anything on the input or output of beam search, for example:
 
-1. meke the correspondind elements in `topk_generated_scores` zero or some small values, beam_search will discard this candidate.
-2. remove some specific candidate in `selected_ids`
-3. get the final `translation_ids`, remove the translation sequence in it.
+1. Make the corresponding elements in `topk_generated_scores` zero or some small values, beam_search will discard this candidate.
+2. Remove some specific candidate in `selected_ids`.
+3. Get the final `translation_ids`, remove the translation sequence in it.
 
 The implementation of sequence decoder can reuse the C++ class [RNNAlgorithm](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/paddle/operators/dynamic_recurrent_op.h#L30),
-so the python syntax is quite similar to a [RNN](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/doc/design/block.md#blocks-with-for-and-rnnop).
+so the python syntax is quite similar to that of an  [RNN](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/doc/design/block.md#blocks-with-for-and-rnnop).
 
-Both of them are two-level `LoDTensors`
+Both of them are two-level `LoDTensors`:
 
-- the first level represents `batch_size` of (source) sentences;
-- the second level represents the candidate ID sets for translation prefix.
+- The first level represents `batch_size` of (source) sentences.
+- The second level represents the candidate ID sets for translation prefix.
 
-for example, 3 source sentences to translate, and has 2, 3, 1 candidates.
+For example, 3 source sentences to translate, and has 2, 3, 1 candidates.
 
-Unlike an RNN, in sequence decoder, the previous state and the current state have different LoD and shape,
-a `lod_expand` operator is used to expand the LoD of the previous state to fit the current state.
+Unlike an RNN, in sequence decoder, the previous state and the current state have different LoD and shape, and an `lod_expand` operator is used to expand the LoD of the previous state to fit the current state.
 
-For example, the previous state
+For example, the previous state:
 
 * LoD is `[0, 1, 3][0, 2, 5, 6]`
 * content of tensor is `a1 a2 b1 b2 b3 c1`
 
-the current state stored in `encoder_ctx_expanded`
+the current state is stored in `encoder_ctx_expanded`:
 
 * LoD is `[0, 2, 7][0 3 5 8 9 11 11]`
-* the content is 
+* the content is
   - a1 a1 a1 (a1 has 3 candidates, so the state should be copied 3 times for each candidates)
   - a2 a2
   - b1 b1 b1
@@ -192,54 +182,48 @@ the current state stored in `encoder_ctx_expanded`
   - b3 b3
   - None (c1 has 0 candidates, so c1 is dropped)
 
-Benefit from the relative offset LoD, empty candidate set can be represented naturally.
+The benefit from the relative offset LoD is that the empty candidate set can be represented naturally.
 
-the status in each time step can be stored in `TensorArray`, and `Pack`ed to a final LoDTensor, the corresponding syntax is 
+The status in each time step can be stored in `TensorArray`, and `Pack`ed to a final LoDTensor. The corresponding syntax is:
 
 ```python
 decoder.output(selected_ids)
 decoder.output(selected_generation_scores)
 ```
 
-the `selected_ids` is the candidate ids for the prefixes, 
-it will be `Packed` by `TensorArray` to a two-level `LoDTensor`,
-the first level represents the source sequences,
-the second level represents generated sequences.
+The `selected_ids` are the candidate ids for the prefixes, and will be `Packed` by `TensorArray` to a two-level `LoDTensor`, where the first level represents the source sequences and the second level represents generated sequences.
 
-Pack the `selected_scores` will get a `LoDTensor` that stores scores of each candidate of translations.
+Packing the `selected_scores` will get a `LoDTensor` that stores scores of each translation candidate.
 
-Pack the `selected_generation_scores` will get a `LoDTensor`, and each tail is the probability of the translation.
+Packing the `selected_generation_scores` will get a `LoDTensor`, and each tail is the probability of the translation.
 
 ## LoD and shape changes during decoding
 <p align="center">
   <img src="./images/LOD-and-shape-changes-during-decoding.jpg"/>
 </p>
 
-According the image above, the only phrase to change LoD is beam search.
+According to the image above, the only phase that changes the LoD is beam search.
 
 ## Beam search design
-The beam search algorthm will be implemented as one method of the sequence decoder, it has 3 inputs
+The beam search algorithm will be implemented as one method of the sequence decoder and has 3 inputs:
 
-1. `topk_ids`, top K candidate ids for each prefix.
+1. `topk_ids`, the top K candidate ids for each prefix.
 2. `topk_scores`, the corresponding scores for `topk_ids`
 3. `generated_scores`, the score of the prefixes.
 
-All of the are LoDTensors, so that the sequence affilication is clear.
-Beam search will keep a beam for each prefix and select a smaller candidate set for each prefix.
+All of these are LoDTensors, so that the sequence affiliation is clear. Beam search will keep a beam for each prefix and select a smaller candidate set for each prefix.
 
-It will return three variables
+It will return three variables:
 
 1. `selected_ids`, the final candidate beam search function selected for the next step.
 2. `selected_scores`, the scores for the candidates.
-3. `generated_scores`, the updated scores for each prefixes (with the new candidates appended).
+3. `generated_scores`, the updated scores for each prefix (with the new candidates appended).
 
 ## Introducing the LoD-based `Pack` and `Unpack` methods in `TensorArray`
-The `selected_ids`, `selected_scores` and `generated_scores` are LoDTensors,
-and they exist in each time step,
+The `selected_ids`, `selected_scores` and `generated_scores` are LoDTensors that exist at each time step,
 so it is natural to store them in arrays.
 
-Currently, PaddlePaddle has a module called `TensorArray` which can store an array of tensors,
-the results of beam search are better to store in a `TensorArray`.
+Currently, PaddlePaddle has a module called `TensorArray` which can store an array of tensors. It is better to store the results of beam search in a `TensorArray`.
 
-The `Pack` and `UnPack` in `TensorArray` are used to package tensors in the array to a `LoDTensor` or split the `LoDTensor` to an array of tensors. 
-It needs some extensions to support pack or unpack an array of `LoDTensors`.
+The `Pack` and `UnPack` in `TensorArray` are used to pack tensors in the array to an `LoDTensor` or split the `LoDTensor` to an array of tensors.
+It needs some extensions to support the packing or unpacking an array of `LoDTensors`.

From b341636f7e3ac8a8d2062e63c86c63063bd2f206 Mon Sep 17 00:00:00 2001
From: Kavya Srinet <kavyasrinet@baidu.com>
Date: Tue, 14 Nov 2017 10:02:18 -0800
Subject: [PATCH 14/52] Fixing the captioning on 2 level RNN

---
 doc/design/ops/images/2_level_rnn.dot | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/design/ops/images/2_level_rnn.dot b/doc/design/ops/images/2_level_rnn.dot
index a498e882a3..5d77865061 100644
--- a/doc/design/ops/images/2_level_rnn.dot
+++ b/doc/design/ops/images/2_level_rnn.dot
@@ -1,6 +1,6 @@
 digraph G {
 
-  rnn [label="1-th level RNN" shape=box]
+  rnn [label="1st level RNN" shape=box]
 
   subgraph cluster0 {
     label = "time step 0"
@@ -8,7 +8,7 @@ digraph G {
     sent0 [label="sentence"]
     sent1 [label="sentence"]
 
-    rnn1 [label="2-th level RNN" shape=box]
+    rnn1 [label="2nd level RNN" shape=box]
 
     sent0 -> rnn1
     sent1 -> rnn1
@@ -20,7 +20,7 @@ digraph G {
     sent2 [label="sentence"]
     sent3 [label="sentence"]
 
-    rnn2 [label="2-th level RNN" shape=box]
+    rnn2 [label="2nd level RNN" shape=box]
 
     sent2 -> rnn2
     sent3 -> rnn2
@@ -32,7 +32,7 @@ digraph G {
     sent4 [label="sentence"]
     sent5 [label="sentence"]
 
-    rnn3 [label="2-th level RNN" shape=box]
+    rnn3 [label="2nd level RNN" shape=box]
 
     sent4 -> rnn3
     sent5 -> rnn3

From 9f2dbc4b5ab45eff990a3c3a6a21664798fe3680 Mon Sep 17 00:00:00 2001
From: Kavya Srinet <kavyasrinet@baidu.com>
Date: Tue, 14 Nov 2017 10:11:18 -0800
Subject: [PATCH 15/52] pushing after a pull

---
 doc/design/ops/sequence_decoder.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/design/ops/sequence_decoder.md b/doc/design/ops/sequence_decoder.md
index bb945ae48b..9db5fb8e9a 100644
--- a/doc/design/ops/sequence_decoder.md
+++ b/doc/design/ops/sequence_decoder.md
@@ -154,7 +154,7 @@ In this way, users can customize anything on the input or output of beam search,
 2. Remove some specific candidate in `selected_ids`.
 3. Get the final `translation_ids`, remove the translation sequence in it.
 
-The implementation of sequence decoder can reuse the C++ class [RNNAlgorithm](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/paddle/operators/dynamic_recurrent_op.h#L30),
+The implementation of sequence decoder can reuse the C++ class:  [RNNAlgorithm](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/paddle/operators/dynamic_recurrent_op.h#L30),
 so the python syntax is quite similar to that of an  [RNN](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/doc/design/block.md#blocks-with-for-and-rnnop).
 
 Both of them are two-level `LoDTensors`:

From 7a1a586355844eb18fb6c87304cee5bbf70d078d Mon Sep 17 00:00:00 2001
From: wangmeng28 <wangmeng28@baidu.com>
Date: Thu, 16 Nov 2017 17:15:03 +0800
Subject: [PATCH 16/52] Update variable names and docs for factorization
 machine layer

---
 .../layers/FactorizationMachineLayer.cpp      | 110 +++++++++---------
 .../layers/FactorizationMachineLayer.h        |  31 +++--
 paddle/gserver/tests/test_LayerGrad.cpp       |   1 +
 paddle/math/CpuSparseMatrix.cpp               |   8 +-
 .../paddle/trainer_config_helpers/layers.py   |  14 ++-
 5 files changed, 94 insertions(+), 70 deletions(-)

diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp
index 3bd8d7cb4c..f0f1738f30 100644
--- a/paddle/gserver/layers/FactorizationMachineLayer.cpp
+++ b/paddle/gserver/layers/FactorizationMachineLayer.cpp
@@ -32,12 +32,10 @@ bool FactorizationMachineLayer::init(const LayerMap& layerMap,
 
   /* initialize the latentVectors_ */
   CHECK_EQ(inputLayers_.size(), 1UL);
-  size_t height = inputLayers_[0]->getSize();
-  CHECK_EQ(parameters_[0]->getSize(), height * factorSize_);
-  latentVectors_ =
-      std::unique_ptr<Weight>(new Weight(height, factorSize_, parameters_[0]));
-
-  v2_ = Matrix::create(height, factorSize_, false, useGpu_);
+  size_t inputSize = inputLayers_[0]->getSize();
+  CHECK_EQ(parameters_[0]->getSize(), inputSize * factorSize_);
+  latentVectors_ = std::unique_ptr<Weight>(
+      new Weight(inputSize, factorSize_, parameters_[0]));
 
   return true;
 }
@@ -48,79 +46,85 @@ void FactorizationMachineLayer::forward(PassType passType) {
   const MatrixPtr& inputV = getInputValue(0);
 
   size_t batchSize = inputV->getHeight();
-  size_t size = getSize();
-  reserveOutput(batchSize, size);
+  size_t outputSize = getSize();
+  size_t inputSize = inputLayers_[0]->getSize();
+  reserveOutput(batchSize, outputSize);
 
   MatrixPtr outV = getOutputValue();
 
-  Matrix::resizeOrCreate(tmpMul_, batchSize, factorSize_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      latentVectorsSquare_, inputSize, factorSize_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      inputMulFactor_, batchSize, factorSize_, false, useGpu_);
   Matrix::resizeOrCreate(tmpOut_, batchSize, factorSize_, false, useGpu_);
 
-  REGISTER_TIMER_INFO("FwMulTimer", getName().c_str());
-  tmpMul_->mul(*inputV, *latentVectors_->getW());
-  tmpMul_->square2(*tmpOut_);
+  REGISTER_TIMER_INFO("InputMulFactorTimer", getName().c_str());
+  inputMulFactor_->mul(*inputV, *latentVectors_->getW());
+  inputMulFactor_->square2(*tmpOut_);
   outV->sumRows(*tmpOut_, 0.5, 0);
 
-  x2_ = inputV->clone(0, 0, useGpu_);
-  if (dynamic_cast<CpuSparseMatrix*>(x2_.get())) {
-    x2_->copyFrom(*inputV);
-    (dynamic_cast<CpuSparseMatrix*>(x2_.get()))->square2();
+  inputSquare_ = inputV->clone(0, 0, useGpu_);
+  if (dynamic_cast<CpuSparseMatrix*>(inputSquare_.get())) {
+    inputSquare_->copyFrom(*inputV);
+    (dynamic_cast<CpuSparseMatrix*>(inputSquare_.get()))->square2();
   } else {
-    inputV->square2(*x2_);
+    inputV->square2(*inputSquare_);
   }
-  latentVectors_->getW()->square2(*v2_);
-  tmpOut_->mul(*x2_, *v2_);
+  latentVectors_->getW()->square2(*latentVectorsSquare_);
+  tmpOut_->mul(*inputSquare_, *latentVectorsSquare_);
   outV->sumRows(*tmpOut_, -0.5, 1.0);
 
   /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    REGISTER_TIMER_INFO("FmAtvTimer", getName().c_str());
     forwardActivation();
   }
 }
 
 void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
+  /* Do derivation */ { backwardActivation(); }
 
   const MatrixPtr& inputV = getInputValue(0);
   const MatrixPtr& oGrad = getOutputGrad();
 
-  MatrixPtr tmpSum =
-      Matrix::create(1, latentVectors_->getW()->getHeight(), false, useGpu_);
-  MatrixPtr tmpSum_T = Matrix::create(tmpSum->getRowBuf(0),
-                                      latentVectors_->getW()->getHeight(),
-                                      1,
-                                      false,
-                                      useGpu_);
+  Matrix::resizeOrCreate(
+      tmpSum_, 1, latentVectors_->getW()->getHeight(), false, useGpu_);
+  MatrixPtr tmpSumTrans = Matrix::create(tmpSum_->getRowBuf(0),
+                                         latentVectors_->getW()->getHeight(),
+                                         1,
+                                         false,
+                                         useGpu_);
 
   /* Calculate the gradients of the latentVectors_ matrix */
   if (latentVectors_->getWGrad()) {
-    MatrixPtr tmpIn = inputV->clone(0, 0, useGpu_);
+    MatrixPtr tmpInput = inputV->clone(0, 0, useGpu_);
     if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
-      CpuSparseMatrix* inputV_s = dynamic_cast<CpuSparseMatrix*>(inputV.get());
-      CpuSparseMatrix* x2_s = dynamic_cast<CpuSparseMatrix*>(x2_.get());
-      CpuSparseMatrix* tmpIn_s = dynamic_cast<CpuSparseMatrix*>(tmpIn.get());
-      tmpIn_s->copyFrom(*inputV_s);
-      tmpIn_s->rowScale(0, *inputV_s, *oGrad);
-      latentVectors_->getWGrad()->mul(*tmpIn_s->getTranspose(), *tmpMul_, 1, 1);
-      tmpIn_s->rowScale(0, *x2_s, *oGrad);
-
-      MatrixPtr ones = Matrix::create(1, inputV->getHeight(), false, useGpu_);
-      ones->zeroMem();
-      ones->add(-1);
-      tmpSum->mul(*ones, *tmpIn_s, 1, 0);
+      CpuSparseMatrix* sparseInputV =
+          dynamic_cast<CpuSparseMatrix*>(inputV.get());
+      CpuSparseMatrix* sparseInputSquare =
+          dynamic_cast<CpuSparseMatrix*>(inputSquare_.get());
+      CpuSparseMatrix* sparseTmpInput =
+          dynamic_cast<CpuSparseMatrix*>(tmpInput.get());
+      sparseTmpInput->copyFrom(*sparseInputV);
+      sparseTmpInput->rowScale(0, *sparseInputV, *oGrad);
+      latentVectors_->getWGrad()->mul(
+          *sparseTmpInput->getTranspose(), *inputMulFactor_, 1, 1);
+      sparseTmpInput->rowScale(0, *sparseInputSquare, *oGrad);
+
+      Matrix::resizeOrCreate(negOnes_, 1, inputV->getHeight(), false, useGpu_);
+      negOnes_->zeroMem();
+      negOnes_->add(-1);
+      tmpSum_->mul(*negOnes_, *sparseTmpInput, 1, 0);
     } else {
-      tmpIn->rowScale(0, *inputV, *oGrad);
-      latentVectors_->getWGrad()->mul(*tmpIn->getTranspose(), *tmpMul_, 1, 1);
-      tmpIn->rowScale(0, *x2_, *oGrad);
+      tmpInput->rowScale(0, *inputV, *oGrad);
+      latentVectors_->getWGrad()->mul(
+          *tmpInput->getTranspose(), *inputMulFactor_, 1, 1);
+      tmpInput->rowScale(0, *inputSquare_, *oGrad);
 
-      tmpSum->sumCols(*tmpIn, -1, 0);
+      tmpSum_->sumCols(*tmpInput, -1, 0);
     }
 
     latentVectors_->getWGrad()->addRowScale(
-        0, *latentVectors_->getW(), *tmpSum_T);
+        0, *latentVectors_->getW(), *tmpSumTrans);
 
     /* Increasing the number of gradient */
     latentVectors_->getParameterPtr()->incUpdate(callback);
@@ -129,10 +133,10 @@ void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
   /* Calculate the input layers gradient */
   MatrixPtr inGrad = getInputGrad(0);
   if (inGrad != NULL) {
-    MatrixPtr latentVectors_T = latentVectors_->getW()->getTranspose();
-    inGrad->mul(*tmpMul_, *latentVectors_T, 1, 1);
-    tmpSum_T->sumRows(*v2_, -1, 0);
-    inGrad->addColScale(0, *inputV, *tmpSum);
+    inGrad->mul(
+        *inputMulFactor_, *latentVectors_->getW()->getTranspose(), 1, 1);
+    tmpSumTrans->sumRows(*latentVectorsSquare_, -1, 0);
+    inGrad->addColScale(0, *inputV, *tmpSum_);
     inGrad->rowScale(0, *inGrad, *oGrad);
   }
 }
diff --git a/paddle/gserver/layers/FactorizationMachineLayer.h b/paddle/gserver/layers/FactorizationMachineLayer.h
index 7cf064690f..85d40fdb1e 100644
--- a/paddle/gserver/layers/FactorizationMachineLayer.h
+++ b/paddle/gserver/layers/FactorizationMachineLayer.h
@@ -34,27 +34,36 @@ namespace paddle {
  *     y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
  * \f]
  *
+ * The detailed calculation for forward and backward can be found at this paper:
+ *
+ *     Rendle, Steffen. Factorization machines. IEEE 10th International
+ *     Conference on Data Mining (ICDM). IEEE, 2010.
+ *
  * The config file api is factorization_machine.
  */
 
 class FactorizationMachineLayer : public Layer {
 protected:
-  /// The latent vectors, shape: (size, factorSize_)
-  /// Each row of the latentVectors_ matrix is the latent vector
-  /// corresponding to one input feature dimension
+  // The latent vectors, shape: (size, factorSize_)
+  // Each row of the latentVectors_ matrix is the latent vector
+  // corresponding to one input feature dimension
   std::unique_ptr<Weight> latentVectors_;
-  /// The hyperparameter that defines the dimensionality of the factorization
+  // The hyperparameter that defines the dimensionality of the factorization
   size_t factorSize_;
 
 private:
-  /// The result of input matrix * letent vector matrix that will be used in
-  /// both forward and backward step
-  MatrixPtr tmpMul_;
+  // Store the square values of the letent vectors matrix
+  MatrixPtr latentVectorsSquare_;
+  // Store the square values of input matrix
+  MatrixPtr inputSquare_;
+  // The result of input matrix * latent vector matrix that will be used in
+  // both forward and backward step
+  MatrixPtr inputMulFactor_;
+  // Temporary calculation result store
   MatrixPtr tmpOut_;
-  /// Store the square values of the letent vectors matrix
-  MatrixPtr v2_;
-  /// Store the square values of input matrix
-  MatrixPtr x2_;
+  MatrixPrt tmpSum_;
+  // Negative identity matrix
+  MatrixPtr negOnes_;
 
 public:
   explicit FactorizationMachineLayer(const LayerConfig& config)
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 072d75c23d..04ff618c21 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -2442,6 +2442,7 @@ void testFactorizationMachineLayer(InputType type, bool useGpu) {
 TEST(Layer, FactorizationMachineLayer) {
   for (auto useGpu : {false, true}) {
     testFactorizationMachineLayer(INPUT_DATA, useGpu);
+    testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, useGpu);
   }
 }
 
diff --git a/paddle/math/CpuSparseMatrix.cpp b/paddle/math/CpuSparseMatrix.cpp
index e211c23a7e..6a432cd16b 100644
--- a/paddle/math/CpuSparseMatrix.cpp
+++ b/paddle/math/CpuSparseMatrix.cpp
@@ -262,15 +262,15 @@ void CpuSparseMatrix::printOneRow(std::ostream& os, size_t idx) const {
 
 void CpuSparseMatrix::rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c) {
   CHECK(getFormat() != SPARSE_CSC) << "Not supported";
-  CHECK(height_ == b.getHeight());
-  CHECK(width_ == b.getWidth());
+  CHECK_EQ(height_, b.getHeight());
+  CHECK_EQ(width_, b.getWidth());
   real* A = getValue();
   real* B = b.getValue();
   for (size_t i = 0; i < height_; i++) {
     size_t start = getRowStartIdx(i);
     size_t end = getRowStartIdx(i + 1);
-    CHECK(start == b.getRowStartIdx(i));
-    CHECK(end == b.getRowStartIdx(i + 1));
+    CHECK_EQ(start, b.getRowStartIdx(i));
+    CHECK_EQ(end, b.getRowStartIdx(i + 1));
     for (size_t j = start; j < end; j++) {
       A[j] = B[j] * c.getElement(i, cCol);
     }
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 30e334e7c8..7e38383bd6 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -7161,16 +7161,26 @@ def factorization_machine(input,
     The Factorization Machine models pairwise feature interactions as inner
     product of the learned latent vectors corresponding to each input feature.
     The Factorization Machine can effectively capture feature interactions
-    especially when the input is sparse. In practice, usually order 2 feature
-    interactions are considered using Factorization Machine with the formula:
+    especially when the input is sparse.
+
+    This implementation only consider the 2-order feature interactions using
+    Factorization Machine with the formula:
+
     .. math::
         y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
+
     Note:
         X is the input vector with size n. V is the factor matrix. Each row of V
         is the latent vector corresponding to each input dimesion. The size of
         each latent vector is k.
+
+    For details of Factorization Machine, please refer to the paper:
+        Rendle, Steffen. Factorization machines. IEEE 10th International
+        Conference on Data Mining (ICDM). IEEE, 2010.
+
     .. code-block:: python
        factor_machine = factorization_machine(input=input_layer, factor_size=10)
+
     :param input: The input layer.
     :type input: LayerOutput
     :param factor_size: The hyperparameter that defines the dimensionality of

From 0b6afb589cb74c4cb24b8ee5461f1d8b12674143 Mon Sep 17 00:00:00 2001
From: wangmeng28 <wangmeng28@baidu.com>
Date: Thu, 16 Nov 2017 19:11:40 +0800
Subject: [PATCH 17/52] Fix typo in factorization machine layer

---
 paddle/gserver/layers/FactorizationMachineLayer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/gserver/layers/FactorizationMachineLayer.h b/paddle/gserver/layers/FactorizationMachineLayer.h
index 85d40fdb1e..85ac175657 100644
--- a/paddle/gserver/layers/FactorizationMachineLayer.h
+++ b/paddle/gserver/layers/FactorizationMachineLayer.h
@@ -61,7 +61,7 @@ private:
   MatrixPtr inputMulFactor_;
   // Temporary calculation result store
   MatrixPtr tmpOut_;
-  MatrixPrt tmpSum_;
+  MatrixPtr tmpSum_;
   // Negative identity matrix
   MatrixPtr negOnes_;
 

From 09f4f9257981dc3744e9131dabcebebaa5eb7f91 Mon Sep 17 00:00:00 2001
From: wangmeng28 <wangmeng28@baidu.com>
Date: Thu, 16 Nov 2017 20:33:25 +0800
Subject: [PATCH 18/52] Add unitest for factorization machine layer with sparse
 input

---
 paddle/gserver/tests/test_LayerGrad.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 589db0bd6c..7ad9866ecf 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -2444,8 +2444,8 @@ void testFactorizationMachineLayer(InputType type, bool useGpu) {
 TEST(Layer, FactorizationMachineLayer) {
   for (auto useGpu : {false, true}) {
     testFactorizationMachineLayer(INPUT_DATA, useGpu);
-    testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, useGpu);
   }
+  testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, false);
 }
 
 int main(int argc, char** argv) {

From d5a6c81dc55057ba437efe417992c0521e87c754 Mon Sep 17 00:00:00 2001
From: wangmeng28 <wangmeng28@baidu.com>
Date: Mon, 20 Nov 2017 11:48:52 +0800
Subject: [PATCH 19/52] Update docs for factorization machine layer

---
 paddle/gserver/layers/FactorizationMachineLayer.h | 5 ++---
 python/paddle/trainer_config_helpers/layers.py    | 5 ++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/paddle/gserver/layers/FactorizationMachineLayer.h b/paddle/gserver/layers/FactorizationMachineLayer.h
index 85ac175657..3bc36daaab 100644
--- a/paddle/gserver/layers/FactorizationMachineLayer.h
+++ b/paddle/gserver/layers/FactorizationMachineLayer.h
@@ -36,8 +36,7 @@ namespace paddle {
  *
  * The detailed calculation for forward and backward can be found at this paper:
  *
- *     Rendle, Steffen. Factorization machines. IEEE 10th International
- *     Conference on Data Mining (ICDM). IEEE, 2010.
+ *     Factorization machines.
  *
  * The config file api is factorization_machine.
  */
@@ -59,7 +58,7 @@ private:
   // The result of input matrix * latent vector matrix that will be used in
   // both forward and backward step
   MatrixPtr inputMulFactor_;
-  // Temporary calculation result store
+  // Store temporary calculation result
   MatrixPtr tmpOut_;
   MatrixPtr tmpSum_;
   // Negative identity matrix
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index cc1bf923dd..37214a53d3 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -3876,7 +3876,7 @@ def recurrent_layer(input,
     :type input: LayerOutput
     :param act: Activation type. TanhActivation is the default activation.
     :type act: BaseActivation
-    :param bias_attr: The parameter attribute for bias. If this parameter is set to 
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
                       False or an object whose type is not ParameterAttribute,
                       no bias is defined. If the parameter is set to True,
                       the bias is initialized to zero.
@@ -7307,8 +7307,7 @@ def factorization_machine(input,
         each latent vector is k.
 
     For details of Factorization Machine, please refer to the paper:
-        Rendle, Steffen. Factorization machines. IEEE 10th International
-        Conference on Data Mining (ICDM). IEEE, 2010.
+    Factorization machines.
 
     .. code-block:: python
        factor_machine = factorization_machine(input=input_layer, factor_size=10)

From 6fed6f2079902c86c43161f916c3450094fde6d0 Mon Sep 17 00:00:00 2001
From: wangmeng28 <wangmeng28@baidu.com>
Date: Mon, 20 Nov 2017 20:44:52 +0800
Subject: [PATCH 20/52] Add support of sparse_binary_vector as input for fm
 layer

---
 .../layers/FactorizationMachineLayer.cpp      | 20 +++++++++-----
 .../layers/FactorizationMachineLayer.h        |  1 +
 paddle/math/CpuSparseMatrix.cpp               | 26 ++++++++++++++-----
 3 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp
index f0f1738f30..b665fb6dfc 100644
--- a/paddle/gserver/layers/FactorizationMachineLayer.cpp
+++ b/paddle/gserver/layers/FactorizationMachineLayer.cpp
@@ -96,15 +96,20 @@ void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
 
   /* Calculate the gradients of the latentVectors_ matrix */
   if (latentVectors_->getWGrad()) {
-    MatrixPtr tmpInput = inputV->clone(0, 0, useGpu_);
     if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
+      Matrix::resizeOrCreateSparseMatrix(tmpInput_,
+                                         inputV->getHeight(),
+                                         inputV->getWidth(),
+                                         inputV->getElementCnt());
+
       CpuSparseMatrix* sparseInputV =
           dynamic_cast<CpuSparseMatrix*>(inputV.get());
       CpuSparseMatrix* sparseInputSquare =
           dynamic_cast<CpuSparseMatrix*>(inputSquare_.get());
       CpuSparseMatrix* sparseTmpInput =
-          dynamic_cast<CpuSparseMatrix*>(tmpInput.get());
+          dynamic_cast<CpuSparseMatrix*>(tmpInput_.get());
       sparseTmpInput->copyFrom(*sparseInputV);
+
       sparseTmpInput->rowScale(0, *sparseInputV, *oGrad);
       latentVectors_->getWGrad()->mul(
           *sparseTmpInput->getTranspose(), *inputMulFactor_, 1, 1);
@@ -115,12 +120,15 @@ void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
       negOnes_->add(-1);
       tmpSum_->mul(*negOnes_, *sparseTmpInput, 1, 0);
     } else {
-      tmpInput->rowScale(0, *inputV, *oGrad);
+      Matrix::resizeOrCreate(
+          tmpInput_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
+
+      tmpInput_->rowScale(0, *inputV, *oGrad);
       latentVectors_->getWGrad()->mul(
-          *tmpInput->getTranspose(), *inputMulFactor_, 1, 1);
-      tmpInput->rowScale(0, *inputSquare_, *oGrad);
+          *tmpInput_->getTranspose(), *inputMulFactor_, 1, 1);
+      tmpInput_->rowScale(0, *inputSquare_, *oGrad);
 
-      tmpSum_->sumCols(*tmpInput, -1, 0);
+      tmpSum_->sumCols(*tmpInput_, -1, 0);
     }
 
     latentVectors_->getWGrad()->addRowScale(
diff --git a/paddle/gserver/layers/FactorizationMachineLayer.h b/paddle/gserver/layers/FactorizationMachineLayer.h
index 3bc36daaab..df20a49934 100644
--- a/paddle/gserver/layers/FactorizationMachineLayer.h
+++ b/paddle/gserver/layers/FactorizationMachineLayer.h
@@ -61,6 +61,7 @@ private:
   // Store temporary calculation result
   MatrixPtr tmpOut_;
   MatrixPtr tmpSum_;
+  MatrixPtr tmpInput_;
   // Negative identity matrix
   MatrixPtr negOnes_;
 
diff --git a/paddle/math/CpuSparseMatrix.cpp b/paddle/math/CpuSparseMatrix.cpp
index 6a432cd16b..dc6979cf5a 100644
--- a/paddle/math/CpuSparseMatrix.cpp
+++ b/paddle/math/CpuSparseMatrix.cpp
@@ -266,13 +266,25 @@ void CpuSparseMatrix::rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c) {
   CHECK_EQ(width_, b.getWidth());
   real* A = getValue();
   real* B = b.getValue();
-  for (size_t i = 0; i < height_; i++) {
-    size_t start = getRowStartIdx(i);
-    size_t end = getRowStartIdx(i + 1);
-    CHECK_EQ(start, b.getRowStartIdx(i));
-    CHECK_EQ(end, b.getRowStartIdx(i + 1));
-    for (size_t j = start; j < end; j++) {
-      A[j] = B[j] * c.getElement(i, cCol);
+  if (b.getValueType() == FLOAT_VALUE) {
+    for (size_t i = 0; i < height_; i++) {
+      size_t start = getRowStartIdx(i);
+      size_t end = getRowStartIdx(i + 1);
+      CHECK_EQ(start, b.getRowStartIdx(i));
+      CHECK_EQ(end, b.getRowStartIdx(i + 1));
+      for (size_t j = start; j < end; j++) {
+        A[j] = B[j] * c.getElement(i, cCol);
+      }
+    }
+  } else if (b.getValueType() == NO_VALUE) {
+    for (size_t i = 0; i < height_; i++) {
+      size_t start = getRowStartIdx(i);
+      size_t end = getRowStartIdx(i + 1);
+      CHECK_EQ(start, b.getRowStartIdx(i));
+      CHECK_EQ(end, b.getRowStartIdx(i + 1));
+      for (size_t j = start; j < end; j++) {
+        A[j] = c.getElement(i, cCol);
+      }
     }
   }
 }

From 74a699a72ef9046a7f302e339c8e20a8152ae9d8 Mon Sep 17 00:00:00 2001
From: wangmeng28 <wangmeng28@baidu.com>
Date: Mon, 20 Nov 2017 22:14:24 +0800
Subject: [PATCH 21/52] change clone to resizeOrCreate in fm layer

---
 .../gserver/layers/FactorizationMachineLayer.cpp   | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp
index b665fb6dfc..be26b9ba88 100644
--- a/paddle/gserver/layers/FactorizationMachineLayer.cpp
+++ b/paddle/gserver/layers/FactorizationMachineLayer.cpp
@@ -58,16 +58,22 @@ void FactorizationMachineLayer::forward(PassType passType) {
       inputMulFactor_, batchSize, factorSize_, false, useGpu_);
   Matrix::resizeOrCreate(tmpOut_, batchSize, factorSize_, false, useGpu_);
 
-  REGISTER_TIMER_INFO("InputMulFactorTimer", getName().c_str());
+  REGISTER_TIMER_INFO("FmInputMulFactorTimer", getName().c_str());
   inputMulFactor_->mul(*inputV, *latentVectors_->getW());
   inputMulFactor_->square2(*tmpOut_);
   outV->sumRows(*tmpOut_, 0.5, 0);
 
-  inputSquare_ = inputV->clone(0, 0, useGpu_);
-  if (dynamic_cast<CpuSparseMatrix*>(inputSquare_.get())) {
+  if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
+    Matrix::resizeOrCreateSparseMatrix(inputSquare_,
+                                       inputV->getHeight(),
+                                       inputV->getWidth(),
+                                       inputV->getElementCnt(),
+                                       inputV->getValueType());
     inputSquare_->copyFrom(*inputV);
     (dynamic_cast<CpuSparseMatrix*>(inputSquare_.get()))->square2();
   } else {
+    Matrix::resizeOrCreate(
+        inputSquare_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
     inputV->square2(*inputSquare_);
   }
   latentVectors_->getW()->square2(*latentVectorsSquare_);
@@ -75,7 +81,7 @@ void FactorizationMachineLayer::forward(PassType passType) {
   outV->sumRows(*tmpOut_, -0.5, 1.0);
 
   /* activation */ {
-    REGISTER_TIMER_INFO("FmAtvTimer", getName().c_str());
+    REGISTER_TIMER_INFO("FmFwAtvTimer", getName().c_str());
     forwardActivation();
   }
 }

From 7fe61a7fa823e2b611ca42aacad76f5ca02a7217 Mon Sep 17 00:00:00 2001
From: Kavya Srinet <kavyasrinet@baidu.com>
Date: Wed, 22 Nov 2017 10:55:28 -0800
Subject: [PATCH 22/52] Editing and re-writing parts of Data Reader design doc

---
 doc/design/reader/README.md | 70 ++++++++++++++++++++-----------------
 1 file changed, 37 insertions(+), 33 deletions(-)

diff --git a/doc/design/reader/README.md b/doc/design/reader/README.md
index 320dccec3d..2cd4b6225b 100644
--- a/doc/design/reader/README.md
+++ b/doc/design/reader/README.md
@@ -1,25 +1,25 @@
 # Python Data Reader Design Doc
 
-At training and testing time, PaddlePaddle programs need to read data. To ease the users' work to write data reading code, we define that
+During the training and testing phases, PaddlePaddle programs need to read data. To help the users write code that performs reading input data, we define the following:
 
-- A *reader* is a function that reads data (from file, network, random number generator, etc) and yields data items.
-- A *reader creator* is a function that returns a reader function.
-- A *reader decorator* is a function, which accepts one or more readers, and returns a reader.
-- A *batch reader* is a function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items.
+- A *reader*: A function that reads data (from file, network, random number generator, etc) and yields the data items.
+- A *reader creator*: A function that returns a reader function.
+- A *reader decorator*: A function, which takes in one or more readers, and returns a reader.
+- A *batch reader*: A function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items.
 
-and provide function which converts reader to batch reader, frequently used reader creators and reader decorators.
+and also provide a function which can convert a reader to a batch reader, frequently used reader creators and reader decorators.
 
 ## Data Reader Interface
 
-Indeed, *data reader* doesn't have to be a function that reads and yields data items. It can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`):
+*Data reader* doesn't have to be a function that reads and yields data items. It can just be any function without any parameters that creates an iterable (anything can be used in `for x in iterable`) as follows:
 
 ```
 iterable = data_reader()
 ```
 
-Element produced from the iterable should be a **single** entry of data, **not** a mini batch. That entry of data could be a single item, or a tuple of items. Item should be of [supported type](http://www.paddlepaddle.org/doc/ui/data_provider/pydataprovider2.html?highlight=dense_vector#input-types) (e.g., numpy 1d array of float32, int, list of int)
+The item produced from the iterable should be a **single** entry of data and **not** a mini batch. The entry of data could be a single item or a tuple of items. Item should be of one of the [supported types](http://www.paddlepaddle.org/doc/ui/data_provider/pydataprovider2.html?highlight=dense_vector#input-types) (e.g., numpy 1d array of float32, int, list of int etc.)
 
-An example implementation for single item data reader creator:
+An example implementation for single item data reader creator is as follows:
 
 ```python
 def reader_creator_random_image(width, height):
@@ -29,7 +29,7 @@ def reader_creator_random_image(width, height):
     return reader
 ```
 
-An example implementation for multiple item data reader creator:
+An example implementation for multiple item data reader creator is as follows:
 ```python
 def reader_creator_random_image_and_label(width, height, label):
     def reader():
@@ -40,9 +40,10 @@ def reader_creator_random_image_and_label(width, height, label):
 
 ## Batch Reader Interface
 
-*batch reader* can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list must be a tuple.
+*Batch reader* can be any function without any parameters that creates an iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list should be a tuple.
+
+Here are some valid outputs:
 
-Here are valid outputs:
 ```python
 # a mini batch of three data items. Each data item consist three columns of data, each of which is 1.
 [(1, 1, 1),
@@ -58,20 +59,22 @@ Here are valid outputs:
 Please note that each item inside the list must be a tuple, below is an invalid output:
 ```python
  # wrong, [1,1,1] needs to be inside a tuple: ([1,1,1],).
- # Otherwise it's ambiguous whether [1,1,1] means a single column of data [1, 1, 1],
- # or three column of datas, each of which is 1.
+ # Otherwise it is ambiguous whether [1,1,1] means a single column of data [1, 1, 1],
+ # or three columns of data, each of which is 1.
 [[1,1,1],
 [2,2,2],
 [3,3,3]]
 ```
 
-It's easy to convert from reader to batch reader:
+It is easy to convert from a reader to a batch reader:
+
 ```python
 mnist_train = paddle.dataset.mnist.train()
 mnist_train_batch_reader = paddle.batch(mnist_train, 128)
 ```
 
-Also easy to create custom batch reader:
+It is also straight forward to create a custom batch reader:
+
 ```python
 def custom_batch_reader():
     while True:
@@ -85,7 +88,8 @@ mnist_random_image_batch_reader = custom_batch_reader
 
 ## Usage
 
-batch reader, mapping from item(s) read to data layer, batch size and number of total pass will be passed into `paddle.train`:
+Following is how we can use the reader with PaddlePaddle:
+The batch reader, a mapping from item(s) to data layer, the batch size and the number of total passes will be passed into `paddle.train` as follows:
 
 ```python
 # two data layer is created:
@@ -99,13 +103,13 @@ paddle.train(batch_reader, {"image":0, "label":1}, 128, 10, ...)
 
 ## Data Reader Decorator
 
-*Data reader decorator* takes a single or multiple data reader, returns a new data reader. It is similar to a [python decorator](https://wiki.python.org/moin/PythonDecorators), but it does not use `@` syntax.
+The *Data reader decorator* takes in a single reader or multiple data readers and returns a new data reader. It is similar to a [python decorator](https://wiki.python.org/moin/PythonDecorators), but it does not use `@` in the syntax.
 
-Since we have a strict interface for data readers (no parameter, return a single data item). Data reader can be used flexiable via data reader decorators. Following are a few examples:
+Since we have a strict interface for data readers (no parameters and return a single data item), a data reader can be used in a flexible way using data reader decorators. Following are a few examples:
 
 ### Prefetch Data
 
-Since reading data may take time and training can not proceed without data. It is generally a good idea to prefetch data.
+Since reading data may take some time and training can not proceed without data, it is generally a good idea to prefetch the data.
 
 Use `paddle.reader.buffered` to prefetch data:
 
@@ -117,9 +121,9 @@ buffered_reader = paddle.reader.buffered(paddle.dataset.mnist.train(), 100)
 
 ### Compose Multiple Data Readers
 
-For example, we want to use a source of real images (reusing mnist dataset), and a source of random images as input for [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661).
+For example, if we want to use a source of real images (say reusing mnist dataset), and a source of random images as input for [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661).
 
-We can do:
+We can do the following :
 
 ```python
 def reader_creator_random_image(width, height):
@@ -139,13 +143,13 @@ false_reader = reader_creator_bool(False)
 
 reader = paddle.reader.compose(paddle.dataset.mnist.train(), data_reader_creator_random_image(20, 20), true_reader, false_reader)
 # Skipped 1 because paddle.dataset.mnist.train() produces two items per data entry.
-# And we don't care second item at this time.
+# And we don't care about the second item at this time.
 paddle.train(paddle.batch(reader, 128), {"true_image":0, "fake_image": 2, "true_label": 3, "false_label": 4}, ...)
 ```
 
 ### Shuffle
 
-Given shuffle buffer size `n`, `paddle.reader.shuffle` will return a data reader that buffers `n` data entries and shuffle them before a data entry is read.
+Given the shuffle buffer size `n`, `paddle.reader.shuffle` returns a data reader that buffers `n` data entries and shuffles them before a data entry is read.
 
 Example:
 ```python
@@ -154,21 +158,21 @@ reader = paddle.reader.shuffle(paddle.dataset.mnist.train(), 512)
 
 ## Q & A
 
-### Why reader return only a single entry, but not a mini batch?
+### Why does a reader return only a single entry, and not a mini batch?
 
-Always returning a single entry make reusing existing data readers much easier (e.g., if existing reader return not a single entry but 3 entries, training code will be more complex because it need to handle cases like batch size 2).
+Returning a single entry makes reusing existing data readers much easier (for example, if an existing reader returns 3 entries instead if a single entry, the training code will be more complicated because it need to handle cases like a batch size 2).
 
-We provide function `paddle.batch` to turn (single entry) reader into batch reader.
+We provide a function: `paddle.batch` to turn (a single entry) reader into a batch reader.
 
-### Why do we need batch reader, isn't train take reader and batch_size as arguments sufficient?
+### Why do we need a batch reader, isn't is sufficient to give the reader and batch_size as arguments during training ?
 
-In most of the case, train taking reader and batch_size as arguments would be sufficent. However sometimes user want to customize order of data entries inside a mini batch. Or even change batch size dynamically.
+In most of the cases, it would be sufficient to give the reader and batch_size as arguments to the train method. However sometimes the user wants to customize the order of data entries inside a mini batch, or even change the batch size dynamically. For these cases using a batch reader is very efficient and helpful.
 
-### Why use a dictionary but not a list to provide mapping?
+### Why use a dictionary instead of a list to provide mapping?
 
-We decided to use dictionary (`{"image":0, "label":1}`) instead of list (`["image", "label"]`) is because that user can easily resue item (e.g., using `{"image_a":0, "image_b":0, "label":1}`) or skip item (e.g., using `{"image_a":0, "label":2}`).
+Using a dictionary (`{"image":0, "label":1}`) instead of a list (`["image", "label"]`) gives the advantage that the user can easily reuse the items (e.g., using `{"image_a":0, "image_b":0, "label":1}`) or even skip an item (e.g., using `{"image_a":0, "label":2}`).
 
-### How to create custom data reader creator
+### How to create a custom data reader creator ?
 
 ```python
 def image_reader_creator(image_path, label_path, n):
@@ -192,7 +196,7 @@ paddle.train(paddle.batch(reader, 128), {"image":0, "label":1}, ...)
 
 ### How is `paddle.train` implemented
 
-An example implementation of paddle.train could be:
+An example implementation of paddle.train is:
 
 ```python
 def train(batch_reader, mapping, batch_size, total_pass):

From 7046e0249a45b00729c551d0d1ecd64af2c06af5 Mon Sep 17 00:00:00 2001
From: Kavya Srinet <kavyasrinet@baidu.com>
Date: Wed, 22 Nov 2017 18:35:05 -0800
Subject: [PATCH 23/52] Updated the design doc for distributed training
 architecture

---
 .../refactor/distributed_architecture.md      | 168 +++++-------------
 1 file changed, 45 insertions(+), 123 deletions(-)

diff --git a/doc/design/refactor/distributed_architecture.md b/doc/design/refactor/distributed_architecture.md
index ac7e98ccf1..2b4f921ae9 100644
--- a/doc/design/refactor/distributed_architecture.md
+++ b/doc/design/refactor/distributed_architecture.md
@@ -2,106 +2,70 @@
 
 ## Abstract
 
-PaddlePaddle v0.10.0 uses the "trainer-parameter server"
-architecture. We run multiple replicated instances of trainers (runs
-the same code written by the user) and parameter servers for
-distributed training. This architecture served us well, but has some
-limitations:
+PaddlePaddle version 0.10.0 uses the "trainer-parameter server" architecture. We run multiple instances of trainers (where each trainer runs the same model) and parameter servers for distributed training. This architecture serves well, but has few limitations:
 
-1. Need to write special code to handle tasks which should only be run
-  by a single trainer. E.g., initializing model and saving model.
+1. There is a need to write special code that handles tasks which should only be run on a single trainer. E.g., initializing the model, saving the model etc.
 
-2. Model parallelism is hard: need to write if-else branches conditioned
-  on the trainer ID to partition model onto each trainer, and manually
-  write the inter-model-shard communication code.
+2. Model parallelism is hard: It would need all the if-else branches conditioned on the trainer ID to partition the model onto the trainers, and eventually manually writing out the inter-model-shard communication code to communicate between different trainers.
 
-3. The user can not directly specify the parameter update rule: need
-   to modify the parameter server C++ code and compile a new
-   binary. This adds complication for researchers: A lot of extra
-   effort is required. Besides, the training job submission program
-   may not allow running arbitrary binaries.
+3. The user can not directly specify the parameter update rule: This would need to modify the parameter server code and compile a new binary. This makes things more complicated for researchers: A lot of extra effort is required to make this work. Besides, the training job submission program may not allow running arbitrary binaries.
 
-This design doc discusses PaddlePaddle's new distributed training
-architecture that addresses the above limitations.
+This design doc discusses PaddlePaddle's new distributed training architecture that addresses the above mentioned limitations.
 
 ## Analysis
 
-We will assume the user writes the trainer program by Python, the same
-analysis holds if the trainer program is written in C++.
+The assumption is that the user writes the trainer program in either Python or C++.
 
 ### Limitation 1
 
-If we look at the Python code that the user writes, there are two
-kinds of functionalities:
+There are two basic functionalities in the trainer program:
 
-- The training logic such as load / save model and print log.
-- The neural network definition such as the definition of the data
-  layer, the fully connected layer, the cost function and the
+1. The training logic such as loading / saving the model and printing out the logs.
+2. The neural network definition such as the definition of the data layer, the fully connected layer, the cost function and the
   optimizer.
 
-When we training with PaddlePaddle v0.10.0 distributedly, multiple
-replicated Python instances are running on different nodes: both the
-training logic and the neural network computation is replicated.
+When we train using PaddlePaddle v0.10.0 in a distributed fashion, multiple instances of the same Python code are run on different nodes, hence both: the
+training logic as well as the neural network computation logic, is replicated.
 
-The tasks that should only run once all belong to the training logic,
-if we only replicate the neural network computation, but do **not**
-replicate the training logic, the limitation could be solved.
+The tasks that only need to be run once belong to the training logic. Hence if we only replicate the neural network computation part, and do **not**
+replicate the training logic, the limitation mentioned above can be avoided.
 
 ### Limitation 2
 
-Model parallelism means running a single model on multiple nodes by
-partitioning the model onto different nodes and managing the
-inter-model-shard communications.
+Model parallelism means that a single model is partitioned into different components and each node runs one of the component separately. This comes at the extra cost of managing the
+inter-model-shard communication between nodes.
 
-PaddlePaddle should be able to modify the nerual network computation
-definition to support model parallelism automatically. However, the
-computation is only specified in Python code, and PaddlePaddle can not
-modify Python code.
+PaddlePaddle should ideally be able to modify the neural network computation and figure out the support for model parallelism automatically. However, the
+computation is only specified in Python code which sits outside of PaddlePaddle, hence PaddlePaddle can not support the feature in this setup.
 
-Just like compiler uses a intermediate representation (IR) so that
-programmer does not need to manually optimize their code in most of
-the cases - the compiler will optimize the IR:
+Similar to how a compiler uses an intermediate representation (IR) so that the programmer does not need to manually optimize their code for most of the cases, we can have an intermediate representation in PaddlePaddle as well. The compiler optimizes the IR as follows:
 
 <img src="src/compiler.png"/>
 
-We can have our own IR too: PaddlePaddle can support model parallel by
-converting the IR so the user no longer need to manually do it in
-Python:
+PaddlePaddle can support model parallelism by converting the IR so that the user no longer needs to manually perform the computation and operations in the Python component:
 
 <img src="src/paddle-compile.png"/>
 
-The IR for PaddlePaddle after refactor is called `Block`, it specifies
-the computation dependency graph and the variables used in the
-computation.
+The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the computation dependency graph and the variables used in the computation.
 
 ### Limitation 3
 
-The user can not directly specify the parameter update rule for the
-parameter server because the parameter server does not use the same
-computation definition as the trainer. Instead, the update rule is
-baked in the parameter server. The user can not specify the update
-rule in the same way of specifying the trainer computation.
+The user can not directly specify the parameter update rule for the parameter server in the Python module, since the parameter server does not use the same computation definition as the trainer. Instead, the update rule is baked inside the parameter server. The user can not specify the update rule explicitly.
 
-This could be fixed by making the parameter server run the same
-computation definition as the trainer. For a detailed explanation,
-please
-see
+This could be fixed by making the parameter server run the same computation definition as the trainer (the user's Python module). For a detailed explanation, refer to this document -
 [Design Doc: Operation Graph Based Parameter Server](./dist_train.md)
 
 ## Distributed Training Architecture
 
-The new distributed training architecture can address the above
-limitations. Below is the illustration:
+The revamped distributed training architecture can address the above discussed limitations. Below is the illustration of how it does so:
 
 <img src="src/distributed_architecture.png"/>
 
-The architecture includes major components: *PaddlePaddle Python*,
-*PaddlePaddle converter* and *PaddlePaddle runtime*:
+The major components in the architecture are: *PaddlePaddle Python*, *PaddlePaddle converter* and *PaddlePaddle runtime*.
 
 ### PaddlePaddle Python
 
-PaddlePaddle Python is the Python library that user's Python trainer
-invoke to build the neural network topology, start training, etc.
+PaddlePaddle Python is the Python library that user's Python code invokes, to read the data. build the neural network topology, start training, etc.
 
 ```Python
 paddle.init()
@@ -117,102 +81,60 @@ for i in range(1000):
 	print cost_val
 ```
 
-The code above is a typical Python trainer code, the neural network
-topology is built using helper functions such as
-`paddle.layer.fc`. The training is done by calling `session.eval`
-iteratively.
+The above code is what a typical Python trainer code is, the neural network topology is built using the helper functions such as `paddle.layer.fc`. Training is done by calling `session.eval` iteratively.
 
 #### session.eval
 
-As shown in the graph, `session.eval` sends the IR and the evaluation
-inputs/targets to the PaddlePaddle cluster for evaluation. The
-targets can be any variable in the computation graph. When the target
-is the `optimizer` variable, the neural network will be optimized
-once. When the target is the `cost` variable, `session.eval` returns
-the cost value.
+As shown in the graph, `session.eval` sends the IR and the evaluation inputs or targets to the PaddlePaddle cluster for evaluation.
+The targets can be any variable in the computation graph. When the target is say, the `optimizer` variable, the neural network will be optimized once. When the target is the `cost` variable, `session.eval` returns the cost value. Based on what the target is, an appropriate action is taken.
 
-The Python `session` is a wrapper of the C++ `Session` class. For more
-information about `Session`, please
-see [Design Doc: Session](./session.md).
+The Python `session` is a wrapper of the C++ `Session` class. For more information about `Session`, refer to this document - [Design Doc: Session](./session.md).
 
 ### PaddlePaddle Converter
 
-PaddlePaddle converter automatically converts the IR in the request
-(IR and evaluation inputs/targets) from PaddlePaddle Python to new
-partitioned IRs and dispatch the new IRs and evaluation inputs/targets
-to different PaddlePaddle runtimes. Below are the steps:
+The PaddlePaddle converter automatically converts the IR in the request (IR and evaluation inputs/targets) from PaddlePaddle Python to partitioned IRs and dispatches the new IRs and evaluation inputs/targets to different PaddlePaddle runtimes. Below are the steps that are followed :
 
-1. Add `feed` OP that feeds the eval inputs, and `fetch` OP that
-   fetches the eval targets to the IR.
+1. Add a `feed` OP that feeds the eval inputs, and a `fetch` OP that fetches the eval targets to the IR.
 
-1. Extract a new computation (sub)graph with `feed` and `fetch` OP as
-   the boundary. The runtime does not need to run the OP that is not
-   dependent by the `fetch` OP.
+2. Extract a new computation (sub)graph with the `feed` and `fetch` OPs as the boundary. The runtime does not need to run the OP that is not dependent on the `fetch` OP.
 
-1. Optimizes the computation graph.
+3. Optimize the computation graph.
 
-1. Place the OPs in the graph onto different devices on different
-   PaddlePaddle runtime according to a placement algorithm and device
-   constraint specified by the user.
+4. Place the OPs in the graph onto different devices on different PaddlePaddle runtime according to a placement algorithm and the device constraints specified by the user.
 
-1. Partition the graph according to runtime boundaries and add `send` /
-   `recv` OP pair on the runtime boundaries.
+5. Partition the graph according to runtime boundaries and add `send` / `recv` OP pair on the runtime boundaries.
 
-1. Dispatch the partitioned graph to different PaddlePaddle runtimes.
+6. Dispatch the partitioned graph to different PaddlePaddle runtimes.
+
+7. PaddlePaddle runtimes with the `fetch` OP reports evaluation results back to the converter, the converter reports the evaluation results back to the PaddlePaddle Python.
 
-1. PaddlePaddle runtimes with the `fetch` OP reports evaluation
-   results back to the converter, the convert reports the evaluation
-   results back to the PaddlePaddle Python.
-   
 The output IRs will be cached to optimize the conversion latency.
 
 
 #### Placement Algorithm
 
-Our first implementation will only support "trainer-parameter server"
-placement: the parameters, initializers, and optimizers are placed on
-the PaddlePaddle runtimes with the parameter server role. And
-everything else will be placed on the PaddlePaddle runtimes with the
-trainer role. This has the same functionality of our
-"trainer-parameter server" architecture of PaddlePaddle v0.10.0, but
-is more general and flexible.
+Our first implementation will only support "trainer-parameter server" placement: the parameters, initializers, and optimizers are all placed on the PaddlePaddle runtimes with the parameter server role. Everything else will be placed on the PaddlePaddle runtimes with the trainer role. This has the same functionality as the "trainer-parameter server" architecture of PaddlePaddle v0.10.0, but is more generic and flexible.
 
-In the future, we will implement the general placement algorithm,
-which makes placements according to the input IR, and a model of
-device computation time and device communication time. Model
-parallelism requires the general placement algorithm.
+In the future, a more general placement algorithm should be implemented, which makes placements according to the input IR, and a model of device computation time and device communication time. Model parallelism requires the generic placement algorithm.
 
 
 ### PaddlePaddle Runtime
 
-The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and
-runs the IR. The runtime does not need to do OP placement since it's
-already done by the converter.
+The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and runs the IR. The runtime does not need to do OP placement since it is already done by the converter.
 
 
 ### Local Training Architecture
 
-The local training architecture will be the same as the distributed
-training architecture, the differences are everything runs locally,
-and there is just one PaddlePaddle runtime:
+The local training architecture will be the same as the distributed training architecture, the difference is that everything runs locally, and there is just one PaddlePaddle runtime:
 
 <img src="src/local_architecture.png"/>
 
 
 ### Training Data
 
-In PaddlePaddle v0.10.0, training data is typically read
-with [data reader](../reader/README.md) from Python. This approach is
-no longer efficient when training distributedly since the Python
-process no longer runs on the same node with the trainer processes,
-the Python reader will need to read from the distributed filesystem
-(assuming it has the access) and send to the trainers, doubling the
-network traffic.
-
-When doing distributed training, the user can still use Python data
-reader: the training data are sent with `session.eval`. However should
-be used for debugging purpose only. The users are encouraged to use
-the read data OPs.
+In PaddlePaddle v0.10.0, training data is typically read with a [data reader](../reader/README.md) from Python. This approach is no longer efficient when training in a distributed fashion since the Python process no longer runs on the same node with the trainer processes. The Python reader will need to read from the distributed filesystem (assuming it has the required access) and send to the trainers, doubling the network traffic.
+
+When doing distributed training, the user can still use Python data reader: the training data are sent with `session.eval`. However this should be used for debugging purpose only. The users are encouraged to use the read data OPs.
 
 
 ## References:

From 6b29904bad2e38ea6a717af9bec2d2ac7ffe070e Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Thu, 23 Nov 2017 19:05:31 +0800
Subject: [PATCH 24/52] Add size, height and width for crop layer. Add size for
 switch order layer

---
 python/paddle/trainer/config_parser.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 5ba0e50c6b..9510194576 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -2401,6 +2401,15 @@ class CropLayer(LayerBase):
         image_conf.channels = input_layer.size / (input_layer.width *
                                                   input_layer.height)
 
+        if (len(self.config.inputs) == 2):
+            self.set_layer_height_width(
+                self.get_input_layer(1).height, self.get_input_layer(1).width)
+            self.set_layer_size(self.get_input_layer(1).size)
+        else:
+            # NCHW order
+            self.set_layer_height_width(shape[-2], shape[-1])
+            self.set_layer_size(reduce(lambda x, y: x * y, shape))
+
 
 @config_layer('batch_norm')
 class BatchNormLayer(LayerBase):
@@ -3850,6 +3859,16 @@ class SwitchOrderLayer(LayerBase):
             name, 'switch_order', 0, inputs=inputs, **xargs)
         self.config.reshape_conf.height_axis.extend(reshape['height'])
         self.config.reshape_conf.width_axis.extend(reshape['width'])
+        input_layer = self.get_input_layer(0)
+        if reshape is None:
+            self.set_layer_size(input_layer.size)
+        else:
+            inH = input_layer.height
+            inW = input_layer.width
+            inC = input_layer.size / inH / inW
+            out_dims = [0, inH, inW, inC]
+            size = reduce(lambda x, y: x * y, out_dims[reshape['width'][0]:])
+            self.set_layer_size(size)
 
 
 @config_layer('scale_sub_region')

From e4c8de9ef5be7ea866d8e6c831ba9cb86ddaac54 Mon Sep 17 00:00:00 2001
From: ranqiu <ranqiu@baidu.com>
Date: Fri, 24 Nov 2017 11:45:51 +0800
Subject: [PATCH 25/52] Update the annotations of layers.py

---
 .../paddle/trainer_config_helpers/layers.py   | 110 ++++++++++--------
 1 file changed, 63 insertions(+), 47 deletions(-)

diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 8e127c9489..469e667e80 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -1900,9 +1900,12 @@ def repeat_layer(input,
     A layer for repeating the input for num_repeats times.
 
     If as_row_vector:
+
     .. math::
        y  = [x_1,\cdots, x_n, \cdots, x_1, \cdots, x_n]
+
     If not as_row_vector:
+
     .. math::
        y  = [x_1,\cdots, x_1, \cdots, x_n, \cdots, x_n]
 
@@ -1915,19 +1918,19 @@ def repeat_layer(input,
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param num_repeats: Repeat the input so many times
+    :param num_repeats: The times of repeating the input.
     :type num_repeats: int
     :param name: The name of this layer. It is optional.
-    :param as_row_vector: True for treating input as row vector and repeating
-                          in the column direction.  This is equivalent to apply
-                          concat_layer() with num_repeats same input.
-                          False for treating input as column vector and repeating
-                          in the row direction.
+    :type name: basestring
+    :param as_row_vector: Whether to treat the input as row vectors or not. If
+                          the parameter is set to True, the repeating operation
+                          will be performed in the column direction. Otherwise,
+                          it will be performed in the row direction.
     :type as_row_vector: bool
     :param act: Activation type. IdentityActivation is the default activation.
     :type act: BaseActivation
-    :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -1974,13 +1977,14 @@ def seq_reshape_layer(input,
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param reshape_size: the size of reshaped sequence.
+    :param reshape_size: The dimension of the reshaped sequence.
     :type reshape_size: int
     :param name: The name of this layer. It is optional.
     :type name: basestring
     :param act: Activation type. IdentityActivation is the default activation.
     :type act: BaseActivation
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :param bias_attr: The bias attribute. If the parameter is set to False or an object
                       whose type is not ParameterAttribute, no bias is defined. If the
@@ -2008,7 +2012,7 @@ def seq_reshape_layer(input,
 @layer_support()
 def interpolation_layer(input, weight, name=None, layer_attr=None):
     """
-    This layer is for linear interpolation with two inputs,
+    This layer performs linear interpolation on two inputs,
     which is used in NEURAL TURING MACHINE.
 
     .. math::
@@ -2030,7 +2034,8 @@ def interpolation_layer(input, weight, name=None, layer_attr=None):
     :type weight: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2064,7 +2069,7 @@ def bilinear_interp_layer(input,
                           name=None,
                           layer_attr=None):
     """
-    This layer is to implement bilinear interpolation on conv layer output.
+    This layer implements bilinear interpolation on convolutional layer's output.
 
     Please refer to Wikipedia: https://en.wikipedia.org/wiki/Bilinear_interpolation
 
@@ -2074,18 +2079,19 @@ def bilinear_interp_layer(input,
 
        bilinear = bilinear_interp_layer(input=layer1, out_size_x=64, out_size_y=64)
 
-    :param   input:        A input layer.
-    :type    input:        LayerOutput.
-    :param   out_size_x:   bilinear interpolation output width.
-    :type    out_size_x:   int | None
-    :param   out_size_y:   bilinear interpolation output height.
-    :type    out_size_y:   int | None
-    :param   name:         The layer's name, which cna not be specified.
-    :type    name:         None | basestring
-    :param   layer_attr:   Extra Layer attribute.
-    :type    layer_attr:   ExtraLayerAttribute
+    :param input: The input of this layer.
+    :type input: LayerOutput.
+    :param out_size_x: The width of the output.
+    :type out_size_x: int
+    :param out_size_y: The height of the output.
+    :type out_size_y: int
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
-    :rtype:  LayerOutput
+    :rtype: LayerOutput
     """
     assert input.layer_type == LayerType.CONV_LAYER
     assert isinstance(input.activation, LinearActivation)
@@ -2120,8 +2126,8 @@ def power_layer(input, weight, name=None, layer_attr=None):
     .. math::
        y = x^w
 
-    where :math:`x` is a input vector, :math:`w` is scalar weight,
-    and :math:`y` is a output vector.
+    where :math:`x` is an input vector, :math:`w` is a scalar exponent,
+    and :math:`y` is an output vector.
 
     The example usage is:
 
@@ -2131,11 +2137,12 @@ def power_layer(input, weight, name=None, layer_attr=None):
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param weight: Weight layer.
+    :param weight: The exponent of the power.
     :type weight: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2175,11 +2182,12 @@ def scaling_layer(input, weight, name=None, layer_attr=None):
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param weight: Weight layer.
+    :param weight: The weight of each sample.
     :type weight: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2217,7 +2225,8 @@ def trans_layer(input, name=None, layer_attr=None):
     :type input: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2253,11 +2262,14 @@ def rotate_layer(input, height, width, name=None, layer_attr=None):
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param height: The height of the sample matrix
+    :param height: The height of the sample matrix.
     :type height: int
+    :param width: The width of the sample matrix.
+    :type width: int
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2302,15 +2314,15 @@ def cos_sim(a, b, scale=1, size=1, name=None, layer_attr=None):
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param a: input layer a
+    :param a: The first input of this layer.
     :type a: LayerOutput
-    :param b: input layer b
+    :param b: The second input of this layer.
     :type b: LayerOutput
-    :param scale: scale for cosine value. default is 5.
+    :param scale: The scale of the cosine similarity. 1 is the default value.
     :type scale: float
-    :param size: layer size. NOTE size_a * size should equal size_b.
+    :param size: The dimension of this layer. NOTE size_a * size should equal size_b.
     :type size: int
-    :param layer_attr: Extra Layer Attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2395,8 +2407,10 @@ def hsigmoid(input,
     """
     Organize the classes into a binary tree. At each node, a sigmoid function
     is used to calculate the probability of belonging to the right branch.
-    This idea is from "F. Morin, Y. Bengio (AISTATS 05):
-    Hierarchical Probabilistic Neural Network Language Model."
+
+    Reference:
+        `Hierarchical Probabilistic Neural Network Language Model
+        <http://www.gatsby.ucl.ac.uk/aistats/fullpapers/208.pdf>`_
 
     The example usage is:
 
@@ -2407,19 +2421,21 @@ def hsigmoid(input,
 
     :param input: The input of this layer.
     :type input: LayerOutput | list | tuple
-    :param label: Label layer.
+    :param label: The input label.
     :type label: LayerOutput
-    :param num_classes: number of classes.
-    :type num_classes: int | None
+    :param num_classes: The number of classes. And it should be larger than 2. If the parameter
+                        is not set or set to None, its actual value will be automatically set to
+                        the number of labels.
+    :type num_classes: int
     :param name: The name of this layer. It is optional.
     :type name: basestring
     :param bias_attr: The bias attribute. If the parameter is set to False or an object
                       whose type is not ParameterAttribute, no bias is defined. If the
                       parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
-    :param param_attr: Parameter Attribute. None means default parameter.
-    :type param_attr: ParameterAttribute | None
-    :param layer_attr: Extra Layer Attribute.
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -4241,7 +4257,7 @@ def dot_prod_layer(input1, input2, name=None, layer_attr=None):
     :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input1: The first input layer.
-    :type input: LayerOutput
+    :type input1: LayerOutput
     :param input2: The second input layer.
     :type input2: LayerOutput
     :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for

From 52be2a2a86f4f1cd74dc12a989341f699c67b9ed Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Fri, 24 Nov 2017 15:41:04 +0800
Subject: [PATCH 26/52] Add depth dim

---
 python/paddle/trainer/config_parser.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 9510194576..b342a90fb6 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -3865,9 +3865,18 @@ class SwitchOrderLayer(LayerBase):
         else:
             inH = input_layer.height
             inW = input_layer.width
-            inC = input_layer.size / inH / inW
-            out_dims = [0, inH, inW, inC]
-            size = reduce(lambda x, y: x * y, out_dims[reshape['width'][0]:])
+            if input_layer.has_depth():
+                inD = input_layer.depth
+                inC = input_layer.size / inH / inW / inD
+                out_dims = [0, inD, inH, inW, inC]
+                size = reduce(lambda x, y: x * y,
+                              out_dims[reshape['width'][0]:])
+            else:
+                inC = input_layer.size / inH / inW
+                out_dims = [0, inH, inW, inC]
+                size = reduce(lambda x, y: x * y,
+                              out_dims[reshape['width'][0]:])
+
             self.set_layer_size(size)
 
 

From 6ace929c3d330bf427465a2dc720a77e7d6b50ed Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Fri, 24 Nov 2017 18:30:35 +0800
Subject: [PATCH 27/52] Rename variable name.

---
 python/paddle/trainer/config_parser.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index b342a90fb6..9ec6ba6347 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -3863,17 +3863,17 @@ class SwitchOrderLayer(LayerBase):
         if reshape is None:
             self.set_layer_size(input_layer.size)
         else:
-            inH = input_layer.height
-            inW = input_layer.width
+            in_h = input_layer.height
+            in_w = input_layer.width
             if input_layer.has_depth():
-                inD = input_layer.depth
-                inC = input_layer.size / inH / inW / inD
-                out_dims = [0, inD, inH, inW, inC]
+                in_d = input_layer.depth
+                in_c = input_layer.size / in_h / in_w / in_d
+                out_dims = [0, in_d, in_h, in_w, in_c]
                 size = reduce(lambda x, y: x * y,
                               out_dims[reshape['width'][0]:])
             else:
-                inC = input_layer.size / inH / inW
-                out_dims = [0, inH, inW, inC]
+                in_c = input_layer.size / in_h / in_w
+                out_dims = [0, in_h, in_w, in_c]
                 size = reduce(lambda x, y: x * y,
                               out_dims[reshape['width'][0]:])
 

From cda3a7747a657e630164c6802b9f1382e29c855b Mon Sep 17 00:00:00 2001
From: peterzhang2029 <zhangchao41@baidu.com>
Date: Mon, 27 Nov 2017 12:55:52 +0800
Subject: [PATCH 28/52] bug fix when using hsigmoid with gpu

---
 .../layers/HierarchicalSigmoidLayer.cpp       | 140 ++++++++++++++++--
 .../gserver/layers/HierarchicalSigmoidLayer.h |  10 ++
 2 files changed, 134 insertions(+), 16 deletions(-)

diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
index d62a8d846e..f93a9937d1 100644
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
@@ -64,49 +64,113 @@ void HierarchicalSigmoidLayer::forward(PassType passType) {
                          batchSize,
                          codeLength_,
                          /* trans */ false,
-                         useGpu(deviceId_));
+                         false);
   Matrix::resizeOrCreate(preOutput_.grad,
                          batchSize,
                          codeLength_,
                          /* trans */ false,
-                         useGpu(deviceId_));
-
+                         false);
   IVectorPtr label = getInput(*getLabelLayer()).ids;
-
   preOutput_.value->zeroMem();
 
+  if (useGpu_) {
+    Matrix::resizeOrCreate(cpuOutput_,
+                         output_.value->getHeight(),
+                         output_.value->getWidth(),
+                          /* trans */ false,
+                          false);
+    IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
+    cpuLabel_->copyFrom(*label);
+    cpuOutput_->copyFrom(*output_.value);
+  } else {
+    cpuOutput_ = output_.value;
+    cpuLabel_ = label;
+  }
   /* add the bias-vector */
   if (biases_.get() != NULL) {
-    preOutput_.value->addByBitCode(numClasses_, *label, *biases_->getW());
+    if (useGpu_) {
+      Matrix::resizeOrCreate(cpuBias_,
+                          1,
+                          numClasses_ - 1,
+                          /* trans */ false,
+                          false);
+      cpuBias_->copyFrom(*biases_->getW());
+    } else {
+      cpuBias_ = biases_->getW();
+    }
+    preOutput_.value->addByBitCode(numClasses_, *cpuLabel_, *cpuBias_);
   }
   for (size_t i = 0; i < inputLayers_.size() - 1; ++i) {
     MatrixPtr input = getInputValue(i);
+    if (useGpu_) {
+      Matrix::resizeOrCreate(cpuInput_,
+                          input->getHeight(),
+                          input->getWidth(),
+                          /* trans */ false,
+                          false);
+      Matrix::resizeOrCreate(cpuWeight_,
+                          weights_[i]->getW()->getHeight(),
+                          weights_[i]->getW()->getWidth(),
+                          /* trans */ false,
+                          false);
+      cpuInput_->copyFrom(*input);
+      cpuWeight_->copyFrom(*weights_[i]->getW());
+    } else {
+      cpuInput_ = input;
+      cpuWeight_ = weights_[i]->getW();
+    }
     preOutput_.value->mulByBitCode(
-        numClasses_, *label, *weights_[i]->getW(), *input);
+        numClasses_, *cpuLabel_, *cpuWeight_, *cpuInput_);
   }
   // keep consistent with the clipping in the following softrelu
   preOutput_.value->clip(-40.0, 40.0);
   preOutput_.value->sumByBitCode(numClasses_,
-                                 *label,
-                                 *output_.value,
+                                 *cpuLabel_,
+                                 *cpuOutput_,
                                  -1);  // scaleSum
   preOutput_.value->softrelu(*preOutput_.value);
   MatrixPtr sum =
-      Matrix::create(batchSize, 1, /* trans= */ false, useGpu(deviceId_));
+      Matrix::create(batchSize, 1, /* trans= */ false, false);
   preOutput_.value->rowSum(*sum);
-  output_.value->add(*sum);
+  cpuOutput_->add(*sum);
+  if (useGpu_) {
+    output_.value->copyFrom(*cpuOutput_);
+  } else {
+    output_.value = cpuOutput_;
+  }
 }
 
 void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
   IVectorPtr label = getInput(*getLabelLayer()).ids;
+  if (useGpu_) {
+    IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
+    cpuLabel_->copyFrom(*label);
+  } else {
+    cpuLabel_ = label;
+  }
   preOutput_.grad->one();
   preOutput_.grad->softreluDerivative(*preOutput_.value);
-  preOutput_.grad->subByBitCode(numClasses_, *label);
+  preOutput_.grad->subByBitCode(numClasses_, *cpuLabel_);
 
   if (biases_ && biases_->getWGrad()) {
+    MatrixPtr biases_grad = biases_->getWGrad();
+    if (useGpu_) {
+      Matrix::resizeOrCreate(cpuBias_,
+                            1,
+                            numClasses_ - 1,
+                            /* trans */ false,
+                            false);
+      cpuBias_->copyFrom(*biases_grad);
+    } else {
+      cpuBias_ = biases_grad;
+    }
     preOutput_.grad->addByBitCodeBackward(
-        numClasses_, *label, *biases_->getWGrad());
-
+        numClasses_, *cpuLabel_, *cpuBias_);
+    if (useGpu) {
+      biases_grad->copyFrom(*cpuBias_);
+    } else {
+      biases_grad = cpuBias_;
+    }
     /* Increasing the number of gradient */
     biases_->getParameterPtr()->incUpdate(callback);
   }
@@ -115,9 +179,31 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
     /* Calculate the W-gradient for the current layer */
     MatrixPtr input = getInputValue(i);
     if (weights_[i]->getWGrad()) {
+      MatrixPtr weights_grad = weights_[i]->getWGrad();
+      if (useGpu_) {
+        Matrix::resizeOrCreate(cpuInput_,
+                              input->getHeight(),
+                              input->getWidth(),
+                              /* trans */ false,
+                              false);
+        Matrix::resizeOrCreate(cpuWeightGrad_,
+                            weights_grad->getHeight(),
+                            weights_grad->getWidth(),
+                            /* trans */ false,
+                            false);
+        cpuInput_->copyFrom(*input);
+        cpuWeightGrad_->copyFrom(*weights_grad);
+      } else {
+        cpuInput_ = input;
+        cpuWeightGrad_ = weights_grad;
+      }
       preOutput_.grad->mulByBitCodeBackwardWeight(
-          numClasses_, *label, *weights_[i]->getWGrad(), *input);
-
+          numClasses_, *cpuLabel_, *cpuWeightGrad_, *cpuInput_);
+      if (useGpu_) {
+        weights_grad->copyFrom(*cpuWeightGrad_);
+      } else {
+        weights_grad = cpuWeightGrad_;
+      }
       /* Increasing the number of gradient */
       weights_[i]->getParameterPtr()->incUpdate(callback);
     }
@@ -125,8 +211,30 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
     /* Calculate the input layers error */
     MatrixPtr inputGrad = getInputGrad(i);
     if (inputGrad) {
+      if (useGpu_) {
+        Matrix::resizeOrCreate(cpuInputGrad_,
+                              inputGrad->getHeight(),
+                              inputGrad->getWidth(),
+                              /* trans */ false,
+                              false);
+        Matrix::resizeOrCreate(cpuWeight_,
+                              weights_[i]->getW()->getHeight(),
+                              weights_[i]->getW()->getWidth(),
+                              /* trans */ false,
+                              false);
+        cpuInputGrad_->copyFrom(*inputGrad);
+        cpuWeight_->copyFrom(*weights_[i]->getW());
+      } else {
+        cpuInputGrad_ = inputGrad;
+        cpuWeight_ = weights_[i]->getW();
+      }
       preOutput_.grad->mulByBitCodeBackwardError(
-          numClasses_, *label, *weights_[i]->getW(), *inputGrad);
+          numClasses_, *cpuLabel_, *cpuWeight_, *cpuInputGrad_);
+      if (useGpu_) {
+        inputGrad->copyFrom(*cpuInputGrad_);
+      } else {
+        inputGrad = cpuInputGrad_;
+      }
     }
   }
 }
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.h b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
index 9afd40b167..2483572ded 100644
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.h
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
@@ -80,6 +80,16 @@ protected:
   int codeLength_;
   /// temporary result of output_
   Argument preOutput_;
+
+  /// The temporary variables in CPU memory.
+  MatrixPtr cpuWeight_;
+  MatrixPtr cpuWeightGrad_;
+  MatrixPtr cpuInput_;
+  MatrixPtr cpuInputGrad_;
+  MatrixPtr cpuBias_;
+  MatrixPtr cpuOutput_;
+  IVectorPtr cpuLabel_;
+
 };
 
 }  // namespace paddle

From c8bb66314173e68aec897f8e4a3f988ad227adc0 Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Mon, 27 Nov 2017 14:21:34 +0800
Subject: [PATCH 29/52] Refine roi_pool_op to avoid warning

---
 paddle/operators/roi_pool_op.h | 49 +++++++++++++++-------------------
 1 file changed, 21 insertions(+), 28 deletions(-)
 mode change 100755 => 100644 paddle/operators/roi_pool_op.h

diff --git a/paddle/operators/roi_pool_op.h b/paddle/operators/roi_pool_op.h
old mode 100755
new mode 100644
index bd7736d631..3812c66c65
--- a/paddle/operators/roi_pool_op.h
+++ b/paddle/operators/roi_pool_op.h
@@ -133,54 +133,47 @@ class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
     auto* in = ctx.Input<framework::Tensor>("X");
     auto* rois = ctx.Input<framework::Tensor>("ROIs");
     auto* argmax = ctx.Input<framework::Tensor>("Argmax");
-
     auto* out_grad =
         ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* x_grad =
-        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
 
     auto pooled_height = ctx.Attr<int>("pooled_height");
     auto pooled_width = ctx.Attr<int>("pooled_width");
 
-    if (x_grad) {
-      int channels = in->dims()[1];
-      auto in_stride = framework::stride(in->dims());
-      auto roi_stride = framework::stride(rois->dims());
-
+    if (in_grad) {
       const int64_t* rois_data = rois->data<int64_t>();
-      int rois_num = rois->dims()[0];
-
-      T* x_grad_data = x_grad->mutable_data<T>(ctx.GetPlace());
+      const T* out_grad_data = out_grad->data<T>();
+      const int64_t* argmax_data = argmax->data<int64_t>();
+      T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
       math::SetConstant<Place, T> set_zero;
-      set_zero(ctx.device_context(), x_grad, static_cast<T>(0));
+      set_zero(ctx.device_context(), in_grad, static_cast<T>(0));
 
-      size_t roi_offset = roi_stride[0];
-      size_t batch_offset = in_stride[0];
-      size_t channel_offset = in_stride[1];
+      auto in_stride = framework::stride(in->dims());
+      auto argmax_stride = framework::stride(argmax->dims());
+      auto roi_stride = framework::stride(rois->dims());
+      auto out_stride = framework::stride(out_grad->dims());
 
-      const T* out_grad_data = out_grad->data<T>();
-      size_t pool_channel_offset = pooled_height * pooled_width;
-      const int64_t* argmax_data = argmax->data<int64_t>();
+      int rois_num = rois->dims()[0];
+      int channels = in->dims()[1];
 
-      for (size_t n = 0; n < rois_num; ++n) {
-        size_t roi_batch_idx = rois_data[0];
-        T* batch_grad_data = x_grad_data + batch_offset * roi_batch_idx;
+      for (int n = 0; n < rois_num; ++n) {
+        int roi_batch_idx = rois_data[0];
+        T* batch_grad_data = in_grad_data + roi_batch_idx * in_stride[0];
         for (int c = 0; c < channels; ++c) {
           for (int ph = 0; ph < pooled_height; ++ph) {
             for (int pw = 0; pw < pooled_width; ++pw) {
-              size_t pool_index = ph * pooled_width + pw;
-
+              int pool_index = ph * pooled_width + pw;
               if (argmax_data[pool_index] >= 0) {
-                size_t index = static_cast<size_t>(argmax_data[pool_index]);
+                auto index = argmax_data[pool_index];
                 batch_grad_data[index] += out_grad_data[pool_index];
               }
             }
           }
-          batch_grad_data += channel_offset;
-          out_grad_data += pool_channel_offset;
-          argmax_data += pool_channel_offset;
+          batch_grad_data += in_stride[1];
+          out_grad_data += out_stride[1];
+          argmax_data += argmax_stride[1];
         }
-        rois_data += roi_offset;
+        rois_data += roi_stride[0];
       }
     }
   }

From c9a96575d5aa89d143025d36ce105b05ed572be3 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 27 Nov 2017 16:42:08 +0800
Subject: [PATCH 30/52] py_test and test_image_classification_train support
 argument (#5934)

* py_test support argument, test_image_classification_train support argument

* use REMOVE_ITEM to rm item from list in cmake
---
 cmake/generic.cmake                           |  6 +++---
 .../paddle/v2/fluid/tests/book/CMakeLists.txt |  6 ++++++
 .../book/test_image_classification_train.py   | 19 ++++++++++++++-----
 3 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 404717187d..7b82d409a3 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -459,11 +459,11 @@ function(py_test TARGET_NAME)
   if(WITH_TESTING)
     set(options STATIC static SHARED shared)
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
-    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})  
+    set(multiValueArgs SRCS DEPS ARGS)
+    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_test(NAME ${TARGET_NAME}
              COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python
-             ${PYTHON_EXECUTABLE} ${py_test_SRCS}
+             ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
              WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
   endif()
 endfunction()
diff --git a/python/paddle/v2/fluid/tests/book/CMakeLists.txt b/python/paddle/v2/fluid/tests/book/CMakeLists.txt
index 4d7664469e..a35abe3e0c 100644
--- a/python/paddle/v2/fluid/tests/book/CMakeLists.txt
+++ b/python/paddle/v2/fluid/tests/book/CMakeLists.txt
@@ -1,5 +1,11 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+list(REMOVE_ITEM TEST_OPS test_image_classification_train)
+py_test(test_image_classification_train_resnet SRCS test_image_classification_train.py ARGS resnet)
+py_test(test_image_classification_train_vgg SRCS test_image_classification_train.py ARGS vgg)
+
+# default test
 foreach(src ${TEST_OPS})
     py_test(${src} SRCS ${src}.py)
 endforeach()
diff --git a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
index 690c533971..cc45b10b90 100644
--- a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
+++ b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
@@ -1,7 +1,9 @@
 from __future__ import print_function
+
 import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
+import sys
 
 
 def resnet_cifar10(input, depth=32):
@@ -80,11 +82,18 @@ data_shape = [3, 32, 32]
 images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
 label = fluid.layers.data(name='label', shape=[1], dtype='int64')
 
-# Add neural network config
-# option 1. resnet
-# net = resnet_cifar10(images, 32)
-# option 2. vgg
-net = vgg16_bn_drop(images)
+net_type = "vgg"
+if len(sys.argv) >= 2:
+    net_type = sys.argv[1]
+
+if net_type == "vgg":
+    print("train vgg net")
+    net = vgg16_bn_drop(images)
+elif net_type == "resnet":
+    print("train resnet")
+    net = resnet_cifar10(images, 32)
+else:
+    raise ValueError("%s network is not supported" % net_type)
 
 predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
 cost = fluid.layers.cross_entropy(input=predict, label=label)

From d89ff5b6144461a967bd73fa739d251691f2a8bc Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 27 Nov 2017 17:09:07 +0800
Subject: [PATCH 31/52] Restore the param infos in Program.clone() (#5873)

* Restore the param infos in Program.clone()

The Program.clone only clone the variables and ops
in the program into a new program. However, the
information of Parameter is not clone.

So we need restore the information of Parameters.

Fix #5871

* Follow comments

* Fix CI

* Fix CI

* Fix CI
---
 python/paddle/v2/fluid/framework.py          | 56 +++++++++++++++++++-
 python/paddle/v2/fluid/tests/test_program.py | 24 +++++++--
 2 files changed, 75 insertions(+), 5 deletions(-)

diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py
index 9a62698b86..6d6ea23f55 100644
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -395,7 +395,11 @@ class Block(object):
         return v
 
     def all_parameters(self):
-        return {v for k, v in self.vars.iteritems() if isinstance(v, Parameter)}
+        return list(self.iter_parameters())
+
+    def iter_parameters(self):
+        return (item[1] for item in self.vars.iteritems()
+                if isinstance(item[1], Parameter))
 
     def create_var(self, *args, **kwargs):
         var = Variable(self, *args, **kwargs)
@@ -469,6 +473,37 @@ class Block(object):
         for index in range(len(self.ops)):
             assert self.ops[index].desc == ops_in_cpp[index]
 
+    def copy_param_info_from(self, other):
+        """
+        Copy the information of parameters from other block
+        Args:
+            other(Block): other block 
+
+        Returns:
+            None
+        """
+        if not isinstance(other, Block):
+            raise TypeError("copy_param_info_from should be invoked with Block")
+        for p in other.iter_parameters():
+            assert isinstance(p, Parameter)
+            v = self.vars.get(p.name, None)
+            if v is None:
+                raise ValueError("copy_param_info_from should be invoked with "
+                                 "same topology")
+            assert isinstance(v, Variable)
+            new_p = Parameter(
+                block=self,
+                shape=v.shape,
+                dtype=v.dtype,
+                type=v.type,
+                lod_level=v.lod_level,
+                stop_gradient=p.stop_gradient,
+                trainable=p.trainable,
+                optimize_attr=p.optimize_attr,
+                regularizer=p.regularizer,
+                name=v.name)
+            self.vars[new_p.name] = new_p
+
 
 class Program(object):
     def __init__(self):
@@ -489,6 +524,7 @@ class Program(object):
         p.desc = core.ProgramDesc(self.desc)
         p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())]
         p.sync_with_cpp()
+        p.copy_param_info_from(self)
         return p
 
     def prune(self, targets):
@@ -572,6 +608,24 @@ class Program(object):
         for block in self.blocks:
             block.sync_with_cpp()
 
+    def copy_param_info_from(self, other):
+        """
+        Copy the information of parameters from other program. 
+        Args:
+            other(Program): Other program
+
+        Returns:
+            None
+        """
+        if not isinstance(other, Program):
+            raise TypeError("copy_param_info_from should be invoked with "
+                            "Program")
+
+        if len(self.blocks) != len(other.blocks):
+            raise ValueError("copy_param_info_from should be invoked with two "
+                             "program, with represent the same topology")
+        self.global_block().copy_param_info_from(other.global_block())
+
     def list_vars(self):
         for each_block in self.blocks:
             for each_var in each_block.vars.itervalues():
diff --git a/python/paddle/v2/fluid/tests/test_program.py b/python/paddle/v2/fluid/tests/test_program.py
index e9bcefd215..15653a1dbf 100644
--- a/python/paddle/v2/fluid/tests/test_program.py
+++ b/python/paddle/v2/fluid/tests/test_program.py
@@ -1,7 +1,9 @@
+from __future__ import print_function
 import unittest
 
 from paddle.v2.fluid.framework import Program
 from paddle.v2.fluid.framework import g_main_program
+import paddle.v2.fluid.layers as layers
 
 
 class TestProgram(unittest.TestCase):
@@ -48,8 +50,8 @@ class TestProgram(unittest.TestCase):
 
         # FIXME(yuyang18): We manual compare the output string, since the order
         # of variable could be changed.
-        print prog
-        print prog.clone()
+        print(prog)
+        print(prog.clone())
 
     def test_parse_program_from_string(self):
         prog = Program()
@@ -67,8 +69,8 @@ class TestProgram(unittest.TestCase):
         binary_str = prog.desc.serialize_to_string()
         prog_restored = Program.parse_from_string(binary_str)
 
-        print prog
-        print prog_restored
+        print(prog)
+        print(prog_restored)
 
     def test_append_backward(self):
         prog = Program()
@@ -123,6 +125,20 @@ class TestProgram(unittest.TestCase):
             actual_ops.append(op.type)
         self.assertEqual(actual_ops, expect_ops)
 
+    def test_program_clone_with_parameter(self):
+        main_program = Program()
+        startup_program = Program()
+        kwargs = {
+            'main_program': main_program,
+            'startup_program': startup_program
+        }
+        d = layers.data(name='x', shape=[784], dtype='float32', **kwargs)
+        hidden = layers.fc(input=d, size=100, **kwargs)
+        layers.fc(input=hidden, size=100, **kwargs)
+
+        new_program = main_program.clone()
+        self.assertNotEqual(0, len(new_program.blocks[0].all_parameters()))
+
 
 if __name__ == '__main__':
     unittest.main()

From b28b2f172b2763dd8917833c2708309f98299a0a Mon Sep 17 00:00:00 2001
From: QI JUN <qijun1994@hotmail.com>
Date: Mon, 27 Nov 2017 18:35:57 +0800
Subject: [PATCH 32/52] refine test_recognize_digits_mlp and format codes
 (#5937)

---
 paddle/capi/Matrix.cpp                        |   4 +-
 paddle/capi/matrix.h                          |   8 +-
 paddle/framework/tensor_util.h                |   9 +-
 paddle/operators/math/maxouting.cc            |  31 ++-
 paddle/operators/math/maxouting.cu            |  80 ++++---
 paddle/operators/math/maxouting.h             |   8 +-
 paddle/operators/maxout_op.cc                 |  38 ++-
 paddle/operators/maxout_op.cu.cc              |   8 +-
 paddle/operators/maxout_op.h                  |   2 +-
 paddle/operators/roi_pool_op.cc               |  24 +-
 paddle/operators/roi_pool_op.cu               | 216 ++++++++----------
 paddle/operators/roi_pool_op.h                |   3 +-
 paddle/operators/sequence_slice_op.cc         |   5 +-
 python/paddle/v2/dataset/uci_housing.py       |   4 +-
 .../tests/book/test_recognize_digits_mlp.py   |  12 +-
 .../paddle/v2/fluid/tests/test_maxout_op.py   |   4 +-
 .../paddle/v2/fluid/tests/test_roi_pool_op.py |  48 ++--
 17 files changed, 231 insertions(+), 273 deletions(-)
 mode change 100755 => 100644 paddle/operators/roi_pool_op.cc
 mode change 100755 => 100644 paddle/operators/roi_pool_op.cu
 mode change 100755 => 100644 paddle/operators/roi_pool_op.h
 mode change 100755 => 100644 paddle/operators/sequence_slice_op.cc

diff --git a/paddle/capi/Matrix.cpp b/paddle/capi/Matrix.cpp
index d5b55e1c95..30f3a766f0 100644
--- a/paddle/capi/Matrix.cpp
+++ b/paddle/capi/Matrix.cpp
@@ -55,7 +55,7 @@ paddle_error paddle_matrix_set_row(paddle_matrix mat,
 }
 
 PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
-                                          paddle_real* value) {
+                                            paddle_real* value) {
   if (mat == nullptr || value == nullptr) return kPD_NULLPTR;
   auto ptr = cast(mat);
   if (ptr->mat == nullptr) return kPD_NULLPTR;
@@ -75,7 +75,7 @@ PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
 }
 
 PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
-                                          paddle_real* result) {
+                                            paddle_real* result) {
   if (mat == nullptr || result == nullptr) return kPD_NULLPTR;
   auto ptr = cast(mat);
   if (ptr->mat == nullptr) return kPD_NULLPTR;
diff --git a/paddle/capi/matrix.h b/paddle/capi/matrix.h
index 01b8bad2ee..8cc3e0034e 100644
--- a/paddle/capi/matrix.h
+++ b/paddle/capi/matrix.h
@@ -79,7 +79,7 @@ PD_API paddle_error paddle_matrix_set_row(paddle_matrix mat,
  * @note  value should contain enough element of data to init the mat
  */
 PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
-                                          paddle_real* value);
+                                            paddle_real* value);
 
 /**
  * @brief PDMatGetRow Get raw row buffer from matrix
@@ -93,14 +93,14 @@ PD_API paddle_error paddle_matrix_get_row(paddle_matrix mat,
                                           paddle_real** rawRowBuffer);
 
 /**
- * @brief copy data from the matrix 
+ * @brief copy data from the matrix
  * @param [in] mat Target matrix
- * @param [out] result pointer to store the matrix data 
+ * @param [out] result pointer to store the matrix data
  * @return paddle_error
  * @note the space of the result should allocated before invoke this API
  */
 PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
-                                          paddle_real* result);
+                                            paddle_real* result);
 /**
  * @brief PDMatCreateNone Create None Matrix
  * @return
diff --git a/paddle/framework/tensor_util.h b/paddle/framework/tensor_util.h
index 8ee2e15a59..4e34b90d57 100644
--- a/paddle/framework/tensor_util.h
+++ b/paddle/framework/tensor_util.h
@@ -135,18 +135,17 @@ inline void CopyToVector(const Tensor& src, const platform::DeviceContext& ctx,
   auto dst_ptr = static_cast<void*>(dst->data());
 
   if (platform::is_cpu_place(src.place())) {
-    memory::Copy(dst_place, dst_ptr, boost::get<platform::CPUPlace>(src.place()), 
-		    src_ptr, size);
+    memory::Copy(dst_place, dst_ptr,
+                 boost::get<platform::CPUPlace>(src.place()), src_ptr, size);
   }
 #ifdef PADDLE_WITH_CUDA
   else if (platform::is_gpu_place(src.place())) {  // NOLINT
     memory::Copy(
-        dst_place, dst_ptr, boost::get<platform::GPUPlace>(src.place()), src_ptr,
-        size,
+        dst_place, dst_ptr, boost::get<platform::GPUPlace>(src.place()),
+        src_ptr, size,
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
   }
 #endif
-
 }
 
 }  // namespace framework
diff --git a/paddle/operators/math/maxouting.cc b/paddle/operators/math/maxouting.cc
index e5168ce7af..c9003962d3 100644
--- a/paddle/operators/math/maxouting.cc
+++ b/paddle/operators/math/maxouting.cc
@@ -23,8 +23,7 @@ template <typename T>
 class MaxOutFunctor<platform::CPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input,
-                  framework::Tensor * output,
+                  const framework::Tensor& input, framework::Tensor* output,
                   int groups) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
@@ -37,34 +36,30 @@ class MaxOutFunctor<platform::CPUPlace, T> {
     T* output_data = output->mutable_data<T>(context.GetPlace());
 
     for (int i = 0; i < batch_size; ++i) {
-      int new_bindex =  c_size * i;
+      int new_bindex = c_size * i;
       for (int c = 0; c < output_channels; ++c) {
         int new_cindex = fea_size * c;
         for (int f = 0; f < fea_size; ++f) {
           T ele = static_cast<T>(-FLT_MAX);
           for (int ph = 0; ph < groups; ++ph) {
-            T x = input_data[(new_bindex + new_cindex) * groups
-              + ph * fea_size + f];
+            T x = input_data[(new_bindex + new_cindex) * groups +
+                             ph * fea_size + f];
             ele = ele > x ? ele : x;
           }
-          output_data[(new_bindex+new_cindex+f)] = ele;
+          output_data[(new_bindex + new_cindex + f)] = ele;
         }
       }
     }
   }
 };
 
-
-
 template <class T>
 class MaxOutGradFunctor<platform::CPUPlace, T> {
-public:
+ public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input,
-                  framework::Tensor * input_grad,
+                  const framework::Tensor& input, framework::Tensor* input_grad,
                   const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  int groups) {
+                  const framework::Tensor& output_grad, int groups) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -84,11 +79,11 @@ public:
           bool continue_match = true;
           int output_idx = blen + clen + f;
           for (int g = 0; g < groups && continue_match; ++g) {
-              int input_idx = input_idx0 + fea_size * g;
-              if (input_data[input_idx] == output_data[output_idx]) {
-                input_grad_data[input_idx] += output_grad_data[output_idx];
-                continue_match = false;
-              }
+            int input_idx = input_idx0 + fea_size * g;
+            if (input_data[input_idx] == output_data[output_idx]) {
+              input_grad_data[input_idx] += output_grad_data[output_idx];
+              continue_match = false;
+            }
           }
         }
       }
diff --git a/paddle/operators/math/maxouting.cu b/paddle/operators/math/maxouting.cu
index 7c698577b8..c3fabcae08 100644
--- a/paddle/operators/math/maxouting.cu
+++ b/paddle/operators/math/maxouting.cu
@@ -21,9 +21,9 @@ namespace math {
 
 template <typename T>
 __global__ void KernelMaxOut(const int nthreads, const T* input_data,
-                            const int channels,
-                             const int input_height, const int input_width,
-                             int groups, T* output_data ) {
+                             const int channels, const int input_height,
+                             const int input_width, int groups,
+                             T* output_data) {
   const int size = input_height * input_width * channels / groups;
   const int feat_len = input_height * input_width;
   int index = blockIdx.x * blockDim.x + threadIdx.x;
@@ -34,7 +34,7 @@ __global__ void KernelMaxOut(const int nthreads, const T* input_data,
     int channel_idx = batch_offset / feat_len;
     int feat_idx = batch_offset % feat_len;
     int data_idx =
-      (batch_idx * size + channel_idx * feat_len) * groups + feat_idx;
+        (batch_idx * size + channel_idx * feat_len) * groups + feat_idx;
     T ele = static_cast<T>(-FLT_MAX);
     for (int g = 0; g < groups; ++g) {
       T x = input_data[data_idx + g * feat_len];
@@ -44,34 +44,35 @@ __global__ void KernelMaxOut(const int nthreads, const T* input_data,
   }
 }
 template <typename T>
-__global__ void KernelMaxoutGrad(
-    const int nthreads, const T* input_data, const T* output_data,
-    const T* output_grad, T* input_grad, const int channels,
-    const int input_height, const int input_width, int groups) {
-    const int size = input_height * input_width * channels / groups;
-    const int feat_len = input_height * input_width;
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    int offset = blockDim.x * gridDim.x;
-    for (int i = index; i < nthreads; i += offset) {
-      int batch_idx = i / size;
-      int batch_offset = i % size;
-      int channel_idx = batch_offset / feat_len;
-      int feat_idx = batch_offset % feat_len;
-      int data_idx =
+__global__ void KernelMaxoutGrad(const int nthreads, const T* input_data,
+                                 const T* output_data, const T* output_grad,
+                                 T* input_grad, const int channels,
+                                 const int input_height, const int input_width,
+                                 int groups) {
+  const int size = input_height * input_width * channels / groups;
+  const int feat_len = input_height * input_width;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int batch_idx = i / size;
+    int batch_offset = i % size;
+    int channel_idx = batch_offset / feat_len;
+    int feat_idx = batch_offset % feat_len;
+    int data_idx =
         (batch_idx * size + channel_idx * feat_len) * groups + feat_idx;
-      int max_index = -1;
-      bool continue_match = true;
-      for (int g = 0; g < groups && continue_match; ++g) {
-        if (input_data[data_idx + g * feat_len] == output_data[i]) {
-          max_index = data_idx + g * feat_len;
-          continue_match = false;
-          break;
-        }
-      }
-      if (max_index != -1) {
-        input_grad[max_index] += output_grad[index];
+    int max_index = -1;
+    bool continue_match = true;
+    for (int g = 0; g < groups && continue_match; ++g) {
+      if (input_data[data_idx + g * feat_len] == output_data[i]) {
+        max_index = data_idx + g * feat_len;
+        continue_match = false;
+        break;
       }
     }
+    if (max_index != -1) {
+      input_grad[max_index] += output_grad[index];
+    }
+  }
 }
 /*
  * All tensors are in NCHW format.
@@ -80,7 +81,7 @@ template <typename T>
 class MaxOutFunctor<platform::GPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor * output,
+                  const framework::Tensor& input, framework::Tensor* output,
                   int groups) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
@@ -92,7 +93,7 @@ class MaxOutFunctor<platform::GPUPlace, T> {
 
     const T* input_data = input.data<T>();
     T* output_data = output->mutable_data<T>(context.GetPlace());
-    int nthreads =  output->numel();
+    int nthreads = output->numel();
     int blocks = (nthreads + 1024 - 1) / 1024;
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
@@ -101,8 +102,7 @@ class MaxOutFunctor<platform::GPUPlace, T> {
         T><<<grid, threads, 0,
              reinterpret_cast<const platform::CUDADeviceContext&>(context)
                  .stream()>>>(nthreads, input_data, input_channels,
-                              input_height, input_width, groups,
-                              output_data);
+                              input_height, input_width, groups, output_data);
   }
 };
 /*
@@ -112,11 +112,9 @@ template <typename T>
 class MaxOutGradFunctor<platform::GPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input,
-                  framework::Tensor * input_grad,
+                  const framework::Tensor& input, framework::Tensor* input_grad,
                   const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  int groups) {
+                  const framework::Tensor& output_grad, int groups) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_height = input.dims()[2];
@@ -129,7 +127,7 @@ class MaxOutGradFunctor<platform::GPUPlace, T> {
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-    int nthreads =  output.numel();
+    int nthreads = output.numel();
     int blocks = (nthreads + 1024 - 1) / 1024;
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
@@ -137,9 +135,9 @@ class MaxOutGradFunctor<platform::GPUPlace, T> {
     KernelMaxoutGrad<
         T><<<grid, threads, 0,
              reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_grad_data,
-        input_channels, input_height, input_width, groups);
+                 .stream()>>>(nthreads, input_data, output_data,
+                              output_grad_data, input_grad_data, input_channels,
+                              input_height, input_width, groups);
   }
 };
 
diff --git a/paddle/operators/math/maxouting.h b/paddle/operators/math/maxouting.h
index d4c9da38ab..2d9069b0b3 100644
--- a/paddle/operators/math/maxouting.h
+++ b/paddle/operators/math/maxouting.h
@@ -21,15 +21,14 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-#define FLT_MAX \
-    __FLT_MAX__
+#define FLT_MAX __FLT_MAX__
 
 template <typename Place, typename T>
 
 class MaxOutFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor * output,
+                  const framework::Tensor& input, framework::Tensor* output,
                   int groups);
 };
 
@@ -37,8 +36,7 @@ template <typename Place, class T>
 class MaxOutGradFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input,
-                  framework::Tensor * input_grad,
+                  const framework::Tensor& input, framework::Tensor* input_grad,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, int groups);
 };
diff --git a/paddle/operators/maxout_op.cc b/paddle/operators/maxout_op.cc
index 95467f2e69..e203a25d54 100644
--- a/paddle/operators/maxout_op.cc
+++ b/paddle/operators/maxout_op.cc
@@ -22,16 +22,17 @@ class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   MaxOutOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
+    AddInput(
+        "X",
         "(Tensor) The input tensor of maxout operator. "
         "The format of input tensor is NCHW. Where N is batch size, C is the "
         "number of channels, H and W is the height and width of feature.");
     AddOutput("Out",
-        "(Tensor) The output tensor of maxout operator."
-        "The format of output tensor is also NCHW."
-        "Where N is batch size, C is "
-        "the number of channels, H and W is the height and "
-        "width of feature.");
+              "(Tensor) The output tensor of maxout operator."
+              "The format of output tensor is also NCHW."
+              "Where N is batch size, C is "
+              "the number of channels, H and W is the height and "
+              "width of feature.");
     AddAttr<int>(
         "groups",
         R"DOC("Specifies how many groups the input tensor will be split"
@@ -59,21 +60,19 @@ class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
-
 class MaxOutOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of MaxoutOp"
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of MaxoutOp"
                    "should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of MaxoutOp should not be null.");
     auto in_x_dims = ctx->GetInputDim("X");
     int groups = ctx->Attrs().Get<int>("groups");
     // check groups > 1
-    PADDLE_ENFORCE_GT(
-        groups, 1,
-        "groups should be larger than 1 in maxoutop");
+    PADDLE_ENFORCE_GT(groups, 1, "groups should be larger than 1 in maxoutop");
     std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1] / groups});
     output_shape.push_back(in_x_dims[2]);
     output_shape.push_back(in_x_dims[3]);
@@ -87,18 +86,17 @@ class MaxOutOpGrad : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
     PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-    "Input(X@GRAD) should not be null.");
+                   "Input(X@GRAD) should not be null.");
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
 };
-}    // namespace operators
-}    // namespace paddle
+}  // namespace operators
+}  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OP(maxout, ops::MaxOutOp, ops::MaxOutOpMaker, maxout_grad,
-                        ops::MaxOutOpGrad);
-REGISTER_OP_CPU_KERNEL(maxout, ops::MaxOutKernel<paddle::platform::CPUPlace,
-                       float>);
-REGISTER_OP_CPU_KERNEL(maxout_grad,
-                       ops::MaxOutGradKernel<paddle::platform::CPUPlace,
-                       float>);
+            ops::MaxOutOpGrad);
+REGISTER_OP_CPU_KERNEL(maxout,
+                       ops::MaxOutKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    maxout_grad, ops::MaxOutGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/maxout_op.cu.cc b/paddle/operators/maxout_op.cu.cc
index a5823fba68..decd43913d 100644
--- a/paddle/operators/maxout_op.cu.cc
+++ b/paddle/operators/maxout_op.cu.cc
@@ -18,8 +18,6 @@ namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(maxout,
                        ops::MaxOutKernel<paddle::platform::GPUPlace, float>,
                        ops::MaxOutKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(maxout_grad,
-                       ops::MaxOutGradKernel<paddle::platform::GPUPlace,
-                        float>,
-                       ops::MaxOutGradKernel<paddle::platform::GPUPlace,
-                        double>);
+REGISTER_OP_GPU_KERNEL(
+    maxout_grad, ops::MaxOutGradKernel<paddle::platform::GPUPlace, float>,
+    ops::MaxOutGradKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/maxout_op.h b/paddle/operators/maxout_op.h
index c404cd16a9..44a0d073dd 100644
--- a/paddle/operators/maxout_op.h
+++ b/paddle/operators/maxout_op.h
@@ -53,7 +53,7 @@ class MaxOutGradKernel : public framework::OpKernel<T> {
       zero(device_ctx, in_x_grad, static_cast<T>(0.0));
       math::MaxOutGradFunctor<Place, T> maxout_backward;
       maxout_backward(context.device_context(), *in_x, in_x_grad, *out,
-        *out_grad, groups);
+                      *out_grad, groups);
     }
   }
 };
diff --git a/paddle/operators/roi_pool_op.cc b/paddle/operators/roi_pool_op.cc
old mode 100755
new mode 100644
index 156db93586..2b5e66c96b
--- a/paddle/operators/roi_pool_op.cc
+++ b/paddle/operators/roi_pool_op.cc
@@ -43,8 +43,8 @@ class ROIPoolOp : public framework::OperatorWithKernel {
                    "ROIs should be a 2-D tensor of shape (num_rois, 5)"
                    "given as [[batch_id, x1, y1, x2, y2], …].");
     PADDLE_ENFORCE(rois_dims[1] == kROISize,
-                "ROIs should be a 2-D tensor of shape (num_rois, 5)"
-                "given as [[batch_id, x1, y1, x2, y2], …].");
+                   "ROIs should be a 2-D tensor of shape (num_rois, 5)"
+                   "given as [[batch_id, x1, y1, x2, y2], …].");
 
     int pooled_height = ctx->Attrs().Get<int>("pooled_height");
     int pooled_width = ctx->Attrs().Get<int>("pooled_width");
@@ -65,7 +65,7 @@ class ROIPoolOp : public framework::OperatorWithKernel {
 
     ctx->SetOutputDim("Out", out_dims);
     ctx->SetOutputDim("Argmax", out_dims);
-    }
+  }
 
  protected:
   framework::OpKernelType GetKernelType(
@@ -100,7 +100,7 @@ class ROIPoolGradOp : public framework::OperatorWithKernel {
 class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   ROIPoolOpMaker(framework::OpProto* proto,
-                       framework::OpAttrChecker* op_checker)
+                 framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor), "
@@ -125,21 +125,22 @@ class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
               "(Tensor), "
               "Argmaxes corresponding to indices in X used "
               "for gradient computation. Only output "
-              "if arg “is_test” is false.").AsIntermediate();
+              "if arg “is_test” is false.")
+        .AsIntermediate();
     AddAttr<float>("spatial_scale",
                    "(float, default 1.0), "
                    "Multiplicative spatial scale factor "
                    "to translate ROI coords from their input scale "
                    "to the scale used when pooling.")
-                   .SetDefault(1.0);
+        .SetDefault(1.0);
     AddAttr<int>("pooled_height",
                  "(int, default 1), "
                  "The pooled output height.")
-                 .SetDefault(1);
+        .SetDefault(1);
     AddAttr<int>("pooled_width",
                  "(int, default 1), "
                  "The pooled output width.")
-                 .SetDefault(1);
+        .SetDefault(1);
     AddComment(R"DOC(
 ROIPool operator
 
@@ -153,11 +154,10 @@ https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker,
-            roi_pool_grad, ops::ROIPoolGradOp);
+REGISTER_OP(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker, roi_pool_grad,
+            ops::ROIPoolGradOp);
 REGISTER_OP_CPU_KERNEL(
-    roi_pool,
-    ops::CPUROIPoolOpKernel<paddle::platform::CPUPlace, float>,
+    roi_pool, ops::CPUROIPoolOpKernel<paddle::platform::CPUPlace, float>,
     ops::CPUROIPoolOpKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
     roi_pool_grad,
diff --git a/paddle/operators/roi_pool_op.cu b/paddle/operators/roi_pool_op.cu
old mode 100755
new mode 100644
index 97df45f1b5..9a4c8ca752
--- a/paddle/operators/roi_pool_op.cu
+++ b/paddle/operators/roi_pool_op.cu
@@ -29,101 +29,95 @@ static inline int NumBlocks(const int N) {
                   kNumMaxinumNumBlocks);
 }
 
-  template <typename T>
-  __global__ void GPUROIPoolForward(
-      const int nthreads, const T* input_data, const int64_t* input_rois,
-      const float spatial_scale, const int channels, const int height,
-      const int width, const int pooled_height, const int pooled_width,
-      T* output_data, int64_t* argmax_data) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    int offset = blockDim.x * gridDim.x;
-    for (size_t i = index; i < nthreads; i += offset) {
-      int pw = index % pooled_width;
-      int ph = (index / pooled_width) % pooled_height;
-      int c = (index / pooled_width / pooled_height) % channels;
-      int n = index / pooled_width / pooled_height / channels;
-
-      const int64_t* offset_input_rois = input_rois + n * kROISize;
-      int roi_batch_ind = offset_input_rois[0];
-      int roi_start_w = round(offset_input_rois[1] * spatial_scale);
-      int roi_start_h = round(offset_input_rois[2] * spatial_scale);
-      int roi_end_w = round(offset_input_rois[3] * spatial_scale);
-      int roi_end_h = round(offset_input_rois[4] * spatial_scale);
-
-      int roi_width = max(roi_end_w - roi_start_w + 1, 1);
-      int roi_height = max(roi_end_h - roi_start_h + 1, 1);
-      T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-      T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-
-      int hstart = static_cast<int>(floor(static_cast<T>(ph) * bin_size_h));
-      int wstart = static_cast<int>(floor(static_cast<T>(pw) * bin_size_w));
-      int hend = static_cast<int>(ceil(static_cast<T>(ph + 1) * bin_size_h));
-      int wend = static_cast<int>(ceil(static_cast<T>(pw + 1) * bin_size_w));
-
-      hstart = min(max(hstart + roi_start_h, 0), height);
-      hend = min(max(hend + roi_start_h, 0), height);
-      wstart = min(max(wstart + roi_start_w, 0), width);
-      wend = min(max(wend + roi_start_w, 0), width);
-      bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-      T maxval = is_empty ? 0 : -std::numeric_limits<T>::max();
-      int maxidx = -1;
-      const T* offset_input_data =
-          input_data + (roi_batch_ind * channels + c) * height * width;
-      for (int h = hstart; h < hend; ++h) {
-        for (int w = wstart; w < wend; ++w) {
-          int input_data_index = h * width + w;
-          if (offset_input_data[input_data_index] > maxval) {
-            maxval = offset_input_data[input_data_index];
-            maxidx = input_data_index;
-          }
+template <typename T>
+__global__ void GPUROIPoolForward(const int nthreads, const T* input_data,
+                                  const int64_t* input_rois,
+                                  const float spatial_scale, const int channels,
+                                  const int height, const int width,
+                                  const int pooled_height,
+                                  const int pooled_width, T* output_data,
+                                  int64_t* argmax_data) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (size_t i = index; i < nthreads; i += offset) {
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const int64_t* offset_input_rois = input_rois + n * kROISize;
+    int roi_batch_ind = offset_input_rois[0];
+    int roi_start_w = round(offset_input_rois[1] * spatial_scale);
+    int roi_start_h = round(offset_input_rois[2] * spatial_scale);
+    int roi_end_w = round(offset_input_rois[3] * spatial_scale);
+    int roi_end_h = round(offset_input_rois[4] * spatial_scale);
+
+    int roi_width = max(roi_end_w - roi_start_w + 1, 1);
+    int roi_height = max(roi_end_h - roi_start_h + 1, 1);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    int hstart = static_cast<int>(floor(static_cast<T>(ph) * bin_size_h));
+    int wstart = static_cast<int>(floor(static_cast<T>(pw) * bin_size_w));
+    int hend = static_cast<int>(ceil(static_cast<T>(ph + 1) * bin_size_h));
+    int wend = static_cast<int>(ceil(static_cast<T>(pw + 1) * bin_size_w));
+
+    hstart = min(max(hstart + roi_start_h, 0), height);
+    hend = min(max(hend + roi_start_h, 0), height);
+    wstart = min(max(wstart + roi_start_w, 0), width);
+    wend = min(max(wend + roi_start_w, 0), width);
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    T maxval = is_empty ? 0 : -std::numeric_limits<T>::max();
+    int maxidx = -1;
+    const T* offset_input_data =
+        input_data + (roi_batch_ind * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        int input_data_index = h * width + w;
+        if (offset_input_data[input_data_index] > maxval) {
+          maxval = offset_input_data[input_data_index];
+          maxidx = input_data_index;
         }
       }
-      output_data[index] = maxval;
-      if (argmax_data) {
-        argmax_data[index] = maxidx;
-      }
+    }
+    output_data[index] = maxval;
+    if (argmax_data) {
+      argmax_data[index] = maxidx;
     }
   }
+}
 
 template <typename T>
 __global__ void GPUROIPoolBackward(
-    const int nthreads,
-    const int64_t* input_rois,
-    const T* output_grad,
-    const int64_t* argmax_data,
-    const int num_rois,
-    const float spatial_scale,
-    const int channels,
-    const int height,
-    const int width,
-    const int pooled_height,
-    const int pooled_width,
-    T* input_grad) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    int offset = blockDim.x * gridDim.x;
-    for (int i = index; i < nthreads; i += offset) {
-      int pw = index % pooled_width;
-      int ph = (index / pooled_width) % pooled_height;
-      int c = (index / pooled_width / pooled_height) % channels;
-      int n = index / pooled_width / pooled_height / channels;
-
-      const int64_t* offset_input_rois = input_rois + n * kROISize;
-      int roi_batch_ind = offset_input_rois[0];
-      int input_offset = (roi_batch_ind * channels + c) * height * width;
-      int output_offset = (n * channels + c) * pooled_height * pooled_width;
-      const T* offset_output_grad = output_grad + output_offset;
-      T* offset_input_grad = input_grad + input_offset;
-      const int64_t* offset_argmax_data = argmax_data + output_offset;
-
-      int argmax = offset_argmax_data[ph * pooled_width + pw];
-      if (argmax != -1) {
-        platform::CudaAtomicAdd(offset_input_grad + argmax,
+    const int nthreads, const int64_t* input_rois, const T* output_grad,
+    const int64_t* argmax_data, const int num_rois, const float spatial_scale,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, T* input_grad) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const int64_t* offset_input_rois = input_rois + n * kROISize;
+    int roi_batch_ind = offset_input_rois[0];
+    int input_offset = (roi_batch_ind * channels + c) * height * width;
+    int output_offset = (n * channels + c) * pooled_height * pooled_width;
+    const T* offset_output_grad = output_grad + output_offset;
+    T* offset_input_grad = input_grad + input_offset;
+    const int64_t* offset_argmax_data = argmax_data + output_offset;
+
+    int argmax = offset_argmax_data[ph * pooled_width + pw];
+    if (argmax != -1) {
+      platform::CudaAtomicAdd(
+          offset_input_grad + argmax,
           static_cast<T>(offset_output_grad[ph * pooled_width + pw]));
-      }
     }
   }
-
+}
 
 template <typename Place, typename T>
 class GPUROIPoolOpKernel : public framework::OpKernel<T> {
@@ -145,25 +139,18 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
     int width = in_dims[3];
 
     size_t rois_num = rois->dims()[0];
-    if (rois_num== 0) return;
+    if (rois_num == 0) return;
 
     int output_size = out->numel();
     int blocks = NumBlocks(output_size);
     int threads = kNumCUDAThreads;
 
-    GPUROIPoolForward<T>
-      <<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
-      output_size,
-      in->data<T>(),
-      rois->data<int64_t>(),
-      spatial_scale,
-      channels,
-      height,
-      width,
-      pooled_height,
-      pooled_width,
-      out->mutable_data<T>(ctx.GetPlace()),
-      argmax->mutable_data<int64_t>(ctx.GetPlace()));
+    GPUROIPoolForward<
+        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+        output_size, in->data<T>(), rois->data<int64_t>(), spatial_scale,
+        channels, height, width, pooled_height, pooled_width,
+        out->mutable_data<T>(ctx.GetPlace()),
+        argmax->mutable_data<int64_t>(ctx.GetPlace()));
   }
 };
 
@@ -175,10 +162,8 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
     auto* rois = ctx.Input<Tensor>("ROIs");
     auto* argmax = ctx.Input<Tensor>("Argmax");
 
-    auto* out_grad =
-        ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* x_grad =
-        ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
 
     auto pooled_height = ctx.Attr<int>("pooled_height");
     auto pooled_width = ctx.Attr<int>("pooled_width");
@@ -199,21 +184,13 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
       int threads = kNumCUDAThreads;
 
       if (output_grad_size > 0) {
-        GPUROIPoolBackward<T>
-          <<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
-          output_grad_size,
-          rois->data<int64_t>(),
-          out_grad->data<T>(),
-          argmax->data<int64_t>(),
-          rois_num,
-          spatial_scale,
-          channels,
-          height,
-          width,
-          pooled_height,
-          pooled_width,
-          x_grad->mutable_data<T>(ctx.GetPlace()));
-        }
+        GPUROIPoolBackward<
+            T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+            output_grad_size, rois->data<int64_t>(), out_grad->data<T>(),
+            argmax->data<int64_t>(), rois_num, spatial_scale, channels, height,
+            width, pooled_height, pooled_width,
+            x_grad->mutable_data<T>(ctx.GetPlace()));
+      }
     }
   }
 };
@@ -223,8 +200,7 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
-    roi_pool,
-    ops::GPUROIPoolOpKernel<paddle::platform::GPUPlace, float>,
+    roi_pool, ops::GPUROIPoolOpKernel<paddle::platform::GPUPlace, float>,
     ops::GPUROIPoolOpKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(
     roi_pool_grad,
diff --git a/paddle/operators/roi_pool_op.h b/paddle/operators/roi_pool_op.h
old mode 100755
new mode 100644
index bd7736d631..1691eb482b
--- a/paddle/operators/roi_pool_op.h
+++ b/paddle/operators/roi_pool_op.h
@@ -136,8 +136,7 @@ class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
 
     auto* out_grad =
         ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* x_grad =
-        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
 
     auto pooled_height = ctx.Attr<int>("pooled_height");
     auto pooled_width = ctx.Attr<int>("pooled_width");
diff --git a/paddle/operators/sequence_slice_op.cc b/paddle/operators/sequence_slice_op.cc
old mode 100755
new mode 100644
index cbe0b42331..255683a572
--- a/paddle/operators/sequence_slice_op.cc
+++ b/paddle/operators/sequence_slice_op.cc
@@ -45,7 +45,7 @@ class SequenceSliceOp : public framework::OperatorWithKernel {
     // Initialize the output's dims to maximum,
     // and re-set to real dims by the value of Offset and Length at kernel
     ctx->SetOutputDim("Out", input_dims);
-    }
+  }
 
  protected:
   framework::OpKernelType GetKernelType(
@@ -93,8 +93,7 @@ class SequenceSliceOpMaker : public framework::OpProtoAndCheckerMaker {
              "(Tensor), "
              "a vector<int> to describe the length of every input sequence for "
              "sub sequence item.");
-    AddOutput("Out",
-              "(LoDTensor), the output of SequenceSliceOp.");
+    AddOutput("Out", "(LoDTensor), the output of SequenceSliceOp.");
     AddComment(R"DOC(
 Sequence slice operator
 
diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py
index 98b97c75ca..f10bf7e42a 100644
--- a/python/paddle/v2/dataset/uci_housing.py
+++ b/python/paddle/v2/dataset/uci_housing.py
@@ -38,6 +38,7 @@ UCI_TEST_DATA = None
 URL_MODEL = 'https://github.com/PaddlePaddle/book/raw/develop/01.fit_a_line/fit_a_line.tar'
 MD5_MODEL = '52fc3da8ef3937822fcdd87ee05c0c9b'
 
+
 def feature_range(maximums, minimums):
     import matplotlib
     matplotlib.use('Agg')
@@ -114,7 +115,8 @@ def test():
 
 
 def model():
-    tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'fit_a_line.tar', MD5_MODEL)
+    tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'fit_a_line.tar',
+                                                 MD5_MODEL)
     with open(tar_file, 'r') as f:
         parameters = Parameters.from_tar(f)
     return parameters
diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
index c96d186ffe..8ca45134dc 100644
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
@@ -35,6 +35,13 @@ opts = optimizer.minimize(avg_cost)
 
 accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
 
+inference_program = fluid.default_main_program().clone()
+test_accuracy = fluid.evaluator.Accuracy(
+    input=predict, label=label, main_program=inference_program)
+test_target = [avg_cost] + test_accuracy.metrics + test_accuracy.states
+inference_program = fluid.io.get_inference_program(
+    test_target, main_program=inference_program)
+
 train_reader = paddle.batch(
     paddle.reader.shuffle(
         paddle.dataset.mnist.train(), buf_size=8192),
@@ -69,11 +76,6 @@ for pass_id in range(PASS_NUM):
         acc = np.array(outs[1])
         pass_acc = accuracy.eval(exe)
 
-        test_accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
-
-        test_target = [avg_cost] + test_accuracy.metrics + test_accuracy.states
-        inference_program = fluid.io.get_inference_program(test_target)
-
         test_accuracy.reset(exe)
         for data in test_reader():
             x_data = np.array(map(lambda x: x[0], data)).astype("float32")
diff --git a/python/paddle/v2/fluid/tests/test_maxout_op.py b/python/paddle/v2/fluid/tests/test_maxout_op.py
index 05e42f3158..5fbed43e25 100644
--- a/python/paddle/v2/fluid/tests/test_maxout_op.py
+++ b/python/paddle/v2/fluid/tests/test_maxout_op.py
@@ -30,9 +30,7 @@ class TestMaxOutOp(OpTest):
     def init_test_case(self):
         self.MaxOut_forward_naive = maxout_forward_naive
         self.shape = [100, 6, 2, 2]
-        self.groups=2
-
-
+        self.groups = 2
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/fluid/tests/test_roi_pool_op.py b/python/paddle/v2/fluid/tests/test_roi_pool_op.py
index 7cedb930ca..a28d9c7f82 100644
--- a/python/paddle/v2/fluid/tests/test_roi_pool_op.py
+++ b/python/paddle/v2/fluid/tests/test_roi_pool_op.py
@@ -4,24 +4,22 @@ import math
 import sys
 from op_test import OpTest
 
+
 class TestROIPoolOp(OpTest):
     def set_data(self):
         self.init_test_case()
         self.make_rois()
         self.calc_roi_pool()
 
-        self.inputs = {
-            'X': self.x, 
-            'ROIs': self.rois}
-        
+        self.inputs = {'X': self.x, 'ROIs': self.rois}
+
         self.attrs = {
             'spatial_scale': self.spatial_scale,
             'pooled_height': self.pooled_height,
-            'pooled_width': self.pooled_width}
+            'pooled_width': self.pooled_width
+        }
 
-        self.outputs = {
-            'Out': self.outs,
-            'Argmax': self.argmaxes}
+        self.outputs = {'Out': self.outs, 'Argmax': self.argmaxes}
 
     def init_test_case(self):
         self.batch_size = 5
@@ -30,10 +28,9 @@ class TestROIPoolOp(OpTest):
         self.width = 4
 
         # n, c, h, w
-        self.x_dim = (self.batch_size, self.channels,
-                      self.height, self.width)
+        self.x_dim = (self.batch_size, self.channels, self.height, self.width)
 
-        self.spatial_scale = 1.0/4.0
+        self.spatial_scale = 1.0 / 4.0
         self.pooled_height = 2
         self.pooled_width = 2
         self.rois_num = 2
@@ -41,13 +38,11 @@ class TestROIPoolOp(OpTest):
         self.x = np.random.random(self.x_dim).astype('float32')
 
     def calc_roi_pool(self):
-        out_data = np.zeros(
-            (self.rois_num, self.channels,
-            self.pooled_height, self.pooled_width))
-        argmax_data = np.zeros(
-            (self.rois_num, self.channels,
-            self.pooled_height, self.pooled_width))
-            
+        out_data = np.zeros((self.rois_num, self.channels, self.pooled_height,
+                             self.pooled_width))
+        argmax_data = np.zeros((self.rois_num, self.channels,
+                                self.pooled_height, self.pooled_width))
+
         for i in range(self.rois_num):
             roi = self.rois[i]
             roi_batch_id = roi[0]
@@ -56,8 +51,8 @@ class TestROIPoolOp(OpTest):
             roi_end_w = int(round(roi[3] * self.spatial_scale))
             roi_end_h = int(round(roi[4] * self.spatial_scale))
 
-            roi_height = int(max(roi_end_h - roi_start_h + 1, 1));
-            roi_width = int(max(roi_end_w - roi_start_w + 1, 1));
+            roi_height = int(max(roi_end_h - roi_start_h + 1, 1))
+            roi_width = int(max(roi_end_w - roi_start_w + 1, 1))
 
             x_i = self.x[roi_batch_id]
 
@@ -84,7 +79,7 @@ class TestROIPoolOp(OpTest):
                             out_data[i, c, ph, pw] = -sys.float_info.max
 
                         argmax_data[i, c, ph, pw] = -1
-                        
+
                         for h in range(hstart, hend):
                             for w in range(wstart, wend):
                                 if x_i[c, h, w] > out_data[i, c, ph, pw]:
@@ -104,11 +99,11 @@ class TestROIPoolOp(OpTest):
             y1 = np.random.random_integers(
                 0, self.height / self.spatial_scale - self.pooled_height)
 
-            x2 = np.random.random_integers(
-                x1 + self.pooled_width, self.width / self.spatial_scale)
-            y2 = np.random.random_integers(
-                y1 + self.pooled_height, self.height / self.spatial_scale)
-            
+            x2 = np.random.random_integers(x1 + self.pooled_width,
+                                           self.width / self.spatial_scale)
+            y2 = np.random.random_integers(y1 + self.pooled_height,
+                                           self.height / self.spatial_scale)
+
             roi = [batch_ids[i], x1, y1, x2, y2]
             rois.append(roi)
         self.rois = np.array(rois).astype("int64")
@@ -123,5 +118,6 @@ class TestROIPoolOp(OpTest):
     def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
+
 if __name__ == '__main__':
     unittest.main()

From 539462839bced49df37f77a06838de5cf6354410 Mon Sep 17 00:00:00 2001
From: peterzhang2029 <zhangchao41@baidu.com>
Date: Mon, 27 Nov 2017 12:57:39 +0800
Subject: [PATCH 33/52] bug fix when using hsigmoid with gpu

---
 .../layers/HierarchicalSigmoidLayer.cpp       | 78 +++++++++----------
 .../gserver/layers/HierarchicalSigmoidLayer.h |  1 -
 2 files changed, 38 insertions(+), 41 deletions(-)

diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
index f93a9937d1..6317b66a45 100644
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
@@ -75,10 +75,10 @@ void HierarchicalSigmoidLayer::forward(PassType passType) {
 
   if (useGpu_) {
     Matrix::resizeOrCreate(cpuOutput_,
-                         output_.value->getHeight(),
-                         output_.value->getWidth(),
-                          /* trans */ false,
-                          false);
+                           output_.value->getHeight(),
+                           output_.value->getWidth(),
+                           /* trans */ false,
+                           false);
     IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
     cpuLabel_->copyFrom(*label);
     cpuOutput_->copyFrom(*output_.value);
@@ -90,10 +90,10 @@ void HierarchicalSigmoidLayer::forward(PassType passType) {
   if (biases_.get() != NULL) {
     if (useGpu_) {
       Matrix::resizeOrCreate(cpuBias_,
-                          1,
-                          numClasses_ - 1,
-                          /* trans */ false,
-                          false);
+                             1,
+                             numClasses_ - 1,
+                             /* trans */ false,
+                             false);
       cpuBias_->copyFrom(*biases_->getW());
     } else {
       cpuBias_ = biases_->getW();
@@ -104,15 +104,15 @@ void HierarchicalSigmoidLayer::forward(PassType passType) {
     MatrixPtr input = getInputValue(i);
     if (useGpu_) {
       Matrix::resizeOrCreate(cpuInput_,
-                          input->getHeight(),
-                          input->getWidth(),
-                          /* trans */ false,
-                          false);
+                             input->getHeight(),
+                             input->getWidth(),
+                             /* trans */ false,
+                             false);
       Matrix::resizeOrCreate(cpuWeight_,
-                          weights_[i]->getW()->getHeight(),
-                          weights_[i]->getW()->getWidth(),
-                          /* trans */ false,
-                          false);
+                             weights_[i]->getW()->getHeight(),
+                             weights_[i]->getW()->getWidth(),
+                             /* trans */ false,
+                             false);
       cpuInput_->copyFrom(*input);
       cpuWeight_->copyFrom(*weights_[i]->getW());
     } else {
@@ -129,8 +129,7 @@ void HierarchicalSigmoidLayer::forward(PassType passType) {
                                  *cpuOutput_,
                                  -1);  // scaleSum
   preOutput_.value->softrelu(*preOutput_.value);
-  MatrixPtr sum =
-      Matrix::create(batchSize, 1, /* trans= */ false, false);
+  MatrixPtr sum = Matrix::create(batchSize, 1, /* trans= */ false, false);
   preOutput_.value->rowSum(*sum);
   cpuOutput_->add(*sum);
   if (useGpu_) {
@@ -156,16 +155,15 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
     MatrixPtr biases_grad = biases_->getWGrad();
     if (useGpu_) {
       Matrix::resizeOrCreate(cpuBias_,
-                            1,
-                            numClasses_ - 1,
-                            /* trans */ false,
-                            false);
+                             1,
+                             numClasses_ - 1,
+                             /* trans */ false,
+                             false);
       cpuBias_->copyFrom(*biases_grad);
     } else {
       cpuBias_ = biases_grad;
     }
-    preOutput_.grad->addByBitCodeBackward(
-        numClasses_, *cpuLabel_, *cpuBias_);
+    preOutput_.grad->addByBitCodeBackward(numClasses_, *cpuLabel_, *cpuBias_);
     if (useGpu) {
       biases_grad->copyFrom(*cpuBias_);
     } else {
@@ -182,15 +180,15 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
       MatrixPtr weights_grad = weights_[i]->getWGrad();
       if (useGpu_) {
         Matrix::resizeOrCreate(cpuInput_,
-                              input->getHeight(),
-                              input->getWidth(),
-                              /* trans */ false,
-                              false);
+                               input->getHeight(),
+                               input->getWidth(),
+                               /* trans */ false,
+                               false);
         Matrix::resizeOrCreate(cpuWeightGrad_,
-                            weights_grad->getHeight(),
-                            weights_grad->getWidth(),
-                            /* trans */ false,
-                            false);
+                               weights_grad->getHeight(),
+                               weights_grad->getWidth(),
+                               /* trans */ false,
+                               false);
         cpuInput_->copyFrom(*input);
         cpuWeightGrad_->copyFrom(*weights_grad);
       } else {
@@ -213,15 +211,15 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
     if (inputGrad) {
       if (useGpu_) {
         Matrix::resizeOrCreate(cpuInputGrad_,
-                              inputGrad->getHeight(),
-                              inputGrad->getWidth(),
-                              /* trans */ false,
-                              false);
+                               inputGrad->getHeight(),
+                               inputGrad->getWidth(),
+                               /* trans */ false,
+                               false);
         Matrix::resizeOrCreate(cpuWeight_,
-                              weights_[i]->getW()->getHeight(),
-                              weights_[i]->getW()->getWidth(),
-                              /* trans */ false,
-                              false);
+                               weights_[i]->getW()->getHeight(),
+                               weights_[i]->getW()->getWidth(),
+                               /* trans */ false,
+                               false);
         cpuInputGrad_->copyFrom(*inputGrad);
         cpuWeight_->copyFrom(*weights_[i]->getW());
       } else {
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.h b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
index 2483572ded..7f896e61ca 100644
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.h
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
@@ -89,7 +89,6 @@ protected:
   MatrixPtr cpuBias_;
   MatrixPtr cpuOutput_;
   IVectorPtr cpuLabel_;
-
 };
 
 }  // namespace paddle

From 8a283dbc9e78f8c2f00d04180986abfb7d6b29df Mon Sep 17 00:00:00 2001
From: wangmeng28 <wangmeng28@baidu.com>
Date: Mon, 27 Nov 2017 19:13:28 +0800
Subject: [PATCH 34/52] Update docs for fm layer

---
 .../paddle/trainer_config_helpers/layers.py   | 21 ++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 32287cce6c..288aebb5b4 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -7423,18 +7423,25 @@ def factorization_machine(input,
     Factorization machines.
 
     .. code-block:: python
-       factor_machine = factorization_machine(input=input_layer, factor_size=10)
-
-    :param input: The input layer.
+        first_order = paddle.layer.fc(input=input,
+                                      size=1,
+                                      act=paddle.activation.Linear())
+        second_order = paddle.layer.factorization_machine(input=input,
+                                                          factor_size=10)
+        fm = paddle.layer.addto(input=[first_order, second_order],
+                                act=paddle.activation.Linear(),
+                                bias_attr=False)
+
+    :param input: The input layer. Supported input types: all input data types
+                  on CPU, and only dense input types on GPU.
     :type input: LayerOutput
     :param factor_size: The hyperparameter that defines the dimensionality of
-                        the latent vector size
+                        the latent vector size.
     :type context_len: int
     :param act: Activation Type. Default is linear activation.
     :type act: BaseActivation
-    :param param_attr: The Parameter Attribute. If None, the latent vectors will
-                       be initialized smartly. It's better to set it by
-                       yourself.
+    :param param_attr: The parameter attribute. See ParameterAttribute for
+                       details.
     :type param_attr: ParameterAttribute
     :param layer_attr: Extra Layer config.
     :type layer_attr: ExtraLayerAttribute|None

From 90fc4a6cd5c47eff93fc5554f0c456841fec1272 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Mon, 27 Nov 2017 19:34:11 +0800
Subject: [PATCH 35/52] Complete shrink_rnn_memory_op comments (#5935)

* Complete shrink_rnn_memory_op comments

* Update
---
 paddle/operators/shrink_rnn_memory_op.cc | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/paddle/operators/shrink_rnn_memory_op.cc b/paddle/operators/shrink_rnn_memory_op.cc
index 48597c1d2a..c380e60686 100644
--- a/paddle/operators/shrink_rnn_memory_op.cc
+++ b/paddle/operators/shrink_rnn_memory_op.cc
@@ -57,11 +57,21 @@ class ShrinkRNNMemoryOpProtoMaker : public framework::OpProtoAndCheckerMaker {
   ShrinkRNNMemoryOpProtoMaker(framework::OpProto *proto,
                               framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "");
-    AddInput("RankTable", "");
-    AddInput("I", "");
-    AddOutput("Out", "");
-    AddComment("");
+    AddInput("X", "(LoDTensor) The RNN step memory to be shrinked.");
+    AddInput("RankTable", "(LoDRankTable) The lod_rank_table of dynamic RNN.");
+    AddInput("I",
+             "(LoDTensor) The step index. The RNN step memory 'X' will be "
+             "shrinked to match the size of the input of the index'th step.");
+    AddOutput("Out", "(LoDTensor) The shrinked RNN step memory.");
+    AddComment(
+        R"DOC(
+        In dynamic RNN, we are able to handle sequences of different lengths. 
+        Because of the multiple lengths, the size of each step input can be 
+        different, which may lead to a mismatching between the input of
+        the current step and the memory generated by the previous one. This 
+        operator shrinks memory according to the size of the next step input, 
+        to make sure that they can match each other.
+        )DOC");
   }
 };
 

From d4c2f2f219d3719a32f48a0c2975b736cd8f5c02 Mon Sep 17 00:00:00 2001
From: ranqiu <ranqiu@baidu.com>
Date: Mon, 27 Nov 2017 19:57:56 +0800
Subject: [PATCH 36/52] Refine the doc of layers.py

---
 .../paddle/trainer_config_helpers/layers.py   | 48 +++++++++----------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 469e667e80..b0f21bdb46 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -2985,8 +2985,8 @@ def spp_layer(input,
     A layer performs spatial pyramid pooling.
 
     Reference:
-        Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition
-        https://arxiv.org/abs/1406.4729
+        `Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition
+        https://arxiv.org/abs/1406.4729`_
 
     The example usage is:
 
@@ -3087,8 +3087,8 @@ def img_cmrnorm_layer(input,
     Response normalization across feature maps.
 
     Reference:
-        ImageNet Classification with Deep Convolutional Neural Networks
-        http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf
+        `ImageNet Classification with Deep Convolutional Neural Networks
+        http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf`_
 
     The example usage is:
 
@@ -3154,9 +3154,9 @@ def batch_norm_layer(input,
         y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
 
     Reference:
-        Batch Normalization: Accelerating Deep Network Training by Reducing
+        `Batch Normalization: Accelerating Deep Network Training by Reducing
         Internal Covariate Shift
-        http://arxiv.org/abs/1502.03167
+        http://arxiv.org/abs/1502.03167`_
 
     The example usage is:
 
@@ -5413,10 +5413,10 @@ def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None):
     to be devided by groups.
 
     Reference:
-        Maxout Networks
-        http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf
-        Multi-digit Number Recognition from Street View Imagery using Deep Convolutional Neural Networks
-        https://arxiv.org/pdf/1312.6082v4.pdf
+        `Maxout Networks
+        http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf`_
+        `Multi-digit Number Recognition from Street View Imagery using Deep Convolutional Neural Networks
+        https://arxiv.org/pdf/1312.6082v4.pdf`_
 
     .. math::
        y_{si+j} = \max_k x_{gsi + sk + j}
@@ -5481,9 +5481,9 @@ def ctc_layer(input,
     alignment between the inputs and the target labels is unknown.
 
     Reference:
-        Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
+        `Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
         with Recurrent Neural Networks
-        http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf
+        http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf`_
 
     Note:
         Considering the 'blank' label needed by CTC, you need to use (num_classes + 1)
@@ -5555,9 +5555,9 @@ def warp_ctc_layer(input,
     install it to :code:`third_party/install/warpctc` directory.
 
     Reference:
-        Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
+        `Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
         with Recurrent Neural Networks
-        http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf
+        http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf`_
 
     Note:
         - Let num_classes represents the category number. Considering the 'blank'
@@ -5777,8 +5777,8 @@ def nce_layer(input,
     Noise-contrastive estimation.
 
     Reference:
-        A fast and simple algorithm for training neural probabilistic language
-        models. https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf
+        `A fast and simple algorithm for training neural probabilistic language
+        models. https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf`_
 
     The example usage is:
 
@@ -5893,8 +5893,8 @@ def rank_cost(left,
     A cost Layer for learning to rank using gradient descent.
 
     Reference:
-        Learning to Rank using Gradient Descent
-        http://research.microsoft.com/en-us/um/people/cburges/papers/ICML_ranking.pdf
+        `Learning to Rank using Gradient Descent
+        http://research.microsoft.com/en-us/um/people/cburges/papers/ICML_ranking.pdf`_
 
     .. math::
 
@@ -6429,8 +6429,8 @@ def smooth_l1_cost(input, label, name=None, coeff=1.0, layer_attr=None):
         smooth_{L1}(x) = \\begin{cases} 0.5x^2& \\text{if}  \\ |x| < 1 \\\\ |x|-0.5& \\text{otherwise} \end{cases}
 
     Reference:
-        Fast R-CNN
-        https://arxiv.org/pdf/1504.08083v2.pdf
+        `Fast R-CNN
+        https://arxiv.org/pdf/1504.08083v2.pdf`_
 
     The example usage is:
 
@@ -6636,8 +6636,8 @@ def prelu_layer(input,
     The Parametric Relu activation that actives outputs with a learnable weight.
 
     Reference:
-        Delving Deep into Rectifiers: Surpassing Human-Level Performance on
-        ImageNet Classification http://arxiv.org/pdf/1502.01852v1.pdf
+        `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
+        ImageNet Classification http://arxiv.org/pdf/1502.01852v1.pdf`_
 
     .. math::
        z_i &\\quad if \\quad z_i > 0 \\\\
@@ -6733,8 +6733,8 @@ def gated_unit_layer(input,
     product between :match:`X'` and :math:`\sigma` is finally returned.
 
     Reference:
-        Language Modeling with Gated Convolutional Networks
-        https://arxiv.org/abs/1612.08083
+        `Language Modeling with Gated Convolutional Networks
+        https://arxiv.org/abs/1612.08083`_
 
     .. math::
        y=\\text{act}(X \cdot W + b)\otimes \sigma(X \cdot V + c)

From ef3420e2b940d23bbc5cbb1b80d4bca457507257 Mon Sep 17 00:00:00 2001
From: Abhinav Arora <abhinavarora28@gmail.com>
Date: Mon, 27 Nov 2017 19:02:42 +0530
Subject: [PATCH 37/52] Fix the latex comment syntax in sgd_op.cc (#5940)

* Fix the latex comment syntax in sgd_op.cc

* Change \textunderscore to \_
---
 paddle/operators/sgd_op.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/operators/sgd_op.cc b/paddle/operators/sgd_op.cc
index 72f4e4d5cb..5576d7b8be 100644
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
@@ -55,7 +55,7 @@ SGD operator
 
 This operator implements one step of the stochastic gradient descent algorithm.
 
-$$param_out = param - learning_rate * grad$$
+$$param\_out = param - learning\_rate * grad$$
 
 )DOC");
   }

From 966a442eb0799b6e25d601d2f27affc1cc74aefd Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Mon, 27 Nov 2017 21:53:16 +0800
Subject: [PATCH 38/52] fix grep socket error in lscpu command

---
 python/paddle/v2/__init__.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py
index 33a0829ba8..70f61e8499 100644
--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
@@ -83,11 +83,10 @@ def set_omp_mkl_env_vars(trainer_count):
         '''Get the number of physical cores'''
         if platform.system() == "Linux":
             num_sockets = int(
-                os.popen("lscpu |grep \"Socket\" |awk -F':' '{print $2}'|xargs")
+                os.popen("grep 'physical id' /proc/cpuinfo | sort -u | wc -l")
                 .read())
             num_cores_per_socket = int(
-                os.popen(
-                    "lscpu |grep \"per socket\" |awk -F':' '{print $2}'|xargs")
+                os.popen("grep 'core id' /proc/cpuinfo | sort -u | wc -l")
                 .read())
             return num_sockets * num_cores_per_socket
         else:

From f96bc313e87a8a8ef73907d153c28e117e3c8d3f Mon Sep 17 00:00:00 2001
From: Yancey <yancey1989@gmail.com>
Date: Tue, 28 Nov 2017 10:34:49 +0800
Subject: [PATCH 39/52] fix path env in build.sh (#5948)

---
 paddle/scripts/docker/build.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index fda2a2f1b7..a2fdc5ce69 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -16,11 +16,13 @@ function cmake_gen() {
         echo "using python abi: $1"
         if [ "$1" == "cp27-cp27m" ]; then
             export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs4/lib:}
+            export PATH=/opt/python/cp27-cp27m/bin/:${PATH}
             PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python
         -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7
         -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs2/lib/libpython2.7.so"
         elif [ "$1" == "cp27-cp27mu" ]; then
             export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:}
+            export PATH=/opt/python/cp27-cp27mu/bin/:${PATH}
             PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python
         -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7
         -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so"

From dc82a30908d0d75948491b0a669abfd690b4acce Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 28 Nov 2017 10:41:07 +0800
Subject: [PATCH 40/52] Refine CheckStyle Script (#5942)

* Refine CheckStyle Script

* Disable linkchecker for build_doc.sh
---
 .travis.yml                        | 2 +-
 paddle/scripts/travis/build_doc.sh | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index c51e02eb79..e2d49daa19 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -42,7 +42,7 @@ before_install:
 script:
   - |
     timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
-    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi;
+    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true ;else exit 1; fi;
   - |
     if [[ "$JOB" != "build_doc" ]]; then exit 0; fi;
     if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh
index 28d82343ed..7d54f0254c 100755
--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
@@ -11,8 +11,9 @@ make -j `nproc` gen_proto_py
 make -j `nproc` paddle_docs paddle_docs_cn
 
 # check websites for broken links
-linkchecker doc/en/html/index.html
-linkchecker doc/cn/html/index.html
+# It will be failed now!
+#linkchecker doc/en/html/index.html
+#linkchecker doc/cn/html/index.html
 
 # Parse Github URL
 REPO=`git config remote.origin.url`

From a88d98c413d3ba70c37228e3d9d5e1cda77e9fa0 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Tue, 28 Nov 2017 10:46:31 +0800
Subject: [PATCH 41/52] Add comments

---
 python/paddle/trainer/config_parser.py         | 16 ++++++++--------
 python/paddle/trainer_config_helpers/layers.py |  1 +
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 9ec6ba6347..deb77e6fd7 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -2400,15 +2400,14 @@ class CropLayer(LayerBase):
         image_conf.img_size_y = input_layer.height
         image_conf.channels = input_layer.size / (input_layer.width *
                                                   input_layer.height)
-
+        # only support for 4-dims inputs and NCHW order
         if (len(self.config.inputs) == 2):
             self.set_layer_height_width(
                 self.get_input_layer(1).height, self.get_input_layer(1).width)
             self.set_layer_size(self.get_input_layer(1).size)
         else:
-            # NCHW order
             self.set_layer_height_width(shape[-2], shape[-1])
-            self.set_layer_size(reduce(lambda x, y: x * y, shape))
+            self.set_layer_size(reduce(lambda x, y: x * y, shape[1:]))
 
 
 @config_layer('batch_norm')
@@ -3865,18 +3864,19 @@ class SwitchOrderLayer(LayerBase):
         else:
             in_h = input_layer.height
             in_w = input_layer.width
+            out_dims = None
             if input_layer.has_depth():
                 in_d = input_layer.depth
                 in_c = input_layer.size / in_h / in_w / in_d
+                # batch_size, depth, height, width, channel
                 out_dims = [0, in_d, in_h, in_w, in_c]
-                size = reduce(lambda x, y: x * y,
-                              out_dims[reshape['width'][0]:])
             else:
                 in_c = input_layer.size / in_h / in_w
+                # batch_size, height, width, channel
                 out_dims = [0, in_h, in_w, in_c]
-                size = reduce(lambda x, y: x * y,
-                              out_dims[reshape['width'][0]:])
-
+            # Because (reshape['width'][0] > 0) always be true.
+            # So out_dims[0] won't be used.
+            size = reduce(lambda x, y: x * y, out_dims[reshape['width'][0]:])
             self.set_layer_size(size)
 
 
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 8e127c9489..bfa395ee13 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -6854,6 +6854,7 @@ def crop_layer(input, offset, axis=2, shape=None, name=None, layer_attr=None):
 
     :param input: The input of this layer. If two inputs are given, the second one
                   will be regarded as the reference.
+                  And the input must be 4-dims and in NCHW order.
     :type input: LayerOutput | Sequence
     :param offset: The crop offset.
     :type offset: Sequence

From 0a8a86e0c9733dd85e82c58d2042d1abb7c85b73 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=AD=A6=E6=AF=85?= <typhoonzero1986@gmail.com>
Date: Tue, 28 Nov 2017 11:02:24 +0800
Subject: [PATCH 42/52] Send recv op (#5520)

* WIP send recv op

* WIP send recv

* put grpc impl in details

* put grpc impl in details

* update wip

* update proto

* update proto

* update proto

* clean cmake

* wip on op implementations

* wip on op implementations

* compile ok adding ut

* wip unitest

* add extern cares for linking

* wip add ut

* working version send recv

* revert optimizer.py

* update test cmake

* add libtool to dockerfile

* update cmake dependency

* update cmake depends

* update cmake grpc depends

* fix cmake dependency

* fix compile error

* fix compile

* follow comments

* update

* update copyfrom
---
 .clang-format                                |   1 -
 CMakeLists.txt                               |   2 +
 Dockerfile                                   |   2 +-
 cmake/external/cares.cmake                   |  45 +++++
 cmake/external/grpc.cmake                    |  58 +++++++
 cmake/external/zlib.cmake                    |   2 +
 cmake/generic.cmake                          |  47 ++++++
 paddle/framework/lod_tensor.cc               | 163 +++++++++++++++++--
 paddle/framework/lod_tensor.h                |   9 +
 paddle/operators/CMakeLists.txt              |  25 ++-
 paddle/operators/detail/CMakeLists.txt       |   1 +
 paddle/operators/detail/recv_impl.cc         |  44 +++++
 paddle/operators/detail/send_impl.cc         |  54 ++++++
 paddle/operators/detail/send_recv.proto      |  37 +++++
 paddle/operators/detail/send_recv_impl.h     |  87 ++++++++++
 paddle/operators/detail/simple_block_queue.h |  52 ++++++
 paddle/operators/load_op.cc                  |  56 +------
 paddle/operators/recv_op.cc                  | 121 ++++++++++++++
 paddle/operators/save_op.cc                  |  68 +-------
 paddle/operators/send_op.cc                  |  84 ++++++++++
 paddle/operators/send_recv_op_test.cc        | 125 ++++++++++++++
 21 files changed, 941 insertions(+), 142 deletions(-)
 create mode 100644 cmake/external/cares.cmake
 create mode 100644 cmake/external/grpc.cmake
 create mode 100644 paddle/operators/detail/CMakeLists.txt
 create mode 100644 paddle/operators/detail/recv_impl.cc
 create mode 100644 paddle/operators/detail/send_impl.cc
 create mode 100644 paddle/operators/detail/send_recv.proto
 create mode 100644 paddle/operators/detail/send_recv_impl.h
 create mode 100644 paddle/operators/detail/simple_block_queue.h
 create mode 100644 paddle/operators/recv_op.cc
 create mode 100644 paddle/operators/send_op.cc
 create mode 100644 paddle/operators/send_recv_op_test.cc

diff --git a/.clang-format b/.clang-format
index 9ba433b173..aff93435f5 100644
--- a/.clang-format
+++ b/.clang-format
@@ -25,4 +25,3 @@ AllowAllParametersOfDeclarationOnNextLine: true
 BinPackParameters: false
 BinPackArguments: false
 ...
-
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 65164b8472..e76512166f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -133,6 +133,8 @@ include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
 include(external/nccl)
+include(external/cares)
+include(external/grpc)
 
 include(cudnn)              # set cudnn libraries, must before configure
 include(configure)          # add paddle env configuration
diff --git a/Dockerfile b/Dockerfile
index 150344a811..857d3f3e5f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -29,7 +29,7 @@ RUN apt-get update && \
     automake locales clang-format swig doxygen cmake  \
     liblapack-dev liblapacke-dev libboost-dev \
     clang-3.8 llvm-3.8 libclang-3.8-dev \
-    net-tools && \
+    net-tools libtool && \
     apt-get clean -y
 
 # Install Go and glide
diff --git a/cmake/external/cares.cmake b/cmake/external/cares.cmake
new file mode 100644
index 0000000000..e05111ee18
--- /dev/null
+++ b/cmake/external/cares.cmake
@@ -0,0 +1,45 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+IF(MOBILE_INFERENCE)
+    return()
+ENDIF()
+
+include (ExternalProject)
+
+# NOTE: c-ares is needed when linking with grpc.
+
+SET(CARES_SOURCES_DIR ${THIRD_PARTY_PATH}/cares)
+SET(CARES_INSTALL_DIR ${THIRD_PARTY_PATH}/install/cares)
+SET(CARES_INCLUDE_DIR "${CARES_INSTALL_DIR}/include/" CACHE PATH "cares include directory." FORCE)
+
+ExternalProject_Add(
+    extern_cares
+    GIT_REPOSITORY "https://github.com/c-ares/c-ares.git"
+    GIT_TAG "cares-1_13_0"
+    PREFIX          ${CARES_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ./buildconf && ./configure --disable-shared --prefix=${CARES_INSTALL_DIR}
+    BUILD_IN_SOURCE 1
+    BUILD_COMMAND   make
+    INSTALL_COMMAND make install
+)
+
+ADD_LIBRARY(cares STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET cares PROPERTY IMPORTED_LOCATION
+             "${CARES_INSTALL_DIR}/lib/libcares.a")
+
+include_directories(${CARES_INCLUDE_DIR})
+ADD_DEPENDENCIES(cares extern_cares)
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
new file mode 100644
index 0000000000..f431c037fd
--- /dev/null
+++ b/cmake/external/grpc.cmake
@@ -0,0 +1,58 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+IF(MOBILE_INFERENCE)
+    return()
+ENDIF()
+
+include (ExternalProject)
+
+SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc)
+SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc)
+SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE)
+SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE)
+
+ExternalProject_Add(
+    extern_grpc
+    DEPENDS protobuf zlib
+    GIT_REPOSITORY "https://github.com/grpc/grpc.git"
+    GIT_TAG "v1.7.x"
+    PREFIX          ${GRPC_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ""
+    BUILD_IN_SOURCE 1
+    BUILD_COMMAND   make
+    INSTALL_COMMAND make prefix=${GRPC_INSTALL_DIR} install
+)
+
+# FIXME(typhoonzero): hack to get static lib path, try a better way like merge them.
+ADD_LIBRARY(grpc++_unsecure STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET grpc++_unsecure PROPERTY IMPORTED_LOCATION
+             "${GRPC_INSTALL_DIR}/lib/libgrpc++_unsecure.a")
+
+ADD_LIBRARY(grpc++ STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET grpc++ PROPERTY IMPORTED_LOCATION
+            "${GRPC_INSTALL_DIR}/lib/libgrpc++.a")
+ADD_LIBRARY(gpr STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET gpr PROPERTY IMPORTED_LOCATION
+            "${GRPC_INSTALL_DIR}/lib/libgpr.a")
+
+ADD_LIBRARY(grpc_unsecure STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET grpc_unsecure PROPERTY IMPORTED_LOCATION
+            "${GRPC_INSTALL_DIR}/lib/libgrpc_unsecure.a")
+
+include_directories(${GRPC_INCLUDE_DIR})
+ADD_DEPENDENCIES(grpc++_unsecure extern_grpc)
+
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index a98e069b7c..1638cd8fdf 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -50,6 +50,8 @@ ExternalProject_Add(
 )
 
 LIST(APPEND external_project_dependencies zlib)
+ADD_LIBRARY(zlib_target STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET zlib_target PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
 
 IF(WITH_C_API)
   INSTALL(DIRECTORY ${ZLIB_INCLUDE_DIR} DESTINATION third_party/zlib)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 7b82d409a3..c917ca0ff4 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -467,3 +467,50 @@ function(py_test TARGET_NAME)
              WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
   endif()
 endfunction()
+
+# grpc_library generate grpc code using grpc_cpp_plugin and protoc
+# then build the generated protobuf code and grpc code with your
+# implementation source codes together. Use SRCS argument for your
+# implementation source files and PROTO argument for your .proto
+# files.
+#
+# Usage: grpc_library(my_target SRCS my_client.cc PROTO my_target.proto DEPS my_dep)
+
+function(grpc_library TARGET_NAME)
+  set(oneValueArgs PROTO)
+  set(multiValueArgs SRCS DEPS)
+  set(options "")
+  cmake_parse_arguments(grpc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  message(STATUS "generating grpc ${grpc_library_PROTO}")
+
+  get_filename_component(ABS_PROTO ${grpc_library_PROTO} ABSOLUTE)
+  get_filename_component(PROTO_WE ${grpc_library_PROTO} NAME_WE)
+  get_filename_component(PROTO_PATH ${ABS_PROTO} PATH)
+
+  protobuf_generate_cpp(grpc_proto_srcs grpc_proto_hdrs "${ABS_PROTO}")
+  set(grpc_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.cc")
+  set(grpc_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.h")
+  cc_library("${TARGET_NAME}_proto" SRCS "${grpc_proto_srcs}")
+
+  add_custom_command(
+          OUTPUT "${grpc_grpc_srcs}" "${grpc_grpc_hdrs}"
+          COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
+          ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}"
+          --plugin=protoc-gen-grpc="${GRPC_CPP_PLUGIN}" "${ABS_PROTO}"
+          DEPENDS "${ABS_PROTO}" ${PROTOBUF_PROTOC_EXECUTABLE} extern_grpc)
+
+  # FIXME(typhoonzero): grpc generated code do not generate virtual-dtor, mark it
+  # as compiler warnings instead of error. Should try remove the warnings also.
+  set_source_files_properties(
+    ${grpc_grpc_srcs}
+    PROPERTIES
+    COMPILE_FLAGS  "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+  cc_library("${TARGET_NAME}_grpc" SRCS "${grpc_grpc_srcs}")
+
+  set_source_files_properties(
+    ${grpc_library_SRCS}
+    PROPERTIES
+    COMPILE_FLAGS  "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+  cc_library("${TARGET_NAME}" SRCS "${grpc_library_SRCS}" DEPS "${TARGET_NAME}_grpc" "${TARGET_NAME}_proto" "${grpc_library_DEPS}")
+endfunction()
diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
index a0f2906c74..fdf6de4bab 100644
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -13,6 +13,8 @@
    limitations under the License. */
 
 #include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/framework.pb.h"
 
 #include "paddle/memory/memcpy.h"
 #include "paddle/memory/memory.h"
@@ -27,11 +29,11 @@
 namespace paddle {
 namespace framework {
 
-std::ostream& operator<<(std::ostream& os, const LoD& lod) {
+std::ostream &operator<<(std::ostream &os, const LoD &lod) {
   os << "{";
-  for (auto& v : lod) {
+  for (auto &v : lod) {
     os << "{";
-    for (auto& i : v) {
+    for (auto &i : v) {
       os << i << ",";
     }
     os << "}";
@@ -41,7 +43,7 @@ std::ostream& operator<<(std::ostream& os, const LoD& lod) {
   return os;
 }
 
-LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) {
+LoD SliceLevels(const LoD &in, size_t level_begin, size_t level_end) {
   LoD new_lod;
   new_lod.reserve(level_end - level_begin);
   for (size_t i = level_begin; i < level_end; i++) {
@@ -53,7 +55,7 @@ LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) {
   return new_lod;
 }
 
-LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
+LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
                  size_t elem_end) {
   PADDLE_ENFORCE_LT(level, in.size());
   PADDLE_ENFORCE_LT(elem_end, in[level].size());
@@ -64,9 +66,9 @@ LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
   res[0].assign(in[level].begin() + elem_begin,
                 in[level].begin() + elem_end + 1);
   for (size_t lvl = 1; lvl < res.size(); lvl++) {
-    const auto& in_level = in[level + lvl];
-    const auto& above_level = res[lvl - 1];
-    auto& out_level = res[lvl];
+    const auto &in_level = in[level + lvl];
+    const auto &above_level = res[lvl - 1];
+    auto &out_level = res[lvl];
     out_level.assign(in_level.begin() + above_level.front(),
                      in_level.begin() + above_level.back() + 1);
   }
@@ -74,33 +76,33 @@ LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
     // to make the first offset equals 0, all the elements minus the first
     // element
     size_t front = res[lvl].front();
-    for (auto& ele : res[lvl]) {
+    for (auto &ele : res[lvl]) {
       ele -= front;
     }
   }
   return res;
 }
 
-LoD ToAbsOffset(const LoD& in) {
+LoD ToAbsOffset(const LoD &in) {
   // the lowest level stores relative offsets
   if (in.empty() || in.size() == 1) return in;
   LoD result = in;
   for (int level = result.size() - 2; level >= 0; level--) {
-    for (auto& ele : result[level]) {
+    for (auto &ele : result[level]) {
       ele = result[level + 1][ele];
     }
   }
   return result;
 }
 
-bool operator==(const LoD& a, const LoD& b) {
+bool operator==(const LoD &a, const LoD &b) {
   if (a.size() != b.size()) {
     return false;
   }
 
   for (size_t i = 0; i < a.size(); i++) {
-    const auto& a_level = a[i];
-    const auto& b_level = b[i];
+    const auto &a_level = a[i];
+    const auto &b_level = b[i];
     if (a_level.size() != b_level.size()) {
       return false;
     }
@@ -151,7 +153,7 @@ void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin,
 }
 
 using LoDAndOffset = std::pair<LoD, std::pair<size_t, size_t>>;
-LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD& lod, size_t start_idx,
+LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx,
                                         size_t end_idx, size_t start_level) {
   LoD sub_lod;
 
@@ -170,7 +172,7 @@ LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD& lod, size_t start_idx,
   return LoDAndOffset{sub_lod, {start_idx, end_idx}};
 }
 
-void AppendLoD(LoD* lod, const LoD& lod_length) {
+void AppendLoD(LoD *lod, const LoD &lod_length) {
   PADDLE_ENFORCE(
       lod->empty() || lod->size() == lod_length.size(),
       "The lod_length should has the same size with the appended lod.");
@@ -178,12 +180,139 @@ void AppendLoD(LoD* lod, const LoD& lod_length) {
     *lod = LoD(lod_length.size(), std::vector<size_t>({0}));
   }
   for (size_t i = 0; i < lod->size(); ++i) {
-    auto& level = (*lod)[i];
+    auto &level = (*lod)[i];
     for (size_t len : lod_length[i]) {
       level.push_back(level.back() + len);
     }
   }
 }
 
+void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
+                       const platform::DeviceContext &dev_ctx) {
+  // TODO(typhoonzero): serialize to ostream
+  {  // the 1st field, uint32_t version
+    constexpr uint32_t version = 0;
+    os.write(reinterpret_cast<const char *>(&version), sizeof(version));
+  }
+  {  // the 2nd field, tensor description
+     // int32_t  size
+     // void*    protobuf message
+    framework::TensorDesc desc;
+    desc.set_data_type(framework::ToDataType(tensor.type()));
+    auto dims = framework::vectorize(tensor.dims());
+    auto *pb_dims = desc.mutable_dims();
+    pb_dims->Resize(static_cast<int>(dims.size()), 0);
+    std::copy(dims.begin(), dims.end(), pb_dims->begin());
+    int32_t size = desc.ByteSize();
+    os.write(reinterpret_cast<const char *>(&size), sizeof(size));
+    auto out = desc.SerializeAsString();
+    os.write(out.data(), size);
+  }
+  {  // the 3rd field, tensor data
+    uint64_t size = tensor.memory_size();
+    auto *data_ptr = tensor.data<void>();
+    PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
+                   "Index overflow when writing tensor");
+    if (platform::is_gpu_place(tensor.place())) {
+#ifdef PADDLE_WITH_CUDA
+      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
+      std::unique_ptr<char[]> buf(new char[kBufSize]);
+      auto &gpu_dev_ctx =
+          static_cast<const platform::CUDADeviceContext &>(dev_ctx);
+      platform::CPUPlace cpu;
+      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
+      while (size != 0) {
+        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
+        memory::Copy(cpu, buf.get(),
+                     boost::get<platform::GPUPlace>(tensor.place()),
+                     reinterpret_cast<const void *>(data), size_to_write,
+                     gpu_dev_ctx.stream());
+        gpu_dev_ctx.Wait();
+        os.write(buf.get(), size_to_write);
+        data += size_to_write;
+        size -= size_to_write;
+      }
+#else
+      PADDLE_THROW("Unexpected branch");
+#endif
+    } else {
+      os.write(static_cast<const char *>(data_ptr),
+               static_cast<std::streamsize>(size));
+    }
+  }
+  {  // the 4th field, lod information
+     // uint64_t lod_level
+     // uint64_t lod_level_1 size in byte.
+     // int*     lod_level_1 data
+     // ...
+    auto lod = tensor.lod();
+    uint64_t size = lod.size();
+    os.write(reinterpret_cast<const char *>(&size), sizeof(size));
+
+    for (auto &each : lod) {
+      size = each.size() * sizeof(framework::LoD::value_type::value_type);
+      os.write(reinterpret_cast<const char *>(&size), sizeof(size));
+      os.write(reinterpret_cast<const char *>(each.data()),
+               static_cast<std::streamsize>(size));
+    }
+  }
+}
+
+void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
+  uint32_t version;
+  is.read(reinterpret_cast<char *>(&version), sizeof(version));
+  PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+  framework::TensorDesc desc;
+  {  // int32_t size
+     // proto buffer
+    int32_t size;
+    is.read(reinterpret_cast<char *>(&size), sizeof(size));
+    std::unique_ptr<char[]> buf(new char[size]);
+    is.read(reinterpret_cast<char *>(buf.get()), size);
+    PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
+                   "Cannot parse tensor desc");
+  }
+  {  // read tensor
+    std::vector<int64_t> dims;
+    dims.reserve(static_cast<size_t>(desc.dims().size()));
+    std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
+    tensor->Resize(framework::make_ddim(dims));
+
+    void *buf;
+    platform::Place cpu = platform::CPUPlace();
+    switch (desc.data_type()) {
+      case framework::FP32:
+        buf = tensor->mutable_data<float>(cpu);
+        break;
+      case framework::FP64:
+        buf = tensor->mutable_data<double>(cpu);
+        break;
+      case framework::INT32:
+        buf = tensor->mutable_data<int>(cpu);
+        break;
+      case framework::INT64:
+        buf = tensor->mutable_data<int64_t>(cpu);
+        break;
+      default:
+        PADDLE_THROW("DataType %d not supported", desc.data_type());
+    }
+    is.read(static_cast<char *>(buf), tensor->memory_size());
+  }
+  {  // read lod
+    uint64_t lod_level;
+    is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
+    auto &lod = *tensor->mutable_lod();
+    lod.resize(lod_level);
+    for (uint64_t i = 0; i < lod_level; ++i) {
+      uint64_t size;
+      is.read(reinterpret_cast<char *>(&size), sizeof(size));
+      std::vector<size_t> tmp(size / sizeof(size_t));
+      is.read(reinterpret_cast<char *>(tmp.data()),
+              static_cast<std::streamsize>(size));
+      lod[i] = tmp;
+    }
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
index 21bdfca111..9411c96aea 100644
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -189,5 +189,14 @@ std::pair<LoD, std::pair<size_t, size_t>> GetSubLoDAndAbsoluteOffset(
 
 void AppendLoD(LoD* lod, const LoD& lod_length);
 
+/*
+ * Serialize/Desiralize LoDTensor to std::ostream
+ * You can pass ofstream or ostringstream to serilize to file
+ * or to a in memory string. GPU tensor will be copied to CPU.
+ */
+void SerializeToStream(std::ostream& os, const LoDTensor& tensor,
+                       const platform::DeviceContext& dev_ctx);
+void DeserializeFromStream(std::istream& is, LoDTensor* tensor);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index a4c4374cf2..7e5d4fd640 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -205,8 +205,24 @@ set(DEPS_OPS
     tensor_array_read_write_op
     gru_op
     adagrad_op
-    sgd_op)
+    sgd_op
+    save_op
+    load_op
+    send_op
+    recv_op)
 
+add_subdirectory(detail)
+op_library(send_op SRCS send_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
+set_source_files_properties(
+    send_op.cc
+    PROPERTIES
+    COMPILE_FLAGS  "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+
+op_library(recv_op SRCS recv_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
+set_source_files_properties(
+    recv_op.cc
+    PROPERTIES
+    COMPILE_FLAGS  "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
 
 op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
 op_library(cross_entropy_op DEPS cross_entropy)
@@ -235,6 +251,10 @@ op_library(conv_transpose_op DEPS vol2col)
 op_library(gru_op DEPS sequence2batch gru_compute)
 op_library(recurrent_op SRCS recurrent_op.cc DEPS executor)
 
+# FIXME(typhoonzero): save/load depends lodtensor serialization functions
+op_library(save_op DEPS lod_tensor)
+op_library(load_op DEPS lod_tensor)
+
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
     op_library(${src})
@@ -242,6 +262,8 @@ endforeach()
 
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
 
+
+
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
 cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
@@ -251,3 +273,4 @@ if(WITH_GPU)
   cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
 endif()
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
+cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor)
diff --git a/paddle/operators/detail/CMakeLists.txt b/paddle/operators/detail/CMakeLists.txt
new file mode 100644
index 0000000000..f6bdc63cc2
--- /dev/null
+++ b/paddle/operators/detail/CMakeLists.txt
@@ -0,0 +1 @@
+grpc_library(sendrecvop_grpc SRCS recv_impl.cc send_impl.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
diff --git a/paddle/operators/detail/recv_impl.cc b/paddle/operators/detail/recv_impl.cc
new file mode 100644
index 0000000000..89dc504522
--- /dev/null
+++ b/paddle/operators/detail/recv_impl.cc
@@ -0,0 +1,44 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "send_recv_impl.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+Status SendRecvServerImpl::SendVariable(ServerContext *context,
+                                        const VariableMessage *in_var,
+                                        VariableMessage *out_var) {
+  framework::LoDTensor t;
+  // TODO(typhoonzero): desirealize in_tensor and run pserver network.
+  std::istringstream iss(in_var->serialized());
+  framework::DeserializeFromStream(iss, &t);
+  lodtensor_queue_.Push(std::move(t));
+  // Block util the sub graph is done.
+  t = lodtensor_return_queue_.Pop();
+  std::ostringstream oss;
+  // FIXME(typhoonzero): get context from op.
+  framework::SerializeToStream(oss, t, platform::CPUDeviceContext());
+  std::string *varname = out_var->mutable_varname();
+  *varname = in_var->varname();
+  std::string *serialized = out_var->mutable_serialized();
+  *serialized = oss.str();
+
+  return Status::OK;
+}
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/send_impl.cc b/paddle/operators/detail/send_impl.cc
new file mode 100644
index 0000000000..da1ddf75d2
--- /dev/null
+++ b/paddle/operators/detail/send_impl.cc
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "send_recv_impl.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+bool RPCClient::SendVariable(const framework::Scope& scope,
+                             const std::string& inname,
+                             const std::string& outname) {
+  ClientContext context;
+  VariableMessage msg, out_msg;
+  // FIXME(typhoonzero): pass device context to here.
+  auto ctx = platform::CPUDeviceContext();
+  auto* var = scope.FindVar(inname);
+  PADDLE_ENFORCE(var);
+  // TODO(typhoonzero): support SelectedRows
+  PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
+                 "Only support LoDTensor, %s has wrong type", inname);
+  const framework::LoDTensor& tensor = var->Get<framework::LoDTensor>();
+  std::ostringstream oss;
+  framework::SerializeToStream(oss, tensor, ctx);
+  msg.set_varname(inname);
+  msg.set_serialized(oss.str());
+  Status status = stub_->SendVariable(&context, msg, &out_msg);
+  if (!status.ok()) {
+    return false;
+  }
+  std::istringstream iss(out_msg.serialized());
+  framework::LoDTensor ret_tensor;
+  framework::DeserializeFromStream(iss, &ret_tensor);
+  auto* outvar = scope.FindVar(outname);
+  framework::LoDTensor* out_tensor = outvar->GetMutable<framework::LoDTensor>();
+  // FIXME(typhoonzero): do not copy.
+  framework::CopyFrom(ret_tensor, ctx.GetPlace(), ctx, out_tensor);
+  return true;
+}
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/send_recv.proto b/paddle/operators/detail/send_recv.proto
new file mode 100644
index 0000000000..66f84678b3
--- /dev/null
+++ b/paddle/operators/detail/send_recv.proto
@@ -0,0 +1,37 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+syntax = "proto3";
+
+package sendrecv;
+
+service SendRecvService {
+  // For parameter server round-robin like hashing, do not split tensors. 
+  // Send and recv only one tensor
+  rpc SendVariable(VariableMessage) returns (VariableMessage) {}
+}
+
+// VariableMessage is serialized paddle variable message.
+// It can be:
+// Tensor
+// LoDTensor
+// SelectedRows
+message VariableMessage {
+  string varname = 1;
+  bytes serialized = 2;
+}
+
+message VoidMessage {
+
+}
\ No newline at end of file
diff --git a/paddle/operators/detail/send_recv_impl.h b/paddle/operators/detail/send_recv_impl.h
new file mode 100644
index 0000000000..b9a5340a86
--- /dev/null
+++ b/paddle/operators/detail/send_recv_impl.h
@@ -0,0 +1,87 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/scope.h"
+#include "paddle/framework/selected_rows.h"
+#include "paddle/operators/detail/simple_block_queue.h"
+
+// #include <grpc++/channel.h>
+// #include <grpc++/client_context.h>
+// #include <grpc++/create_channel.h>
+// #include <grpc++/security/credentials.h>
+#include "paddle/operators/detail/send_recv.grpc.pb.h"
+#include "paddle/operators/detail/send_recv.pb.h"
+
+#include <grpc++/grpc++.h>
+
+using grpc::Channel;
+using grpc::Server;
+using grpc::ServerContext;
+using grpc::ServerReader;
+using grpc::ServerBuilder;
+
+using grpc::ClientContext;
+using grpc::ClientReader;
+using grpc::ClientReaderWriter;
+using grpc::ClientWriter;
+using grpc::Status;
+using sendrecv::SendRecvService;
+using sendrecv::VariableMessage;
+using sendrecv::VoidMessage;
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+class SendRecvServerImpl final : public SendRecvService::Service {
+ public:
+  explicit SendRecvServerImpl() {}
+
+  Status SendVariable(ServerContext *context, const VariableMessage *in_var,
+                      VariableMessage *out_var) override;
+
+  const framework::LoDTensor Get() { return this->lodtensor_queue_.Pop(); }
+
+  void Push(const framework::LoDTensor &tensor) {
+    this->lodtensor_return_queue_.Push(tensor);
+  }
+
+ private:
+  SimpleBlockQueue<framework::LoDTensor> lodtensor_queue_;
+  SimpleBlockQueue<framework::LoDTensor> lodtensor_return_queue_;
+  SimpleBlockQueue<framework::SelectedRows> selected_rows_queue_;
+  SimpleBlockQueue<framework::SelectedRows> selected_rows_return_queue_;
+};
+
+// RPCClient is a class to send tensors to pserver sub-network
+// using different hashing methods.
+class RPCClient {
+ public:
+  RPCClient(std::shared_ptr<Channel> channel)
+      : stub_(SendRecvService::NewStub(channel)) {}
+
+  bool SendVariable(const framework::Scope &scope, const std::string &inname,
+                    const std::string &outname);
+
+ private:
+  std::unique_ptr<SendRecvService::Stub> stub_;
+};
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/simple_block_queue.h b/paddle/operators/detail/simple_block_queue.h
new file mode 100644
index 0000000000..4489921757
--- /dev/null
+++ b/paddle/operators/detail/simple_block_queue.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <condition_variable>
+#include <deque>
+#include <mutex>
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+template <typename T>
+class SimpleBlockQueue {
+ private:
+  std::mutex mutex_;
+  std::condition_variable condition_;
+  std::deque<T> queue_;
+
+ public:
+  void Push(T const& value) {
+    {
+      std::unique_lock<std::mutex> lock(this->mutex_);
+      queue_.push_front(value);
+    }
+    this->condition_.notify_one();
+  }
+
+  T Pop() {
+    std::unique_lock<std::mutex> lock(this->mutex_);
+    this->condition_.wait(lock, [=] { return !this->queue_.empty(); });
+    T rc(std::move(this->queue_.back()));
+    this->queue_.pop_back();
+    return rc;
+  }
+};
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/load_op.cc b/paddle/operators/load_op.cc
index b0838eed16..4e58b84430 100644
--- a/paddle/operators/load_op.cc
+++ b/paddle/operators/load_op.cc
@@ -38,61 +38,7 @@ class LoadOp : public framework::OperatorBase {
                    out_var_name);
 
     auto *tensor = out_var->GetMutable<framework::LoDTensor>();
-
-    uint32_t version;
-    fin.read(reinterpret_cast<char *>(&version), sizeof(version));
-    PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
-    framework::TensorDesc desc;
-    {  // int32_t size
-       // proto buffer
-      int32_t size;
-      fin.read(reinterpret_cast<char *>(&size), sizeof(size));
-      std::unique_ptr<char[]> buf(new char[size]);
-      fin.read(reinterpret_cast<char *>(buf.get()), size);
-      PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
-                     "Cannot parse tensor desc");
-    }
-    {  // read tensor
-      std::vector<int64_t> dims;
-      dims.reserve(static_cast<size_t>(desc.dims().size()));
-      std::copy(desc.dims().begin(), desc.dims().end(),
-                std::back_inserter(dims));
-      tensor->Resize(framework::make_ddim(dims));
-
-      void *buf;
-      platform::Place cpu = platform::CPUPlace();
-      switch (desc.data_type()) {
-        case framework::FP32:
-          buf = tensor->mutable_data<float>(cpu);
-          break;
-        case framework::FP64:
-          buf = tensor->mutable_data<double>(cpu);
-          break;
-        case framework::INT32:
-          buf = tensor->mutable_data<int>(cpu);
-          break;
-        case framework::INT64:
-          buf = tensor->mutable_data<int64_t>(cpu);
-          break;
-        default:
-          PADDLE_THROW("DataType %d not supported", desc.data_type());
-      }
-      fin.read(static_cast<char *>(buf), tensor->memory_size());
-    }
-    {  // read lod
-      uint64_t lod_level;
-      fin.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
-      auto &lod = *tensor->mutable_lod();
-      lod.resize(lod_level);
-      for (uint64_t i = 0; i < lod_level; ++i) {
-        uint64_t size;
-        fin.read(reinterpret_cast<char *>(&size), sizeof(size));
-        std::vector<size_t> tmp(size / sizeof(size_t));
-        fin.read(reinterpret_cast<char *>(tmp.data()),
-                 static_cast<std::streamsize>(size));
-        lod[i] = tmp;
-      }
-    }
+    framework::DeserializeFromStream(fin, tensor);
 
     auto place = dev_ctx.GetPlace();
     if (platform::is_gpu_place(place)) {
diff --git a/paddle/operators/recv_op.cc b/paddle/operators/recv_op.cc
new file mode 100644
index 0000000000..c69e416e10
--- /dev/null
+++ b/paddle/operators/recv_op.cc
@@ -0,0 +1,121 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <stdint.h>
+#include <sys/stat.h>
+#include <ostream>
+#include <thread>
+
+#include <unistd.h>
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/executor.h"
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/detail/send_recv_impl.h"
+#include "paddle/operators/detail/simple_block_queue.h"
+
+namespace paddle {
+namespace operators {
+
+void RunServer(Server **rpc_server,
+               std::shared_ptr<detail::SendRecvServerImpl> service,
+               const std::string &server_address) {
+  ServerBuilder builder;
+  builder.AddListeningPort(server_address, grpc::InsecureServerCredentials());
+  builder.RegisterService(service.get());
+  std::unique_ptr<Server> server(builder.BuildAndStart());
+  *rpc_server = server.get();
+  LOG(INFO) << "Server listening on " << server_address << std::endl;
+  server->Wait();
+}
+
+class RecvOp : public framework::OperatorBase {
+ public:
+  RecvOp(const std::string &type, const framework::VariableNameMap &inputs,
+         const framework::VariableNameMap &outputs,
+         const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {
+    if (!rpc_service_) {
+      rpc_service_.reset(new detail::SendRecvServerImpl());
+      std::string endpoint = Attr<std::string>("endpoint");
+      server_thread_.reset(
+          new std::thread(RunServer, &rpc_server_, rpc_service_, endpoint));
+    }
+  }
+
+  virtual ~RecvOp() {
+    rpc_server_->Shutdown();
+    server_thread_->join();
+  }
+
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    // blocking get one var from client.
+    const framework::LoDTensor &t = rpc_service_->Get();
+    framework::Scope &recv_scope = scope.NewScope();
+    // set graph input var
+    auto *var = recv_scope.Var(Input("RX"));
+    auto *tensor = var->GetMutable<framework::LoDTensor>();
+    // FIXME(typhoonzero): do not copy
+    framework::CopyFrom(t, dev_ctx.GetPlace(), dev_ctx, tensor);
+
+    auto *block = Attr<framework::BlockDescBind *>("OptimizeBlock");
+    auto *program = block->Program();
+    framework::Executor executor(dev_ctx);
+    // Run sub graph to get optimized tensor
+    executor.Run(*program, &recv_scope, block->ID(),
+                 false /*create_local_scope*/);
+
+    auto *out_var = recv_scope.FindVar("Out");
+    // push back
+    rpc_service_->Push(out_var->Get<framework::LoDTensor>());
+  }
+
+ protected:
+  // grpc server instance to track status and gracefully shutdown.
+  // borrow an pointer from server thread.
+  Server *rpc_server_{nullptr};
+  // grpc send/recv service implement to register.
+  std::shared_ptr<detail::SendRecvServerImpl> rpc_service_;
+  std::shared_ptr<std::thread> server_thread_;
+};
+
+class RecvOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  RecvOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("RX", "(Tensor) Input tensor to be saved");
+    AddComment(R"DOC(
+Recv operator
+
+This operator will recv tensor from send_op
+)DOC");
+    AddAttr<std::string>("endpoint",
+                         "(string, default 127.0.0.1:6164)"
+                         "IP address to listen on.")
+        .SetDefault("127.0.0.1:6164")
+        .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
+    AddAttr<framework::BlockDescBind *>("OptimizeBlock", "type BlockDescBind*",
+                                        "optimize network run in server");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(recv, ops::RecvOp, ops::RecvOpMaker);
diff --git a/paddle/operators/save_op.cc b/paddle/operators/save_op.cc
index 56909fb65f..d4921cb80c 100644
--- a/paddle/operators/save_op.cc
+++ b/paddle/operators/save_op.cc
@@ -88,73 +88,7 @@ class SaveOp : public framework::OperatorBase {
                    "SaveOp only support LoDTensor, %s has wrong type", iname);
 
     auto &tensor = var->Get<framework::LoDTensor>();
-
-    {  // the 1st field, uint32_t version
-      constexpr uint32_t version = 0;
-      fout.write(reinterpret_cast<const char *>(&version), sizeof(version));
-    }
-    {  // the 2nd field, tensor description
-       // int32_t  size
-       // void*    protobuf message
-      framework::TensorDesc desc;
-      desc.set_data_type(framework::ToDataType(tensor.type()));
-      auto dims = framework::vectorize(tensor.dims());
-      auto *pb_dims = desc.mutable_dims();
-      pb_dims->Resize(static_cast<int>(dims.size()), 0);
-      std::copy(dims.begin(), dims.end(), pb_dims->begin());
-      int32_t size = desc.ByteSize();
-      fout.write(reinterpret_cast<const char *>(&size), sizeof(size));
-      auto out = desc.SerializeAsString();
-      fout.write(out.data(), size);
-    }
-    {  // the 3rd field, tensor data
-      uint64_t size = tensor.memory_size();
-      auto *data_ptr = tensor.data<void>();
-      PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
-                     "Index overflow when writing tensor");
-      if (platform::is_gpu_place(tensor.place())) {
-#ifdef PADDLE_WITH_CUDA
-        constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
-        std::unique_ptr<char[]> buf(new char[kBufSize]);
-        auto &gpu_dev_ctx =
-            static_cast<const platform::CUDADeviceContext &>(dev_ctx);
-        platform::CPUPlace cpu;
-        uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
-        while (size != 0) {
-          size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
-          memory::Copy(cpu, buf.get(),
-                       boost::get<platform::GPUPlace>(tensor.place()),
-                       reinterpret_cast<const void *>(data), size_to_write,
-                       gpu_dev_ctx.stream());
-          gpu_dev_ctx.Wait();
-          fout.write(buf.get(), size_to_write);
-          data += size_to_write;
-          size -= size_to_write;
-        }
-#else
-        PADDLE_THROW("Unexpected branch");
-#endif
-      } else {
-        fout.write(static_cast<const char *>(data_ptr),
-                   static_cast<std::streamsize>(size));
-      }
-    }
-    {  // the 4th field, lod information
-       // uint64_t lod_level
-       // uint64_t lod_level_1 size in byte.
-       // int*     lod_level_1 data
-       // ...
-      auto lod = tensor.lod();
-      uint64_t size = lod.size();
-      fout.write(reinterpret_cast<const char *>(&size), sizeof(size));
-
-      for (auto &each : lod) {
-        size = each.size() * sizeof(framework::LoD::value_type::value_type);
-        fout.write(reinterpret_cast<const char *>(&size), sizeof(size));
-        fout.write(reinterpret_cast<const char *>(each.data()),
-                   static_cast<std::streamsize>(size));
-      }
-    }
+    framework::SerializeToStream(fout, tensor, dev_ctx);
   }
 };
 
diff --git a/paddle/operators/send_op.cc b/paddle/operators/send_op.cc
new file mode 100644
index 0000000000..a3059847f2
--- /dev/null
+++ b/paddle/operators/send_op.cc
@@ -0,0 +1,84 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <ostream>
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/op_registry.h"
+
+#include "paddle/operators/detail/send_recv_impl.h"
+#include "paddle/operators/detail/simple_block_queue.h"
+
+namespace paddle {
+namespace operators {
+
+// TODO(typhoonzero): this is a simple implementation which only send
+// one tensor
+class SendOp : public framework::OperatorBase {
+ public:
+  SendOp(const std::string &type, const framework::VariableNameMap &inputs,
+         const framework::VariableNameMap &outputs,
+         const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {
+    // init client when the operator is created at runtime.
+    if (!client_) {
+      std::string endpoint = Attr<std::string>("endpoint");
+      client_.reset(new detail::RPCClient(
+          grpc::CreateChannel(endpoint, grpc::InsecureChannelCredentials())));
+      // TODO(typhoonzero): how to call InitVariables
+    }
+  }
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto iname = Input("X");
+    auto oname = Output("Out");
+    // TODO(typhoonzero): currently it's non-blocking,
+    // should block until server responds.
+    bool ret = client_->SendVariable(scope, iname, oname);
+    if (!ret) {
+      LOG(ERROR) << "send variable error";
+    }
+  }
+
+ protected:
+  std::shared_ptr<detail::RPCClient> client_{nullptr};
+};
+
+class SendOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SendOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor) Input tensor to be saved");
+    AddOutput("Out", "(Tensor) Output fetched from server");
+    AddComment(R"DOC(
+Recv operator
+
+This operator will recv tensor from send_op
+)DOC");
+    AddAttr<std::string>("endpoint",
+                         "(string, default 127.0.0.1:6164)"
+                         "IP address to listen on.")
+        .SetDefault("127.0.0.1:6164")
+        .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(send, ops::SendOp, ops::SendOpMaker);
diff --git a/paddle/operators/send_recv_op_test.cc b/paddle/operators/send_recv_op_test.cc
new file mode 100644
index 0000000000..ac03eb3752
--- /dev/null
+++ b/paddle/operators/send_recv_op_test.cc
@@ -0,0 +1,125 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+// TODO(typhoonzero): add python bindings for this test as
+// a RemoteOptimizer.
+
+#include <unistd.h>
+#include <thread>
+
+#include "gtest/gtest.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+#include "paddle/framework/program_desc.h"
+
+USE_NO_KERNEL_OP(send);
+USE_NO_KERNEL_OP(recv);
+USE_OP(sum);
+
+// global for simplicity.
+std::unique_ptr<paddle::framework::OperatorBase> recv_op;
+
+void InitTensorsInScope(paddle::framework::Scope &scope,
+                        paddle::platform::CPUPlace &place) {
+  paddle::platform::CPUDeviceContext ctx(place);
+  auto var = scope.Var("X");
+  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
+  tensor->Resize({10, 10});
+  float *expect = tensor->mutable_data<float>(place);
+  for (int64_t i = 0; i < tensor->numel(); ++i) {
+    expect[i] = static_cast<float>(i);
+  }
+
+  auto out_var = scope.Var("Out");
+  auto out_tensor = out_var->GetMutable<paddle::framework::LoDTensor>();
+  out_tensor->Resize({10, 10});
+  tensor->mutable_data<float>(place);  // allocate
+}
+
+void AddOp(const std::string &type,
+           const paddle::framework::VariableNameMap &inputs,
+           const paddle::framework::VariableNameMap &outputs,
+           paddle::framework::AttributeMap attrs,
+           paddle::framework::BlockDescBind *block) {
+  // insert output
+  for (auto kv : outputs) {
+    for (auto v : kv.second) {
+      auto var = block->Var(v);
+      var->SetDataType(paddle::framework::DataType::FP32);
+    }
+  }
+
+  // insert op
+  auto op = block->AppendOp();
+  op->SetType(type);
+  for (auto &kv : inputs) {
+    op->SetInput(kv.first, kv.second);
+  }
+  for (auto &kv : outputs) {
+    op->SetOutput(kv.first, kv.second);
+  }
+  op->SetAttrMap(attrs);
+}
+
+void StartServerNet() {
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace place;
+  InitTensorsInScope(scope, place);
+
+  // sub program run in recv_op, for simple test we use sum
+  paddle::framework::ProgramDescBind program;
+  paddle::framework::BlockDescBind *block = program.MutableBlock(0);
+  // X for server side tensors, RX for received tensers, must be of same shape.
+  AddOp("sum", {{"X", {"X", "RX"}}}, {{"Out", {"Out"}}}, {}, block);
+
+  paddle::framework::AttributeMap attrs;
+  attrs.insert({"endpoint", std::string("127.0.0.1:6174")});
+  attrs.insert({"OptimizeBlock", block});
+  recv_op = paddle::framework::OpRegistry::CreateOp("recv", {{"RX", {"RX"}}},
+                                                    {{"Out", {"Out"}}}, attrs);
+  paddle::platform::CPUDeviceContext ctx(place);
+  recv_op->Run(scope, ctx);
+}
+
+TEST(SendRecvOp, CPU) {
+  std::thread server_thread(StartServerNet);
+  sleep(5);  // wait server to start
+  // local net
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace place;
+  InitTensorsInScope(scope, place);
+
+  paddle::framework::AttributeMap attrs;
+  attrs.insert({"endpoint", std::string("127.0.0.1:6174")});
+
+  auto send_op = paddle::framework::OpRegistry::CreateOp(
+      "send", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs);
+  paddle::platform::CPUDeviceContext ctx(place);
+  send_op->Run(scope, ctx);
+
+  auto in_var = scope.Var("X");
+  auto tensor = in_var->GetMutable<paddle::framework::LoDTensor>();
+  float *expected = tensor->data<float>();
+
+  auto out_var = scope.Var("Out");
+  auto target = out_var->GetMutable<paddle::framework::LoDTensor>();
+  // send fail cause output is none.
+  EXPECT_NE(target->memory_size(), size_t(0));
+  float *actual = target->data<float>();
+  for (int64_t i = 0; i < target->numel(); ++i) {
+    EXPECT_EQ(expected[i] * 2, actual[i]);
+  }
+  recv_op.reset();  // dtor can shutdown and join server thread.
+  server_thread.join();
+}

From 3022a790828dc75ce17deb1fb114d45838124f3d Mon Sep 17 00:00:00 2001
From: peterzhang2029 <zhangchao41@baidu.com>
Date: Tue, 28 Nov 2017 10:59:29 +0800
Subject: [PATCH 43/52] add gpu test in test_LayerGrad

---
 paddle/gserver/tests/test_LayerGrad.cpp | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index cacf106929..12d6600916 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -681,12 +681,13 @@ TEST(Layer, hsigmoidLayer) {
   config.layerConfig.add_inputs();
   config.layerConfig.add_inputs();
 
-  // Not support GPU now
-  testLayerGrad(config,
-                "hsigmoid",
-                100,
-                /* trans */ false, /* useGpu */
-                false);
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "hsigmoid",
+                  100,
+                  /* trans */ false, /* useGpu */
+                  useGpu);
+  }
 }
 
 TEST(Layer, multi_cross) {

From ab1af66b1281b941c75d5c000141ce912ab1e37b Mon Sep 17 00:00:00 2001
From: peterzhang2029 <zhangchao41@baidu.com>
Date: Tue, 28 Nov 2017 11:30:18 +0800
Subject: [PATCH 44/52] --amend

---
 paddle/gserver/tests/test_LayerGrad.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 60a4feff03..c5359f272b 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -685,8 +685,8 @@ TEST(Layer, hsigmoidLayer) {
     testLayerGrad(config,
                   "hsigmoid",
                   100,
-                  /* trans */ false, /* useGpu */
-                  useGpu);
+                  /* trans */ false,
+                  /* useGpu */ useGpu);
   }
 }
 

From 0aceeee1fae98c0ad012f1c85adf91a49b4365fd Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 28 Nov 2017 12:03:16 +0800
Subject: [PATCH 45/52] Feature/remove g program (#5930)

* Unify fluid submodules to fluid module

Change books just use `import fluid`, not submodules

* Remove g_main_program/g_startup_program

Use default_main_program/default_startup_program instead

* Typo

* Fix CI
---
 python/paddle/v2/fluid/evaluator.py           |  4 ++--
 python/paddle/v2/fluid/executor.py            |  4 ++--
 python/paddle/v2/fluid/framework.py           | 10 ++++-----
 python/paddle/v2/fluid/io.py                  | 19 ++++++++---------
 python/paddle/v2/fluid/layer_helper.py        |  7 +++----
 python/paddle/v2/fluid/layers.py              |  2 +-
 .../fluid/tests/test_array_read_write_op.py   |  4 ++--
 .../v2/fluid/tests/test_conditional_block.py  |  8 ++++---
 .../v2/fluid/tests/test_executor_and_mul.py   | 12 +++++------
 .../v2/fluid/tests/test_lod_rank_table.py     |  3 +--
 .../v2/fluid/tests/test_operator_desc.py      |  8 +++++--
 .../paddle/v2/fluid/tests/test_parameter.py   | 10 +++++----
 python/paddle/v2/fluid/tests/test_program.py  | 21 ++++++++++---------
 .../v2/fluid/tests/test_shrink_rnn_memory.py  |  6 ++++--
 python/paddle/v2/fluid/tests/test_variable.py |  4 ++--
 15 files changed, 64 insertions(+), 58 deletions(-)

diff --git a/python/paddle/v2/fluid/evaluator.py b/python/paddle/v2/fluid/evaluator.py
index bd4a6fda1f..137c573622 100644
--- a/python/paddle/v2/fluid/evaluator.py
+++ b/python/paddle/v2/fluid/evaluator.py
@@ -26,9 +26,9 @@ class Evaluator(object):
         name(str): The name of evaluator. such as, "accuracy". Used for generate 
             temporary variable name.
         main_program(Program, optional): The evaluator should be added to this 
-            main_program. Default g_main_program 
+            main_program. Default default_main_program()
         startup_program(Program, optional):The parameter should be added to this 
-            startup_program. Default g_startup_program
+            startup_program. Default default_startup_program()
             
     Attributes:
         states(list): The list of state variables. states will be reset to zero 
diff --git a/python/paddle/v2/fluid/executor.py b/python/paddle/v2/fluid/executor.py
index 3e26d1b983..bdc82eede9 100644
--- a/python/paddle/v2/fluid/executor.py
+++ b/python/paddle/v2/fluid/executor.py
@@ -1,6 +1,6 @@
 import numpy as np
 from . import core
-from framework import Program, g_main_program
+from framework import Program, default_main_program
 
 __all__ = ['Executor', 'g_scope']
 
@@ -103,7 +103,7 @@ class Executor(object):
             fetch_list = []
 
         if program is None:
-            program = g_main_program
+            program = default_main_program()
 
         if not isinstance(program, Program):
             raise TypeError()
diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py
index 6d6ea23f55..1c42e4d44f 100644
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -6,7 +6,7 @@ import proto.framework_pb2 as framework_pb2
 
 __all__ = [
     'Block', 'Variable', 'Program', 'Operator', 'default_startup_program',
-    'default_main_program', 'g_startup_program', 'g_main_program'
+    'default_main_program'
 ]
 
 
@@ -654,13 +654,13 @@ class Parameter(Variable):
 
 
 # program is a global instance.
-g_main_program = Program()
-g_startup_program = Program()
+_main_program_ = Program()
+_startup_program_ = Program()
 
 
 def default_startup_program():
-    return g_startup_program
+    return _startup_program_
 
 
 def default_main_program():
-    return g_main_program
+    return _main_program_
diff --git a/python/paddle/v2/fluid/io.py b/python/paddle/v2/fluid/io.py
index e5b2aa3b91..e147ac22ad 100644
--- a/python/paddle/v2/fluid/io.py
+++ b/python/paddle/v2/fluid/io.py
@@ -1,8 +1,7 @@
 import os
 import cPickle as pickle
 
-from paddle.v2.fluid.framework import Program, Parameter, g_main_program, \
-    Variable
+from paddle.v2.fluid.framework import Program, Parameter, default_main_program, Variable
 
 __all__ = [
     'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params',
@@ -46,7 +45,7 @@ def save_vars(executor, dirname, main_program=None, vars=None, predicate=None):
     """
     if vars is None:
         if main_program is None:
-            main_program = g_main_program
+            main_program = default_main_program()
         if not isinstance(main_program, Program):
             raise TypeError("program should be as Program type or None")
 
@@ -98,7 +97,7 @@ def load_vars(executor, dirname, main_program=None, vars=None, predicate=None):
     :param executor: executor that save variable
     :param dirname: directory path
     :param main_program: program. If vars is None, then filter all variables in this
-    program which fit `predicate`. Default g_program.
+    program which fit `predicate`. Default default_main_program().
     :param predicate: The Predicate describes a callable that returns a variable
     as a bool. If it returns true, the variables will be loaded.
     :param vars: variables need to be loaded. If specify vars, program &
@@ -107,7 +106,7 @@ def load_vars(executor, dirname, main_program=None, vars=None, predicate=None):
     """
     if vars is None:
         if main_program is None:
-            main_program = g_main_program
+            main_program = default_main_program()
         if not isinstance(main_program, Program):
             raise TypeError("program's type should be Program")
 
@@ -154,7 +153,7 @@ def load_persistables(executor, dirname, main_program=None):
 
 def get_inference_program(target_vars, main_program=None):
     if main_program is None:
-        main_program = g_main_program
+        main_program = default_main_program()
     if not isinstance(target_vars, list):
         target_vars = [target_vars]
 
@@ -177,12 +176,12 @@ def save_inference_model(dirname,
     :param target_vars: Variables from which we can get inference results.
     :param executor: executor that save inference model
     :param main_program: original program, which will be pruned to build the inference model.
-    Default g_main_program.
+            Default default_main_program().
 
     :return: None
     """
     if main_program is None:
-        main_program = g_main_program
+        main_program = default_main_program()
     if not isinstance(target_vars, list):
         target_vars = [target_vars]
 
@@ -272,10 +271,10 @@ def get_parameter_value_by_name(name, executor, program=None):
     :param executor: executor for retrieving the value
     :param name: the name of the parameter
     :param program: the program where the variable is found
-    Default g_main_program.
+            Default default_main_program().
     :return: the LoDTensor for the variable
     """
     if program is None:
-        program = g_main_program
+        program = default_main_program()
     var = program.global_block().var(name)
     return get_parameter_value(var, executor)
diff --git a/python/paddle/v2/fluid/layer_helper.py b/python/paddle/v2/fluid/layer_helper.py
index 5f88555511..7762b0d88f 100644
--- a/python/paddle/v2/fluid/layer_helper.py
+++ b/python/paddle/v2/fluid/layer_helper.py
@@ -1,8 +1,7 @@
 import copy
 import itertools
 
-from framework import Variable, g_main_program, \
-    g_startup_program, unique_name, dtype_is_floating
+from framework import Variable, default_main_program, default_startup_program, unique_name, dtype_is_floating
 from paddle.v2.fluid.initializer import Constant, Xavier
 
 
@@ -22,7 +21,7 @@ class LayerHelper(object):
     def main_program(self):
         prog = self.kwargs.get('main_program', None)
         if prog is None:
-            return g_main_program
+            return default_main_program()
         else:
             return prog
 
@@ -30,7 +29,7 @@ class LayerHelper(object):
     def startup_program(self):
         prog = self.kwargs.get('startup_program', None)
         if prog is None:
-            return g_startup_program
+            return default_startup_program()
         else:
             return prog
 
diff --git a/python/paddle/v2/fluid/layers.py b/python/paddle/v2/fluid/layers.py
index 28bc3d214b..5a76c79db1 100644
--- a/python/paddle/v2/fluid/layers.py
+++ b/python/paddle/v2/fluid/layers.py
@@ -1,4 +1,4 @@
-from . import core
+import core
 import proto.framework_pb2 as framework_pb2
 from framework import OpProtoHolder, Variable, Program, Operator
 from initializer import Constant, Normal, Xavier
diff --git a/python/paddle/v2/fluid/tests/test_array_read_write_op.py b/python/paddle/v2/fluid/tests/test_array_read_write_op.py
index b7790b0106..f6120aedec 100644
--- a/python/paddle/v2/fluid/tests/test_array_read_write_op.py
+++ b/python/paddle/v2/fluid/tests/test_array_read_write_op.py
@@ -3,7 +3,7 @@ import paddle.v2.fluid.core as core
 import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.executor import Executor
 from paddle.v2.fluid.backward import append_backward_ops
-from paddle.v2.fluid.framework import g_main_program
+from paddle.v2.fluid.framework import default_main_program
 import numpy
 
 
@@ -66,7 +66,7 @@ class TestArrayReadWrite(unittest.TestCase):
 
         append_backward_ops(total_sum_scaled)
 
-        g_vars = map(g_main_program.global_block().var,
+        g_vars = map(default_main_program().global_block().var,
                      [each_x.name + "@GRAD" for each_x in x])
         g_out = [
             item.sum()
diff --git a/python/paddle/v2/fluid/tests/test_conditional_block.py b/python/paddle/v2/fluid/tests/test_conditional_block.py
index d953ee7ddc..2b9d8f351a 100644
--- a/python/paddle/v2/fluid/tests/test_conditional_block.py
+++ b/python/paddle/v2/fluid/tests/test_conditional_block.py
@@ -1,7 +1,7 @@
 import unittest
 import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.core as core
-from paddle.v2.fluid.framework import g_startup_program, g_main_program
+from paddle.v2.fluid.framework import default_startup_program, default_main_program
 from paddle.v2.fluid.executor import Executor
 from paddle.v2.fluid.backward import append_backward_ops
 import numpy
@@ -19,7 +19,7 @@ class ConditionalBlock(unittest.TestCase):
 
         cpu = core.CPUPlace()
         exe = Executor(cpu)
-        exe.run(g_startup_program)
+        exe.run(default_startup_program())
 
         x = numpy.random.random(size=(10, 1)).astype('float32')
 
@@ -29,7 +29,9 @@ class ConditionalBlock(unittest.TestCase):
         append_backward_ops(loss=loss)
         outs = exe.run(
             feed={'X': x},
-            fetch_list=[g_main_program.block(0).var(data.name + "@GRAD")])[0]
+            fetch_list=[
+                default_main_program().block(0).var(data.name + "@GRAD")
+            ])[0]
         print outs
 
 
diff --git a/python/paddle/v2/fluid/tests/test_executor_and_mul.py b/python/paddle/v2/fluid/tests/test_executor_and_mul.py
index 558273e30d..b1ef87c5cb 100644
--- a/python/paddle/v2/fluid/tests/test_executor_and_mul.py
+++ b/python/paddle/v2/fluid/tests/test_executor_and_mul.py
@@ -1,9 +1,10 @@
 import unittest
-from paddle.v2.fluid.layers import mul, data, sequence_pool
+
+import numpy
 import paddle.v2.fluid.core as core
+
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.framework import g_main_program
-import numpy
+from paddle.v2.fluid.layers import mul, data
 
 
 class TestExecutor(unittest.TestCase):
@@ -19,10 +20,7 @@ class TestExecutor(unittest.TestCase):
         a_np = numpy.random.random((100, 784)).astype('float32')
         b_np = numpy.random.random((784, 100)).astype('float32')
         exe = Executor(place)
-        outs = exe.run(g_main_program,
-                       feed={'a': a_np,
-                             'b': b_np},
-                       fetch_list=[out])
+        outs = exe.run(feed={'a': a_np, 'b': b_np}, fetch_list=[out])
         out = outs[0]
         self.assertEqual((100, 100), out.shape)
         self.assertTrue(numpy.allclose(out, numpy.dot(a_np, b_np)))
diff --git a/python/paddle/v2/fluid/tests/test_lod_rank_table.py b/python/paddle/v2/fluid/tests/test_lod_rank_table.py
index bbc11930b9..30d619fe31 100644
--- a/python/paddle/v2/fluid/tests/test_lod_rank_table.py
+++ b/python/paddle/v2/fluid/tests/test_lod_rank_table.py
@@ -1,6 +1,5 @@
 from paddle.v2.fluid.layers import lod_rank_table, data
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.framework import g_main_program
 import paddle.v2.fluid.core as core
 import numpy
 import unittest
@@ -18,7 +17,7 @@ class TestLoDRankTable(unittest.TestCase):
         tensor = core.LoDTensor()
         tensor.set(numpy.random.random(size=(17, 100)), cpu)
         tensor.set_lod([[0, 1, 3], [0, 5, 6, 7], [0, 3, 4, 9, 10, 13, 16, 17]])
-        exe.run(g_main_program, scope=scope, feed={'x': tensor})
+        exe.run(scope=scope, feed={'x': tensor})
         var = scope.find_var(rank_table.name)
         table = var.get_lod_rank_table()
         self.assertEqual([(0, 5), (1, 1), (2, 1)], table.items())
diff --git a/python/paddle/v2/fluid/tests/test_operator_desc.py b/python/paddle/v2/fluid/tests/test_operator_desc.py
index e8362d2e9c..ce34d95ac8 100644
--- a/python/paddle/v2/fluid/tests/test_operator_desc.py
+++ b/python/paddle/v2/fluid/tests/test_operator_desc.py
@@ -1,11 +1,15 @@
 import unittest
-from paddle.v2.fluid.framework import Variable, Program, g_main_program
+
 import paddle.v2.fluid.core as core
 
+from paddle.v2.fluid.framework import Program, default_startup_program
+
+main_program = default_startup_program()
+
 
 class TestOperator(unittest.TestCase):
     def test_error_type(self):
-        block = g_main_program.create_block()
+        block = main_program.create_block()
         try:
             block.append_op()
             self.assertFail()
diff --git a/python/paddle/v2/fluid/tests/test_parameter.py b/python/paddle/v2/fluid/tests/test_parameter.py
index 13f6278ad8..694344acbb 100644
--- a/python/paddle/v2/fluid/tests/test_parameter.py
+++ b/python/paddle/v2/fluid/tests/test_parameter.py
@@ -1,17 +1,19 @@
 import unittest
-from paddle.v2.fluid.framework import g_main_program
+from paddle.v2.fluid.framework import default_main_program
 import paddle.v2.fluid.core as core
 from paddle.v2.fluid.executor import Executor
 import paddle.v2.fluid.io as io
 from paddle.v2.fluid.initializer import ConstantInitializer
 import numpy as np
 
+main_program = default_main_program()
+
 
 class TestParameter(unittest.TestCase):
     def test_param(self):
         shape = [784, 100]
         val = 1.0625
-        b = g_main_program.global_block()
+        b = main_program.global_block()
         param = b.create_parameter(
             name='fc.w',
             shape=shape,
@@ -23,9 +25,9 @@ class TestParameter(unittest.TestCase):
         self.assertEqual(core.DataType.FP32, param.dtype)
         self.assertEqual(0, param.block.idx)
         exe = Executor(core.CPUPlace())
-        p = exe.run(g_main_program, fetch_list=[param])[0]
+        p = exe.run(main_program, fetch_list=[param])[0]
         self.assertTrue(np.allclose(p, np.ones(shape) * val))
-        p = io.get_parameter_value_by_name('fc.w', exe, g_main_program)
+        p = io.get_parameter_value_by_name('fc.w', exe, main_program)
         self.assertTrue(np.allclose(np.array(p), np.ones(shape) * val))
 
 
diff --git a/python/paddle/v2/fluid/tests/test_program.py b/python/paddle/v2/fluid/tests/test_program.py
index 15653a1dbf..1a9313c68a 100644
--- a/python/paddle/v2/fluid/tests/test_program.py
+++ b/python/paddle/v2/fluid/tests/test_program.py
@@ -1,37 +1,38 @@
 from __future__ import print_function
 import unittest
 
-from paddle.v2.fluid.framework import Program
-from paddle.v2.fluid.framework import g_main_program
+from paddle.v2.fluid.framework import Program, default_main_program
 import paddle.v2.fluid.layers as layers
 
+main_program = default_main_program()
+
 
 class TestProgram(unittest.TestCase):
     def test_program(self):
-        b = g_main_program.current_block()
+        b = main_program.current_block()
         self.assertEqual(-1, b.parent_idx)
         self.assertEqual(0, b.idx)
 
-        b = g_main_program.create_block()
+        b = main_program.create_block()
         self.assertEqual(1, b.idx)
         self.assertEqual(0, b.parent_idx)
 
-        b = g_main_program.create_block()
+        b = main_program.create_block()
         self.assertEqual(2, b.idx)
         self.assertEqual(1, b.parent_idx)
 
-        g_main_program.rollback()
+        main_program.rollback()
 
-        b = g_main_program.current_block()
+        b = main_program.current_block()
         self.assertEqual(1, b.idx)
         self.assertEqual(0, b.parent_idx)
 
-        b = g_main_program.create_block()
+        b = main_program.create_block()
         self.assertEqual(3, b.idx)
         self.assertEqual(1, b.parent_idx)
 
-        g_main_program.rollback()
-        b = g_main_program.current_block()
+        main_program.rollback()
+        b = main_program.current_block()
         self.assertEqual(1, b.idx)
         self.assertEqual(0, b.parent_idx)
 
diff --git a/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py b/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
index 05f6a56064..86db4c64b4 100644
--- a/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
+++ b/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
@@ -3,9 +3,11 @@ import paddle.v2.fluid.core as core
 from paddle.v2.fluid.executor import Executor
 import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.backward import append_backward_ops
-from paddle.v2.fluid.framework import g_main_program
+from paddle.v2.fluid.framework import default_main_program
 import numpy
 
+main_program = default_main_program()
+
 
 class TestShrinkRNNMemory(unittest.TestCase):
     def test_shrink_rnn_memory(self):
@@ -36,7 +38,7 @@ class TestShrinkRNNMemory(unittest.TestCase):
         append_backward_ops(loss=mem3_mean)
         x_grad = exe.run(
             feed={'x': tensor},
-            fetch_list=[g_main_program.global_block().var('x@GRAD')])[0]
+            fetch_list=[main_program.global_block().var('x@GRAD')])[0]
         self.assertAlmostEqual(1.0, x_grad.sum(), delta=0.1)
 
 
diff --git a/python/paddle/v2/fluid/tests/test_variable.py b/python/paddle/v2/fluid/tests/test_variable.py
index 92ffdceb6c..f1e4c0ba21 100644
--- a/python/paddle/v2/fluid/tests/test_variable.py
+++ b/python/paddle/v2/fluid/tests/test_variable.py
@@ -1,5 +1,5 @@
 import unittest
-from paddle.v2.fluid.framework import g_main_program, Program, convert_np_dtype_to_dtype_
+from paddle.v2.fluid.framework import default_main_program, Program, convert_np_dtype_to_dtype_
 import paddle.v2.fluid.core as core
 import numpy as np
 
@@ -18,7 +18,7 @@ class TestVariable(unittest.TestCase):
         self.assertRaises(ValueError, lambda: convert("int8"))
 
     def test_var(self):
-        b = g_main_program.current_block()
+        b = default_main_program().current_block()
         w = b.create_var(
             dtype="float64", shape=[784, 100], lod_level=0, name="fc.w")
         self.assertNotEqual(str(w), "")

From 985e4ab62dc6ca2eb023d8c1e0c633dc235c847a Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 28 Nov 2017 15:35:36 +0800
Subject: [PATCH 46/52] Add Python wrap of conv2d_transpose and its unittest
 (#5946)

* Add Python wrap of conv2d_transpose and its unittest

* Follow comments

* Fix format
---
 paddle/operators/conv_transpose_op.cc       | 18 ++--
 paddle/operators/detail/send_recv.proto     |  6 +-
 python/paddle/v2/fluid/layers.py            | 93 ++++++++++++++++++++-
 python/paddle/v2/fluid/tests/test_layers.py |  9 ++
 4 files changed, 112 insertions(+), 14 deletions(-)

diff --git a/paddle/operators/conv_transpose_op.cc b/paddle/operators/conv_transpose_op.cc
index 3e55ef036a..314b577d00 100644
--- a/paddle/operators/conv_transpose_op.cc
+++ b/paddle/operators/conv_transpose_op.cc
@@ -74,12 +74,12 @@ Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(
             "The format of output tensor is also NCHW.");
   AddAttr<std::vector<int>>(
       "strides",
-      "(vector<int> defalut:{1, 1}), the strides(h_stride, w_stride) of "
+      "(vector<int> default:{1, 1}), the strides(h_stride, w_stride) of "
       "convolution transpose operator.")
       .SetDefault({1, 1});
   AddAttr<std::vector<int>>(
       "paddings",
-      "(vector<int> defalut:{0, 0}), the paddings(h_pad, w_pad) of convolution "
+      "(vector<int> default:{0, 0}), the paddings(h_pad, w_pad) of convolution "
       "transpose operator.")
       .SetDefault({0, 0});
   AddComment(R"DOC(
@@ -101,8 +101,8 @@ Example:
   Output:
        Output shape: (N, C_out, H_out, W_out)
   where
-       H_out = (H_in - 1) * strides[0] - 2 * paddings[0] + filter_size[0];
-       W_out = (W_in - 1) * strides[1] - 2 * paddings[1] + filter_size[1];
+       H_out = (H_in - 1) * strides[0] - 2 * paddings[0] + H_f;
+       W_out = (W_in - 1) * strides[1] - 2 * paddings[1] + W_f;
 )DOC");
 }
 
@@ -130,12 +130,12 @@ Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(
             "the number of channels, D is the depth of the feature, H is the "
             "height of the feature, and W is the width of the feature.");
   AddAttr<std::vector<int>>("strides",
-                            "(vector<int> defalut:{1, 1, 1}), the "
+                            "(vector<int> default:{1, 1, 1}), the "
                             "strides{d_stride, h_stride, w_stride} of "
                             "convolution transpose operator.")
       .SetDefault({1, 1, 1});
   AddAttr<std::vector<int>>("paddings",
-                            "(vector<int> defalut:{0, 0, 0}), paddings(d_pad, "
+                            "(vector<int> default:{0, 0, 0}), paddings(d_pad, "
                             "h_pad, w_pad) of convolution transpose operator.")
       .SetDefault({0, 0, 0});
   AddComment(R"DOC(
@@ -158,9 +158,9 @@ Example:
   Output:
        Output shape: (N, C_out, D_out, H_out, W_out)
   where
-       D_out = (D_in - 1) * strides[0] - 2 * paddings[0] + filter_size[0];
-       H_out = (H_in - 1) * strides[1] - 2 * paddings[1] + filter_size[1];
-       W_out = (W_in - 1) * strides[2] - 2 * paddings[2] + filter_size[2];
+       D_out = (D_in - 1) * strides[0] - 2 * paddings[0] + D_f;
+       H_out = (H_in - 1) * strides[1] - 2 * paddings[1] + H_f;
+       W_out = (W_in - 1) * strides[2] - 2 * paddings[2] + W_f;
 )DOC");
 }
 
diff --git a/paddle/operators/detail/send_recv.proto b/paddle/operators/detail/send_recv.proto
index 66f84678b3..962c7d5981 100644
--- a/paddle/operators/detail/send_recv.proto
+++ b/paddle/operators/detail/send_recv.proto
@@ -17,7 +17,7 @@ syntax = "proto3";
 package sendrecv;
 
 service SendRecvService {
-  // For parameter server round-robin like hashing, do not split tensors. 
+  // For parameter server round-robin like hashing, do not split tensors.
   // Send and recv only one tensor
   rpc SendVariable(VariableMessage) returns (VariableMessage) {}
 }
@@ -32,6 +32,4 @@ message VariableMessage {
   bytes serialized = 2;
 }
 
-message VoidMessage {
-
-}
\ No newline at end of file
+message VoidMessage {}
\ No newline at end of file
diff --git a/python/paddle/v2/fluid/layers.py b/python/paddle/v2/fluid/layers.py
index 5a76c79db1..6adfac3a32 100644
--- a/python/paddle/v2/fluid/layers.py
+++ b/python/paddle/v2/fluid/layers.py
@@ -1,7 +1,7 @@
 import core
 import proto.framework_pb2 as framework_pb2
 from framework import OpProtoHolder, Variable, Program, Operator
-from initializer import Constant, Normal, Xavier
+from initializer import Constant, Normal, Xavier, Initializer
 from paddle.v2.fluid.layer_helper import LayerHelper, unique_name
 import re
 import cStringIO
@@ -1587,6 +1587,97 @@ def array_length(array, main_program=None):
     return tmp
 
 
+def conv2d_transpose(input,
+                     num_filters,
+                     output_size=None,
+                     filter_size=None,
+                     padding=None,
+                     stride=None,
+                     param_attr=None,
+                     param_initializer=None,
+                     main_program=None,
+                     startup_program=None):
+    """
+    The transpose of conv2d layer.
+    
+    This layer is also known as deconvolution layer.
+    
+    Args:
+        input(Variable): The input image with [N, C, H, W] format.
+        num_filters(int): The number of filter. It is as same as the output
+            image channel.
+        output_size(int|tuple|None): The output image size. If output size is a
+            tuple, it must contain two integers, (image_H, image_W). This 
+            parameter only works when filter_size is None.
+        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
+            it must contain two integers, (filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square.  None if use output size to
+            calculate filter_size
+        padding(int|tuple): The padding size. If padding is a tuple, it must
+            contain two integers, (padding_H, padding_W). Otherwise, the 
+            padding_H = padding_W = padding.
+        stride(int|tuple): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride.
+        param_attr: Parameter Attribute.
+        param_initializer(Initializer): Parameter Initializer. Default is Xavier
+        main_program(Program): the main program
+        startup_program(Program): the startup program 
+
+    Returns:
+        Variable: Output image.
+    """
+    helper = LayerHelper("conv2d_transpose", **locals())
+    if not isinstance(input, Variable):
+        raise TypeError("Input of conv2d_transpose must be Variable")
+    input_channel = input.shape[1]
+
+    op_attr = dict()
+
+    if isinstance(padding, int):
+        op_attr['paddings'] = [padding, padding]
+    elif padding is not None:
+        op_attr['paddings'] = padding
+
+    if isinstance(stride, int):
+        op_attr['strides'] = stride
+    elif stride is not None:
+        op_attr['strides'] = stride
+
+    if filter_size is None:
+        if output_size is None:
+            raise ValueError("output_size must be set when filter_size is None")
+        if isinstance(output_size, int):
+            output_size = [output_size, output_size]
+
+        padding = op_attr.get('paddings', [0, 0])
+        stride = op_attr.get('strides', [1, 1])
+
+        h_in = input.shape[2]
+        w_in = input.shape[3]
+        filter_size_h = output_size[0] - (h_in - 1) * stride[0] + 2 * padding[0]
+        filter_size_w = output_size[1] - (w_in - 1) * stride[1] + 2 * padding[1]
+        filter_size = [filter_size_h, filter_size_w]
+    elif isinstance(filter_size, int):
+        filter_size = [filter_size, filter_size]
+
+    filter_shape = [input_channel, num_filters] + filter_size
+    img_filter = helper.create_parameter(
+        dtype=input.dtype,
+        shape=filter_shape,
+        attr=helper.param_attr,
+        initializer=param_initializer)
+
+    out = helper.create_tmp_variable(dtype=input.dtype)
+    helper.append_op(
+        type='conv2d_transpose',
+        inputs={'Input': [input],
+                'Filter': [img_filter]},
+        outputs={'Output': out},
+        attrs=op_attr)
+    return out
+
+
 class ConditionalBlockGuard(BlockGuard):
     def __init__(self, block):
         if not isinstance(block, ConditionalBlock):
diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py
index 87dc6d1a62..62b2a0f9a1 100644
--- a/python/paddle/v2/fluid/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
@@ -65,6 +65,15 @@ class TestBook(unittest.TestCase):
 
         print str(program)
 
+    def test_conv2d_transpose(self):
+        program = Program()
+        kwargs = {'main_program': program}
+        img = layers.data(
+            name='pixel', shape=[3, 2, 2], dtype='float32', **kwargs)
+        layers.conv2d_transpose(
+            input=img, num_filters=10, output_size=28, **kwargs)
+        print str(program)
+
     def test_recognize_digits_conv(self):
         program = Program()
 

From 696b0253e597a38edb948daf3278adc52a69b004 Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Tue, 28 Nov 2017 18:28:35 +0800
Subject: [PATCH 47/52] Refine paddle/v2/fluid/profiler.py.

---
 paddle/platform/cuda_profiler.h               |  8 +-
 python/paddle/v2/fluid/profiler.py            | 78 ++++++-------------
 python/paddle/v2/fluid/tests/test_profiler.py |  2 +-
 3 files changed, 30 insertions(+), 58 deletions(-)

diff --git a/paddle/platform/cuda_profiler.h b/paddle/platform/cuda_profiler.h
index c096ce37c5..b6311cb23d 100644
--- a/paddle/platform/cuda_profiler.h
+++ b/paddle/platform/cuda_profiler.h
@@ -29,10 +29,10 @@ void CudaProfilerInit(std::string output_file, std::string output_mode,
   memcpy(buf.data(), tmpl.data(), tmpl.size());
   auto result = mktemp(buf.data());
   PADDLE_ENFORCE(strlen(result) != 0);
-  std::string config = result;
+  std::string config_file = result;
 
   {
-    std::ofstream ofs(config, std::ios::out | std::ios::trunc);
+    std::ofstream ofs(config_file, std::ios::out | std::ios::trunc);
     PADDLE_ENFORCE(ofs.is_open(), "ofstream: ", ofs.rdstate());
     for (const auto& line : config_flags) {
       ofs << line << std::endl;
@@ -42,12 +42,12 @@ void CudaProfilerInit(std::string output_file, std::string output_mode,
   PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv");
   cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair;
   PADDLE_ENFORCE(
-      cudaProfilerInitialize(config.c_str(), output_file.c_str(), mode));
+      cudaProfilerInitialize(config_file.c_str(), output_file.c_str(), mode));
 }
 
 void CudaProfilerStart() { PADDLE_ENFORCE(cudaProfilerStart()); }
 
-void CudaProfilerStop() { PADDLE_ENFORCE((cudaProfilerStop())); }
+void CudaProfilerStop() { PADDLE_ENFORCE(cudaProfilerStop()); }
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/python/paddle/v2/fluid/profiler.py b/python/paddle/v2/fluid/profiler.py
index f31d6f0a61..2dbba72c64 100644
--- a/python/paddle/v2/fluid/profiler.py
+++ b/python/paddle/v2/fluid/profiler.py
@@ -1,9 +1,9 @@
 import paddle.v2.fluid.core as core
-import subprocess
+from contextlib import contextmanager
 
 __all__ = ['CudaProfiler']
 
-NV_FLAGS = [
+NVPROF_CONFIG = [
     "gpustarttimestamp",
     "gpuendtimestamp",
     "gridsize3d",
@@ -14,61 +14,33 @@ NV_FLAGS = [
 ]
 
 
-def nvporf_init(output_file, output_mode=None, flags=None):
-    """
-    Initialize the CUDA profiler.
-    This methods must be called before nvprof_start.
-
-    :param output_file: The output file name.
-    :type output_file: string
-    :param output_mode: The output mode has Key-Value pair format and
-                        Comma separated values format.
-                        It should be 'kv' or 'csv'.
-    :type output_mode: string
+@contextmanager
+def cuda_profiler(output_file, output_mode=None, config=None):
+    """The CUDA profiler.
+    This fuctions is used to profile CUDA program by CUDA runtime application
+    programming interface. The profiling result will be written into
+    `output_file` with Key-Value pair format or Comma separated values format.
+    The user can set the output mode by `output_mode` argument and set the
+    counters/options for profiling by `config` argument. The default config
+    caontains 'gpustarttimestamp', 'gpustarttimestamp', 'gridsize3d',
+    'threadblocksize', 'streamid', 'enableonstart 0', 'conckerneltrace'.
+
+    Args:
+        output_file (string) : The output file name, the result will be
+            written into this file.
+        output_mode (string) : The output mode has Key-Value pair format and
+            Comma separated values format. It should be 'kv' or 'csv'.
+        config (string) : The profiler options and counters can refer to
+            "Compute Command Line Profiler User Guide".
     """
     if output_mode is None:
         output_mode = 'csv'
     if output_mode not in ['kv', 'csv']:
         raise ValueError("The output mode must be 'key-value' or 'csv'.")
-    flags = NV_FLAGS if flags is None else flags
-    core.nvprof_init(output_file, output_mode, flags)
-
-
-def nvporf_start():
-    """
-    Enables profiler collection by the active CUDA profiling tool.
-    """
+    config = NVPROF_CONFIG if config is None else config
+    core.nvprof_init(output_file, output_mode, config)
+    # Enables profiler collection by the active CUDA profiling tool.
     core.nvprof_start()
-
-
-def nvporf_stop():
-    """
-    Disables profiler collection.
-    """
+    yield
+    # Disables profiler collection.
     core.nvprof_stop()
-
-
-class CudaProfiler(object):
-    def __init__(self, output_file, output_mode=None, flags=None, enabled=True):
-        self.enabled = enabled
-        if not self.enabled:
-            return
-        self.entered = False
-        self.out_file = output_file
-        nvporf_init(output_file, output_mode, flags)
-
-    def __enter__(self):
-        if not self.enabled:
-            return
-        if self.entered:
-            raise RuntimeError("The profiler traces are not reentrant")
-        self.entered = True
-        nvporf_start()
-        return self
-
-    def __exit__(self, exc_type, exc_value, tb):
-        if exc_value is not None:
-            raise exc_value
-        if not self.enabled:
-            return
-        nvporf_stop()
diff --git a/python/paddle/v2/fluid/tests/test_profiler.py b/python/paddle/v2/fluid/tests/test_profiler.py
index 1fec5c99bf..e8f24251b9 100644
--- a/python/paddle/v2/fluid/tests/test_profiler.py
+++ b/python/paddle/v2/fluid/tests/test_profiler.py
@@ -18,7 +18,7 @@ class TestProfiler(unittest.TestCase):
         exe = fluid.Executor(place)
         exe.run(fluid.default_startup_program())
 
-        with profiler.CudaProfiler("cuda_profiler.txt", 'csv') as nvprof:
+        with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
             for i in range(epoc):
                 input = np.random.random(dshape).astype("float32")
                 exe.run(fluid.default_main_program(), feed={'data': input})

From 6375c8cacbf72da741590361c887758d7a5323f5 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 28 Nov 2017 18:53:37 +0800
Subject: [PATCH 48/52] Fix MacOS compile (#5978)

* Fix MacOS compile

* Update GRPC

* Unset PROTOBUF_EXEC
---
 cmake/external/grpc.cmake     | 12 ++++++++++--
 cmake/external/protobuf.cmake | 24 +++++++++++++++++-------
 2 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
index f431c037fd..1330ef82dc 100644
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -23,6 +23,11 @@ SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc)
 SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc)
 SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE)
 SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE)
+IF(APPLE)
+  SET(BUILD_CMD make -n | sed "s/-Werror//g" | sh)
+ELSE()
+  SET(BUILD_CMD make)
+ENDIF()
 
 ExternalProject_Add(
     extern_grpc
@@ -33,7 +38,11 @@ ExternalProject_Add(
     UPDATE_COMMAND  ""
     CONFIGURE_COMMAND ""
     BUILD_IN_SOURCE 1
-    BUILD_COMMAND   make
+    # NOTE(yuyang18):
+    # Disable -Werror, otherwise the compile will fail in MacOS.
+    # It seems that we cannot configure that by make command.
+    # Just dry run make command and remove `-Werror`, then use a shell to run make commands
+    BUILD_COMMAND  ${BUILD_CMD}
     INSTALL_COMMAND make prefix=${GRPC_INSTALL_DIR} install
 )
 
@@ -55,4 +64,3 @@ SET_PROPERTY(TARGET grpc_unsecure PROPERTY IMPORTED_LOCATION
 
 include_directories(${GRPC_INCLUDE_DIR})
 ADD_DEPENDENCIES(grpc++_unsecure extern_grpc)
-
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index be7f6a9465..7cfe1e6807 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -15,7 +15,18 @@
 INCLUDE(ExternalProject)
 # Always invoke `FIND_PACKAGE(Protobuf)` for importing function protobuf_generate_cpp
 FIND_PACKAGE(Protobuf QUIET)
-SET(PROTOBUF_FOUND "OFF")
+macro(UNSET_VAR VAR_NAME)
+    UNSET(${VAR_NAME} CACHE)
+    UNSET(${VAR_NAME})
+endmacro()
+UNSET_VAR(PROTOBUF_INCLUDE_DIR)
+UNSET_VAR(PROTOBUF_FOUND)
+UNSET_VAR(PROTOBUF_PROTOC_EXECUTABLE)
+UNSET_VAR(PROTOBUF_PROTOC_LIBRARY)
+UNSET_VAR(PROTOBUF_LITE_LIBRARY)
+UNSET_VAR(PROTOBUF_LIBRARY)
+UNSET_VAR(PROTOBUF_INCLUDE_DIR)
+UNSET_VAR(Protobuf_PROTOC_EXECUTABLE)
 
 if(NOT COMMAND protobuf_generate_python)  # before cmake 3.4, protobuf_genrerate_python is not defined.
     function(protobuf_generate_python SRCS)
@@ -110,7 +121,6 @@ macro(PROMPT_PROTOBUF_LIB)
     # FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`.
     # make `protobuf_generate_cpp` happy.
     SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE})
-
     FOREACH(dep ${protobuf_DEPS})
         ADD_DEPENDENCIES(protobuf ${dep})
         ADD_DEPENDENCIES(protobuf_lite ${dep})
@@ -128,11 +138,11 @@ endmacro()
 
 set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf")
 if (NOT "${PROTOBUF_ROOT}" STREQUAL "")
-    find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include)
-    find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib)
-    find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib)
-    find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib)
-    find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin)
+    find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include NO_DEFAULT_PATH)
+    find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+    find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+    find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+    find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin NO_DEFAULT_PATH)
     if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOBUF_LITE_LIBRARY AND PROTOBUF_PROTOC_LIBRARY AND PROTOBUF_PROTOC_EXECUTABLE)
         message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.")
         SET_PROTOBUF_VERSION()

From 23b3fef062ce41d7b19060fb1190452c9160da59 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Tue, 28 Nov 2017 19:06:50 +0800
Subject: [PATCH 49/52] Make 'scale_op' supporting int and int64 (#5986)

* Make 'scale_op' supporting int and int64

* refine .cu file
---
 paddle/operators/scale_op.cc | 4 +++-
 paddle/operators/scale_op.cu | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/paddle/operators/scale_op.cc b/paddle/operators/scale_op.cc
index 5745580504..e5c10fec4d 100644
--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
@@ -77,4 +77,6 @@ REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker<float>,
                   ops::ScaleGradMaker);
 REGISTER_OP_CPU_KERNEL(scale,
                        ops::ScaleKernel<paddle::platform::CPUPlace, float>,
-                       ops::ScaleKernel<paddle::platform::CPUPlace, double>);
+                       ops::ScaleKernel<paddle::platform::CPUPlace, double>,
+                       ops::ScaleKernel<paddle::platform::CPUPlace, int>,
+                       ops::ScaleKernel<paddle::platform::CPUPlace, int64_t>);
diff --git a/paddle/operators/scale_op.cu b/paddle/operators/scale_op.cu
index 820fd4e685..0d70775159 100644
--- a/paddle/operators/scale_op.cu
+++ b/paddle/operators/scale_op.cu
@@ -16,4 +16,6 @@
 
 REGISTER_OP_GPU_KERNEL(
     scale, paddle::operators::ScaleKernel<paddle::platform::GPUPlace, float>,
-    paddle::operators::ScaleKernel<paddle::platform::GPUPlace, double>);
+    paddle::operators::ScaleKernel<paddle::platform::GPUPlace, double>,
+    paddle::operators::ScaleKernel<paddle::platform::GPUPlace, int>,
+    paddle::operators::ScaleKernel<paddle::platform::GPUPlace, int64_t>);

From c975fe1bdeac914847f59bee588feba0c76220f9 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Tue, 28 Nov 2017 19:34:03 +0800
Subject: [PATCH 50/52] batch norm support matrix input (#5980)

* batch norm support matrix input

* update gpu code

* format code
---
 paddle/operators/batch_norm_op.cc             | 15 ++---
 paddle/operators/batch_norm_op.cu.cc          | 31 ++++++----
 .../book/test_image_classification_train.py   |  3 +-
 .../v2/fluid/tests/test_batch_norm_op.py      | 60 +++++++++++++++----
 .../tests/test_image_classification_layer.py  | 28 +++++----
 5 files changed, 93 insertions(+), 44 deletions(-)

diff --git a/paddle/operators/batch_norm_op.cc b/paddle/operators/batch_norm_op.cc
index f884e6efa9..ac97bd83ab 100644
--- a/paddle/operators/batch_norm_op.cc
+++ b/paddle/operators/batch_norm_op.cc
@@ -62,13 +62,14 @@ class BatchNormOp : public framework::OperatorWithKernel {
     const auto x_dims = ctx->GetInputDim("X");
     const TensorFormat tensor_format =
         StringToTensorFormat(ctx->Attrs().Get<std::string>("tensor_format"));
+
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "Input X must have 2 to 5 dimensions.");
+
     const int C =
         (tensor_format == TensorFormat::NCHW ? x_dims[1]
                                              : x_dims[x_dims.size() - 1]);
 
-    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
-                   "Input X must have 3 to 5 dimensions.");
-
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C);
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
@@ -146,8 +147,8 @@ class BatchNormKernel<platform::CPUPlace, T> : public framework::OpKernel<T> {
 
     const auto *x = ctx.Input<Tensor>("X");
     const auto &x_dims = x->dims();
-    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
-                   "The Input dim size should be between 3 and 5");
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "The Input dim size should be between 2 and 5");
     const int N = x_dims[0];
     const int C =
         (tensor_format == TensorFormat::NCHW ? x_dims[1]
@@ -339,8 +340,8 @@ class BatchNormGradKernel<platform::CPUPlace, T>
     // Get the size for each dimension.
     // NCHW [batch_size, in_channels, in_height, in_width]
     const auto &x_dims = x->dims();
-    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
-                   "The Input dim size should be between 3 and 5");
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "The Input dim size should be between 2 and 5");
     const int N = x_dims[0];
     const int C =
         (tensor_format == TensorFormat::NCHW ? x_dims[1]
diff --git a/paddle/operators/batch_norm_op.cu.cc b/paddle/operators/batch_norm_op.cu.cc
index 726d1ea1b8..7b2f318700 100644
--- a/paddle/operators/batch_norm_op.cu.cc
+++ b/paddle/operators/batch_norm_op.cu.cc
@@ -29,14 +29,21 @@ void ExtractNCWHD(const framework::DDim &dims,
                   const TensorFormat &tensor_format, int *N, int *C, int *H,
                   int *W, int *D) {
   *N = dims[0];
-  *C = tensor_format == TensorFormat::NCHW ? dims[1] : dims[dims.size() - 1];
-  *H = tensor_format == TensorFormat::NCHW ? dims[2] : dims[1];
-  *W = dims.size() > 3
-           ? (tensor_format == TensorFormat::NCHW ? dims[3] : dims[2])
-           : 1;
-  *D = dims.size() > 4
-           ? (tensor_format == TensorFormat::NCHW ? dims[4] : dims[3])
-           : 1;
+  if (dims.size() == 2) {
+    *C = dims[1];
+    *H = 1;
+    *W = 1;
+    *D = 1;
+  } else {
+    *C = tensor_format == TensorFormat::NCHW ? dims[1] : dims[dims.size() - 1];
+    *H = tensor_format == TensorFormat::NCHW ? dims[2] : dims[1];
+    *W = dims.size() > 3
+             ? (tensor_format == TensorFormat::NCHW ? dims[3] : dims[2])
+             : 1;
+    *D = dims.size() > 4
+             ? (tensor_format == TensorFormat::NCHW ? dims[4] : dims[3])
+             : 1;
+  }
 }
 
 template <typename T>
@@ -56,8 +63,8 @@ class BatchNormKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
     // NCHW [batch_size, in_channels, in_height, in_width]
     const auto *x = ctx.Input<Tensor>("X");
     const auto &x_dims = x->dims();
-    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
-                   "The Input dim size should be between 3 and 5");
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "The Input dim size should be between 2 and 5");
     int N, C, H, W, D;
     ExtractNCWHD(x_dims, tensor_format, &N, &C, &H, &W, &D);
 
@@ -180,8 +187,8 @@ class BatchNormGradKernel<platform::GPUPlace, T>
 
     const auto &x_dims = x->dims();
 
-    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
-                   "The Input dim size should be between 3 and 5");
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "The Input dim size should be between 2 and 5");
     int N, C, H, W, D;
     ExtractNCWHD(x_dims, tensor_format, &N, &C, &H, &W, &D);
 
diff --git a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
index cc45b10b90..0f0cc5b540 100644
--- a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
+++ b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
@@ -69,8 +69,7 @@ def vgg16_bn_drop(input):
 
     drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
     fc1 = fluid.layers.fc(input=drop, size=512, act=None)
-    reshape1 = fluid.layers.reshape(x=fc1, shape=list(fc1.shape + (1, 1)))
-    bn = fluid.layers.batch_norm(input=reshape1, act='relu')
+    bn = fluid.layers.batch_norm(input=fc1, act='relu')
     drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
     fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
     return fc2
diff --git a/python/paddle/v2/fluid/tests/test_batch_norm_op.py b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
index 71f9599e0d..e766a68c0e 100644
--- a/python/paddle/v2/fluid/tests/test_batch_norm_op.py
+++ b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
@@ -21,6 +21,13 @@ def get_backward_op(scope, op, no_grad_set):
 
 
 def _reference_training(x, scale, offset, epsilon, data_format):
+    x_shape = x.shape
+    if len(x_shape) == 2:
+        if data_format == "NCHW":
+            x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
+        else:
+            x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
+
     if data_format == "NCHW":
         n, c, h, w = x.shape
         x_square = x * x
@@ -39,6 +46,8 @@ def _reference_training(x, scale, offset, epsilon, data_format):
         offset_tile = np.reshape(offset, (1, c, 1, 1))
         offset_tile = np.reshape(offset_tile, (1, c, 1, 1))
         y = normalized * scale_tile + offset_tile
+        if len(x_shape) == 2:
+            y = np.reshape(y, (y.shape[0], y.shape[1]))
         return y, mean, var
     elif data_format == "NHWC":
         x_square = x * x
@@ -48,7 +57,10 @@ def _reference_training(x, scale, offset, epsilon, data_format):
         mean = x_sum / element_count
         var = x_square_sum / element_count - mean * mean
         normalized = (x - mean) / np.sqrt(var + epsilon)
-        return (normalized * scale + offset), mean, var
+        y = normalized * scale + offset
+        if len(x_shape) == 2:
+            y = np.reshape(y, x_shape)
+        return y, mean, var
     else:
         raise ValueError("Unknown data order.")
 
@@ -65,6 +77,18 @@ def _reference_grad(x, grad_y, scale, mean, var, epsilon, data_format):
     #   (x - mean) * sum(grad_y * (x - mean)) / (var + epsilon))
 
     # transfer from (N, C, H, W) to (N, H, W, C) to simplify computation
+    x_shape = x.shape
+
+    if len(x_shape) == 2:
+        if data_format == "NCHW":
+            x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
+            grad_y = np.reshape(grad_y,
+                                (grad_y.shape[0], grad_y.shape[1], 1, 1))
+        else:
+            x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
+            grad_y = np.reshape(grad_y,
+                                (grad_y.shape[0], 1, 1, grad_y.shape[1]))
+
     if data_format == "NCHW":
         x = np.transpose(x, (0, 2, 3, 1))
         grad_y = np.transpose(grad_y, (0, 2, 3, 1))
@@ -83,6 +107,9 @@ def _reference_grad(x, grad_y, scale, mean, var, epsilon, data_format):
         grad_x = np.transpose(grad_x, (0, 3, 1, 2))
         x = np.transpose(x, (0, 3, 1, 2))
         grad_y = np.transpose(grad_y, (0, 3, 1, 2))
+
+    if len(x_shape) == 2:
+        grad_x = np.reshape(grad_x, x_shape)
     return grad_x, grad_scale, grad_offset
 
 
@@ -127,7 +154,7 @@ class TestBatchNormOp(OpTest):
         momentum = 0.9
 
         # N, H, W, C: 2, 3, 4, 2
-        n, h, w, c = 2, 3, 4, 2
+        n, h, w, c = 2, 3, 4, 5
         x_shape = [n, h, w, c]
         scale_shape = [c]
 
@@ -184,20 +211,23 @@ class TestBatchNormOp(OpTest):
         print 'python: NHWC, NCHW, backward checking passed'
 
     def test_forward_backward(self):
-        def test_with_place(place, tensor_format):
+        def test_with_place(place, tensor_format, shape):
             # attr
             epsilon = 0.00001
             momentum = 0.9
 
-            # N, H, W, C: 12, 3, 4, 2
-            n, h, w, c = 2, 3, 4, 2
-
-            if data_format == "NHWC":
-                x_shape = [n, h, w, c]
-            elif data_format == "NCHW":
-                x_shape = [n, c, h, w]
+            if len(shape) == 2:
+                x_shape = shape
+                c = shape[1]
             else:
-                raise ValueError("Unknown data type.")
+                # n, h, w, c = 2, 3, 4, 2
+                n, h, w, c = shape[0], shape[1], shape[2], shape[3]
+                if data_format == "NHWC":
+                    x_shape = [n, h, w, c]
+                elif data_format == "NCHW":
+                    x_shape = [n, c, h, w]
+                else:
+                    raise ValueError("Unknown data type.")
             scale_shape = [c]
 
             x_val = np.random.random_sample(x_shape).astype(np.float32)
@@ -219,7 +249,10 @@ class TestBatchNormOp(OpTest):
             #  for gradient test
             # y_grad = np.ones(x_shape).astype(np.float32)
             y_grad = np.zeros(x_shape).astype(np.float32)
-            y_grad[0, 0, 0, 0] = 1.
+            if len(y_grad.shape) == 2:
+                y_grad[0, 0] = 1.
+            else:
+                y_grad[0, 0, 0, 0] = 1.
             # y_grad = np.random.random_sample(x_shape).astype(np.float32)
             x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_grad(
                 x_val, y_grad, scale_val, saved_mean, var_ref, epsilon,
@@ -313,7 +346,8 @@ class TestBatchNormOp(OpTest):
             places.append(core.GPUPlace(0))
         for place in places:
             for data_format in ["NCHW", "NHWC"]:
-                test_with_place(place, data_format)
+                test_with_place(place, data_format, [2, 3, 4, 5])
+                test_with_place(place, data_format, [2, 3])
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/fluid/tests/test_image_classification_layer.py b/python/paddle/v2/fluid/tests/test_image_classification_layer.py
index 8e8e1b0a8c..2fd609d447 100644
--- a/python/paddle/v2/fluid/tests/test_image_classification_layer.py
+++ b/python/paddle/v2/fluid/tests/test_image_classification_layer.py
@@ -1,6 +1,6 @@
 import unittest
 
-import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid as fluid
 import paddle.v2.fluid.nets as nets
 from paddle.v2.fluid.framework import Program
 
@@ -29,27 +29,35 @@ class TestLayer(unittest.TestCase):
     def test_batch_norm_layer(self):
         main_program = Program()
         startup_program = Program()
-        images = layers.data(
+        images = fluid.layers.data(
             name='pixel',
             shape=[3, 48, 48],
             dtype='float32',
             main_program=main_program)
-        layers.batch_norm(
+        hidden1 = fluid.layers.batch_norm(
             input=images,
             main_program=main_program,
             startup_program=startup_program)
+        hidden2 = fluid.layers.fc(input=hidden1,
+                                  size=128,
+                                  act='relu',
+                                  main_program=main_program)
+        hidden3 = fluid.layers.batch_norm(
+            input=hidden2,
+            main_program=main_program,
+            startup_program=startup_program)
 
-        # print str(main_program)
+        print str(main_program)
 
     def test_dropout_layer(self):
         main_program = Program()
         startup_program = Program()
-        images = layers.data(
+        images = fluid.layers.data(
             name='pixel',
             shape=[3, 48, 48],
             dtype='float32',
             main_program=main_program)
-        layers.dropout(
+        fluid.layers.dropout(
             x=images,
             dropout_prob=0.5,
             main_program=main_program,
@@ -61,7 +69,7 @@ class TestLayer(unittest.TestCase):
         main_program = Program()
         startup_program = Program()
 
-        images = layers.data(
+        images = fluid.layers.data(
             name='pixel',
             shape=[3, 48, 48],
             dtype='float32',
@@ -77,19 +85,19 @@ class TestLayer(unittest.TestCase):
     def test_elementwise_add_with_act(self):
         main_program = Program()
         startup_program = Program()
-        image1 = layers.data(
+        image1 = fluid.layers.data(
             name='pixel1',
             shape=[3, 48, 48],
             dtype='float32',
             main_program=main_program,
             startup_program=startup_program)
-        image2 = layers.data(
+        image2 = fluid.layers.data(
             name='pixel2',
             shape=[3, 48, 48],
             dtype='float32',
             main_program=main_program,
             startup_program=startup_program)
-        out = layers.elementwise_add(
+        out = fluid.layers.elementwise_add(
             x=image1,
             y=image2,
             act='relu',

From 6ed135413a71bc2e5a44d762af564d056a5165c3 Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Tue, 28 Nov 2017 21:49:39 +0800
Subject: [PATCH 51/52] Fix useGpu in HierarchicalSigmoidLayer

---
 paddle/gserver/layers/HierarchicalSigmoidLayer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
index 6317b66a45..236f8096bd 100644
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
@@ -164,7 +164,7 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
       cpuBias_ = biases_grad;
     }
     preOutput_.grad->addByBitCodeBackward(numClasses_, *cpuLabel_, *cpuBias_);
-    if (useGpu) {
+    if (useGpu_) {
       biases_grad->copyFrom(*cpuBias_);
     } else {
       biases_grad = cpuBias_;

From dcf3ffd98033ffa492932ed9ffb7880d0bf010a0 Mon Sep 17 00:00:00 2001
From: kavyasrinet <kavyasrinet@baidu.com>
Date: Tue, 28 Nov 2017 18:02:28 -0800
Subject: [PATCH 52/52] Adding log loss operator (#5854)

* Adding log loss operator

* Removing comments
---
 paddle/operators/log_loss_op.cc               | 115 ++++++++++++++++++
 paddle/operators/log_loss_op.cu               |  22 ++++
 paddle/operators/log_loss_op.h                |  75 ++++++++++++
 .../paddle/v2/fluid/tests/test_log_loss_op.py |  33 +++++
 4 files changed, 245 insertions(+)
 create mode 100644 paddle/operators/log_loss_op.cc
 create mode 100644 paddle/operators/log_loss_op.cu
 create mode 100644 paddle/operators/log_loss_op.h
 create mode 100644 python/paddle/v2/fluid/tests/test_log_loss_op.py

diff --git a/paddle/operators/log_loss_op.cc b/paddle/operators/log_loss_op.cc
new file mode 100644
index 0000000000..257e5c8a49
--- /dev/null
+++ b/paddle/operators/log_loss_op.cc
@@ -0,0 +1,115 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/log_loss_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LogLossOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Predicted"),
+                   "Input(Predicted) must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) must be initialized.");
+
+    auto pred_dims = ctx->GetInputDim("Predicted");
+    auto label_dims = ctx->GetInputDim("Labels");
+
+    PADDLE_ENFORCE_EQ(pred_dims, label_dims);
+    PADDLE_ENFORCE_EQ(pred_dims.size(), 2,
+                      "The rank of Input(Predicted) must be 2 and the shape is "
+                      "[batch_size, 1].");
+    PADDLE_ENFORCE_EQ(pred_dims[1], 1,
+                      "Each row of Input(Predicted) contains a real value, "
+                      "so the 2nd dimension of Input(X) must be 1.");
+
+    ctx->SetOutputDim("Loss", {pred_dims[0], 1});
+    ctx->ShareLoD("Predicted", "Loss");
+  }
+};
+
+template <typename AttrType>
+class LogLossOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LogLossOpMaker(framework::OpProto* proto,
+                 framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Predicted",
+             "The input value (Predicted) of Log loss op."
+             "Predicted is a 2-D tensor with shape [batch_size, 1].");
+    AddInput("Labels",
+             "The target value (Labels) of Log loss op."
+             "Labels is a 2-D tensor with shape [batch_size, 1].");
+    AddOutput("Loss",
+              "The output tensor with shape [batch_size, 1] "
+              "which represents the log loss.");
+    AddAttr<AttrType>("epsilon", "Epsilon in log loss.");
+    AddComment(R"DOC(
+LogLoss Operator.
+
+Log loss is a loss function used for binary classification. Log Loss quantifies
+the accuracy of a classifier by penalising false classifications. Minimising the
+Log Loss is equivalent to maximising the accuracy of the classifier. We define
+Predicted as the values predicted by our model and Labels as the target ground
+truth value. Log loss can evaluate how close the predicted values are to the
+target. The shapes of Predicted and Labels are both [batch_size, 1].
+The equation is:
+
+$$
+Loss = - Labels * log(Predicted + \epsilon) -
+        (1 - Labels) * log(1 - Predicted + \epsilon)
+$$
+
+)DOC");
+  }
+};
+
+class LogLossGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Predicted"),
+                   "Input(Predicted) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
+                   "Input(Loss@GRAD) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Predicted")),
+                   "Output(Predicted@GRAD) should not be null.");
+
+    auto pred_dims = ctx->GetInputDim("Predicted");
+    auto label_dims = ctx->GetInputDim("Labels");
+    auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss"));
+    PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims);
+
+    auto pred_grad_name = framework::GradVarName("Predicted");
+    ctx->SetOutputDim(pred_grad_name, pred_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(log_loss, ops::LogLossOp, ops::LogLossOpMaker<float>, log_loss_grad,
+            ops::LogLossGradOp);
+REGISTER_OP_CPU_KERNEL(log_loss,
+                       ops::LogLossKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    log_loss_grad, ops::LogLossGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/log_loss_op.cu b/paddle/operators/log_loss_op.cu
new file mode 100644
index 0000000000..6c189ef341
--- /dev/null
+++ b/paddle/operators/log_loss_op.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/log_loss_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(log_loss,
+                       ops::LogLossKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    log_loss_grad, ops::LogLossGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/log_loss_op.h b/paddle/operators/log_loss_op.h
new file mode 100644
index 0000000000..73404fce91
--- /dev/null
+++ b/paddle/operators/log_loss_op.h
@@ -0,0 +1,75 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename Place, typename T, typename AttrType = T>
+class LogLossKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* loss_out = ctx.Output<Tensor>("Loss");
+
+    loss_out->mutable_data<T>(ctx.GetPlace());
+
+    auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
+
+    auto prediction = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Predicted"));
+    auto label = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Labels"));
+
+    auto loss = EigenVector<T>::Flatten(*loss_out);
+    auto place = ctx.GetEigenDevice<Place>();
+
+    loss.device(place) = (-(label * (prediction + epsilon).log()) -
+                          ((static_cast<T>(1) - label) *
+                           (static_cast<T>(1) - prediction + epsilon).log()));
+  }
+};
+
+template <typename Place, typename T, typename AttrType = T>
+class LogLossGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
+
+    auto prediction = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Predicted"));
+    auto label = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Labels"));
+
+    auto* dloss = ctx.Input<Tensor>(framework::GradVarName("Loss"));
+    auto* dpred = ctx.Output<Tensor>(framework::GradVarName("Predicted"));
+
+    auto dl = EigenVector<T>::Flatten(*dloss);
+    auto place = ctx.GetEigenDevice<Place>();
+
+    if (dpred) {
+      dpred->mutable_data<T>(ctx.GetPlace());
+      auto dx = framework::EigenVector<T>::Flatten(*dpred);
+      dx.device(place) = dl * (-(label / (prediction + epsilon)) +
+                               ((static_cast<T>(1) - label) /
+                                (static_cast<T>(1) - prediction + epsilon)));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/v2/fluid/tests/test_log_loss_op.py b/python/paddle/v2/fluid/tests/test_log_loss_op.py
new file mode 100644
index 0000000000..2eeaa90758
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_log_loss_op.py
@@ -0,0 +1,33 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestLogLossOp(OpTest):
+    def setUp(self):
+        self.op_type = 'log_loss'
+        samples_num = 32
+
+        predicted = np.random.uniform(0.1, 1.0,
+                                      (samples_num, 1)).astype("float32")
+        labels = np.random.randint(0, 2, (samples_num, 1)).astype("float32")
+        epsilon = 1e-4
+        self.inputs = {
+            'Predicted': predicted,
+            'Labels': labels,
+        }
+
+        self.attrs = {'epsilon': epsilon}
+        loss = -labels * np.log(predicted + epsilon) - (
+            1 - labels) * np.log(1 - predicted + epsilon)
+        self.outputs = {'Loss': loss}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['Predicted'], 'Loss', max_relative_error=0.03)
+
+
+if __name__ == '__main__':
+    unittest.main()