Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into lookup_table

8 years ago · 1795e57671
parent 0f3b9e4112 ce723af062
commit 1795e57671
33 changed files with 393 additions and 135 deletions
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@ -362,6 +362,11 @@ trans
 ..  autoclass:: paddle.v2.layer.trans
    :noindex:
 scale_shift
 -----------
 ..  autoclass:: paddle.v2.layer.scale_shift
    :noindex:
 Sampling Layers
 ===============
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@ -110,7 +110,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
                       dup_output_ops[out].emplace_back(local_op_id);
                       return false;
                     });
-      net->AddOp(std::move(bwd));
+      net->AppendOp(std::move(bwd));
    }
    // Get unique ID for this method.
    auto uid = uniq_id++;
@ -163,7 +163,8 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
        // If part of input gradient of that operator is not calculated, fill
        // zero variables to that input gradient.
-        net->AddOp(OpRegistry::CreateOp("fill_zeros_like", {{"Src", {prefix}}},
+        net->AppendOp(OpRegistry::CreateOp("fill_zeros_like",
                                           {{"Src", {prefix}}},
                                           {{"Dst", {grad_input}}}, {}));
      }
      return false;
@ -195,7 +196,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
    if (net->ops_.empty()) {  // Current no aux op is added to network
      return grad_op;
    }
-    net->AddOp(std::move(grad_op));
+    net->AppendOp(std::move(grad_op));
  }
  net->SetType("@GENERATED_BACKWARD@");
  net->CompleteAddOp();
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@ -75,13 +75,13 @@ class FcOp : public operators::NetOp {
  FcOp(const std::string &type, const VarNameMap &inputs,
       const VarNameMap &outputs, const AttributeMap &attrs)
      : NetOp(type, inputs, outputs, attrs) {
-    AddOp(OpRegistry::CreateOp("mul",
+    AppendOp(OpRegistry::CreateOp("mul",
                                  {{"X", {Input("X")}}, {"Y", {Input("W")}}},
                                  {{"Out", {Output("mul_result")}}}, {}));
    auto input_b = Inputs("b");
    std::string before_act = "mul_result";
    if (input_b.size() != 0) {
-      AddOp(OpRegistry::CreateOp(
+      AppendOp(OpRegistry::CreateOp(
          "rowwise_add", {{"X", {Output("mul_result")}}, {"b", {input_b[0]}}},
          {{"Out", {Output("add_result")}}}, {}));
      before_act = "add_result";
@ -92,7 +92,7 @@ class FcOp : public operators::NetOp {
      }
    }
-    AddOp(OpRegistry::CreateOp("sigmoid", {{"X", {Output(before_act)}}},
+    AppendOp(OpRegistry::CreateOp("sigmoid", {{"X", {Output(before_act)}}},
                                  {{"Out", {Output("Out")}}}, {}));
    CompleteAddOp(false);
  }
@ -234,13 +234,13 @@ TEST(Backward, net_fc_backward_not_have_b) {
 TEST(Backward, net_input_of_network_not_need_grad) {
  ops::NetOp net;
-  net.AddOp(f::OpRegistry::CreateOp(
+  net.AppendOp(f::OpRegistry::CreateOp(
      "fc", {{"X", {"x"}}, {"W", {"W1"}}, {"b", {"b1"}}},
      {{"mul_result", {"mul_tmp_0"}},
       {"add_result", {"add_tmp_0"}},
       {"Out", {"hidden0"}}},
      {}));
-  net.AddOp(f::OpRegistry::CreateOp(
+  net.AppendOp(f::OpRegistry::CreateOp(
      "fc", {{"X", {"hidden0"}}, {"W", {"W2"}}, {"b", {"b2"}}},
      {{"mul_result", {"mul_tmp_1"}},
       {"add_result", {"add_tmp_1"}},
@ -273,9 +273,9 @@ TEST(Backward, net_input_of_network_not_need_grad) {
 TEST(Backward, net_shared_weight) {
  ops::NetOp net;
-  net.AddOp(f::OpRegistry::CreateOp("mul", {{"X", {"x"}}, {"Y", {"w"}}},
+  net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"x"}}, {"Y", {"w"}}},
                                       {{"Out", {"out"}}}, {}));
-  net.AddOp(f::OpRegistry::CreateOp("mul", {{"X", {"out"}}, {"Y", {"w"}}},
+  net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"out"}}, {"Y", {"w"}}},
                                       {{"Out", {"FinalOut"}}}, {}));
  net.CompleteAddOp();
@ -357,19 +357,19 @@ TEST(Backward, op_part_of_input_are_not_need) {
 TEST(Backward, linear_net_intermediate_variable_has_no_grad) {
  ops::NetOp net;
-  net.AddOp(f::OpRegistry::CreateOp(
+  net.AppendOp(f::OpRegistry::CreateOp(
      "fc", {{"X", {"x1"}}, {"W", {"w1"}}, {"b", {"b1"}}},
      {{"mul_result", {"mul_out1"}},
       {"add_result", {"add_out1"}},
       {"Out", {"out1"}}},
      {}));
-  net.AddOp(f::OpRegistry::CreateOp(
+  net.AppendOp(f::OpRegistry::CreateOp(
      "fc", {{"X", {"out1"}}, {"W", {"w2"}}, {"b", {"b2"}}},
      {{"mul_result", {"mul_out2"}},
       {"add_result", {"tmp_out2"}},
       {"Out", {"out2"}}},
      {}));
-  net.AddOp(f::OpRegistry::CreateOp(
+  net.AppendOp(f::OpRegistry::CreateOp(
      "fc", {{"X", {"out2"}}, {"W", {"w3"}}, {"b", {"b3"}}},
      {{"mul_result", {"mul_out3"}},
       {"add_result", {"tmp_out3"}},
--- a/paddle/framework/pybind.cc
+++ b/paddle/framework/pybind.cc
@ -223,8 +223,8 @@ All parameter, weight, gradient are variables in Paddle.
                    retv->SetType("plain_net");
                    return retv;
                  })
-      .def("add_op", [](operators::NetOp &self,
+      .def("append_op", [](operators::NetOp &self,
-                        const OperatorBase &op) { self.AddOp(op); })
+                           const OperatorBase &op) { self.AppendOp(op); })
      .def("complete_add_op", &operators::NetOp::CompleteAddOp)
      .def("complete_add_op", [](std::shared_ptr<operators::NetOp> &self) {
        self->CompleteAddOp();
--- a/paddle/function/GemmFunctor.cpp
+++ b/paddle/function/GemmFunctor.cpp
@ -84,7 +84,7 @@ struct BlasGemm<DEVICE_TYPE_GPU, T> {
  }
 };
-template class BlasGemm<DEVICE_TYPE_CPU, real>;
+template struct BlasGemm<DEVICE_TYPE_CPU, real>;
-template class BlasGemm<DEVICE_TYPE_GPU, real>;
+template struct BlasGemm<DEVICE_TYPE_GPU, real>;
 }  // namespace paddle
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@ -202,7 +202,7 @@ void NeuralNetwork::prefetch(const std::vector<Argument>& inArgs) {
        auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
            para->getMat(PARAMETER_VALUE).get());
        para->clearGradient();
-        mat->clearIndices();
+        if (mat) mat->clearIndices();
      }
    }
  }
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@ -184,7 +184,7 @@ public:
  }
  void backward(const UpdateCallback& callback) override {
-    if (biases_) {
+    if (biases_ && biases_->getWGrad()) {
      backwardActivation();
      biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
      biases_->getParameterPtr()->incUpdate(callback);
--- a/paddle/gserver/layers/ScaleShiftLayer.cpp
+++ b/paddle/gserver/layers/ScaleShiftLayer.cpp
@ -0,0 +1,107 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "Layer.h"
 namespace paddle {
 /**
 * A layer applies a linear transformation to each element in each row of
 * the input matrix. For each element, the layer first re-scale it and then
 * adds a bias to it.
 *
 * \f[
 *    y = wx + b
 * \f]
 *
 * Here, w is the scale and b is the bias. Both w and b are trainable scalars.
 *
 */
 class ScaleShiftLayer : public Layer {
 protected:
  std::unique_ptr<Weight> scale_;
  std::unique_ptr<Weight> offset_;
 public:
  explicit ScaleShiftLayer(const LayerConfig& config) : Layer(config) {}
  bool init(const LayerMap& layerMap,
            const ParameterMap& parameterMap) override;
  void forward(PassType passType) override;
  void backward(const UpdateCallback& callback = nullptr) override;
 };
 REGISTER_LAYER(scale_shift, ScaleShiftLayer);
 bool ScaleShiftLayer::init(const LayerMap& layerMap,
                           const ParameterMap& parameterMap) {
  Layer::init(layerMap, parameterMap);
  CHECK_EQ(inputLayers_.size(), 1U);
  scale_.reset(new Weight(1, 1, parameters_[0]));
  if (biasParameter_.get() != NULL) {
    offset_ = std::unique_ptr<Weight>(new Weight(1, 1, biasParameter_));
  }
  return true;
 }
 void ScaleShiftLayer::forward(PassType passType) {
  Layer::forward(passType);
  MatrixPtr inV = getInputValue(0);
  resetOutput(inV->getHeight(), inV->getWidth());
  MatrixPtr outV = getOutputValue();
  real scaleValue = scale_->getW()->getElement(0, 0);
  outV->mulScalar(*inV, scaleValue);
  if (offset_) {
    real offsetValue = offset_->getW()->getElement(0, 0);
    outV->add(offsetValue);
  }
 }
 void ScaleShiftLayer::backward(const UpdateCallback& callback) {
  MatrixPtr inV = getInputValue(0);
  MatrixPtr inG = getInputGrad(0);
  MatrixPtr outV = getOutputValue();
  MatrixPtr outG = getOutputGrad();
  /* Calculate the parameter gradient for the current layer */
  if (scale_->getWGrad()) {
    MatrixPtr rowSumMtx;
    Matrix::resizeOrCreate(rowSumMtx, outG->getHeight(), 1, false, useGpu_);
    // this_i = scaleDest * this_i + scaleSum * \sum_j b_{ij} * c_{ij}
    rowSumMtx->sumOfProducts(
        /* b= */ *inV, /* c= */ *outG, /* scaleSum= */ 1, /* scaleDest= */ 0.);
    // this_i = scaleDest * this_i + scaleSum * \sum_j b_{ji}
    scale_->getWGrad()->sumCols(
        /* b= */ *rowSumMtx, /* scaleSum= */ 1., /* scaleDest= */ 1.);
    scale_->getParameterPtr()->incUpdate(callback);
  }
  if (offset_ && offset_->getWGrad()) {
    MatrixPtr rowSumMtx;
    Matrix::resizeOrCreate(rowSumMtx, outG->getHeight(), 1, false, useGpu_);
    rowSumMtx->sumRows(*outG, 1., 0.);
    offset_->getWGrad()->sumCols(*rowSumMtx, 1., 1.);
    offset_->getParameterPtr()->incUpdate(callback);
  }
  /* Calculate the input layers error */
  if (inG) {
    real scaleValue = scale_->getW()->getElement(0, 0);
    inG->add(*outG, scaleValue);
  }
 }
 }  // namespace paddle
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@ -2007,6 +2007,21 @@ TEST(Layer, RowL2NormLayer) {
  }
 }
 TEST(Layer, ScaleShiftLayer) {
  const size_t batchSize = 16;
  const size_t size = 32;
  TestConfig config;
  config.layerConfig.set_type("scale_shift");
  config.layerConfig.set_size(size);
  config.biasSize = 1;
  config.inputDefs.push_back(
      {INPUT_DATA, "input", /* dim= */ size, /* paraSize= */ 1});
  config.layerConfig.add_inputs();
  for (auto useGpu : {false, true}) {
    testLayerGrad(config, "scale_shift", batchSize, false, useGpu, false);
  }
 }
 int main(int argc, char** argv) {
  testing::InitGoogleTest(&argc, argv);
  initMain(argc, argv);
--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ b/paddle/gserver/tests/test_NetworkCompare.cpp
@ -269,7 +269,8 @@ TEST(Compare, img_conv2) {
  bool useGpu = FLAGS_use_gpu;
  double eps = FLAGS_checkgrad_eps;
  FLAGS_use_gpu = true;
-  FLAGS_checkgrad_eps = 1e-2;
+  // Sometimes, this unit test will fail with 1e-2
  FLAGS_checkgrad_eps = 4e-2;
  compareNetwork(config_file_a, config_file_b);
  FLAGS_use_gpu = useGpu;
  FLAGS_checkgrad_eps = eps;
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@ -1,11 +1,8 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
   http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -19,25 +16,25 @@ namespace paddle {
 namespace operators {
 template <typename T>
-class GaussianRandomKernel : public framework::OpKernel {
+class CPUGaussianRandomKernel : public framework::OpKernel {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    float mean = context.op_.GetAttr<float>("mean");
    float std = context.op_.GetAttr<float>("std");
-    auto* tensor = context.Output<framework::Tensor>(0);
+    auto* tensor = context.Output<framework::Tensor>("Out");
    T* data = tensor->mutable_data<T>(context.GetPlace());
-    // TODO(dzh): attribute does not support unsigned int.
+    unsigned int seed =
-    // And we need a global random seed configuration.
+        static_cast<unsigned int>(context.op_.GetAttr<int>("seed"));
-    int seed = context.op_.GetAttr<int>("seed");
+    std::minstd_rand engine;
    if (seed == 0) {
      seed = std::random_device()();
    }
-    std::mt19937 g(seed);
+    engine.seed(seed);
-    std::normal_distribution<T> distribution(mean, std);
+    std::normal_distribution<T> dist(mean, std);
    ssize_t size = framework::product(tensor->dims());
-    for (int i = 0; i < size; ++i) {
+    for (ssize_t i = 0; i < size; ++i) {
-      data[i] = distribution(g);
+      data[i] = dist(engine);
    }
  }
 };
@ -48,7 +45,7 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext& context) const override {
-    auto* tensor = context.Output<framework::Tensor>(0);
+    auto* tensor = context.Output<framework::Tensor>("Out");
    auto dims = GetAttr<std::vector<int>>("dims");
    PADDLE_ENFORCE(dims.size() > 0UL,
                   "dims can be one int or array. dims must be set.");
@ -68,8 +65,8 @@ Use to initialize tensor with gaussian random generator.
 )DOC");
    AddAttr<std::vector<int>>("dims", "The dimension of random tensor.");
-    AddAttr<float>("mean", "mean value of random.").SetDefault(.0f);
+    AddAttr<float>("mean", "mean of random tensor.").SetDefault(.0f);
-    AddAttr<float>("std", "minimum value of random value.").SetDefault(1.0f);
+    AddAttr<float>("std", "std of random tensor.").SetDefault(1.0f);
    AddAttr<int>("seed",
                 "Random seed of generator."
                 "0 means use system wide seed")
@ -83,4 +80,4 @@ Use to initialize tensor with gaussian random generator.
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(gaussian_random, ops::GaussianRandomOp,
                             ops::GaussianRandomOpMaker);
-REGISTER_OP_CPU_KERNEL(gaussian_random, ops::GaussianRandomKernel<float>);
+REGISTER_OP_CPU_KERNEL(gaussian_random, ops::CPUGaussianRandomKernel<float>);
--- a/paddle/operators/gaussian_random_op.cu
+++ b/paddle/operators/gaussian_random_op.cu
@ -1,53 +1,65 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
   http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License. */
-#include <memory>
+#include <thrust/device_ptr.h>
-#include <random>
+#include <thrust/iterator/counting_iterator.h>
-#include "paddle/platform/dynload/curand.h"
+#include <thrust/random.h>
-#include "paddle/platform/gpu_info.h"
+#include <thrust/transform.h>
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
 namespace paddle {
 namespace operators {
 template <typename T>
-class GaussianRandomKernel : public framework::OpKernel {
+struct GaussianGenerator {
  T mean_, std_;
  unsigned int seed_;
  __host__ __device__ GaussianGenerator(T mean, T std, int seed)
      : mean_(mean), std_(std), seed_(seed) {}
  __host__ __device__ T operator()(const unsigned int n) const {
    thrust::minstd_rand rng;
    rng.seed(seed_);
    thrust::normal_distribution<T> dist(mean_, std_);
    rng.discard(n);
    return dist(rng);
  }
 };
 template <typename T>
 class GPUGaussianRandomKernel : public framework::OpKernel {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
-    float mean = context.op_.GetAttr<float>("mean");
+    auto* tensor = context.Output<framework::Tensor>("Out");
    float std = context.op_.GetAttr<float>("std");
    auto* tensor = context.Output<framework::Tensor>(0);
    T* data = tensor->mutable_data<T>(context.GetPlace());
-
+    unsigned int seed =
-    int seed = context.op_.GetAttr<int>("seed");
+        static_cast<unsigned int>(context.op_.GetAttr<int>("seed"));
    if (seed == 0) {
      std::random_device rd;
      seed = rd();
    }
-    curandGenerator_t g;
+    T mean = static_cast<T>(context.op_.GetAttr<float>("mean"));
-    PADDLE_ENFORCE(platform::dynload::curandCreateGenerator(
+    T std = static_cast<T>(context.op_.GetAttr<float>("std"));
-        &g, CURAND_RNG_PSEUDO_DEFAULT));
+    thrust::counting_iterator<unsigned int> index_sequence_begin(0);
-    PADDLE_ENFORCE(
+    ssize_t N = framework::product(tensor->dims());
-        platform::dynload::curandSetPseudoRandomGeneratorSeed(g, seed));
+    thrust::transform(index_sequence_begin, index_sequence_begin + N,
-    platform::dynload::curandGenerateNormal(
+                      thrust::device_ptr<T>(data),
-        g, data, framework::product(tensor->dims()), mean, std);
+                      GaussianGenerator<T>(mean, std, seed));
  }
 };
 }  // namespace operators
 }  // namespace paddle
-namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(gaussian_random,
-REGISTER_OP_GPU_KERNEL(gaussian_random, ops::GaussianRandomKernel<float>);
+                       paddle::operators::GPUGaussianRandomKernel<float>);
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@ -13,7 +13,6 @@
   limitations under the License. */
 #include "paddle/operators/mul_op.h"
 #include "paddle/operators/math/math_function.h"
 namespace paddle {
 namespace operators {
--- a/paddle/operators/net_op.h
+++ b/paddle/operators/net_op.h
@ -84,13 +84,14 @@ class NetOp : public framework::OperatorBase {
    return true;
  }
-  void AddOp(const framework::OperatorBase& op) { AddOp(op.Clone()); }
+  void AppendOp(const framework::OperatorBase& op) { AppendOp(op.Clone()); }
  /**
   * @brief Add an operator by ptr
   */
-  void AddOp(std::unique_ptr<framework::OperatorBase> op) {
+  void AppendOp(std::unique_ptr<framework::OperatorBase> op) {
-    PADDLE_ENFORCE(!add_op_done_, "Cannot AddOp when this network is sealed");
+    PADDLE_ENFORCE(!add_op_done_,
                   "Cannot AppendOp when this network is sealed");
    PADDLE_ENFORCE_NOT_NULL(op, "Cannot Insert Null op");
    ops_.push_back(std::move(op));
  }
--- a/paddle/operators/net_op_test.cc
+++ b/paddle/operators/net_op_test.cc
@ -38,10 +38,10 @@ TEST(OpKernel, all) {
  auto net = std::make_shared<NetOp>();
  ASSERT_NE(net, nullptr);
-  net->AddOp(std::unique_ptr<TestOp>(
+  net->AppendOp(std::unique_ptr<TestOp>(
      new TestOp("test", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}},
                 {{"Out", {"y"}}}, {})));
-  net->AddOp(std::unique_ptr<TestOp>(
+  net->AppendOp(std::unique_ptr<TestOp>(
      new TestOp("test", {{"X", {"y"}}, {"W", {"w2"}}, {"b", {"b2"}}},
                 {{"Out", {"z"}}}, {})));
@ -61,7 +61,7 @@ TEST(NetOp, insert_op) {
  auto op1 = std::unique_ptr<framework::NOP>(
      new framework::NOP("empty", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}},
                         {{"Out", {"y"}}}, {}));
-  net.AddOp(*op1);
+  net.AppendOp(*op1);
  net.InsertOp(0, *op1);
  ASSERT_EQ(2UL, net.ops_.size());
  net.InsertOp(2, std::move(op1));
@ -70,9 +70,9 @@ TEST(NetOp, insert_op) {
 TEST(NetOp, Clone) {
  NetOp net;
-  net.AddOp(
+  net.AppendOp(
      std::unique_ptr<framework::NOP>(new framework::NOP{"empty", {}, {}, {}}));
-  net.AddOp(std::unique_ptr<framework::NOP>(
+  net.AppendOp(std::unique_ptr<framework::NOP>(
      new framework::NOP{"empty2", {}, {}, {}}));
  net.CompleteAddOp(true);
  auto new_net_op = net.Clone();
--- a/paddle/operators/rowwise_add_op.h
+++ b/paddle/operators/rowwise_add_op.h
@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
+Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
+you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
+Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
+distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
+See the License for the specific language governing permissions and
-   limitations under the License. */
+limitations under the License. */
 #pragma once
 #include "paddle/framework/eigen.h"
@ -63,7 +63,7 @@ class RowwiseAddGradKernel : public framework::OpKernel {
    // https://eigen.tuxfamily.org/dox/unsupported/TensorBase_8h_source.html
    // colwise add
-    Eigen::array<int, 1> dims{{1}}; /* dimension to reduce */
+    Eigen::array<int, 1> dims{{0}}; /* dimension to reduce */
    EigenVector<T>::Flatten(*db).device(place) = OutGrad.sum(dims);
  }
 };
--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
@ -1,11 +1,8 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
   http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -39,7 +36,8 @@ class CPUUniformRandomKernel : public framework::OpKernel {
    std::uniform_real_distribution<T> dist(
        static_cast<T>(context.op_.GetAttr<float>("min")),
        static_cast<T>(context.op_.GetAttr<float>("max")));
-    for (ssize_t i = 0; i < framework::product(tensor->dims()); ++i) {
+    ssize_t size = framework::product(tensor->dims());
    for (ssize_t i = 0; i < size; ++i) {
      data[i] = dist(engine);
    }
  }
@ -66,7 +64,6 @@ class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker {
      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddOutput("Out", "The output tensor of uniform random op");
    AddComment(R"DOC(Uniform random operator.
 Used to initialize tensor with uniform random generator.
 )DOC");
    AddAttr<std::vector<int>>("dims", "the dimension of random tensor");
--- a/paddle/operators/uniform_random_op.cu
+++ b/paddle/operators/uniform_random_op.cu
@ -1,11 +1,8 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
   http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
--- a/paddle/parameter/Parameter.h
+++ b/paddle/parameter/Parameter.h
@ -65,8 +65,11 @@ public:
  size_t getSize() const { return config_.size(); }
  bool isFullSize() const {
    if (bufs_[PARAMETER_VALUE]) {
      return this->getSize() == bufs_[PARAMETER_VALUE]->getSize();
    }
    return false;
  }
  inline bool useGpu() const { return useGpu_; }
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@ -114,9 +114,6 @@ CUDADeviceContext::~CUDADeviceContext() {
    PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_));
  }
  if (curand_generator_) {
    PADDLE_ENFORCE(dynload::curandDestroyGenerator(curand_generator_));
  }
  eigen_stream_.reset();
  eigen_device_.reset();
  PADDLE_ENFORCE(cudaStreamDestroy(stream_));
@ -152,19 +149,6 @@ cudnnHandle_t CUDADeviceContext::cudnn_handle() {
 cudaStream_t CUDADeviceContext::stream() { return stream_; }
 curandGenerator_t CUDADeviceContext::curand_generator() {
  if (!curand_generator_) {
    SetDeviceId(place_.device);
    PADDLE_ENFORCE(dynload::curandCreateGenerator(&curand_generator_,
                                                  CURAND_RNG_PSEUDO_DEFAULT));
    PADDLE_ENFORCE(
        dynload::curandSetPseudoRandomGeneratorSeed(curand_generator_, seed_));
    PADDLE_ENFORCE(dynload::curandSetStream(curand_generator_, stream_));
  }
  return curand_generator_;
 }
 #endif  // PADDLE_ONLY_CPU
 }  // namespace platform
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@ -17,7 +17,6 @@ limitations under the License. */
 #ifndef PADDLE_ONLY_CPU
 #include "paddle/platform/dynload/cublas.h"
 #include "paddle/platform/dynload/cudnn.h"
 #include "paddle/platform/dynload/curand.h"
 #include "paddle/platform/gpu_info.h"
 #define EIGEN_USE_GPU
 #endif
@ -40,7 +39,7 @@ class DeviceContext {
 class CPUDeviceContext : public DeviceContext {
 public:
  CPUDeviceContext();
-  explicit CPUDeviceContext(CPUPlace);
+  explicit CPUDeviceContext(CPUPlace place);
  virtual ~CPUDeviceContext() {}
  Eigen::DefaultDevice* eigen_device() const;
@ -56,7 +55,7 @@ class EigenCudaStreamDevice;
 class CUDADeviceContext : public DeviceContext {
 public:
-  explicit CUDADeviceContext(GPUPlace);
+  explicit CUDADeviceContext(GPUPlace place);
  virtual ~CUDADeviceContext();
  /*! \brief  Wait for all operations completion in the stream. */
@ -75,9 +74,6 @@ class CUDADeviceContext : public DeviceContext {
  /*! \brief  Return cudnn  handle in the device context. */
  cudnnHandle_t     cudnn_handle();
  /*! \brief  Return curand handle in the device context. */
  curandGenerator_t curand_generator();
  /*! \brief  Return cuda stream in the device context. */
  cudaStream_t      stream();
  // clang-format on
@ -85,18 +81,13 @@ class CUDADeviceContext : public DeviceContext {
 private:
  GPUPlace place_;
 private:
  std::unique_ptr<Eigen::GpuDevice> eigen_device_;
  std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
 private:
  uint64_t seed_;
  // clang-format off
  cudaStream_t       stream_{nullptr};
  cudnnHandle_t      cudnn_handle_{nullptr};
  cublasHandle_t     cublas_handle_{nullptr};
  curandGenerator_t  curand_generator_{nullptr};
  // clang-format on
 };
--- a/paddle/platform/device_context_test.cc
+++ b/paddle/platform/device_context_test.cc
@ -43,8 +43,6 @@ TEST(Device, CUDADeviceContext) {
    ASSERT_NE(nullptr, cudnn_handle);
    cublasHandle_t cublas_handle = device_context->cublas_handle();
    ASSERT_NE(nullptr, cublas_handle);
    curandGenerator_t curand_handle = device_context->curand_generator();
    ASSERT_NE(nullptr, curand_handle);
    ASSERT_NE(nullptr, device_context->stream());
    delete device_context;
  }
--- a/paddle/pserver/ParameterClient2.cpp
+++ b/paddle/pserver/ParameterClient2.cpp
@ -65,7 +65,6 @@ void ParameterClient2::initThreads() {
    LOG(INFO) << "parallel_thread_num dosent need to set";
  }
  syncThreadPool_.reset(new SyncThreadPool(threadNum_));
  startThreads();
 }
@ -224,6 +223,14 @@ void ParameterClient2::prepareSendData(
    request.set_cost(cost);
    request.set_batch_status(batchStatus);
    CHECK_EQ(request.blocks_size(), 0);
    VLOG(10) << "request: trainer_id: " << request.trainer_id()
             << " update_mode" << request.update_mode()
             << " send_back_parameter: " << request.send_back_parameter()
             << " send_back_parameter_type: "
             << request.send_back_parameter_type()
             << " num_samples: " << request.num_samples()
             << " cost: " << request.cost()
             << " batch_status: " << request.batch_status();
  }
  for (const auto& segments : parameterSegments) {
    const auto it = parameterMap_.find(segments.id);
@ -251,11 +258,17 @@ void ParameterClient2::prepareSendData(
      CHECK(sendMat != nullptr) << "sendMat is nullptr";
      syncThreadPool_->exec([&](int tid, size_t numThreads) {
        std::lock_guard<std::mutex> guard(sparseAutoGrowthMutex_);
        const auto& localIndices = prefetchMat->getLocalIndices();
        /// num of sparse rows
        size_t nLocalBlocks = localIndices.size();
        uint64_t beginDim = 0;
        uint64_t endDim = 0;
        // FIXME(typhoonzero): let it resize first
        prefetchMat->getLocalRow(nLocalBlocks + 1);
        sendMat->getLocalRow(nLocalBlocks + 1);
        for (size_t row = 0; row < nLocalBlocks; ++row) {
          int64_t blockId = localIndices[row];  // local row -> sparse row
          int serverId = std::abs((blockId + nameHash) % serviceNum_);
@ -275,7 +288,6 @@ void ParameterClient2::prepareSendData(
          block->set_begin_pos(row * blockSize);
          /// block len
          block->set_block_size(endDim - beginDim);
          if (sendingPara) {
            sendJob->parallelInputIovs[serverId].push_back(
                {sendMat->getLocalRow(row), sizeof(real) * (size_t)blockSize});
--- a/paddle/pserver/ParameterClient2.h
+++ b/paddle/pserver/ParameterClient2.h
@ -583,6 +583,7 @@ protected:
 #ifndef PADDLE_DISABLE_TIMER
  uint64_t forwardbackwordTime_;
 #endif
  std::mutex sparseAutoGrowthMutex_;
  /// map id to parameter used for decoding protobuf data
  std::unordered_map<size_t, ParameterPtr> parameterMap_;
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@ -2232,6 +2232,20 @@ class ClipLayer(LayerBase):
        self.config.inputs[0].clip_conf.max = max
@config_layer('scale_shift')
 class ScaleShiftLayer(LayerBase):
    def __init__(self, name, inputs, bias=True, **xargs):
        super(ScaleShiftLayer, self).__init__(
            name, 'scale_shift', 0, inputs=inputs, **xargs)
        config_assert(
            len(self.inputs) == 1,
            'ScaleShiftLayer must have one and only one input.')
        input_layer = self.get_input_layer(0)
        self.set_layer_size(input_layer.size)
        self.create_input_parameter(0, 1, [1, 1])
        self.create_bias_parameter(bias, 1)
 # key: cost type
 # value: cost class
 g_cost_map = {}
--- a/Show More
+++ b/Show More