Merge branch 'develop' of github.com:baidu/Paddle into feature/move_pybind_to_framework_dir

8 years ago · fa5a5a3acf
parent 3fc68f6f14 dd249a50a5
commit fa5a5a3acf
60 changed files with 917 additions and 325 deletions
--- a/2
+++ b/2
@ -27,7 +27,7 @@ RUN apt-get update && \
    git python-pip python-dev openssh-server bison  \
    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
    curl sed grep graphviz libjpeg-dev zlib1g-dev  \
-    python-numpy python-matplotlib gcc g++ \
+    python-numpy python-matplotlib gcc-4.8 g++-4.8 \
    automake locales clang-format-3.8 swig doxygen cmake  \
    liblapack-dev liblapacke-dev libboost-dev \
    clang-3.8 llvm-3.8 libclang-3.8-dev \
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@ -9,6 +9,11 @@ function(CheckCompilerCXX11Flag)
        if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8)
            message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.")
        endif()
        # TODO(qijun) gcc 4.9 or later versions raise SEGV due to the optimization problem.
        # Use Debug mode instead for now.
        if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.9 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 4.9) 
            set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "" FORCE)
        endif()
    elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
        # cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang"
        # Apple Clang is a different compiler than upstream Clang which havs different version numbers.
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@ -105,6 +105,11 @@ cross_channel_norm
 ..  autoclass:: paddle.v2.layer.cross_channel_norm
    :noindex:
 row_l2_norm
 -----------
 ..  autoclass:: paddle.v2.layer.row_l2_norm
    :noindex:
 Recurrent Layers
 ================
@ -320,6 +325,11 @@ scaling
 ..  autoclass:: paddle.v2.layer.scaling
    :noindex:
 clip
 ----
 ..  autoclass:: paddle.v2.layer.clip
    :noindex:
 slope_intercept
 ---------------
 ..  autoclass:: paddle.v2.layer.slope_intercept
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@ -1022,6 +1022,15 @@ void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
  real alpha = 1.0f;
  real beta = 1.0f;
  cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
  int batch_size = ((cudnn_tensor_descriptor)inputDesc)->batch_size;
  if (batch_size > 1024 && g_cudnn_lib_version < 6000) {
    LOG(INFO) << " To process current batch data with size " << batch_size
              << " (>1024), cudnnBatchNorm requires cuDNN version >= 6000."
              << " If there is an error complaining CUDNN_STATUS_NOT_SUPPORTED,"
              << " just recompile PaddlePaddle with cuDNN >= 6000, replacing"
              << " current version " << g_cudnn_lib_version;
  }
  CHECK_CUDNN(
      dynload::cudnnBatchNormalizationForwardInference(t_resource.cudnn_handle,
                                                       mode,
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@ -38,7 +38,7 @@ cc_library(backward SRCS backward.cc DEPS net)
 cc_test(backward_test SRCS backward_test.cc DEPS backward)
 cc_library(paddle_pybind SHARED
    SRCS pybind.cc
-    DEPS pybind python
+    DEPS pybind python backward
 	fc_op
 	sgd_op
 	add_op
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@ -400,6 +400,14 @@ class GradOpRegisterHelper {
    return 0;                                                                  \
  }
 /**
 * Macro to Forbid user register Gradient Operator.
 */
 #define NO_GRADIENT(__op_type)                          \
  STATIC_ASSERT_GLOBAL_NAMESPACE(                       \
      __reg_gradient_op__##__op_type##__op_type##_grad, \
      "NO_GRADIENT must be in global namespace")
 /**
 * Macro to Register OperatorKernel.
 */
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@ -20,16 +20,16 @@ namespace paddle {
 namespace framework {
 template <>
-Eigen::DefaultDevice* ExecutionContext::GetEigenDevice<
+Eigen::DefaultDevice& ExecutionContext::GetEigenDevice<
    platform::CPUPlace, Eigen::DefaultDevice>() const {
-  return device_context_.get_eigen_device<Eigen::DefaultDevice>();
+  return *device_context_.get_eigen_device<Eigen::DefaultDevice>();
 }
 #ifndef PADDLE_ONLY_CPU
 template <>
-Eigen::GpuDevice*
+Eigen::GpuDevice&
 ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
-  return device_context_.get_eigen_device<Eigen::GpuDevice>();
+  return *device_context_.get_eigen_device<Eigen::GpuDevice>();
 }
 #endif
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@ -253,7 +253,7 @@ class ExecutionContext : public OperatorContext {
  template <typename PlaceType,
            typename DeviceType =
                typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
-  DeviceType* GetEigenDevice() const;
+  DeviceType& GetEigenDevice() const;
  platform::Place GetPlace() const { return device_context_.GetPlace(); }
--- a/paddle/framework/pybind.cc
+++ b/paddle/framework/pybind.cc
@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@ -16,11 +16,14 @@ limitations under the License. */
 #include <fstream>
 #include <vector>
 #include "paddle/framework/backward.h"
 #include "paddle/framework/net.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
 #include "paddle/framework/scope.h"
 #include "paddle/framework/tensor_bind.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
@ -43,6 +46,10 @@ template <typename ClassType>
 void ExposeOperator(ClassType &m) {
  m.def("infer_shape", &ClassType::type::InferShape)
      .def("run", &ClassType::type::Run)
      .def("type",
           [](const typename ClassType::type &op) -> std::string {
             return op.type_;
           })
      .def("outputs",
           [](const typename ClassType::type &op) -> std::vector<std::string> {
             return op.outputs_;
@ -55,6 +62,14 @@ static size_t UniqueIntegerGenerator() {
  return generator.fetch_add(1);
 }
 bool IsCompileGPU() {
 #ifdef PADDLE_ONLY_CPU
  return false;
 #else
  return true;
 #endif
 }
 PYBIND11_PLUGIN(core) {
  py::module m("core", "C++ core of PaddlePaddle");
@ -68,16 +83,29 @@ PYBIND11_PLUGIN(core) {
             self.Resize(make_ddim(dim));
           })
      .def("alloc_float",
-           [](Tensor &self) {
+           [](pd::Tensor &self, paddle::platform::GPUPlace &place) {
-             self.mutable_data<float>(paddle::platform::CPUPlace());
+             self.mutable_data<float>(place);
           })
      .def("alloc_float",
           [](pd::Tensor &self, paddle::platform::CPUPlace &place) {
             self.mutable_data<float>(place);
           })
      .def("alloc_int",
           [](pd::Tensor &self, paddle::platform::CPUPlace &place) {
             self.mutable_data<int>(place);
           })
      .def("alloc_int",
-           [](Tensor &self) {
+           [](pd::Tensor &self, paddle::platform::GPUPlace &place) {
-             self.mutable_data<int>(paddle::platform::CPUPlace());
+             self.mutable_data<int>(place);
           })
-      .def("set", PyTensorSetFromArray<float>)
+      .def("set", paddle::pybind::PyCPUTensorSetFromArray<float>)
-      .def("set", PyTensorSetFromArray<int>)
+      .def("set", paddle::pybind::PyCPUTensorSetFromArray<int>)
-      .def("shape", [](Tensor &self) { return vectorize(self.dims()); });
+#ifndef PADDLE_ONLY_CPU
      .def("set", paddle::pybind::PyCUDATensorSetFromArray<float>)
      .def("set", paddle::pybind::PyCUDATensorSetFromArray<int>)
 #endif
      .def("shape",
           [](pd::Tensor &self) { return pd::vectorize(self.dims()); });
  py::class_<Variable>(m, "Variable", R"DOC(Variable Class.
@ -124,13 +152,29 @@ All parameter, weight, gradient are variables in Paddle.
  m.def_submodule(
       "var_names",
       "The module will return special predefined variable name in Paddle")
-      .def("empty", OperatorBase::EMPTY_VAR_NAME)
+      .def("empty", pd::OperatorBase::EMPTY_VAR_NAME)
-      .def("temp", OperatorBase::TMP_VAR_NAME);
+      .def("temp", pd::OperatorBase::TMP_VAR_NAME);
-
+  // clang-format off
  py::class_<paddle::platform::DeviceContext>(m, "DeviceContext")
-      .def_static("cpu_context", []() -> paddle::platform::DeviceContext * {
+      .def_static("create",
                  [](paddle::platform::CPUPlace& place)
                      -> paddle::platform::DeviceContext* {
                    return new paddle::platform::CPUDeviceContext();
                  })
      .def_static("create",
                  [](paddle::platform::GPUPlace& place)
                      -> paddle::platform::DeviceContext* {
 #ifdef PADDLE_ONLY_CPU
                    PADDLE_THROW("GPUPlace is not supported in CPU device.");
 #else
                    return new paddle::platform::CUDADeviceContext(place);
 #endif
                  });
  // clang-format on
  py::class_<paddle::platform::GPUPlace>(m, "GPUPlace").def(py::init<int>());
  py::class_<paddle::platform::CPUPlace>(m, "CPUPlace").def(py::init<>());
  py::class_<OperatorBase, std::shared_ptr<OperatorBase>> operator_base(
      m, "Operator");
@ -144,6 +188,13 @@ All parameter, weight, gradient are variables in Paddle.
                   desc.InitializationErrorString());
    return OpRegistry::CreateOp(desc);
  });
  operator_base.def("backward",
                    [](const pd::OperatorBase &forwardOp,
                       const std::unordered_set<std::string> &no_grad_vars) {
                      return pd::Backward(forwardOp, no_grad_vars);
                    });
  ExposeOperator(operator_base);
  py::class_<NetOp, std::shared_ptr<NetOp>> net(m, "Net");
@ -166,6 +217,8 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("unique_integer", UniqueIntegerGenerator);
  m.def("is_compile_gpu", IsCompileGPU);
  return m.ptr();
 }
 }  // namespace framework
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@ -165,4 +165,4 @@ class Tensor {
 }  // namespace framework
 }  // namespace paddle
-#include "paddle/framework/detail/tensor-inl.h"
+#include "paddle/framework/tensor_impl.h"
--- a/paddle/framework/tensor_bind.h
+++ b/paddle/framework/tensor_bind.h
@ -13,9 +13,11 @@
   limitations under the License. */
 #pragma once
-#include <paddle/framework/tensor.h>
+#include <string>
-#include <pybind11/numpy.h>
+#include "paddle/framework/tensor.h"
-#include <pybind11/pybind11.h>
+#include "paddle/memory/memcpy.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 namespace py = pybind11;
@ -40,9 +42,6 @@ template <size_t I, typename... ARGS>
 struct CastToPyBufferImpl<true, I, ARGS...> {
  using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
  py::buffer_info operator()(framework::Tensor &tensor) {
    PADDLE_ENFORCE(paddle::platform::is_cpu_place(tensor.holder_->place()),
                   "Only CPU tensor can cast to numpy array");
    if (std::type_index(typeid(CUR_TYPE)) == tensor.holder_->type()) {
      auto dim_vec = framework::vectorize(tensor.dims());
      std::vector<size_t> dims_outside;
@ -56,11 +55,16 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
        strides[i - 1] = sizeof(CUR_TYPE) * prod;
        prod *= dims_outside[i - 1];
      }
-
+      framework::Tensor dst_tensor;
      if (paddle::platform::is_gpu_place(tensor.holder_->place())) {
        dst_tensor.CopyFrom<CUR_TYPE>(tensor, platform::CPUPlace());
      } else if (paddle::platform::is_cpu_place(tensor.holder_->place())) {
        dst_tensor = tensor;
      }
      return py::buffer_info(
-          tensor.mutable_data<CUR_TYPE>(tensor.holder_->place()),
+          dst_tensor.mutable_data<CUR_TYPE>(dst_tensor.holder_->place()),
          sizeof(CUR_TYPE), py::format_descriptor<CUR_TYPE>::format(),
-          (size_t)framework::arity(tensor.dims()), dims_outside, strides);
+          (size_t)framework::arity(dst_tensor.dims()), dims_outside, strides);
    } else {
      constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
      return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);
@ -74,9 +78,10 @@ inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
 }
 template <typename T>
-void PyTensorSetFromArray(
+void PyCPUTensorSetFromArray(
    framework::Tensor &self,
-    py::array_t<T, py::array::c_style | py::array::forcecast> array) {
+    py::array_t<T, py::array::c_style | py::array::forcecast> array,
    paddle::platform::CPUPlace &place) {
  std::vector<int> dims;
  dims.reserve(array.ndim());
  for (size_t i = 0; i < array.ndim(); ++i) {
@ -84,9 +89,28 @@ void PyTensorSetFromArray(
  }
  self.Resize(framework::make_ddim(dims));
-  auto *dst = self.mutable_data<T>(paddle::platform::CPUPlace());
+  auto *dst = self.mutable_data<T>(place);
  std::memcpy(dst, array.data(), sizeof(T) * array.size());
 }
 #ifndef PADDLE_ONLY_CPU
 template <typename T>
 void PyCUDATensorSetFromArray(
    framework::Tensor &self,
    py::array_t<T, py::array::c_style | py::array::forcecast> array,
    paddle::platform::GPUPlace &place) {
  std::vector<int> dims;
  dims.reserve(array.ndim());
  for (size_t i = 0; i < array.ndim(); ++i) {
    dims.push_back((int)array.shape()[i]);
  }
  self.Resize(framework::make_ddim(dims));
  auto *dst = self.mutable_data<T>(place);
  paddle::platform::GpuMemcpySync(dst, array.data(), sizeof(T) * array.size(),
                                  cudaMemcpyHostToDevice);
 }
 #endif
 }  // namespace pybind
 }  // namespace paddle
--- a/paddle/framework/detail/tensor-inl.h
+++ b/paddle/framework/detail/tensor-inl.h
@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include "paddle/memory/memcpy.h"
 namespace paddle {
@ -62,9 +61,11 @@ inline T* Tensor::mutable_data(platform::Place place) {
    if (platform::is_cpu_place(place)) {
      holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
          boost::get<platform::CPUPlace>(place), size));
    } else if (platform::is_gpu_place(place)) {
 #ifdef PADDLE_ONLY_CPU
      PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
    }
-#ifndef PADDLE_ONLY_CPU
+#else
    else if (platform::is_gpu_place(place)) {
      holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>(
          boost::get<platform::GPUPlace>(place), size));
    }
--- a/paddle/function/ConvOp.h
+++ b/paddle/function/ConvOp.h
@ -109,6 +109,13 @@ protected:
    return filter[filter.ndims() - 1];
  }
  // determine whether im2col needs to be performed
  inline bool isNeedIm2col(const TensorShape& filter) const {
    return !(getFilterHeight(filter) == 1 && getFilterWidth(filter) == 1 &&
             strideH() == 1 && strideW() == 1 && paddingH() == 0 &&
             paddingW() == 0);
  }
  std::vector<size_t> strides_;
  std::vector<size_t> paddings_;
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@ -66,16 +66,23 @@ public:
    real* inputData = inputs[0].data<real>();
    real* filterData = inputs[1].data<real>();
    real* outputData = outputs[0].data<real>();
    bool needIm2col = isNeedIm2col(filter);
    TensorShape imShape =
        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-    TensorShape colShape = TensorShape({inputChannels / groups_,
+
    TensorShape colShape;
    real* colData = NULL;
    if (needIm2col) {
      colShape = TensorShape({inputChannels / groups_,
                              filterHeight,
                              filterWidth,
                              outputHeight,
                              outputWidth});
      resizeBuffer<Device>(colShape.getElements());
-    real* colData = reinterpret_cast<real*>(memory_->getBuf());
+      colData = reinterpret_cast<real*>(memory_->getBuf());
    }
    Im2ColFunctor<kCFO, Device, real> im2col;
    GemmFunctor<Device, real> gemm;
@ -86,6 +93,7 @@ public:
    for (size_t i = 0; i < batchSize; i++) {
      for (size_t g = 0; g < groups_; g++) {
        if (needIm2col) {
          im2col(inputData + g * inputOffset,
                 imShape,
                 colData,
@ -94,7 +102,9 @@ public:
                 strideW(),
                 paddingH(),
                 paddingW());
-
+        } else {
          colData = inputData + g * inputOffset;
        }
        int M = outputChannels / groups_;
        int N = outputHeight * outputWidth;
        int K = inputChannels / groups_ * filterHeight * filterWidth;
@ -159,19 +169,27 @@ public:
    real* outputGrad = inputs[0].data<real>();
    real* filterData = inputs[1].data<real>();
    real* inputGrad = outputs[0].data<real>();
    bool needIm2col = isNeedIm2col(filter);
    TensorShape imShape =
        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-    TensorShape colShape = TensorShape({inputChannels / groups_,
+
    TensorShape colShape;
    real* colData = NULL;
    if (needIm2col) {
      colShape = TensorShape({inputChannels / groups_,
                              filterHeight,
                              filterWidth,
                              outputHeight,
                              outputWidth});
      resizeBuffer<Device>(colShape.getElements());
-    real* colData = reinterpret_cast<real*>(memory_->getBuf());
+      colData = reinterpret_cast<real*>(memory_->getBuf());
    }
    Col2ImFunctor<kCFO, Device, real> col2im;
    GemmFunctor<Device, real> gemm;
    size_t inputOffset = imShape.getElements();
    size_t outputOffset =
        (outputChannels / groups_) * outputHeight * outputWidth;
@ -182,6 +200,11 @@ public:
        int K = outputChannels / groups_;
        int N = outputHeight * outputWidth;
        int M = inputChannels / groups_ * filterHeight * filterWidth;
        real scale = 0.0f;
        if (!needIm2col) {
          colData = inputGrad + g * inputOffset;
          scale = 1.0f;
        }
        gemm(CblasTrans,
             CblasNoTrans,
             M,
@ -192,9 +215,10 @@ public:
             M,
             outputGrad + g * outputOffset,
             N,
-             0.0f,
+             scale,
             colData,
             N);
        if (needIm2col) {
          col2im(inputGrad + g * inputOffset,
                 imShape,
                 colData,
@ -204,6 +228,7 @@ public:
                 paddingH(),
                 paddingW());
        }
      }
      inputGrad += inputChannels * inputHeight * inputWidth;
      outputGrad += outputChannels * outputHeight * outputWidth;
    }
@ -255,16 +280,23 @@ public:
    real* outputGrad = inputs[0].data<real>();
    real* inputData = inputs[1].data<real>();
    real* filterGrad = outputs[0].data<real>();
    bool needIm2col = isNeedIm2col(filter);
    TensorShape imShape =
        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-    TensorShape colShape = TensorShape({inputChannels / groups_,
+
    TensorShape colShape;
    real* colData = NULL;
    if (needIm2col) {
      colShape = TensorShape({inputChannels / groups_,
                              filterHeight,
                              filterWidth,
                              outputHeight,
                              outputWidth});
      resizeBuffer<Device>(colShape.getElements());
-    real* colData = reinterpret_cast<real*>(memory_->getBuf());
+      colData = reinterpret_cast<real*>(memory_->getBuf());
    }
    Im2ColFunctor<kCFO, Device, real> im2col;
    GemmFunctor<Device, real> gemm;
@ -274,6 +306,7 @@ public:
    size_t filterOffset = filter.getElements() / groups_;
    for (size_t i = 0; i < batchSize; i++) {
      for (size_t g = 0; g < groups_; g++) {
        if (needIm2col) {
          im2col(inputData + g * inputOffset,
                 imShape,
                 colData,
@ -282,7 +315,9 @@ public:
                 strideW(),
                 paddingH(),
                 paddingW());
-
+        } else {
          colData = inputData + g * inputOffset;
        }
        int M = outputChannels / groups_;
        int K = outputHeight * outputWidth;
        int N = inputChannels / groups_ * filterHeight * filterWidth;
--- a/paddle/gserver/layers/ClipLayer.cpp
+++ b/paddle/gserver/layers/ClipLayer.cpp
@ -0,0 +1,79 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "Layer.h"
 namespace paddle {
 /**
 * A layer for clipping the input value by the threshold.
 * \f[
 *   out[i] = \min\left(\max\left(in[i],p_{1}\right),p_{2}\right)
 * \f]
 */
 class ClipLayer : public Layer {
 protected:
  double min_;
  double max_;
 public:
  explicit ClipLayer(const LayerConfig& config) : Layer(config) {}
  bool init(const LayerMap& layerMap,
            const ParameterMap& parameterMap) override;
  void forward(PassType passType) override;
  void backward(const UpdateCallback& callback = nullptr) override;
 };
 REGISTER_LAYER(clip, ClipLayer);
 bool ClipLayer::init(const LayerMap& layerMap,
                     const ParameterMap& parameterMap) {
  Layer::init(layerMap, parameterMap);
  CHECK_EQ(inputLayers_.size(), 1U);
  auto layerConf = config_.inputs(0).clip_conf();
  min_ = layerConf.min();
  max_ = layerConf.max();
  CHECK_LT(min_, max_);
  return true;
 }
 void ClipLayer::forward(PassType passType) {
  Layer::forward(passType);
  MatrixPtr inV = getInputValue(0);
  resetOutput(inV->getHeight(), inV->getWidth());
  MatrixPtr outV = getOutputValue();
  outV->copyFrom(*inV);
  outV->clip(min_, max_);
 }
 void ClipLayer::backward(const UpdateCallback& callback) {
  MatrixPtr inV = getInputValue(0);
  MatrixPtr inG = getInputGrad(0);
  if (inG) {
    MatrixPtr outV = getOutputValue();
    MatrixPtr outG = getOutputGrad();
    MatrixPtr tmpMtx;
    Matrix::resizeOrCreate(
        tmpMtx, outG->getHeight(), outG->getWidth(), false, useGpu_);
    tmpMtx->clipDerivative(*inV, min_, max_);
    inG->addDotMul(*outG, *tmpMtx, 1, 1);
  }
 }
 }  // namespace paddle
--- a/paddle/gserver/layers/RowL2NormLayer.cpp
+++ b/paddle/gserver/layers/RowL2NormLayer.cpp
@ -0,0 +1,98 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "Layer.h"
 namespace paddle {
 /**
 * A layer for L2 normalization in each row,
 * \f[
 *   out[i] = \frac{in[i]}{\sqrt{\sum_{k=1}^N in[k]^{2}}}
 * \f]
 * where the size of \f$in\f$ is (batchSize x dataDim),
 * and the size of \f$out\f$ is (batchSize x dataDim).
 */
 class RowL2NormLayer : public Layer {
 protected:
  MatrixPtr inSquare_;
  MatrixPtr l2NormReciprocal_;
  MatrixPtr dotSum_;
 public:
  explicit RowL2NormLayer(const LayerConfig& config) : Layer(config) {}
  bool init(const LayerMap& layerMap,
            const ParameterMap& parameterMap) override;
  void forward(PassType passType) override;
  void backward(const UpdateCallback& callback = nullptr) override;
 };
 REGISTER_LAYER(row_l2_norm, RowL2NormLayer);
 bool RowL2NormLayer::init(const LayerMap& layerMap,
                          const ParameterMap& parameterMap) {
  Layer::init(layerMap, parameterMap);
  CHECK_EQ(inputLayers_.size(), 1U);
  return true;
 }
 void RowL2NormLayer::forward(PassType passType) {
  Layer::forward(passType);
  MatrixPtr inV = getInputValue(0);
  /* malloc memory for the output_ if necessary */
  size_t batchSize = inV->getHeight();
  size_t dataDim = getSize();
  CHECK_EQ(dataDim, inV->getWidth());
  resetOutput(batchSize, dataDim);
  MatrixPtr outV = getOutputValue();
  Matrix::resizeOrCreate(inSquare_, batchSize, dataDim, false, useGpu_);
  inV->square2(*inSquare_);
  Matrix::resizeOrCreate(l2NormReciprocal_, batchSize, 1, false, useGpu_);
  inSquare_->rowSum(*l2NormReciprocal_);
  l2NormReciprocal_->sqrt2(*l2NormReciprocal_);
  l2NormReciprocal_->scalarDiv(*l2NormReciprocal_, 1.0);
  outV->rowScale(0, *inV, *l2NormReciprocal_);
 }
 void RowL2NormLayer::backward(const UpdateCallback& callback) {
  MatrixPtr inV = getInputValue(0);
  MatrixPtr inG = getInputGrad(0);
  MatrixPtr outV = getOutputValue();
  MatrixPtr outG = getOutputGrad();
  size_t batchSize = inV->getHeight();
  // inG[ij] += outG[ij] / l2NormReciprocal
  // inG[ij] += -inV[ij] * l2NormReciprocal * l2NormReciprocal * DotMul(outG[i],
  // inV[i])
  if (inG) {
    Matrix::resizeOrCreate(dotSum_, batchSize, 1, false, useGpu_);
    dotSum_->zeroMem();
    dotSum_->rowDotMul(0, *outG, *outV);
    dotSum_->dotMul(*dotSum_, *l2NormReciprocal_);
    dotSum_->dotMul(*dotSum_, *l2NormReciprocal_);
    inSquare_->rowScale(0, *inV, *dotSum_);
    inG->sub(*inSquare_);
    inG->addRowScale(0, *outG, *l2NormReciprocal_);
  }
 }
 }  // namespace paddle
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@ -1899,6 +1899,36 @@ TEST(Layer, CropLayer) {
  }
 }
 TEST(Layer, ClipLayer) {
  const size_t batchSize = 128;
  const size_t size = 512;
  TestConfig config;
  config.layerConfig.set_type("clip");
  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
  LayerInputConfig* input = config.layerConfig.add_inputs();
  ClipConfig* layerConf = input->mutable_clip_conf();
  double p1 = std::rand() / (double)RAND_MAX;
  double p2 = std::rand() / (double)RAND_MAX;
  layerConf->set_min(std::min(p1, p2));
  layerConf->set_max(std::max(p1, p2));
  for (auto useGpu : {false, true}) {
    testLayerGrad(config, "clip", batchSize, false, useGpu, false);
  }
 }
 TEST(Layer, RowL2NormLayer) {
  const size_t batchSize = 128;
  const size_t size = 512;
  TestConfig config;
  config.layerConfig.set_type("row_l2_norm");
  config.layerConfig.set_size(size);
  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
  config.layerConfig.add_inputs();
  for (auto useGpu : {false, true}) {
    testLayerGrad(config, "row_l2_norm", batchSize, false, useGpu, false);
  }
 }
 int main(int argc, char** argv) {
  testing::InitGoogleTest(&argc, argv);
  initMain(argc, argv);
--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@ -442,6 +442,12 @@ DEFINE_MATRIX_UNARY_PARAMETER_OP(Clip, TWO_PARAMETER,
 template<class T>
 void BaseMatrixT<T>::clip(T p1, T p2) { applyUnary(unary::Clip<T>(p1, p2)); }
 DEFINE_MATRIX_BINARY_PARAMETER_OP(ClipDerivative, TWO_PARAMETER, a = b < p1 ? 0 : (b > p2 ? 0 : 1));
 template<class T>
 void BaseMatrixT<T>::clipDerivative(BaseMatrixT& b, T p1, T p2) {
  applyBinary(binary::ClipDerivative<T>(p1, p2), b);
 }
 DEFINE_MATRIX_UNARY_PARAMETER_OP(BiggerThanScalar, ONE_PARAMETER,
                                 a = a > p ? 1.0f : 0.0f);
 template<class T>
--- a/paddle/math/BaseMatrix.h
+++ b/paddle/math/BaseMatrix.h
@ -488,6 +488,13 @@ public:
   */
  void clip(T p1, T p2);
  /**
   * this = b < low ? 0 : 1
   *
   * this = b > high ? 0 : 1
   */
  void clipDerivative(BaseMatrixT& b, T p1, T p2);
  /**
   * @code
   * a = a > p ? 1.0f : 0.0f
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@ -60,10 +60,5 @@ op_library(sgd_op SRCS sgd_op.cc sgd_op.cu)
 op_library(fc_op
    SRCS fc_op.cc
    DEPS mul_op rowwise_add_op sigmoid_op softmax_op net)
-
+op_library(recurrent_op SRCS recurrent_op.cc DEPS op_desc tensor op_registry operator net)
-op_library(recurrent_network_op
+cc_test(recurrent_op_test SRCS recurrent_op_test.cc DEPS recurrent_op gtest mul_op add_op)
    SRCS recurrent_network_op.cc
    DEPS op_desc tensor net)
 cc_test(recurrent_network_op_test
    SRCS recurrent_network_op_test.cc
    DEPS recurrent_network_op mul_op add_op)
--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
@ -50,10 +50,6 @@ The equation is: Out = X + Y
 class AddOpGrad : public OperatorWithKernel {
 protected:
  void InferShape(const InferShapeContext &ctx) const override {}
  std::string DebugString() const override {
    LOG(INFO) << "AddOpGrad";
    return "";
  }
 };
 }  // namespace operators
--- a/paddle/operators/add_op.cu
+++ b/paddle/operators/add_op.cu
@ -1,3 +1,4 @@
 #define EIGEN_USE_GPU
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/add_op.h"
--- a/paddle/operators/add_op.h
+++ b/paddle/operators/add_op.h
@ -28,10 +28,13 @@ public:
    output->mutable_data<T>(context.GetPlace());
-    EigenVector<T>::Flatten(*output).device(
+    auto X = EigenVector<T>::Flatten(*input0);
-        *(context.GetEigenDevice<Place>())) =
+    auto Y = EigenVector<T>::Flatten(*input1);
-        framework::EigenVector<T>::Flatten(*input0) +
+    auto Z = EigenVector<T>::Flatten(*output);
-        framework::EigenVector<T>::Flatten(*input1);
+
    auto place = context.GetEigenDevice<Place>();
    Z.device(place) = X + Y;
  }
 };
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@ -1,3 +1,4 @@
 #define EIGEN_USE_GPU
 #include "paddle/operators/cross_entropy_op.h"
 REGISTER_OP_GPU_KERNEL(onehot_cross_entropy,
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@ -33,13 +33,23 @@ public:
  MeanOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input of mean op");
-    AddOutput("Out", "The output of mean op");
+    AddOutput("Out", "The output of mean op").IgnoreGradient();
    AddComment("Mean Operator");
  }
 };
 class MeanGradOp : public OperatorWithKernel {
 protected:
  void InferShape(const InferShapeContext &ctx) const override {
    ctx.Output<Tensor>("X" + GRAD_VAR_SUFFIX())
        ->Resize(ctx.Input<Tensor>("X")->dims());
  }
 };
 }  // namespace operators
 }  // namespace paddle
 REGISTER_OP(mean, ops::MeanOp, ops::MeanOpMaker);
 REGISTER_OP_CPU_KERNEL(mean, ops::MeanKernel<ops::CPUPlace, float>);
 REGISTER_GRADIENT_OP(mean, mean_grad, ops::MeanGradOp);
 REGISTER_OP_CPU_KERNEL(mean_grad, ops::MeanGradKernel<ops::CPUPlace, float>);
--- a/Show More
+++ b/Show More