Merge branch 'develop' into add-ClipLayer

8 years ago · b7b956f0ba
parent 5d644994d3 5cb29a8fbf
commit b7b956f0ba
39 changed files with 381 additions and 96 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -17,10 +17,14 @@
    -   id: detect-private-key
        files: (?!.*third_party)^.*$ | (?!.*book)^.*$
    -   id: end-of-file-fixer
-   repo: https://github.com/PaddlePaddle/clang-format-pre-commit-hook.git
-    sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
+-   repo: local
    hooks:
-    -   id: clang-formater
+    -   id: clang-format
+        name: clang-format
+        description: Format files with ClangFormat.
+        entry: clang-format -i
+        language: system
+        files: \.(c|cc|cxx|cpp|h|hpp|hxx)$
 -   repo: https://github.com/PaddlePaddle/pre-commit-golang
    sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
    hooks:
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@ -9,6 +9,11 @@ function(CheckCompilerCXX11Flag)
        if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8)
            message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.")
        endif()
+        # TODO(qijun) gcc 4.9 or later versions raise SEGV due to the optimization problem.
+        # Use Debug mode instead for now.
+        if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.9 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 4.9) 
+            set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "" FORCE)
+        endif()
    elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
        # cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang"
        # Apple Clang is a different compiler than upstream Clang which havs different version numbers.
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@ -105,6 +105,11 @@ cross_channel_norm
 ..  autoclass:: paddle.v2.layer.cross_channel_norm
    :noindex:

+row_l2_norm
+-----------
+..  autoclass:: paddle.v2.layer.row_l2_norm
+    :noindex:
+    
 Recurrent Layers
 ================

--- a/paddle/framework/detail/tensor-inl.h
+++ b/paddle/framework/detail/tensor-inl.h
@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
-
 #include "paddle/memory/memcpy.h"

 namespace paddle {
@ -62,9 +61,11 @@ inline T* Tensor::mutable_data(platform::Place place) {
    if (platform::is_cpu_place(place)) {
      holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
          boost::get<platform::CPUPlace>(place), size));
+    } else if (platform::is_gpu_place(place)) {
+#ifdef PADDLE_ONLY_CPU
+      PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
    }
-#ifndef PADDLE_ONLY_CPU
-    else if (platform::is_gpu_place(place)) {
+#else
      holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>(
          boost::get<platform::GPUPlace>(place), size));
    }
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@ -20,16 +20,16 @@ namespace paddle {
 namespace framework {

 template <>
-Eigen::DefaultDevice* ExecutionContext::GetEigenDevice<
+Eigen::DefaultDevice& ExecutionContext::GetEigenDevice<
    platform::CPUPlace, Eigen::DefaultDevice>() const {
-  return device_context_.get_eigen_device<Eigen::DefaultDevice>();
+  return *device_context_.get_eigen_device<Eigen::DefaultDevice>();
 }

 #ifndef PADDLE_ONLY_CPU
 template <>
-Eigen::GpuDevice*
+Eigen::GpuDevice&
 ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
-  return device_context_.get_eigen_device<Eigen::GpuDevice>();
+  return *device_context_.get_eigen_device<Eigen::GpuDevice>();
 }
 #endif

--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@ -253,7 +253,7 @@ class ExecutionContext : public OperatorContext {
  template <typename PlaceType,
            typename DeviceType =
                typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
-  DeviceType* GetEigenDevice() const;
+  DeviceType& GetEigenDevice() const;

  platform::Place GetPlace() const { return device_context_.GetPlace(); }

--- a/paddle/gserver/layers/RowL2NormLayer.cpp
+++ b/paddle/gserver/layers/RowL2NormLayer.cpp
@ -0,0 +1,98 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * A layer for L2 normalization in each row,
+ * \f[
+ *   out[i] = \frac{in[i]}{\sqrt{\sum_{k=1}^N in[k]^{2}}}
+ * \f]
+ * where the size of \f$in\f$ is (batchSize x dataDim),
+ * and the size of \f$out\f$ is (batchSize x dataDim).
+ */
+
+class RowL2NormLayer : public Layer {
+protected:
+  MatrixPtr inSquare_;
+  MatrixPtr l2NormReciprocal_;
+  MatrixPtr dotSum_;
+
+public:
+  explicit RowL2NormLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(row_l2_norm, RowL2NormLayer);
+
+bool RowL2NormLayer::init(const LayerMap& layerMap,
+                          const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 1U);
+
+  return true;
+}
+
+void RowL2NormLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV = getInputValue(0);
+
+  /* malloc memory for the output_ if necessary */
+  size_t batchSize = inV->getHeight();
+  size_t dataDim = getSize();
+  CHECK_EQ(dataDim, inV->getWidth());
+  resetOutput(batchSize, dataDim);
+  MatrixPtr outV = getOutputValue();
+
+  Matrix::resizeOrCreate(inSquare_, batchSize, dataDim, false, useGpu_);
+  inV->square2(*inSquare_);
+  Matrix::resizeOrCreate(l2NormReciprocal_, batchSize, 1, false, useGpu_);
+  inSquare_->rowSum(*l2NormReciprocal_);
+  l2NormReciprocal_->sqrt2(*l2NormReciprocal_);
+  l2NormReciprocal_->scalarDiv(*l2NormReciprocal_, 1.0);
+  outV->rowScale(0, *inV, *l2NormReciprocal_);
+}
+
+void RowL2NormLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV = getInputValue(0);
+  MatrixPtr inG = getInputGrad(0);
+  MatrixPtr outV = getOutputValue();
+  MatrixPtr outG = getOutputGrad();
+  size_t batchSize = inV->getHeight();
+
+  // inG[ij] += outG[ij] / l2NormReciprocal
+  // inG[ij] += -inV[ij] * l2NormReciprocal * l2NormReciprocal * DotMul(outG[i],
+  // inV[i])
+  if (inG) {
+    Matrix::resizeOrCreate(dotSum_, batchSize, 1, false, useGpu_);
+    dotSum_->zeroMem();
+    dotSum_->rowDotMul(0, *outG, *outV);
+    dotSum_->dotMul(*dotSum_, *l2NormReciprocal_);
+    dotSum_->dotMul(*dotSum_, *l2NormReciprocal_);
+    inSquare_->rowScale(0, *inV, *dotSum_);
+    inG->sub(*inSquare_);
+    inG->addRowScale(0, *outG, *l2NormReciprocal_);
+  }
+}
+
+}  // namespace paddle
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@ -1916,6 +1916,19 @@ TEST(Layer, ClipLayer) {
  }
 }

+TEST(Layer, RowL2NormLayer) {
+  const size_t batchSize = 128;
+  const size_t size = 512;
+  TestConfig config;
+  config.layerConfig.set_type("row_l2_norm");
+  config.layerConfig.set_size(size);
+  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
+  config.layerConfig.add_inputs();
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "row_l2_norm", batchSize, false, useGpu, false);
+  }
+}
+
 int main(int argc, char** argv) {
  testing::InitGoogleTest(&argc, argv);
  initMain(argc, argv);
--- a/paddle/operators/add_op.cu
+++ b/paddle/operators/add_op.cu
@ -1,3 +1,4 @@
+#define EIGEN_USE_GPU
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/add_op.h"

--- a/paddle/operators/add_op.h
+++ b/paddle/operators/add_op.h
@ -28,8 +28,7 @@ public:

    output->mutable_data<T>(context.GetPlace());

-    EigenVector<T>::Flatten(*output).device(
-        *(context.GetEigenDevice<Place>())) =
+    EigenVector<T>::Flatten(*output).device(context.GetEigenDevice<Place>()) =
        framework::EigenVector<T>::Flatten(*input0) +
        framework::EigenVector<T>::Flatten(*input1);
  }
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@ -1,3 +1,4 @@
+#define EIGEN_USE_GPU
 #include "paddle/operators/cross_entropy_op.h"

 REGISTER_OP_GPU_KERNEL(onehot_cross_entropy,
--- a/paddle/operators/mean_op.h
+++ b/paddle/operators/mean_op.h
@ -27,7 +27,7 @@ public:

    output->mutable_data<T>(context.GetPlace());

-    EigenScalar<T>::From(*output).device(*(context.GetEigenDevice<Place>())) =
+    EigenScalar<T>::From(*output).device(context.GetEigenDevice<Place>()) =
        EigenVector<T>::Flatten(*input).mean();
  }
 };
--- a/paddle/operators/mul_op.cu
+++ b/paddle/operators/mul_op.cu
@ -12,6 +12,7 @@
   See the License for the specific language governing permissions and
   limitations under the License. */

+#define EIGEN_USE_GPU
 #include "paddle/operators/mul_op.h"

 REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<ops::GPUPlace, float>);
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
@ -29,7 +29,7 @@ public:
    auto output = context.Output<Tensor>(0);
    output->mutable_data<T>(context.GetPlace());

-    EigenMatrix<T>::From(*output).device(*(context.GetEigenDevice<Place>())) =
+    EigenMatrix<T>::From(*output).device(context.GetEigenDevice<Place>()) =
        EigenMatrix<T>::From(*context.Input<Tensor>("X"))
            .contract(EigenMatrix<T>::From(*context.Input<Tensor>("Y")),
                      dim_pair);
--- a/paddle/operators/rowwise_add_op.cu
+++ b/paddle/operators/rowwise_add_op.cu
@ -1,3 +1,4 @@
+#define EIGEN_USE_GPU
 #include "paddle/operators/rowwise_add_op.h"

 REGISTER_OP_GPU_KERNEL(rowwise_add,
--- a/paddle/operators/rowwise_add_op.h
+++ b/paddle/operators/rowwise_add_op.h
@ -33,7 +33,7 @@ public:
    const int rest_size = input.size() / bias_size;
    Eigen::DSizes<int, 1> one_d(input.size());
    Eigen::DSizes<int, 1> bcast(rest_size);
-    output.reshape(one_d).device(*(context.GetEigenDevice<Place>())) =
+    output.reshape(one_d).device(context.GetEigenDevice<Place>()) =
        input.reshape(one_d) + bias.broadcast(bcast).reshape(one_d);
  }
 };
--- a/paddle/operators/sgd_op.cu
+++ b/paddle/operators/sgd_op.cu
@ -1,3 +1,4 @@
+#define EIGEN_USE_GPU
 #include "paddle/operators/sgd_op.h"

 REGISTER_OP_GPU_KERNEL(sgd, ops::SGDOpKernel<ops::GPUPlace, float>);
--- a/paddle/operators/sgd_op.h
+++ b/paddle/operators/sgd_op.h
@ -29,7 +29,7 @@ public:

    param_out->mutable_data<T>(ctx.GetPlace());

-    EigenVector<T>::Flatten(*param_out).device(*(ctx.GetEigenDevice<Place>())) =
+    EigenVector<T>::Flatten(*param_out).device(ctx.GetEigenDevice<Place>()) =
        EigenVector<T>::Flatten(*param) - lr * EigenVector<T>::Flatten(*grad);
  }
 };
--- a/paddle/operators/sigmoid_op.cu
+++ b/paddle/operators/sigmoid_op.cu
@ -1,3 +1,4 @@
+#define EIGEN_USE_GPU
 #include "paddle/operators/sigmoid_op.h"

 REGISTER_OP_GPU_KERNEL(sigmoid, ops::SigmoidKernel<ops::GPUPlace, float>);
--- a/paddle/operators/sigmoid_op.h
+++ b/paddle/operators/sigmoid_op.h
@ -27,8 +27,7 @@ public:
    auto output = context.Output<Tensor>(0);
    output->mutable_data<T>(context.GetPlace());

-    EigenVector<T>::Flatten(*output).device(
-        *(context.GetEigenDevice<Place>())) =
+    EigenVector<T>::Flatten(*output).device(context.GetEigenDevice<Place>()) =
        1.0 / (1.0 + (-1.0 * EigenVector<T>::Flatten(*input)).exp());
  }
 };
--- a/paddle/operators/softmax_op.cu
+++ b/paddle/operators/softmax_op.cu
@ -1,3 +1,4 @@
+#define EIGEN_USE_GPU
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/softmax_op.h"

--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
@ -46,9 +46,9 @@ public:
                               .reshape(batch_by_one)
                               .broadcast(one_by_class));

-    softmax.device(*(context.GetEigenDevice<Place>())) = shifted_logits.exp();
+    softmax.device(context.GetEigenDevice<Place>()) = shifted_logits.exp();

-    softmax.device(*(context.GetEigenDevice<Place>())) =
+    softmax.device(context.GetEigenDevice<Place>()) =
        (softmax *
         softmax.sum(along_class)
             .inverse()
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@ -148,7 +148,7 @@ inline void throw_on_error(T e) {
  do {                                                                 \
    throw ::paddle::platform::EnforceNotMet(                           \
        std::make_exception_ptr(                                       \
-            std::runtime_error(string::Sprintf(__VA_ARGS__))), \
+            std::runtime_error(paddle::string::Sprintf(__VA_ARGS__))), \
        __FILE__, __LINE__);                                           \
  } while (0)

--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@ -20,6 +20,8 @@ limitations under the License. */
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
 #include "paddle/framework/scope.h"
+#include "paddle/platform/enforce.h"
+#include "paddle/platform/place.h"
 #include "paddle/pybind/tensor_bind.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
@ -55,6 +57,14 @@ static size_t UniqueIntegerGenerator() {
  return generator.fetch_add(1);
 }

+bool IsCompileGPU() {
+#ifdef PADDLE_ONLY_CPU
+  return false;
+#else
+  return true;
+#endif
+}
+
 PYBIND11_PLUGIN(core) {
  py::module m("core", "C++ core of PaddlePaddle");

@ -69,15 +79,27 @@ PYBIND11_PLUGIN(core) {
             self.Resize(pd::make_ddim(dim));
           })
      .def("alloc_float",
-           [](pd::Tensor& self) {
-             self.mutable_data<float>(paddle::platform::CPUPlace());
+           [](pd::Tensor& self, paddle::platform::GPUPlace& place) {
+             self.mutable_data<float>(place);
+           })
+      .def("alloc_float",
+           [](pd::Tensor& self, paddle::platform::CPUPlace& place) {
+             self.mutable_data<float>(place);
+           })
+      .def("alloc_int",
+           [](pd::Tensor& self, paddle::platform::CPUPlace& place) {
+             self.mutable_data<int>(place);
           })
      .def("alloc_int",
-           [](pd::Tensor& self) {
-             self.mutable_data<int>(paddle::platform::CPUPlace());
+           [](pd::Tensor& self, paddle::platform::GPUPlace& place) {
+             self.mutable_data<int>(place);
           })
-      .def("set", paddle::pybind::PyTensorSetFromArray<float>)
-      .def("set", paddle::pybind::PyTensorSetFromArray<int>)
+      .def("set", paddle::pybind::PyCPUTensorSetFromArray<float>)
+      .def("set", paddle::pybind::PyCPUTensorSetFromArray<int>)
+#ifndef PADDLE_ONLY_CPU
+      .def("set", paddle::pybind::PyCUDATensorSetFromArray<float>)
+      .def("set", paddle::pybind::PyCUDATensorSetFromArray<int>)
+#endif
      .def("shape",
           [](pd::Tensor& self) { return pd::vectorize(self.dims()); });

@ -136,11 +158,27 @@ All parameter, weight, gradient are variables in Paddle.
       "The module will return special predefined variable name in Paddle")
      .def("empty", pd::OperatorBase::EMPTY_VAR_NAME)
      .def("temp", pd::OperatorBase::TMP_VAR_NAME);
-
+  // clang-format off
  py::class_<paddle::platform::DeviceContext>(m, "DeviceContext")
-      .def_static("cpu_context", []() -> paddle::platform::DeviceContext* {
+      .def_static("create",
+                  [](paddle::platform::CPUPlace& place)
+                      -> paddle::platform::DeviceContext* {
                    return new paddle::platform::CPUDeviceContext();
+                  })
+      .def_static("create",
+                  [](paddle::platform::GPUPlace& place)
+                      -> paddle::platform::DeviceContext* {
+#ifdef PADDLE_ONLY_CPU
+                    PADDLE_THROW("GPUPlace is not supported in CPU device.");
+#else
+                    return new paddle::platform::CUDADeviceContext(place);
+#endif
                  });
+  // clang-format on
+
+  py::class_<paddle::platform::GPUPlace>(m, "GPUPlace").def(py::init<int>());
+
+  py::class_<paddle::platform::CPUPlace>(m, "CPUPlace").def(py::init<>());

  py::class_<pd::OperatorBase, std::shared_ptr<pd::OperatorBase>> operator_base(
      m, "Operator");
@ -176,5 +214,7 @@ All parameter, weight, gradient are variables in Paddle.

  m.def("unique_integer", UniqueIntegerGenerator);

+  m.def("is_compile_gpu", IsCompileGPU);
+
  return m.ptr();
 }
--- a/paddle/pybind/tensor_bind.h
+++ b/paddle/pybind/tensor_bind.h
@ -13,9 +13,11 @@
   limitations under the License. */

 #pragma once
-#include <paddle/framework/tensor.h>
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
+#include <string>
+#include "paddle/framework/tensor.h"
+#include "paddle/memory/memcpy.h"
+#include "pybind11/numpy.h"
+#include "pybind11/pybind11.h"

 namespace py = pybind11;

@ -40,9 +42,6 @@ template <size_t I, typename... ARGS>
 struct CastToPyBufferImpl<true, I, ARGS...> {
  using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
  py::buffer_info operator()(framework::Tensor &tensor) {
-    PADDLE_ENFORCE(paddle::platform::is_cpu_place(tensor.holder_->place()),
-                   "Only CPU tensor can cast to numpy array");
-
    if (std::type_index(typeid(CUR_TYPE)) == tensor.holder_->type()) {
      auto dim_vec = framework::vectorize(tensor.dims());
      std::vector<size_t> dims_outside;
@ -56,12 +55,17 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
        strides[i - 1] = sizeof(CUR_TYPE) * prod;
        prod *= dims_outside[i - 1];
      }
-
+      framework::Tensor dst_tensor;
+      if (paddle::platform::is_gpu_place(tensor.holder_->place())) {
+        dst_tensor.CopyFrom<CUR_TYPE>(tensor, platform::CPUPlace());
+      } else if (paddle::platform::is_cpu_place(tensor.holder_->place())) {
+        dst_tensor = tensor;
+      }
      return py::buffer_info(
-          tensor.mutable_data<CUR_TYPE>(tensor.holder_->place()),
+          dst_tensor.mutable_data<CUR_TYPE>(dst_tensor.holder_->place()),
          sizeof(CUR_TYPE),
          py::format_descriptor<CUR_TYPE>::format(),
-          (size_t)framework::arity(tensor.dims()),
+          (size_t)framework::arity(dst_tensor.dims()),
          dims_outside,
          strides);
    } else {
@ -77,9 +81,10 @@ inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
 }

 template <typename T>
-void PyTensorSetFromArray(
+void PyCPUTensorSetFromArray(
    framework::Tensor &self,
-    py::array_t<T, py::array::c_style | py::array::forcecast> array) {
+    py::array_t<T, py::array::c_style | py::array::forcecast> array,
+    paddle::platform::CPUPlace &place) {
  std::vector<int> dims;
  dims.reserve(array.ndim());
  for (size_t i = 0; i < array.ndim(); ++i) {
@ -87,9 +92,28 @@ void PyTensorSetFromArray(
  }

  self.Resize(framework::make_ddim(dims));
-  auto *dst = self.mutable_data<T>(paddle::platform::CPUPlace());
+  auto *dst = self.mutable_data<T>(place);
  std::memcpy(dst, array.data(), sizeof(T) * array.size());
 }

+#ifndef PADDLE_ONLY_CPU
+template <typename T>
+void PyCUDATensorSetFromArray(
+    framework::Tensor &self,
+    py::array_t<T, py::array::c_style | py::array::forcecast> array,
+    paddle::platform::GPUPlace &place) {
+  std::vector<int> dims;
+  dims.reserve(array.ndim());
+  for (size_t i = 0; i < array.ndim(); ++i) {
+    dims.push_back((int)array.shape()[i]);
+  }
+
+  self.Resize(framework::make_ddim(dims));
+  auto *dst = self.mutable_data<T>(place);
+  paddle::platform::GpuMemcpySync(
+      dst, array.data(), sizeof(T) * array.size(), cudaMemcpyHostToDevice);
+}
+#endif
+
 }  // namespace pybind
 }  // namespace paddle
--- a/Show More
+++ b/Show More