Merge branch 'develop' into add-ClipLayer

8 years ago · b7b956f0ba
parent 5d644994d3 5cb29a8fbf
commit b7b956f0ba
39 changed files with 381 additions and 96 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -17,10 +17,14 @@
    -   id: detect-private-key
        files: (?!.*third_party)^.*$ | (?!.*book)^.*$
    -   id: end-of-file-fixer
-   repo: https://github.com/PaddlePaddle/clang-format-pre-commit-hook.git
+-   repo: local
    sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
    hooks:
-    -   id: clang-formater
+    -   id: clang-format
        name: clang-format
        description: Format files with ClangFormat.
        entry: clang-format -i
        language: system
        files: \.(c|cc|cxx|cpp|h|hpp|hxx)$
 -   repo: https://github.com/PaddlePaddle/pre-commit-golang
    sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
    hooks:
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@ -9,6 +9,11 @@ function(CheckCompilerCXX11Flag)
        if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8)
            message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.")
        endif()
        # TODO(qijun) gcc 4.9 or later versions raise SEGV due to the optimization problem.
        # Use Debug mode instead for now.
        if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.9 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 4.9) 
            set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "" FORCE)
        endif()
    elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
        # cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang"
        # Apple Clang is a different compiler than upstream Clang which havs different version numbers.
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@ -105,6 +105,11 @@ cross_channel_norm
 ..  autoclass:: paddle.v2.layer.cross_channel_norm
    :noindex:
 row_l2_norm
 -----------
 ..  autoclass:: paddle.v2.layer.row_l2_norm
    :noindex:
 Recurrent Layers
 ================
--- a/paddle/framework/detail/tensor-inl.h
+++ b/paddle/framework/detail/tensor-inl.h
@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include "paddle/memory/memcpy.h"
 namespace paddle {
@ -62,9 +61,11 @@ inline T* Tensor::mutable_data(platform::Place place) {
    if (platform::is_cpu_place(place)) {
      holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
          boost::get<platform::CPUPlace>(place), size));
    } else if (platform::is_gpu_place(place)) {
 #ifdef PADDLE_ONLY_CPU
      PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
    }
-#ifndef PADDLE_ONLY_CPU
+#else
    else if (platform::is_gpu_place(place)) {
      holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>(
          boost::get<platform::GPUPlace>(place), size));
    }
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@ -20,16 +20,16 @@ namespace paddle {
 namespace framework {
 template <>
-Eigen::DefaultDevice* ExecutionContext::GetEigenDevice<
+Eigen::DefaultDevice& ExecutionContext::GetEigenDevice<
    platform::CPUPlace, Eigen::DefaultDevice>() const {
-  return device_context_.get_eigen_device<Eigen::DefaultDevice>();
+  return *device_context_.get_eigen_device<Eigen::DefaultDevice>();
 }
 #ifndef PADDLE_ONLY_CPU
 template <>
-Eigen::GpuDevice*
+Eigen::GpuDevice&
 ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
-  return device_context_.get_eigen_device<Eigen::GpuDevice>();
+  return *device_context_.get_eigen_device<Eigen::GpuDevice>();
 }
 #endif
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@ -253,7 +253,7 @@ class ExecutionContext : public OperatorContext {
  template <typename PlaceType,
            typename DeviceType =
                typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
-  DeviceType* GetEigenDevice() const;
+  DeviceType& GetEigenDevice() const;
  platform::Place GetPlace() const { return device_context_.GetPlace(); }
--- a/paddle/gserver/layers/RowL2NormLayer.cpp
+++ b/paddle/gserver/layers/RowL2NormLayer.cpp
@ -0,0 +1,98 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "Layer.h"
 namespace paddle {
 /**
 * A layer for L2 normalization in each row,
 * \f[
 *   out[i] = \frac{in[i]}{\sqrt{\sum_{k=1}^N in[k]^{2}}}
 * \f]
 * where the size of \f$in\f$ is (batchSize x dataDim),
 * and the size of \f$out\f$ is (batchSize x dataDim).
 */
 class RowL2NormLayer : public Layer {
 protected:
  MatrixPtr inSquare_;
  MatrixPtr l2NormReciprocal_;
  MatrixPtr dotSum_;
 public:
  explicit RowL2NormLayer(const LayerConfig& config) : Layer(config) {}
  bool init(const LayerMap& layerMap,
            const ParameterMap& parameterMap) override;
  void forward(PassType passType) override;
  void backward(const UpdateCallback& callback = nullptr) override;
 };
 REGISTER_LAYER(row_l2_norm, RowL2NormLayer);
 bool RowL2NormLayer::init(const LayerMap& layerMap,
                          const ParameterMap& parameterMap) {
  Layer::init(layerMap, parameterMap);
  CHECK_EQ(inputLayers_.size(), 1U);
  return true;
 }
 void RowL2NormLayer::forward(PassType passType) {
  Layer::forward(passType);
  MatrixPtr inV = getInputValue(0);
  /* malloc memory for the output_ if necessary */
  size_t batchSize = inV->getHeight();
  size_t dataDim = getSize();
  CHECK_EQ(dataDim, inV->getWidth());
  resetOutput(batchSize, dataDim);
  MatrixPtr outV = getOutputValue();
  Matrix::resizeOrCreate(inSquare_, batchSize, dataDim, false, useGpu_);
  inV->square2(*inSquare_);
  Matrix::resizeOrCreate(l2NormReciprocal_, batchSize, 1, false, useGpu_);
  inSquare_->rowSum(*l2NormReciprocal_);
  l2NormReciprocal_->sqrt2(*l2NormReciprocal_);
  l2NormReciprocal_->scalarDiv(*l2NormReciprocal_, 1.0);
  outV->rowScale(0, *inV, *l2NormReciprocal_);
 }
 void RowL2NormLayer::backward(const UpdateCallback& callback) {
  MatrixPtr inV = getInputValue(0);
  MatrixPtr inG = getInputGrad(0);
  MatrixPtr outV = getOutputValue();
  MatrixPtr outG = getOutputGrad();
  size_t batchSize = inV->getHeight();
  // inG[ij] += outG[ij] / l2NormReciprocal
  // inG[ij] += -inV[ij] * l2NormReciprocal * l2NormReciprocal * DotMul(outG[i],
  // inV[i])
  if (inG) {
    Matrix::resizeOrCreate(dotSum_, batchSize, 1, false, useGpu_);
    dotSum_->zeroMem();
    dotSum_->rowDotMul(0, *outG, *outV);
    dotSum_->dotMul(*dotSum_, *l2NormReciprocal_);
    dotSum_->dotMul(*dotSum_, *l2NormReciprocal_);
    inSquare_->rowScale(0, *inV, *dotSum_);
    inG->sub(*inSquare_);
    inG->addRowScale(0, *outG, *l2NormReciprocal_);
  }
 }
 }  // namespace paddle
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@ -1916,6 +1916,19 @@ TEST(Layer, ClipLayer) {
  }
 }
 TEST(Layer, RowL2NormLayer) {
  const size_t batchSize = 128;
  const size_t size = 512;
  TestConfig config;
  config.layerConfig.set_type("row_l2_norm");
  config.layerConfig.set_size(size);
  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
  config.layerConfig.add_inputs();
  for (auto useGpu : {false, true}) {
    testLayerGrad(config, "row_l2_norm", batchSize, false, useGpu, false);
  }
 }
 int main(int argc, char** argv) {
  testing::InitGoogleTest(&argc, argv);
  initMain(argc, argv);
--- a/paddle/operators/add_op.cu
+++ b/paddle/operators/add_op.cu
@ -1,3 +1,4 @@
 #define EIGEN_USE_GPU
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/add_op.h"
--- a/paddle/operators/add_op.h
+++ b/paddle/operators/add_op.h
@ -28,8 +28,7 @@ public:
    output->mutable_data<T>(context.GetPlace());
-    EigenVector<T>::Flatten(*output).device(
+    EigenVector<T>::Flatten(*output).device(context.GetEigenDevice<Place>()) =
        *(context.GetEigenDevice<Place>())) =
        framework::EigenVector<T>::Flatten(*input0) +
        framework::EigenVector<T>::Flatten(*input1);
  }
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@ -1,3 +1,4 @@
 #define EIGEN_USE_GPU
 #include "paddle/operators/cross_entropy_op.h"
 REGISTER_OP_GPU_KERNEL(onehot_cross_entropy,
--- a/paddle/operators/mean_op.h
+++ b/paddle/operators/mean_op.h
@ -27,7 +27,7 @@ public:
    output->mutable_data<T>(context.GetPlace());
-    EigenScalar<T>::From(*output).device(*(context.GetEigenDevice<Place>())) =
+    EigenScalar<T>::From(*output).device(context.GetEigenDevice<Place>()) =
        EigenVector<T>::Flatten(*input).mean();
  }
 };
--- a/paddle/operators/mul_op.cu
+++ b/paddle/operators/mul_op.cu
@ -12,6 +12,7 @@
   See the License for the specific language governing permissions and
   limitations under the License. */
 #define EIGEN_USE_GPU
 #include "paddle/operators/mul_op.h"
 REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<ops::GPUPlace, float>);
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
@ -29,7 +29,7 @@ public:
    auto output = context.Output<Tensor>(0);
    output->mutable_data<T>(context.GetPlace());
-    EigenMatrix<T>::From(*output).device(*(context.GetEigenDevice<Place>())) =
+    EigenMatrix<T>::From(*output).device(context.GetEigenDevice<Place>()) =
        EigenMatrix<T>::From(*context.Input<Tensor>("X"))
            .contract(EigenMatrix<T>::From(*context.Input<Tensor>("Y")),
                      dim_pair);
--- a/paddle/operators/rowwise_add_op.cu
+++ b/paddle/operators/rowwise_add_op.cu
@ -1,3 +1,4 @@
 #define EIGEN_USE_GPU
 #include "paddle/operators/rowwise_add_op.h"
 REGISTER_OP_GPU_KERNEL(rowwise_add,
--- a/paddle/operators/rowwise_add_op.h
+++ b/paddle/operators/rowwise_add_op.h
@ -33,7 +33,7 @@ public:
    const int rest_size = input.size() / bias_size;
    Eigen::DSizes<int, 1> one_d(input.size());
    Eigen::DSizes<int, 1> bcast(rest_size);
-    output.reshape(one_d).device(*(context.GetEigenDevice<Place>())) =
+    output.reshape(one_d).device(context.GetEigenDevice<Place>()) =
        input.reshape(one_d) + bias.broadcast(bcast).reshape(one_d);
  }
 };
--- a/paddle/operators/sgd_op.cu
+++ b/paddle/operators/sgd_op.cu
@ -1,3 +1,4 @@
 #define EIGEN_USE_GPU
 #include "paddle/operators/sgd_op.h"
 REGISTER_OP_GPU_KERNEL(sgd, ops::SGDOpKernel<ops::GPUPlace, float>);
--- a/paddle/operators/sgd_op.h
+++ b/paddle/operators/sgd_op.h
@ -29,7 +29,7 @@ public:
    param_out->mutable_data<T>(ctx.GetPlace());
-    EigenVector<T>::Flatten(*param_out).device(*(ctx.GetEigenDevice<Place>())) =
+    EigenVector<T>::Flatten(*param_out).device(ctx.GetEigenDevice<Place>()) =
        EigenVector<T>::Flatten(*param) - lr * EigenVector<T>::Flatten(*grad);
  }
 };
--- a/paddle/operators/sigmoid_op.cu
+++ b/paddle/operators/sigmoid_op.cu
@ -1,3 +1,4 @@
 #define EIGEN_USE_GPU
 #include "paddle/operators/sigmoid_op.h"
 REGISTER_OP_GPU_KERNEL(sigmoid, ops::SigmoidKernel<ops::GPUPlace, float>);
--- a/paddle/operators/sigmoid_op.h
+++ b/paddle/operators/sigmoid_op.h
@ -27,8 +27,7 @@ public:
    auto output = context.Output<Tensor>(0);
    output->mutable_data<T>(context.GetPlace());
-    EigenVector<T>::Flatten(*output).device(
+    EigenVector<T>::Flatten(*output).device(context.GetEigenDevice<Place>()) =
        *(context.GetEigenDevice<Place>())) =
        1.0 / (1.0 + (-1.0 * EigenVector<T>::Flatten(*input)).exp());
  }
 };
--- a/paddle/operators/softmax_op.cu
+++ b/paddle/operators/softmax_op.cu
@ -1,3 +1,4 @@
 #define EIGEN_USE_GPU
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/softmax_op.h"
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
@ -46,9 +46,9 @@ public:
                               .reshape(batch_by_one)
                               .broadcast(one_by_class));
-    softmax.device(*(context.GetEigenDevice<Place>())) = shifted_logits.exp();
+    softmax.device(context.GetEigenDevice<Place>()) = shifted_logits.exp();
-    softmax.device(*(context.GetEigenDevice<Place>())) =
+    softmax.device(context.GetEigenDevice<Place>()) =
        (softmax *
         softmax.sum(along_class)
             .inverse()
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@ -148,7 +148,7 @@ inline void throw_on_error(T e) {
  do {                                                                 \
    throw ::paddle::platform::EnforceNotMet(                           \
        std::make_exception_ptr(                                       \
-            std::runtime_error(string::Sprintf(__VA_ARGS__))), \
+            std::runtime_error(paddle::string::Sprintf(__VA_ARGS__))), \
        __FILE__, __LINE__);                                           \
  } while (0)
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@ -20,6 +20,8 @@ limitations under the License. */
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
 #include "paddle/framework/scope.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
 #include "paddle/pybind/tensor_bind.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
@ -55,6 +57,14 @@ static size_t UniqueIntegerGenerator() {
  return generator.fetch_add(1);
 }
 bool IsCompileGPU() {
 #ifdef PADDLE_ONLY_CPU
  return false;
 #else
  return true;
 #endif
 }
 PYBIND11_PLUGIN(core) {
  py::module m("core", "C++ core of PaddlePaddle");
@ -69,15 +79,27 @@ PYBIND11_PLUGIN(core) {
             self.Resize(pd::make_ddim(dim));
           })
      .def("alloc_float",
-           [](pd::Tensor& self) {
+           [](pd::Tensor& self, paddle::platform::GPUPlace& place) {
-             self.mutable_data<float>(paddle::platform::CPUPlace());
+             self.mutable_data<float>(place);
           })
      .def("alloc_float",
           [](pd::Tensor& self, paddle::platform::CPUPlace& place) {
             self.mutable_data<float>(place);
           })
      .def("alloc_int",
           [](pd::Tensor& self, paddle::platform::CPUPlace& place) {
             self.mutable_data<int>(place);
           })
      .def("alloc_int",
-           [](pd::Tensor& self) {
+           [](pd::Tensor& self, paddle::platform::GPUPlace& place) {
-             self.mutable_data<int>(paddle::platform::CPUPlace());
+             self.mutable_data<int>(place);
           })
-      .def("set", paddle::pybind::PyTensorSetFromArray<float>)
+      .def("set", paddle::pybind::PyCPUTensorSetFromArray<float>)
-      .def("set", paddle::pybind::PyTensorSetFromArray<int>)
+      .def("set", paddle::pybind::PyCPUTensorSetFromArray<int>)
 #ifndef PADDLE_ONLY_CPU
      .def("set", paddle::pybind::PyCUDATensorSetFromArray<float>)
      .def("set", paddle::pybind::PyCUDATensorSetFromArray<int>)
 #endif
      .def("shape",
           [](pd::Tensor& self) { return pd::vectorize(self.dims()); });
@ -136,11 +158,27 @@ All parameter, weight, gradient are variables in Paddle.
       "The module will return special predefined variable name in Paddle")
      .def("empty", pd::OperatorBase::EMPTY_VAR_NAME)
      .def("temp", pd::OperatorBase::TMP_VAR_NAME);
-
+  // clang-format off
  py::class_<paddle::platform::DeviceContext>(m, "DeviceContext")
-      .def_static("cpu_context", []() -> paddle::platform::DeviceContext* {
+      .def_static("create",
                  [](paddle::platform::CPUPlace& place)
                      -> paddle::platform::DeviceContext* {
                    return new paddle::platform::CPUDeviceContext();
                  })
      .def_static("create",
                  [](paddle::platform::GPUPlace& place)
                      -> paddle::platform::DeviceContext* {
 #ifdef PADDLE_ONLY_CPU
                    PADDLE_THROW("GPUPlace is not supported in CPU device.");
 #else
                    return new paddle::platform::CUDADeviceContext(place);
 #endif
                  });
  // clang-format on
  py::class_<paddle::platform::GPUPlace>(m, "GPUPlace").def(py::init<int>());
  py::class_<paddle::platform::CPUPlace>(m, "CPUPlace").def(py::init<>());
  py::class_<pd::OperatorBase, std::shared_ptr<pd::OperatorBase>> operator_base(
      m, "Operator");
@ -176,5 +214,7 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("unique_integer", UniqueIntegerGenerator);
  m.def("is_compile_gpu", IsCompileGPU);
  return m.ptr();
 }
--- a/paddle/pybind/tensor_bind.h
+++ b/paddle/pybind/tensor_bind.h
@ -13,9 +13,11 @@
   limitations under the License. */
 #pragma once
-#include <paddle/framework/tensor.h>
+#include <string>
-#include <pybind11/numpy.h>
+#include "paddle/framework/tensor.h"
-#include <pybind11/pybind11.h>
+#include "paddle/memory/memcpy.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 namespace py = pybind11;
@ -40,9 +42,6 @@ template <size_t I, typename... ARGS>
 struct CastToPyBufferImpl<true, I, ARGS...> {
  using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
  py::buffer_info operator()(framework::Tensor &tensor) {
    PADDLE_ENFORCE(paddle::platform::is_cpu_place(tensor.holder_->place()),
                   "Only CPU tensor can cast to numpy array");
    if (std::type_index(typeid(CUR_TYPE)) == tensor.holder_->type()) {
      auto dim_vec = framework::vectorize(tensor.dims());
      std::vector<size_t> dims_outside;
@ -56,12 +55,17 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
        strides[i - 1] = sizeof(CUR_TYPE) * prod;
        prod *= dims_outside[i - 1];
      }
-
+      framework::Tensor dst_tensor;
      if (paddle::platform::is_gpu_place(tensor.holder_->place())) {
        dst_tensor.CopyFrom<CUR_TYPE>(tensor, platform::CPUPlace());
      } else if (paddle::platform::is_cpu_place(tensor.holder_->place())) {
        dst_tensor = tensor;
      }
      return py::buffer_info(
-          tensor.mutable_data<CUR_TYPE>(tensor.holder_->place()),
+          dst_tensor.mutable_data<CUR_TYPE>(dst_tensor.holder_->place()),
          sizeof(CUR_TYPE),
          py::format_descriptor<CUR_TYPE>::format(),
-          (size_t)framework::arity(tensor.dims()),
+          (size_t)framework::arity(dst_tensor.dims()),
          dims_outside,
          strides);
    } else {
@ -77,9 +81,10 @@ inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
 }
 template <typename T>
-void PyTensorSetFromArray(
+void PyCPUTensorSetFromArray(
    framework::Tensor &self,
-    py::array_t<T, py::array::c_style | py::array::forcecast> array) {
+    py::array_t<T, py::array::c_style | py::array::forcecast> array,
    paddle::platform::CPUPlace &place) {
  std::vector<int> dims;
  dims.reserve(array.ndim());
  for (size_t i = 0; i < array.ndim(); ++i) {
@ -87,9 +92,28 @@ void PyTensorSetFromArray(
  }
  self.Resize(framework::make_ddim(dims));
-  auto *dst = self.mutable_data<T>(paddle::platform::CPUPlace());
+  auto *dst = self.mutable_data<T>(place);
  std::memcpy(dst, array.data(), sizeof(T) * array.size());
 }
 #ifndef PADDLE_ONLY_CPU
 template <typename T>
 void PyCUDATensorSetFromArray(
    framework::Tensor &self,
    py::array_t<T, py::array::c_style | py::array::forcecast> array,
    paddle::platform::GPUPlace &place) {
  std::vector<int> dims;
  dims.reserve(array.ndim());
  for (size_t i = 0; i < array.ndim(); ++i) {
    dims.push_back((int)array.shape()[i]);
  }
  self.Resize(framework::make_ddim(dims));
  auto *dst = self.mutable_data<T>(place);
  paddle::platform::GpuMemcpySync(
      dst, array.data(), sizeof(T) * array.size(), cudaMemcpyHostToDevice);
 }
 #endif
 }  // namespace pybind
 }  // namespace paddle
--- a/Show More
+++ b/Show More