Merge branch 'develop' of github.com:baidu/Paddle into feature/move_pybind_to_framework_dir

cblas_new
Yu Yang 8 years ago
commit fa5a5a3acf

@ -27,7 +27,7 @@ RUN apt-get update && \
git python-pip python-dev openssh-server bison \ git python-pip python-dev openssh-server bison \
wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \ wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
curl sed grep graphviz libjpeg-dev zlib1g-dev \ curl sed grep graphviz libjpeg-dev zlib1g-dev \
python-numpy python-matplotlib gcc g++ \ python-numpy python-matplotlib gcc-4.8 g++-4.8 \
automake locales clang-format-3.8 swig doxygen cmake \ automake locales clang-format-3.8 swig doxygen cmake \
liblapack-dev liblapacke-dev libboost-dev \ liblapack-dev liblapacke-dev libboost-dev \
clang-3.8 llvm-3.8 libclang-3.8-dev \ clang-3.8 llvm-3.8 libclang-3.8-dev \

@ -9,6 +9,11 @@ function(CheckCompilerCXX11Flag)
if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8) if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8)
message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.") message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.")
endif() endif()
# TODO(qijun) gcc 4.9 or later versions raise SEGV due to the optimization problem.
# Use Debug mode instead for now.
if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.9 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 4.9)
set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "" FORCE)
endif()
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang") elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
# cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang" # cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang"
# Apple Clang is a different compiler than upstream Clang which havs different version numbers. # Apple Clang is a different compiler than upstream Clang which havs different version numbers.

@ -105,6 +105,11 @@ cross_channel_norm
.. autoclass:: paddle.v2.layer.cross_channel_norm .. autoclass:: paddle.v2.layer.cross_channel_norm
:noindex: :noindex:
row_l2_norm
-----------
.. autoclass:: paddle.v2.layer.row_l2_norm
:noindex:
Recurrent Layers Recurrent Layers
================ ================
@ -320,6 +325,11 @@ scaling
.. autoclass:: paddle.v2.layer.scaling .. autoclass:: paddle.v2.layer.scaling
:noindex: :noindex:
clip
----
.. autoclass:: paddle.v2.layer.clip
:noindex:
slope_intercept slope_intercept
--------------- ---------------
.. autoclass:: paddle.v2.layer.slope_intercept .. autoclass:: paddle.v2.layer.slope_intercept

@ -1022,6 +1022,15 @@ void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
real alpha = 1.0f; real alpha = 1.0f;
real beta = 1.0f; real beta = 1.0f;
cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL; cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
int batch_size = ((cudnn_tensor_descriptor)inputDesc)->batch_size;
if (batch_size > 1024 && g_cudnn_lib_version < 6000) {
LOG(INFO) << " To process current batch data with size " << batch_size
<< " (>1024), cudnnBatchNorm requires cuDNN version >= 6000."
<< " If there is an error complaining CUDNN_STATUS_NOT_SUPPORTED,"
<< " just recompile PaddlePaddle with cuDNN >= 6000, replacing"
<< " current version " << g_cudnn_lib_version;
}
CHECK_CUDNN( CHECK_CUDNN(
dynload::cudnnBatchNormalizationForwardInference(t_resource.cudnn_handle, dynload::cudnnBatchNormalizationForwardInference(t_resource.cudnn_handle,
mode, mode,

@ -38,7 +38,7 @@ cc_library(backward SRCS backward.cc DEPS net)
cc_test(backward_test SRCS backward_test.cc DEPS backward) cc_test(backward_test SRCS backward_test.cc DEPS backward)
cc_library(paddle_pybind SHARED cc_library(paddle_pybind SHARED
SRCS pybind.cc SRCS pybind.cc
DEPS pybind python DEPS pybind python backward
fc_op fc_op
sgd_op sgd_op
add_op add_op

@ -400,6 +400,14 @@ class GradOpRegisterHelper {
return 0; \ return 0; \
} }
/**
* Macro to Forbid user register Gradient Operator.
*/
#define NO_GRADIENT(__op_type) \
STATIC_ASSERT_GLOBAL_NAMESPACE( \
__reg_gradient_op__##__op_type##__op_type##_grad, \
"NO_GRADIENT must be in global namespace")
/** /**
* Macro to Register OperatorKernel. * Macro to Register OperatorKernel.
*/ */

@ -20,16 +20,16 @@ namespace paddle {
namespace framework { namespace framework {
template <> template <>
Eigen::DefaultDevice* ExecutionContext::GetEigenDevice< Eigen::DefaultDevice& ExecutionContext::GetEigenDevice<
platform::CPUPlace, Eigen::DefaultDevice>() const { platform::CPUPlace, Eigen::DefaultDevice>() const {
return device_context_.get_eigen_device<Eigen::DefaultDevice>(); return *device_context_.get_eigen_device<Eigen::DefaultDevice>();
} }
#ifndef PADDLE_ONLY_CPU #ifndef PADDLE_ONLY_CPU
template <> template <>
Eigen::GpuDevice* Eigen::GpuDevice&
ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const { ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
return device_context_.get_eigen_device<Eigen::GpuDevice>(); return *device_context_.get_eigen_device<Eigen::GpuDevice>();
} }
#endif #endif

@ -253,7 +253,7 @@ class ExecutionContext : public OperatorContext {
template <typename PlaceType, template <typename PlaceType,
typename DeviceType = typename DeviceType =
typename EigenDeviceConverter<PlaceType>::EigenDeviceType> typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
DeviceType* GetEigenDevice() const; DeviceType& GetEigenDevice() const;
platform::Place GetPlace() const { return device_context_.GetPlace(); } platform::Place GetPlace() const { return device_context_.GetPlace(); }

@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
@ -16,11 +16,14 @@ limitations under the License. */
#include <fstream> #include <fstream>
#include <vector> #include <vector>
#include "paddle/framework/backward.h"
#include "paddle/framework/net.h" #include "paddle/framework/net.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/framework/operator.h" #include "paddle/framework/operator.h"
#include "paddle/framework/scope.h" #include "paddle/framework/scope.h"
#include "paddle/framework/tensor_bind.h" #include "paddle/framework/tensor_bind.h"
#include "paddle/platform/enforce.h"
#include "paddle/platform/place.h"
#include "pybind11/numpy.h" #include "pybind11/numpy.h"
#include "pybind11/pybind11.h" #include "pybind11/pybind11.h"
#include "pybind11/stl.h" #include "pybind11/stl.h"
@ -43,6 +46,10 @@ template <typename ClassType>
void ExposeOperator(ClassType &m) { void ExposeOperator(ClassType &m) {
m.def("infer_shape", &ClassType::type::InferShape) m.def("infer_shape", &ClassType::type::InferShape)
.def("run", &ClassType::type::Run) .def("run", &ClassType::type::Run)
.def("type",
[](const typename ClassType::type &op) -> std::string {
return op.type_;
})
.def("outputs", .def("outputs",
[](const typename ClassType::type &op) -> std::vector<std::string> { [](const typename ClassType::type &op) -> std::vector<std::string> {
return op.outputs_; return op.outputs_;
@ -55,6 +62,14 @@ static size_t UniqueIntegerGenerator() {
return generator.fetch_add(1); return generator.fetch_add(1);
} }
bool IsCompileGPU() {
#ifdef PADDLE_ONLY_CPU
return false;
#else
return true;
#endif
}
PYBIND11_PLUGIN(core) { PYBIND11_PLUGIN(core) {
py::module m("core", "C++ core of PaddlePaddle"); py::module m("core", "C++ core of PaddlePaddle");
@ -68,16 +83,29 @@ PYBIND11_PLUGIN(core) {
self.Resize(make_ddim(dim)); self.Resize(make_ddim(dim));
}) })
.def("alloc_float", .def("alloc_float",
[](Tensor &self) { [](pd::Tensor &self, paddle::platform::GPUPlace &place) {
self.mutable_data<float>(paddle::platform::CPUPlace()); self.mutable_data<float>(place);
})
.def("alloc_float",
[](pd::Tensor &self, paddle::platform::CPUPlace &place) {
self.mutable_data<float>(place);
})
.def("alloc_int",
[](pd::Tensor &self, paddle::platform::CPUPlace &place) {
self.mutable_data<int>(place);
}) })
.def("alloc_int", .def("alloc_int",
[](Tensor &self) { [](pd::Tensor &self, paddle::platform::GPUPlace &place) {
self.mutable_data<int>(paddle::platform::CPUPlace()); self.mutable_data<int>(place);
}) })
.def("set", PyTensorSetFromArray<float>) .def("set", paddle::pybind::PyCPUTensorSetFromArray<float>)
.def("set", PyTensorSetFromArray<int>) .def("set", paddle::pybind::PyCPUTensorSetFromArray<int>)
.def("shape", [](Tensor &self) { return vectorize(self.dims()); }); #ifndef PADDLE_ONLY_CPU
.def("set", paddle::pybind::PyCUDATensorSetFromArray<float>)
.def("set", paddle::pybind::PyCUDATensorSetFromArray<int>)
#endif
.def("shape",
[](pd::Tensor &self) { return pd::vectorize(self.dims()); });
py::class_<Variable>(m, "Variable", R"DOC(Variable Class. py::class_<Variable>(m, "Variable", R"DOC(Variable Class.
@ -124,13 +152,29 @@ All parameter, weight, gradient are variables in Paddle.
m.def_submodule( m.def_submodule(
"var_names", "var_names",
"The module will return special predefined variable name in Paddle") "The module will return special predefined variable name in Paddle")
.def("empty", OperatorBase::EMPTY_VAR_NAME) .def("empty", pd::OperatorBase::EMPTY_VAR_NAME)
.def("temp", OperatorBase::TMP_VAR_NAME); .def("temp", pd::OperatorBase::TMP_VAR_NAME);
// clang-format off
py::class_<paddle::platform::DeviceContext>(m, "DeviceContext") py::class_<paddle::platform::DeviceContext>(m, "DeviceContext")
.def_static("cpu_context", []() -> paddle::platform::DeviceContext * { .def_static("create",
[](paddle::platform::CPUPlace& place)
-> paddle::platform::DeviceContext* {
return new paddle::platform::CPUDeviceContext(); return new paddle::platform::CPUDeviceContext();
})
.def_static("create",
[](paddle::platform::GPUPlace& place)
-> paddle::platform::DeviceContext* {
#ifdef PADDLE_ONLY_CPU
PADDLE_THROW("GPUPlace is not supported in CPU device.");
#else
return new paddle::platform::CUDADeviceContext(place);
#endif
}); });
// clang-format on
py::class_<paddle::platform::GPUPlace>(m, "GPUPlace").def(py::init<int>());
py::class_<paddle::platform::CPUPlace>(m, "CPUPlace").def(py::init<>());
py::class_<OperatorBase, std::shared_ptr<OperatorBase>> operator_base( py::class_<OperatorBase, std::shared_ptr<OperatorBase>> operator_base(
m, "Operator"); m, "Operator");
@ -144,6 +188,13 @@ All parameter, weight, gradient are variables in Paddle.
desc.InitializationErrorString()); desc.InitializationErrorString());
return OpRegistry::CreateOp(desc); return OpRegistry::CreateOp(desc);
}); });
operator_base.def("backward",
[](const pd::OperatorBase &forwardOp,
const std::unordered_set<std::string> &no_grad_vars) {
return pd::Backward(forwardOp, no_grad_vars);
});
ExposeOperator(operator_base); ExposeOperator(operator_base);
py::class_<NetOp, std::shared_ptr<NetOp>> net(m, "Net"); py::class_<NetOp, std::shared_ptr<NetOp>> net(m, "Net");
@ -166,6 +217,8 @@ All parameter, weight, gradient are variables in Paddle.
m.def("unique_integer", UniqueIntegerGenerator); m.def("unique_integer", UniqueIntegerGenerator);
m.def("is_compile_gpu", IsCompileGPU);
return m.ptr(); return m.ptr();
} }
} // namespace framework } // namespace framework

@ -165,4 +165,4 @@ class Tensor {
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
#include "paddle/framework/detail/tensor-inl.h" #include "paddle/framework/tensor_impl.h"

@ -13,9 +13,11 @@
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <paddle/framework/tensor.h> #include <string>
#include <pybind11/numpy.h> #include "paddle/framework/tensor.h"
#include <pybind11/pybind11.h> #include "paddle/memory/memcpy.h"
#include "pybind11/numpy.h"
#include "pybind11/pybind11.h"
namespace py = pybind11; namespace py = pybind11;
@ -40,9 +42,6 @@ template <size_t I, typename... ARGS>
struct CastToPyBufferImpl<true, I, ARGS...> { struct CastToPyBufferImpl<true, I, ARGS...> {
using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type; using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
py::buffer_info operator()(framework::Tensor &tensor) { py::buffer_info operator()(framework::Tensor &tensor) {
PADDLE_ENFORCE(paddle::platform::is_cpu_place(tensor.holder_->place()),
"Only CPU tensor can cast to numpy array");
if (std::type_index(typeid(CUR_TYPE)) == tensor.holder_->type()) { if (std::type_index(typeid(CUR_TYPE)) == tensor.holder_->type()) {
auto dim_vec = framework::vectorize(tensor.dims()); auto dim_vec = framework::vectorize(tensor.dims());
std::vector<size_t> dims_outside; std::vector<size_t> dims_outside;
@ -56,11 +55,16 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
strides[i - 1] = sizeof(CUR_TYPE) * prod; strides[i - 1] = sizeof(CUR_TYPE) * prod;
prod *= dims_outside[i - 1]; prod *= dims_outside[i - 1];
} }
framework::Tensor dst_tensor;
if (paddle::platform::is_gpu_place(tensor.holder_->place())) {
dst_tensor.CopyFrom<CUR_TYPE>(tensor, platform::CPUPlace());
} else if (paddle::platform::is_cpu_place(tensor.holder_->place())) {
dst_tensor = tensor;
}
return py::buffer_info( return py::buffer_info(
tensor.mutable_data<CUR_TYPE>(tensor.holder_->place()), dst_tensor.mutable_data<CUR_TYPE>(dst_tensor.holder_->place()),
sizeof(CUR_TYPE), py::format_descriptor<CUR_TYPE>::format(), sizeof(CUR_TYPE), py::format_descriptor<CUR_TYPE>::format(),
(size_t)framework::arity(tensor.dims()), dims_outside, strides); (size_t)framework::arity(dst_tensor.dims()), dims_outside, strides);
} else { } else {
constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value; constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor); return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);
@ -74,9 +78,10 @@ inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
} }
template <typename T> template <typename T>
void PyTensorSetFromArray( void PyCPUTensorSetFromArray(
framework::Tensor &self, framework::Tensor &self,
py::array_t<T, py::array::c_style | py::array::forcecast> array) { py::array_t<T, py::array::c_style | py::array::forcecast> array,
paddle::platform::CPUPlace &place) {
std::vector<int> dims; std::vector<int> dims;
dims.reserve(array.ndim()); dims.reserve(array.ndim());
for (size_t i = 0; i < array.ndim(); ++i) { for (size_t i = 0; i < array.ndim(); ++i) {
@ -84,9 +89,28 @@ void PyTensorSetFromArray(
} }
self.Resize(framework::make_ddim(dims)); self.Resize(framework::make_ddim(dims));
auto *dst = self.mutable_data<T>(paddle::platform::CPUPlace()); auto *dst = self.mutable_data<T>(place);
std::memcpy(dst, array.data(), sizeof(T) * array.size()); std::memcpy(dst, array.data(), sizeof(T) * array.size());
} }
#ifndef PADDLE_ONLY_CPU
template <typename T>
void PyCUDATensorSetFromArray(
framework::Tensor &self,
py::array_t<T, py::array::c_style | py::array::forcecast> array,
paddle::platform::GPUPlace &place) {
std::vector<int> dims;
dims.reserve(array.ndim());
for (size_t i = 0; i < array.ndim(); ++i) {
dims.push_back((int)array.shape()[i]);
}
self.Resize(framework::make_ddim(dims));
auto *dst = self.mutable_data<T>(place);
paddle::platform::GpuMemcpySync(dst, array.data(), sizeof(T) * array.size(),
cudaMemcpyHostToDevice);
}
#endif
} // namespace pybind } // namespace pybind
} // namespace paddle } // namespace paddle

@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include "paddle/memory/memcpy.h" #include "paddle/memory/memcpy.h"
namespace paddle { namespace paddle {
@ -62,9 +61,11 @@ inline T* Tensor::mutable_data(platform::Place place) {
if (platform::is_cpu_place(place)) { if (platform::is_cpu_place(place)) {
holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>( holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
boost::get<platform::CPUPlace>(place), size)); boost::get<platform::CPUPlace>(place), size));
} else if (platform::is_gpu_place(place)) {
#ifdef PADDLE_ONLY_CPU
PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
} }
#ifndef PADDLE_ONLY_CPU #else
else if (platform::is_gpu_place(place)) {
holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>( holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>(
boost::get<platform::GPUPlace>(place), size)); boost::get<platform::GPUPlace>(place), size));
} }

@ -109,6 +109,13 @@ protected:
return filter[filter.ndims() - 1]; return filter[filter.ndims() - 1];
} }
// determine whether im2col needs to be performed
inline bool isNeedIm2col(const TensorShape& filter) const {
return !(getFilterHeight(filter) == 1 && getFilterWidth(filter) == 1 &&
strideH() == 1 && strideW() == 1 && paddingH() == 0 &&
paddingW() == 0);
}
std::vector<size_t> strides_; std::vector<size_t> strides_;
std::vector<size_t> paddings_; std::vector<size_t> paddings_;

@ -66,16 +66,23 @@ public:
real* inputData = inputs[0].data<real>(); real* inputData = inputs[0].data<real>();
real* filterData = inputs[1].data<real>(); real* filterData = inputs[1].data<real>();
real* outputData = outputs[0].data<real>(); real* outputData = outputs[0].data<real>();
bool needIm2col = isNeedIm2col(filter);
TensorShape imShape = TensorShape imShape =
TensorShape({inputChannels / groups_, inputHeight, inputWidth}); TensorShape({inputChannels / groups_, inputHeight, inputWidth});
TensorShape colShape = TensorShape({inputChannels / groups_,
TensorShape colShape;
real* colData = NULL;
if (needIm2col) {
colShape = TensorShape({inputChannels / groups_,
filterHeight, filterHeight,
filterWidth, filterWidth,
outputHeight, outputHeight,
outputWidth}); outputWidth});
resizeBuffer<Device>(colShape.getElements()); resizeBuffer<Device>(colShape.getElements());
real* colData = reinterpret_cast<real*>(memory_->getBuf()); colData = reinterpret_cast<real*>(memory_->getBuf());
}
Im2ColFunctor<kCFO, Device, real> im2col; Im2ColFunctor<kCFO, Device, real> im2col;
GemmFunctor<Device, real> gemm; GemmFunctor<Device, real> gemm;
@ -86,6 +93,7 @@ public:
for (size_t i = 0; i < batchSize; i++) { for (size_t i = 0; i < batchSize; i++) {
for (size_t g = 0; g < groups_; g++) { for (size_t g = 0; g < groups_; g++) {
if (needIm2col) {
im2col(inputData + g * inputOffset, im2col(inputData + g * inputOffset,
imShape, imShape,
colData, colData,
@ -94,7 +102,9 @@ public:
strideW(), strideW(),
paddingH(), paddingH(),
paddingW()); paddingW());
} else {
colData = inputData + g * inputOffset;
}
int M = outputChannels / groups_; int M = outputChannels / groups_;
int N = outputHeight * outputWidth; int N = outputHeight * outputWidth;
int K = inputChannels / groups_ * filterHeight * filterWidth; int K = inputChannels / groups_ * filterHeight * filterWidth;
@ -159,19 +169,27 @@ public:
real* outputGrad = inputs[0].data<real>(); real* outputGrad = inputs[0].data<real>();
real* filterData = inputs[1].data<real>(); real* filterData = inputs[1].data<real>();
real* inputGrad = outputs[0].data<real>(); real* inputGrad = outputs[0].data<real>();
bool needIm2col = isNeedIm2col(filter);
TensorShape imShape = TensorShape imShape =
TensorShape({inputChannels / groups_, inputHeight, inputWidth}); TensorShape({inputChannels / groups_, inputHeight, inputWidth});
TensorShape colShape = TensorShape({inputChannels / groups_,
TensorShape colShape;
real* colData = NULL;
if (needIm2col) {
colShape = TensorShape({inputChannels / groups_,
filterHeight, filterHeight,
filterWidth, filterWidth,
outputHeight, outputHeight,
outputWidth}); outputWidth});
resizeBuffer<Device>(colShape.getElements()); resizeBuffer<Device>(colShape.getElements());
real* colData = reinterpret_cast<real*>(memory_->getBuf()); colData = reinterpret_cast<real*>(memory_->getBuf());
}
Col2ImFunctor<kCFO, Device, real> col2im; Col2ImFunctor<kCFO, Device, real> col2im;
GemmFunctor<Device, real> gemm; GemmFunctor<Device, real> gemm;
size_t inputOffset = imShape.getElements(); size_t inputOffset = imShape.getElements();
size_t outputOffset = size_t outputOffset =
(outputChannels / groups_) * outputHeight * outputWidth; (outputChannels / groups_) * outputHeight * outputWidth;
@ -182,6 +200,11 @@ public:
int K = outputChannels / groups_; int K = outputChannels / groups_;
int N = outputHeight * outputWidth; int N = outputHeight * outputWidth;
int M = inputChannels / groups_ * filterHeight * filterWidth; int M = inputChannels / groups_ * filterHeight * filterWidth;
real scale = 0.0f;
if (!needIm2col) {
colData = inputGrad + g * inputOffset;
scale = 1.0f;
}
gemm(CblasTrans, gemm(CblasTrans,
CblasNoTrans, CblasNoTrans,
M, M,
@ -192,9 +215,10 @@ public:
M, M,
outputGrad + g * outputOffset, outputGrad + g * outputOffset,
N, N,
0.0f, scale,
colData, colData,
N); N);
if (needIm2col) {
col2im(inputGrad + g * inputOffset, col2im(inputGrad + g * inputOffset,
imShape, imShape,
colData, colData,
@ -204,6 +228,7 @@ public:
paddingH(), paddingH(),
paddingW()); paddingW());
} }
}
inputGrad += inputChannels * inputHeight * inputWidth; inputGrad += inputChannels * inputHeight * inputWidth;
outputGrad += outputChannels * outputHeight * outputWidth; outputGrad += outputChannels * outputHeight * outputWidth;
} }
@ -255,16 +280,23 @@ public:
real* outputGrad = inputs[0].data<real>(); real* outputGrad = inputs[0].data<real>();
real* inputData = inputs[1].data<real>(); real* inputData = inputs[1].data<real>();
real* filterGrad = outputs[0].data<real>(); real* filterGrad = outputs[0].data<real>();
bool needIm2col = isNeedIm2col(filter);
TensorShape imShape = TensorShape imShape =
TensorShape({inputChannels / groups_, inputHeight, inputWidth}); TensorShape({inputChannels / groups_, inputHeight, inputWidth});
TensorShape colShape = TensorShape({inputChannels / groups_,
TensorShape colShape;
real* colData = NULL;
if (needIm2col) {
colShape = TensorShape({inputChannels / groups_,
filterHeight, filterHeight,
filterWidth, filterWidth,
outputHeight, outputHeight,
outputWidth}); outputWidth});
resizeBuffer<Device>(colShape.getElements()); resizeBuffer<Device>(colShape.getElements());
real* colData = reinterpret_cast<real*>(memory_->getBuf()); colData = reinterpret_cast<real*>(memory_->getBuf());
}
Im2ColFunctor<kCFO, Device, real> im2col; Im2ColFunctor<kCFO, Device, real> im2col;
GemmFunctor<Device, real> gemm; GemmFunctor<Device, real> gemm;
@ -274,6 +306,7 @@ public:
size_t filterOffset = filter.getElements() / groups_; size_t filterOffset = filter.getElements() / groups_;
for (size_t i = 0; i < batchSize; i++) { for (size_t i = 0; i < batchSize; i++) {
for (size_t g = 0; g < groups_; g++) { for (size_t g = 0; g < groups_; g++) {
if (needIm2col) {
im2col(inputData + g * inputOffset, im2col(inputData + g * inputOffset,
imShape, imShape,
colData, colData,
@ -282,7 +315,9 @@ public:
strideW(), strideW(),
paddingH(), paddingH(),
paddingW()); paddingW());
} else {
colData = inputData + g * inputOffset;
}
int M = outputChannels / groups_; int M = outputChannels / groups_;
int K = outputHeight * outputWidth; int K = outputHeight * outputWidth;
int N = inputChannels / groups_ * filterHeight * filterWidth; int N = inputChannels / groups_ * filterHeight * filterWidth;

@ -0,0 +1,79 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "Layer.h"
namespace paddle {
/**
* A layer for clipping the input value by the threshold.
* \f[
* out[i] = \min\left(\max\left(in[i],p_{1}\right),p_{2}\right)
* \f]
*/
class ClipLayer : public Layer {
protected:
double min_;
double max_;
public:
explicit ClipLayer(const LayerConfig& config) : Layer(config) {}
bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) override;
void forward(PassType passType) override;
void backward(const UpdateCallback& callback = nullptr) override;
};
REGISTER_LAYER(clip, ClipLayer);
bool ClipLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
Layer::init(layerMap, parameterMap);
CHECK_EQ(inputLayers_.size(), 1U);
auto layerConf = config_.inputs(0).clip_conf();
min_ = layerConf.min();
max_ = layerConf.max();
CHECK_LT(min_, max_);
return true;
}
void ClipLayer::forward(PassType passType) {
Layer::forward(passType);
MatrixPtr inV = getInputValue(0);
resetOutput(inV->getHeight(), inV->getWidth());
MatrixPtr outV = getOutputValue();
outV->copyFrom(*inV);
outV->clip(min_, max_);
}
void ClipLayer::backward(const UpdateCallback& callback) {
MatrixPtr inV = getInputValue(0);
MatrixPtr inG = getInputGrad(0);
if (inG) {
MatrixPtr outV = getOutputValue();
MatrixPtr outG = getOutputGrad();
MatrixPtr tmpMtx;
Matrix::resizeOrCreate(
tmpMtx, outG->getHeight(), outG->getWidth(), false, useGpu_);
tmpMtx->clipDerivative(*inV, min_, max_);
inG->addDotMul(*outG, *tmpMtx, 1, 1);
}
}
} // namespace paddle

@ -0,0 +1,98 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "Layer.h"
namespace paddle {
/**
* A layer for L2 normalization in each row,
* \f[
* out[i] = \frac{in[i]}{\sqrt{\sum_{k=1}^N in[k]^{2}}}
* \f]
* where the size of \f$in\f$ is (batchSize x dataDim),
* and the size of \f$out\f$ is (batchSize x dataDim).
*/
class RowL2NormLayer : public Layer {
protected:
MatrixPtr inSquare_;
MatrixPtr l2NormReciprocal_;
MatrixPtr dotSum_;
public:
explicit RowL2NormLayer(const LayerConfig& config) : Layer(config) {}
bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) override;
void forward(PassType passType) override;
void backward(const UpdateCallback& callback = nullptr) override;
};
REGISTER_LAYER(row_l2_norm, RowL2NormLayer);
bool RowL2NormLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
Layer::init(layerMap, parameterMap);
CHECK_EQ(inputLayers_.size(), 1U);
return true;
}
void RowL2NormLayer::forward(PassType passType) {
Layer::forward(passType);
MatrixPtr inV = getInputValue(0);
/* malloc memory for the output_ if necessary */
size_t batchSize = inV->getHeight();
size_t dataDim = getSize();
CHECK_EQ(dataDim, inV->getWidth());
resetOutput(batchSize, dataDim);
MatrixPtr outV = getOutputValue();
Matrix::resizeOrCreate(inSquare_, batchSize, dataDim, false, useGpu_);
inV->square2(*inSquare_);
Matrix::resizeOrCreate(l2NormReciprocal_, batchSize, 1, false, useGpu_);
inSquare_->rowSum(*l2NormReciprocal_);
l2NormReciprocal_->sqrt2(*l2NormReciprocal_);
l2NormReciprocal_->scalarDiv(*l2NormReciprocal_, 1.0);
outV->rowScale(0, *inV, *l2NormReciprocal_);
}
void RowL2NormLayer::backward(const UpdateCallback& callback) {
MatrixPtr inV = getInputValue(0);
MatrixPtr inG = getInputGrad(0);
MatrixPtr outV = getOutputValue();
MatrixPtr outG = getOutputGrad();
size_t batchSize = inV->getHeight();
// inG[ij] += outG[ij] / l2NormReciprocal
// inG[ij] += -inV[ij] * l2NormReciprocal * l2NormReciprocal * DotMul(outG[i],
// inV[i])
if (inG) {
Matrix::resizeOrCreate(dotSum_, batchSize, 1, false, useGpu_);
dotSum_->zeroMem();
dotSum_->rowDotMul(0, *outG, *outV);
dotSum_->dotMul(*dotSum_, *l2NormReciprocal_);
dotSum_->dotMul(*dotSum_, *l2NormReciprocal_);
inSquare_->rowScale(0, *inV, *dotSum_);
inG->sub(*inSquare_);
inG->addRowScale(0, *outG, *l2NormReciprocal_);
}
}
} // namespace paddle

@ -1899,6 +1899,36 @@ TEST(Layer, CropLayer) {
} }
} }
TEST(Layer, ClipLayer) {
const size_t batchSize = 128;
const size_t size = 512;
TestConfig config;
config.layerConfig.set_type("clip");
config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
LayerInputConfig* input = config.layerConfig.add_inputs();
ClipConfig* layerConf = input->mutable_clip_conf();
double p1 = std::rand() / (double)RAND_MAX;
double p2 = std::rand() / (double)RAND_MAX;
layerConf->set_min(std::min(p1, p2));
layerConf->set_max(std::max(p1, p2));
for (auto useGpu : {false, true}) {
testLayerGrad(config, "clip", batchSize, false, useGpu, false);
}
}
TEST(Layer, RowL2NormLayer) {
const size_t batchSize = 128;
const size_t size = 512;
TestConfig config;
config.layerConfig.set_type("row_l2_norm");
config.layerConfig.set_size(size);
config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
config.layerConfig.add_inputs();
for (auto useGpu : {false, true}) {
testLayerGrad(config, "row_l2_norm", batchSize, false, useGpu, false);
}
}
int main(int argc, char** argv) { int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv); testing::InitGoogleTest(&argc, argv);
initMain(argc, argv); initMain(argc, argv);

@ -442,6 +442,12 @@ DEFINE_MATRIX_UNARY_PARAMETER_OP(Clip, TWO_PARAMETER,
template<class T> template<class T>
void BaseMatrixT<T>::clip(T p1, T p2) { applyUnary(unary::Clip<T>(p1, p2)); } void BaseMatrixT<T>::clip(T p1, T p2) { applyUnary(unary::Clip<T>(p1, p2)); }
DEFINE_MATRIX_BINARY_PARAMETER_OP(ClipDerivative, TWO_PARAMETER, a = b < p1 ? 0 : (b > p2 ? 0 : 1));
template<class T>
void BaseMatrixT<T>::clipDerivative(BaseMatrixT& b, T p1, T p2) {
applyBinary(binary::ClipDerivative<T>(p1, p2), b);
}
DEFINE_MATRIX_UNARY_PARAMETER_OP(BiggerThanScalar, ONE_PARAMETER, DEFINE_MATRIX_UNARY_PARAMETER_OP(BiggerThanScalar, ONE_PARAMETER,
a = a > p ? 1.0f : 0.0f); a = a > p ? 1.0f : 0.0f);
template<class T> template<class T>

@ -488,6 +488,13 @@ public:
*/ */
void clip(T p1, T p2); void clip(T p1, T p2);
/**
* this = b < low ? 0 : 1
*
* this = b > high ? 0 : 1
*/
void clipDerivative(BaseMatrixT& b, T p1, T p2);
/** /**
* @code * @code
* a = a > p ? 1.0f : 0.0f * a = a > p ? 1.0f : 0.0f

@ -60,10 +60,5 @@ op_library(sgd_op SRCS sgd_op.cc sgd_op.cu)
op_library(fc_op op_library(fc_op
SRCS fc_op.cc SRCS fc_op.cc
DEPS mul_op rowwise_add_op sigmoid_op softmax_op net) DEPS mul_op rowwise_add_op sigmoid_op softmax_op net)
op_library(recurrent_op SRCS recurrent_op.cc DEPS op_desc tensor op_registry operator net)
op_library(recurrent_network_op cc_test(recurrent_op_test SRCS recurrent_op_test.cc DEPS recurrent_op gtest mul_op add_op)
SRCS recurrent_network_op.cc
DEPS op_desc tensor net)
cc_test(recurrent_network_op_test
SRCS recurrent_network_op_test.cc
DEPS recurrent_network_op mul_op add_op)

@ -50,10 +50,6 @@ The equation is: Out = X + Y
class AddOpGrad : public OperatorWithKernel { class AddOpGrad : public OperatorWithKernel {
protected: protected:
void InferShape(const InferShapeContext &ctx) const override {} void InferShape(const InferShapeContext &ctx) const override {}
std::string DebugString() const override {
LOG(INFO) << "AddOpGrad";
return "";
}
}; };
} // namespace operators } // namespace operators

@ -1,3 +1,4 @@
#define EIGEN_USE_GPU
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/operators/add_op.h" #include "paddle/operators/add_op.h"

@ -28,10 +28,13 @@ public:
output->mutable_data<T>(context.GetPlace()); output->mutable_data<T>(context.GetPlace());
EigenVector<T>::Flatten(*output).device( auto X = EigenVector<T>::Flatten(*input0);
*(context.GetEigenDevice<Place>())) = auto Y = EigenVector<T>::Flatten(*input1);
framework::EigenVector<T>::Flatten(*input0) + auto Z = EigenVector<T>::Flatten(*output);
framework::EigenVector<T>::Flatten(*input1);
auto place = context.GetEigenDevice<Place>();
Z.device(place) = X + Y;
} }
}; };

@ -1,3 +1,4 @@
#define EIGEN_USE_GPU
#include "paddle/operators/cross_entropy_op.h" #include "paddle/operators/cross_entropy_op.h"
REGISTER_OP_GPU_KERNEL(onehot_cross_entropy, REGISTER_OP_GPU_KERNEL(onehot_cross_entropy,

@ -33,13 +33,23 @@ public:
MeanOpMaker(OpProto *proto, OpAttrChecker *op_checker) MeanOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The input of mean op"); AddInput("X", "The input of mean op");
AddOutput("Out", "The output of mean op"); AddOutput("Out", "The output of mean op").IgnoreGradient();
AddComment("Mean Operator"); AddComment("Mean Operator");
} }
}; };
class MeanGradOp : public OperatorWithKernel {
protected:
void InferShape(const InferShapeContext &ctx) const override {
ctx.Output<Tensor>("X" + GRAD_VAR_SUFFIX())
->Resize(ctx.Input<Tensor>("X")->dims());
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_OP(mean, ops::MeanOp, ops::MeanOpMaker); REGISTER_OP(mean, ops::MeanOp, ops::MeanOpMaker);
REGISTER_OP_CPU_KERNEL(mean, ops::MeanKernel<ops::CPUPlace, float>); REGISTER_OP_CPU_KERNEL(mean, ops::MeanKernel<ops::CPUPlace, float>);
REGISTER_GRADIENT_OP(mean, mean_grad, ops::MeanGradOp);
REGISTER_OP_CPU_KERNEL(mean_grad, ops::MeanGradKernel<ops::CPUPlace, float>);

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save