Merge remote-tracking branch 'ups/develop' into feature/op/fusion_lstm

dataset_flowers_-_change_md5
tensor-tang 7 years ago
commit a72f68f223

@ -204,11 +204,12 @@ include(external/snappy) # download snappy
include(external/snappystream)
include(external/threadpool)
set(WITH_ANAKIN OFF CACHE STRING "Disable Anakin first, will add it later." FORCE)
if(WITH_GPU)
include(cuda)
include(tensorrt)
include(external/anakin)
elseif()
set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in GPU only now." FORCE)
endif()
include(cudnn) # set cudnn libraries, must before configure

@ -104,6 +104,10 @@ if(WITH_GPU)
if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
message(FATAL_ERROR "Anakin needs CUDNN >= 7.0 to compile")
endif()
set(ENV{CUDNN_INCLUDE_DIR} ${CUDNN_INCLUDE_DIR})
set(ENV{CUDNN_LIBRARY} ${CUDNN_LIBRARY})
message(STATUS "cudnn include header is ${CUDNN_INCLUDE_DIR}/cudnn.h")
message(STATUS "cudnn library is ${CUDNN_LIBRARY}")
endif()
elseif(WITH_AMD_GPU)
add_definitions(-DPADDLE_WITH_HIP)

@ -35,9 +35,8 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS
ExternalProject_Add(
extern_anakin
${EXTERNAL_PROJECT_LOG_ARGS}
# TODO(luotao): use PaddlePaddle/Anakin later
GIT_REPOSITORY "https://github.com/luotao1/Anakin"
GIT_TAG "3957ae9263eaa0b1986758dac60a88852afb09be"
GIT_REPOSITORY "https://github.com/PaddlePaddle/Anakin"
GIT_TAG "04256ba78fa3da0beb74e8036c8efd68c12824d6"
PREFIX ${ANAKIN_SOURCE_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DUSE_GPU_PLACE=YES

@ -155,10 +155,11 @@ paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale',
paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.random_crop ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.relu ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.log ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.log ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.prelu ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))

@ -20,6 +20,9 @@
DEFINE_int32(io_threadpool_size, 100,
"number of threads used for doing IO, default 100");
DEFINE_int32(dist_threadpool_size, 0,
"number of threads used for distributed executed.");
namespace paddle {
namespace framework {
@ -35,6 +38,10 @@ void ThreadPool::Init() {
if (threadpool_.get() == nullptr) {
// TODO(Yancey1989): specify the max threads number
int num_threads = std::thread::hardware_concurrency();
if (FLAGS_dist_threadpool_size > 0) {
num_threads = FLAGS_dist_threadpool_size;
VLOG(1) << "set dist_threadpool_size to " << num_threads;
}
PADDLE_ENFORCE_GT(num_threads, 0);
threadpool_.reset(new ThreadPool(num_threads));
}

@ -60,7 +60,7 @@ cc_library(paddle_inference_tensorrt_subgraph_engine
inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec)
endif()
if (WITH_ANAKIN) # only needed in CI
if (WITH_ANAKIN AND WITH_GPU) # only needed in CI
# compile the libinference_anakin_api.a and anakin.so.
nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber)
#nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin)

@ -26,6 +26,8 @@ namespace plat = paddle::platform;
act_type##_grad, ops::ActivationGradKernel<plat::CUDADeviceContext, \
ops::grad_functor<float>>, \
ops::ActivationGradKernel<plat::CUDADeviceContext, \
ops::grad_functor<double>>);
ops::grad_functor<double>>, \
ops::ActivationGradKernel<plat::CUDADeviceContext, \
ops::grad_functor<plat::float16>>);
FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);

@ -333,8 +333,7 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
const Out out_conj = Eigen::numext::conj(out);
dx.device(d) = static_cast<T>(0.5) * dout / out_conj;
dx.device(d) = static_cast<T>(0.5) * dout / out;
}
};
@ -740,7 +739,7 @@ struct PowGradFunctor : public BaseActivationFunctor<T> {
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) = dout * static_cast<T>(factor) *
x.pow(static_cast<T>(factor - static_cast<T>(1)));
x.pow(static_cast<T>(factor) - static_cast<T>(1));
}
};
@ -863,10 +862,11 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
T b = static_cast<T>(beta);
auto temp1 = static_cast<T>(1) /
(static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
auto temp2 = temp1 * (static_cast<T>(1) - (beta * out));
dx.device(d) = dout * ((beta * out) + temp2);
(static_cast<T>(1) + (static_cast<T>(-b) * x).exp());
auto temp2 = temp1 * (static_cast<T>(1) - (b * out));
dx.device(d) = dout * ((b * out) + temp2);
}
};

@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/assign_value_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(assign_value, ops::AssignValueKernel<int>,
ops::AssignValueKernel<float>);
ops::AssignValueKernel<float>,
ops::AssignValueKernel<plat::float16>);

@ -39,6 +39,27 @@ using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
static_cast<size_t>(1024) * 1024 * 1024;
template <typename T, typename DeviceContext>
// bool EnableFp16(const T& dummy, const DeviceContext& dev_ctx,
bool EnableFp16(const DeviceContext& dev_ctx,
cudnnConvolutionDescriptor_t cudnn_conv_desc) {
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
// Tensor core is supported since the volta GPU and
// is only enabled when input and filter data are float16
if (dev_ctx.GetComputeCapability() >= 70 &&
std::type_index(typeid(T)) ==
std::type_index(typeid(platform::float16))) {
PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
return true;
} else {
PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
cudnn_conv_desc, CUDNN_DEFAULT_MATH));
}
#endif
return false;
}
template <typename T>
class CUDNNConvOpKernel : public framework::OpKernel<T> {
public:
@ -128,27 +149,14 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
cudnnConvolutionFwdAlgo_t algo;
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle();
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo));
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
// Tensor core is supported since the volta GPU and
// is only enabled when input and filter data are float16
if (dev_ctx.GetComputeCapability() >= 70 &&
std::type_index(typeid(T)) ==
std::type_index(typeid(platform::float16))) {
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
// Currently tensor core is only enabled using this algo
if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
} else {
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
cudnn_conv_desc, CUDNN_DEFAULT_MATH));
PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo));
}
#endif
// get workspace size able to allocate
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
@ -288,6 +296,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
} else {
data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
}
if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
}
CUDNN_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
@ -307,6 +318,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
} else {
filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
}
if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
}
CUDNN_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
@ -362,7 +376,8 @@ REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvOpKernel<plat::float16>);
REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvGradOpKernel<float>,
paddle::operators::CUDNNConvGradOpKernel<double>);
paddle::operators::CUDNNConvGradOpKernel<double>,
paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvOpKernel<float>,
@ -370,4 +385,5 @@ REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvOpKernel<plat::float16>);
REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvGradOpKernel<float>,
paddle::operators::CUDNNConvGradOpKernel<double>);
paddle::operators::CUDNNConvGradOpKernel<double>,
paddle::operators::CUDNNConvGradOpKernel<plat::float16>)

@ -13,12 +13,16 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/cross_entropy_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
using CUDACtx = paddle::platform::CUDADeviceContext;
REGISTER_OP_CUDA_KERNEL(cross_entropy,
ops::CrossEntropyOpKernel<CUDACtx, float>,
ops::CrossEntropyOpKernel<CUDACtx, double>);
REGISTER_OP_CUDA_KERNEL(cross_entropy_grad,
ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
ops::CrossEntropyGradientOpKernel<CUDACtx, double>);
ops::CrossEntropyOpKernel<CUDACtx, double>,
ops::CrossEntropyOpKernel<CUDACtx, plat::float16>);
REGISTER_OP_CUDA_KERNEL(
cross_entropy_grad, ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
ops::CrossEntropyGradientOpKernel<CUDACtx, double>,
ops::CrossEntropyGradientOpKernel<CUDACtx, plat::float16>);

@ -190,12 +190,15 @@ bool VariableResponse::ProcSerializedField(
#endif
}
VLOG(7) << "ProcSerializedField:" << meta_.varname()
<< ", type:" << meta_.type() << std::endl;
framework::DDim dims = GetDims(meta_.dims());
if (meta_.type() == sendrecv::LOD_TENSOR) {
PADDLE_ENFORCE(meta_.lod_size() >= 0, "lod info should be got first!");
if (!CopyLodTensorData(input, *dev_ctx_, dims, num_bytes)) {
return false;
}
return true;
}
@ -206,7 +209,9 @@ bool VariableResponse::ProcSerializedField(
return true;
}
return true;
PADDLE_ENFORCE("not supported var types:", meta_.varname(), meta_.type());
return false;
}
}; // namespace distributed

@ -30,4 +30,5 @@ REGISTER_OP_CUDA_KERNEL(
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, float>,
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, double>,
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int>,
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>);
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>,
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, plat::float16>);

@ -14,19 +14,24 @@ limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/elementwise_div_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
elementwise_div,
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>);
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
plat::float16>);
REGISTER_OP_CUDA_KERNEL(
elementwise_div_grad,
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
int64_t>);
plat::float16>);

@ -14,19 +14,25 @@ limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/elementwise_mul_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
elementwise_mul,
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int64_t>);
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext,
plat::float16>);
REGISTER_OP_CUDA_KERNEL(
elementwise_mul_grad,
ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext,
plat::float16>,
ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext,
int64_t>);

@ -350,7 +350,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(
int j = blockIdx.x;
int i = threadIdx.x;
int tid = threadIdx.x;
T val = 0;
T val(0);
do {
int x_offset = i * w + j;
@ -418,7 +418,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
int tid = threadIdx.x;
int j = blockIdx.x;
T val = 0;
T val(0);
int ttid = tid;
while (true) {

@ -14,19 +14,25 @@ limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/elementwise_sub_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
elementwise_sub,
ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>);
ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
plat::float16>);
REGISTER_OP_CUDA_KERNEL(
elementwise_sub_grad,
ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
plat::float16>,
ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
int64_t>);

@ -12,48 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/operators/fill_constant_op.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace operators {
class FillConstantInferShape : public framework::InferShapeBase {
class FillConstantOp : public framework::OperatorWithKernel {
public:
void operator()(framework::InferShapeContext *ctx) const override {
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of FillConstantOp should not be null.");
auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
auto& shape = ctx->Attrs().Get<std::vector<int>>("shape");
ctx->SetOutputDim("Out", framework::make_ddim(shape));
}
};
class FillConstantOp : public framework::OperatorBase {
public:
using framework::OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &dev_place) const override {
auto data_type =
static_cast<framework::proto::VarType::Type>(Attr<int>("dtype"));
auto value = Attr<float>("value");
auto force_cpu = Attr<bool>("force_cpu");
auto &out =
*scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
out.Resize(framework::make_ddim(Attr<std::vector<int>>("shape")));
if (force_cpu) {
auto cpu = platform::CPUPlace();
out.mutable_data(cpu, framework::ToTypeIndex(data_type));
} else {
out.mutable_data(dev_place, framework::ToTypeIndex(data_type));
}
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(dev_place);
math::set_constant(dev_ctx, &out, value);
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")),
ctx.device_context());
}
};
@ -87,6 +67,11 @@ Fill up a variable with specified constant value.
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(fill_constant, ops::FillConstantOp,
ops::FillConstantInferShape, ops::FillConstantOpMaker,
REGISTER_OPERATOR(fill_constant, ops::FillConstantOp, ops::FillConstantOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(
fill_constant,
ops::FillConstantOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::FillConstantOpKernel<paddle::platform::CPUDeviceContext, double>,
ops::FillConstantOpKernel<paddle::platform::CPUDeviceContext, int>,
ops::FillConstantOpKernel<paddle::platform::CPUDeviceContext, int64_t>)

@ -0,0 +1,26 @@
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/fill_constant_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
fill_constant,
ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext, double>,
ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext, int>,
ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>)

@ -0,0 +1,48 @@
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class FillConstantOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto data_type =
static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
auto value = ctx.Attr<float>("value");
auto force_cpu = ctx.Attr<bool>("force_cpu");
auto* out = ctx.Output<framework::Tensor>("Out");
out->Resize(framework::make_ddim(ctx.Attr<std::vector<int>>("shape")));
if (force_cpu) {
auto cpu = platform::CPUPlace();
out->mutable_data(cpu, framework::ToTypeIndex(data_type));
} else {
out->mutable_data(ctx.GetPlace(), framework::ToTypeIndex(data_type));
}
math::set_constant(ctx.template device_context<DeviceContext>(), out,
value);
}
};
} // namespace operators
} // namespace paddle

@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace operators {
@ -69,7 +70,6 @@ class FillOp : public framework::OperatorBase {
framework::VisitDataType(
dtype, FillOpVisitor(&tensor, Attr<std::vector<float>>("value")));
if (!force_cpu && platform::is_gpu_place(place)) {
// Copy tensor to out
platform::DeviceContextPool &pool =

@ -15,6 +15,7 @@ limitations under the License. */
#include <thrust/transform.h>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace operators {
@ -60,6 +61,7 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
} // namespace operators
} // namespace paddle
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(gaussian_random,
paddle::operators::GPUGaussianRandomKernel<float>,
paddle::operators::GPUGaussianRandomKernel<double>);

@ -123,8 +123,11 @@ void ListenAndServOp::RunSyncLoop(
optimize_prepared.begin(),
std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
// Trainers will get all parameters from pserver in the
// startup program, so we will wait RequestGet first
rpc_service_->SetCond(distributed::kRequestGet);
rpc_service_->WaitBarrier(distributed::kRequestGet);
rpc_service_->ResetBarrierCounter();
while (true) {
rpc_service_->Profiler().OneStep();
// Get from multiple trainers, we don't care about the order in which

@ -15,11 +15,25 @@ limitations under the License. */
#include "paddle/fluid/operators/math/cross_entropy.h"
#include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace operators {
namespace math {
template <typename T>
HOSTDEVICE T log(const T& val) {
return std::log(val);
}
template <>
HOSTDEVICE platform::float16 log(const platform::float16& val) {
// strage bug, hlog is not exists.
return static_cast<float16>(0);
// half tmp = static_cast<half>(val);
// return static_cast<platform::float16>(hlog(tmp));
}
namespace {
template <typename T>
__global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
@ -35,12 +49,12 @@ template <typename T>
__global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
const int class_num) {
int tid = threadIdx.x;
T val = 0;
T val(0);
int idx = blockIdx.x * class_num + tid;
int end = blockIdx.x * class_num + class_num;
for (; idx < end; idx += blockDim.x) {
val += math::TolerableValue<T>()(std::log(X[idx])) * label[idx];
val += math::TolerableValue<T>()(log(X[idx])) * label[idx];
}
val = paddle::platform::reduceSum(val, tid, blockDim.x);
@ -84,6 +98,8 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
template class CrossEntropyFunctor<platform::CUDADeviceContext, float>;
template class CrossEntropyFunctor<platform::CUDADeviceContext, double>;
template class CrossEntropyFunctor<platform::CUDADeviceContext,
platform::float16>;
} // namespace math
} // namespace operators
} // namespace paddle

@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <limits>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/hostdevice.h"
namespace paddle {
@ -33,6 +35,21 @@ struct TolerableValue {
}
};
// float16 value clip behave different.
using paddle::platform::float16;
using paddle::platform::isfinite;
template <>
struct TolerableValue<float16> {
HOSTDEVICE float16 operator()(const float16& x) const {
if (isfinite(x))
return x;
else if (x > static_cast<float16>(0))
return std::numeric_limits<float16>::max();
else
return std::numeric_limits<float16>::min();
}
};
template <typename DeviceContext, typename T>
class CrossEntropyFunctor {
public:

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save