Merge remote-tracking branch 'ups/develop' into feature/op/fusion_lstm

7 years ago · a72f68f223
parent df28a3b452 17da113c87
commit a72f68f223
51 changed files with 746 additions and 251 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -204,11 +204,12 @@ include(external/snappy)    # download snappy
 include(external/snappystream)
 include(external/threadpool)
 set(WITH_ANAKIN OFF CACHE STRING "Disable Anakin first, will add it later." FORCE)
 if(WITH_GPU)
    include(cuda)
    include(tensorrt)
    include(external/anakin)
 elseif()
    set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in GPU only now." FORCE)
 endif()
 include(cudnn)              # set cudnn libraries, must before configure
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@ -104,6 +104,10 @@ if(WITH_GPU)
        if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
            message(FATAL_ERROR "Anakin needs CUDNN >= 7.0 to compile")
        endif()
        set(ENV{CUDNN_INCLUDE_DIR} ${CUDNN_INCLUDE_DIR})
        set(ENV{CUDNN_LIBRARY} ${CUDNN_LIBRARY})
        message(STATUS "cudnn include header is ${CUDNN_INCLUDE_DIR}/cudnn.h")
        message(STATUS "cudnn library is ${CUDNN_LIBRARY}")
    endif()
 elseif(WITH_AMD_GPU)
    add_definitions(-DPADDLE_WITH_HIP)
--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@ -35,9 +35,8 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS
 ExternalProject_Add(
    extern_anakin
    ${EXTERNAL_PROJECT_LOG_ARGS}
-    # TODO(luotao): use PaddlePaddle/Anakin later
+    GIT_REPOSITORY      "https://github.com/PaddlePaddle/Anakin"
-    GIT_REPOSITORY      "https://github.com/luotao1/Anakin"
+    GIT_TAG             "04256ba78fa3da0beb74e8036c8efd68c12824d6"
    GIT_TAG             "3957ae9263eaa0b1986758dac60a88852afb09be"
    PREFIX              ${ANAKIN_SOURCE_DIR}
    UPDATE_COMMAND      ""
    CMAKE_ARGS          -DUSE_GPU_PLACE=YES
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@ -155,10 +155,11 @@ paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale',
 paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.random_crop ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.relu ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.log ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.log ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.prelu ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
@ -20,6 +20,9 @@
 DEFINE_int32(io_threadpool_size, 100,
             "number of threads used for doing IO, default 100");
 DEFINE_int32(dist_threadpool_size, 0,
             "number of threads used for distributed executed.");
 namespace paddle {
 namespace framework {
@ -35,6 +38,10 @@ void ThreadPool::Init() {
  if (threadpool_.get() == nullptr) {
    // TODO(Yancey1989): specify the max threads number
    int num_threads = std::thread::hardware_concurrency();
    if (FLAGS_dist_threadpool_size > 0) {
      num_threads = FLAGS_dist_threadpool_size;
      VLOG(1) << "set dist_threadpool_size to " << num_threads;
    }
    PADDLE_ENFORCE_GT(num_threads, 0);
    threadpool_.reset(new ThreadPool(num_threads));
  }
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@ -60,7 +60,7 @@ cc_library(paddle_inference_tensorrt_subgraph_engine
 inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec)
 endif()
-if (WITH_ANAKIN) # only needed in CI
+if (WITH_ANAKIN AND WITH_GPU) # only needed in CI
    # compile the libinference_anakin_api.a and anakin.so.
    nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber)
    #nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin)
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@ -26,6 +26,8 @@ namespace plat = paddle::platform;
      act_type##_grad, ops::ActivationGradKernel<plat::CUDADeviceContext,   \
                                                 ops::grad_functor<float>>, \
      ops::ActivationGradKernel<plat::CUDADeviceContext,                    \
-                                ops::grad_functor<double>>);
+                                ops::grad_functor<double>>,                 \
      ops::ActivationGradKernel<plat::CUDADeviceContext,                    \
                                ops::grad_functor<plat::float16>>);
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@ -333,8 +333,7 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> {
  template <typename Device, typename X, typename Out, typename dOut,
            typename dX>
  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    const Out out_conj = Eigen::numext::conj(out);
+    dx.device(d) = static_cast<T>(0.5) * dout / out;
    dx.device(d) = static_cast<T>(0.5) * dout / out_conj;
  }
 };
@ -740,7 +739,7 @@ struct PowGradFunctor : public BaseActivationFunctor<T> {
            typename dX>
  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
    dx.device(d) = dout * static_cast<T>(factor) *
-                   x.pow(static_cast<T>(factor - static_cast<T>(1)));
+                   x.pow(static_cast<T>(factor) - static_cast<T>(1));
  }
 };
@ -863,10 +862,11 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
  template <typename Device, typename X, typename Out, typename dOut,
            typename dX>
  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
    T b = static_cast<T>(beta);
    auto temp1 = static_cast<T>(1) /
-                 (static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
+                 (static_cast<T>(1) + (static_cast<T>(-b) * x).exp());
-    auto temp2 = temp1 * (static_cast<T>(1) - (beta * out));
+    auto temp2 = temp1 * (static_cast<T>(1) - (b * out));
-    dx.device(d) = dout * ((beta * out) + temp2);
+    dx.device(d) = dout * ((b * out) + temp2);
  }
 };
--- a/paddle/fluid/operators/assign_value_op.cu.cc
+++ b/paddle/fluid/operators/assign_value_op.cu.cc
@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/assign_value_op.h"
 #include "paddle/fluid/platform/float16.h"
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(assign_value, ops::AssignValueKernel<int>,
-                        ops::AssignValueKernel<float>);
+                        ops::AssignValueKernel<float>,
                        ops::AssignValueKernel<plat::float16>);
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@ -39,6 +39,27 @@ using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
 static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
    static_cast<size_t>(1024) * 1024 * 1024;
 template <typename T, typename DeviceContext>
 // bool EnableFp16(const T& dummy, const DeviceContext& dev_ctx,
 bool EnableFp16(const DeviceContext& dev_ctx,
                cudnnConvolutionDescriptor_t cudnn_conv_desc) {
 #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
  // Tensor core is supported since the volta GPU and
  // is only enabled when input and filter data are float16
  if (dev_ctx.GetComputeCapability() >= 70 &&
      std::type_index(typeid(T)) ==
          std::type_index(typeid(platform::float16))) {
    PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
        cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
    return true;
  } else {
    PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
        cudnn_conv_desc, CUDNN_DEFAULT_MATH));
  }
 #endif
  return false;
 }
 template <typename T>
 class CUDNNConvOpKernel : public framework::OpKernel<T> {
 public:
@ -128,27 +149,14 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
    cudnnConvolutionFwdAlgo_t algo;
    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
    auto handle = dev_ctx.cudnn_handle();
-
+    if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
        handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
        cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
        workspace_size_limit, &algo));
 #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
    // Tensor core is supported since the volta GPU and
    // is only enabled when input and filter data are float16
    if (dev_ctx.GetComputeCapability() >= 70 &&
        std::type_index(typeid(T)) ==
            std::type_index(typeid(platform::float16))) {
      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
          cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
      // Currently tensor core is only enabled using this algo
      algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
    } else {
-      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+      PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
-          cudnn_conv_desc, CUDNN_DEFAULT_MATH));
+          handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
          cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
          workspace_size_limit, &algo));
    }
 #endif
    // get workspace size able to allocate
    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
@ -288,6 +296,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
      } else {
        data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
      }
      if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
        data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
      }
      CUDNN_ENFORCE(
          platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
@ -307,6 +318,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
      } else {
        filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
      }
      if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
        filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
      }
      CUDNN_ENFORCE(
          platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
@ -362,7 +376,8 @@ REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
 REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
                   paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<double>);
+                   paddle::operators::CUDNNConvGradOpKernel<double>,
                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
 REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
                   paddle::operators::CUDNNConvOpKernel<float>,
@ -370,4 +385,5 @@ REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
 REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace,
                   paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<double>);
+                   paddle::operators::CUDNNConvGradOpKernel<double>,
                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>)
--- a/paddle/fluid/operators/cross_entropy_op.cu
+++ b/paddle/fluid/operators/cross_entropy_op.cu
@ -13,12 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/cross_entropy_op.h"
 #include "paddle/fluid/platform/float16.h"
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 using CUDACtx = paddle::platform::CUDADeviceContext;
 REGISTER_OP_CUDA_KERNEL(cross_entropy,
                        ops::CrossEntropyOpKernel<CUDACtx, float>,
-                        ops::CrossEntropyOpKernel<CUDACtx, double>);
+                        ops::CrossEntropyOpKernel<CUDACtx, double>,
-REGISTER_OP_CUDA_KERNEL(cross_entropy_grad,
+                        ops::CrossEntropyOpKernel<CUDACtx, plat::float16>);
-                        ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
+REGISTER_OP_CUDA_KERNEL(
-                        ops::CrossEntropyGradientOpKernel<CUDACtx, double>);
+    cross_entropy_grad, ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
    ops::CrossEntropyGradientOpKernel<CUDACtx, double>,
    ops::CrossEntropyGradientOpKernel<CUDACtx, plat::float16>);
--- a/paddle/fluid/operators/distributed/variable_response.cc
+++ b/paddle/fluid/operators/distributed/variable_response.cc
@ -190,12 +190,15 @@ bool VariableResponse::ProcSerializedField(
 #endif
  }
  VLOG(7) << "ProcSerializedField:" << meta_.varname()
          << ", type:" << meta_.type() << std::endl;
  framework::DDim dims = GetDims(meta_.dims());
  if (meta_.type() == sendrecv::LOD_TENSOR) {
    PADDLE_ENFORCE(meta_.lod_size() >= 0, "lod info should be got first!");
    if (!CopyLodTensorData(input, *dev_ctx_, dims, num_bytes)) {
      return false;
    }
    return true;
  }
@ -206,7 +209,9 @@ bool VariableResponse::ProcSerializedField(
    return true;
  }
-  return true;
+  PADDLE_ENFORCE("not supported var types:", meta_.varname(), meta_.type());
  return false;
 }
 };  // namespace distributed
--- a/paddle/fluid/operators/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise_add_op.cu
@ -30,4 +30,5 @@ REGISTER_OP_CUDA_KERNEL(
    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, float>,
    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, double>,
    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>);
+    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>,
    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, plat::float16>);
--- a/paddle/fluid/operators/elementwise_div_op.cu
+++ b/paddle/fluid/operators/elementwise_div_op.cu
@ -14,19 +14,24 @@ limitations under the License. */
 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/elementwise_div_op.h"
 #include "paddle/fluid/platform/float16.h"
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
    elementwise_div,
    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, float>,
    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, double>,
    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>);
+    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>,
    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
                              plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
    elementwise_div_grad,
    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, float>,
    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, double>,
    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>,
    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
-                                  int64_t>);
+                                  plat::float16>);
--- a/paddle/fluid/operators/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise_mul_op.cu
@ -14,19 +14,25 @@ limitations under the License. */
 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/elementwise_mul_op.h"
 #include "paddle/fluid/platform/float16.h"
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
    elementwise_mul,
    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, float>,
    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, double>,
    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int64_t>);
+    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int64_t>,
    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext,
                              plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
    elementwise_mul_grad,
    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, float>,
    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, double>,
    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, int>,
    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext,
                                  plat::float16>,
    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext,
                                  int64_t>);
--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
@ -350,7 +350,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(
  int j = blockIdx.x;
  int i = threadIdx.x;
  int tid = threadIdx.x;
-  T val = 0;
+  T val(0);
  do {
    int x_offset = i * w + j;
@ -418,7 +418,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
  int tid = threadIdx.x;
  int j = blockIdx.x;
-  T val = 0;
+  T val(0);
  int ttid = tid;
  while (true) {
--- a/paddle/fluid/operators/elementwise_sub_op.cu
+++ b/paddle/fluid/operators/elementwise_sub_op.cu
@ -14,19 +14,25 @@ limitations under the License. */
 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/elementwise_sub_op.h"
 #include "paddle/fluid/platform/float16.h"
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
    elementwise_sub,
    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, float>,
    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, double>,
    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>);
+    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>,
    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
                              plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
    elementwise_sub_grad,
    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, float>,
    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, double>,
    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int>,
    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
                                  plat::float16>,
    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
                                  int64_t>);
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@ -12,48 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/operators/fill_constant_op.h"
-#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
 namespace paddle {
 namespace operators {
-class FillConstantInferShape : public framework::InferShapeBase {
+class FillConstantOp : public framework::OperatorWithKernel {
 public:
-  void operator()(framework::InferShapeContext *ctx) const override {
+  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
                   "Output(Out) of FillConstantOp should not be null.");
-    auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    auto& shape = ctx->Attrs().Get<std::vector<int>>("shape");
    ctx->SetOutputDim("Out", framework::make_ddim(shape));
  }
 };
 class FillConstantOp : public framework::OperatorBase {
 public:
  using framework::OperatorBase::OperatorBase;
 private:
  void RunImpl(const framework::Scope &scope,
               const platform::Place &dev_place) const override {
    auto data_type =
        static_cast<framework::proto::VarType::Type>(Attr<int>("dtype"));
    auto value = Attr<float>("value");
    auto force_cpu = Attr<bool>("force_cpu");
    auto &out =
        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
    out.Resize(framework::make_ddim(Attr<std::vector<int>>("shape")));
    if (force_cpu) {
      auto cpu = platform::CPUPlace();
      out.mutable_data(cpu, framework::ToTypeIndex(data_type));
    } else {
      out.mutable_data(dev_place, framework::ToTypeIndex(data_type));
    }
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  framework::OpKernelType GetExpectedKernelType(
-    auto &dev_ctx = *pool.Get(dev_place);
+      const framework::ExecutionContext& ctx) const override {
-    math::set_constant(dev_ctx, &out, value);
+    return framework::OpKernelType(
        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")),
        ctx.device_context());
  }
 };
@ -87,6 +67,11 @@ Fill up a variable with specified constant value.
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(fill_constant, ops::FillConstantOp,
+REGISTER_OPERATOR(fill_constant, ops::FillConstantOp, ops::FillConstantOpMaker,
                  ops::FillConstantInferShape, ops::FillConstantOpMaker,
                  paddle::framework::EmptyGradOpMaker);
 REGISTER_OP_CPU_KERNEL(
    fill_constant,
    ops::FillConstantOpKernel<paddle::platform::CPUDeviceContext, float>,
    ops::FillConstantOpKernel<paddle::platform::CPUDeviceContext, double>,
    ops::FillConstantOpKernel<paddle::platform::CPUDeviceContext, int>,
    ops::FillConstantOpKernel<paddle::platform::CPUDeviceContext, int64_t>)
--- a/paddle/fluid/operators/fill_constant_op.cu.cc
+++ b/paddle/fluid/operators/fill_constant_op.cu.cc
@ -0,0 +1,26 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/operators/fill_constant_op.h"
 #include "paddle/fluid/platform/float16.h"
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
    fill_constant,
    ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext, float>,
    ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext, double>,
    ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext, int>,
    ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
    ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext,
                              paddle::platform::float16>)
--- a/paddle/fluid/operators/fill_constant_op.h
+++ b/paddle/fluid/operators/fill_constant_op.h
@ -0,0 +1,48 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 namespace paddle {
 namespace operators {
 template <typename DeviceContext, typename T>
 class FillConstantOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto data_type =
        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
    auto value = ctx.Attr<float>("value");
    auto force_cpu = ctx.Attr<bool>("force_cpu");
    auto* out = ctx.Output<framework::Tensor>("Out");
    out->Resize(framework::make_ddim(ctx.Attr<std::vector<int>>("shape")));
    if (force_cpu) {
      auto cpu = platform::CPUPlace();
      out->mutable_data(cpu, framework::ToTypeIndex(data_type));
    } else {
      out->mutable_data(ctx.GetPlace(), framework::ToTypeIndex(data_type));
    }
    math::set_constant(ctx.template device_context<DeviceContext>(), out,
                       value);
  }
 };
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/fill_op.cc
+++ b/paddle/fluid/operators/fill_op.cc
@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/float16.h"
 namespace paddle {
 namespace operators {
@ -69,7 +70,6 @@ class FillOp : public framework::OperatorBase {
    framework::VisitDataType(
        dtype, FillOpVisitor(&tensor, Attr<std::vector<float>>("value")));
    if (!force_cpu && platform::is_gpu_place(place)) {
      // Copy tensor to out
      platform::DeviceContextPool &pool =
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@ -15,6 +15,7 @@ limitations under the License. */
 #include <thrust/transform.h>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/float16.h"
 namespace paddle {
 namespace operators {
@ -60,6 +61,7 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(gaussian_random,
                        paddle::operators::GPUGaussianRandomKernel<float>,
                        paddle::operators::GPUGaussianRandomKernel<double>);
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@ -123,8 +123,11 @@ void ListenAndServOp::RunSyncLoop(
      optimize_prepared.begin(),
      std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
  // Trainers will get all parameters from pserver in the
  // startup program, so we will wait RequestGet first
  rpc_service_->SetCond(distributed::kRequestGet);
  rpc_service_->WaitBarrier(distributed::kRequestGet);
  rpc_service_->ResetBarrierCounter();
  while (true) {
    rpc_service_->Profiler().OneStep();
    // Get from multiple trainers, we don't care about the order in which
--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
@ -15,11 +15,25 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/float16.h"
 namespace paddle {
 namespace operators {
 namespace math {
 template <typename T>
 HOSTDEVICE T log(const T& val) {
  return std::log(val);
 }
 template <>
 HOSTDEVICE platform::float16 log(const platform::float16& val) {
  // strage bug, hlog is not exists.
  return static_cast<float16>(0);
  // half tmp = static_cast<half>(val);
  // return static_cast<platform::float16>(hlog(tmp));
 }
 namespace {
 template <typename T>
 __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
@ -35,12 +49,12 @@ template <typename T>
 __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
                                       const int class_num) {
  int tid = threadIdx.x;
-  T val = 0;
+  T val(0);
  int idx = blockIdx.x * class_num + tid;
  int end = blockIdx.x * class_num + class_num;
  for (; idx < end; idx += blockDim.x) {
-    val += math::TolerableValue<T>()(std::log(X[idx])) * label[idx];
+    val += math::TolerableValue<T>()(log(X[idx])) * label[idx];
  }
  val = paddle::platform::reduceSum(val, tid, blockDim.x);
@ -84,6 +98,8 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
 template class CrossEntropyFunctor<platform::CUDADeviceContext, float>;
 template class CrossEntropyFunctor<platform::CUDADeviceContext, double>;
 template class CrossEntropyFunctor<platform::CUDADeviceContext,
                                   platform::float16>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/math/cross_entropy.h
+++ b/paddle/fluid/operators/math/cross_entropy.h
@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include <limits>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/hostdevice.h"
 namespace paddle {
@ -33,6 +35,21 @@ struct TolerableValue {
  }
 };
 // float16 value clip behave different.
 using paddle::platform::float16;
 using paddle::platform::isfinite;
 template <>
 struct TolerableValue<float16> {
  HOSTDEVICE float16 operator()(const float16& x) const {
    if (isfinite(x))
      return x;
    else if (x > static_cast<float16>(0))
      return std::numeric_limits<float16>::max();
    else
      return std::numeric_limits<float16>::min();
  }
 };
 template <typename DeviceContext, typename T>
 class CrossEntropyFunctor {
 public:
--- a/Show More
+++ b/Show More