Merge remote-tracking branch 'origin/develop' into windows/debug

test=develop
6 years ago · 234a1d9248
parent 2835e04409 a270fdf2db
commit 234a1d9248
79 changed files with 1279 additions and 1419 deletions
--- a/paddle/fluid/framework/details/execution_strategy.h
+++ b/paddle/fluid/framework/details/execution_strategy.h
@ -13,6 +13,7 @@
 // limitations under the License.
 #pragma once
 #include <cstddef>  // for size_t
 namespace paddle {
 namespace framework {
@ -26,6 +27,7 @@ struct ExecutionStrategy {
  bool allow_op_delay_{false};
  size_t num_iteration_per_drop_scope_{100};
  ExecutorType type_{kDefault};
  bool dry_run_{false};
 };
 }  //  namespace details
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@ -128,7 +128,9 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
    size_t complete = 0;
    while (op_to_run != nullptr) {
      try {
        if (LIKELY(!strategy_.dry_run_)) {
          op_to_run->Run(strategy_.use_cuda_);
        }
        ++complete;
      } catch (...) {
        exception_.Catch(std::current_exception());
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@ -211,7 +211,9 @@ void ThreadedSSAGraphExecutor::RunOp(
      if (VLOG_IS_ON(10)) {
        VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
      }
      if (LIKELY(!strategy_.dry_run_)) {
        op->Run(strategy_.use_cuda_);
      }
      VLOG(10) << op << " " << op->Name() << " Done ";
      running_ops_--;
      ready_var_q->Extend(op->Outputs());
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@ -48,7 +48,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
  // Use topological sort algorithm
  FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
-  ~ThreadedSSAGraphExecutor() {}
+  ~ThreadedSSAGraphExecutor() final = default;
 private:
  void RunOp(const std::shared_ptr<BlockingQueue<VarHandleBase *>> &ready_var_q,
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@ -38,9 +38,20 @@ class ParallelExecutorPrivate {
  explicit ParallelExecutorPrivate(const std::vector<platform::Place> &places)
      : places_(places) {}
  ~ParallelExecutorPrivate() {
    if (own_local_scope_) {
      for (size_t i = 1; i < local_scopes_.size(); ++i) {
        // Skip the first scope, since it is the global scope.
        Scope *local_scope = local_scopes_[i];
        if (global_scope_->HasKid(local_scope)) {
          global_scope_->DeleteScope(local_scope);
        }
      }
    }
  }
  std::vector<platform::Place> places_;
  std::vector<Scope *> local_scopes_;
-  Scope *global_scope_;
+  Scope *global_scope_;  // not owned
  std::unique_ptr<details::SSAGraphExecutor> executor_;
 #ifdef PADDLE_WITH_CUDA
@ -306,16 +317,6 @@ ParallelExecutor::~ParallelExecutor() {
  for (auto &p : member_->places_) {
    platform::DeviceContextPool::Instance().Get(p)->Wait();
  }
  if (member_->own_local_scope_) {
    for (size_t i = 1; i < member_->local_scopes_.size(); ++i) {
      Scope *local_scope = member_->local_scopes_[i];
      if (member_->global_scope_->HasKid(local_scope)) {
        member_->global_scope_->DeleteScope(local_scope);
      }
    }
  }
  // member_ must be destructed before gcs_ since the destructor of
  // ReferenceCountOpHandle use raw pointers of gcs_ inside.
  member_.reset();
--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
@ -57,10 +57,10 @@ ThreadPool::ThreadPool(int num_threads) : running_(true) {
 ThreadPool::~ThreadPool() {
  {
    // notify all threads to stop running
-    std::lock_guard<std::mutex> l(mutex_);
+    std::unique_lock<std::mutex> l(mutex_);
    running_ = false;
    scheduled_.notify_all();
  }
  scheduled_.notify_all();
  for (auto& t : threads_) {
    t->join();
@ -70,19 +70,25 @@ ThreadPool::~ThreadPool() {
 void ThreadPool::TaskLoop() {
  while (true) {
-    std::unique_lock<std::mutex> lock(mutex_);
+    Task task;
    {
      std::unique_lock<std::mutex> lock(mutex_);
      scheduled_.wait(
          lock, [this] { return !this->tasks_.empty() || !this->running_; });
-    if (!running_ || tasks_.empty()) {
+      if (!running_ && tasks_.empty()) {
        return;
      }
      if (tasks_.empty()) {
        PADDLE_THROW("This thread has no task to Run");
      }
      // pop a task from the task queue
-    auto task = std::move(tasks_.front());
+      task = std::move(tasks_.front());
      tasks_.pop();
-    lock.unlock();
+    }
    // run the task
    task();
--- a/paddle/fluid/framework/threadpool.h
+++ b/paddle/fluid/framework/threadpool.h
@ -69,7 +69,6 @@ class ThreadPool {
  template <typename Callback>
  std::future<std::unique_ptr<platform::EnforceNotMet>> RunAndGetException(
      Callback fn) {
    std::unique_lock<std::mutex> lock(mutex_);
    Task task([fn]() -> std::unique_ptr<platform::EnforceNotMet> {
      try {
        fn();
@ -84,7 +83,13 @@ class ThreadPool {
      return nullptr;
    });
    std::future<std::unique_ptr<platform::EnforceNotMet>> f = task.get_future();
    {
      std::unique_lock<std::mutex> lock(mutex_);
      if (!running_) {
        PADDLE_THROW("enqueue on stopped ThreadPool");
      }
      tasks_.push(std::move(task));
    }
    scheduled_.notify_one();
    return f;
  }
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@ -1,5 +1,5 @@
 if(WITH_TESTING)
-  include(test.cmake) # some generic cmake funtion for inference
+  include(tests/test.cmake) # some generic cmake funtion for inference
 endif()
 # analysis and tensorrt must be added before creating static library,
 # otherwise, there would be undefined reference to them in static library.
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@ -18,6 +18,21 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 bool to_skip_merging_optimize(TensorRTEngine* engine_,
                              const std::vector<int>& filters,
                              const std::vector<int>& strides,
                              const std::vector<int>& paddings,
                              std::string input_name) {
  if (engine_->itensor_quote_num[input_name] > 0) {
    return true;
  }
  if (filters[0] == 1 && filters[1] == 1 && strides[0] == 1 &&
      strides[1] == 1 && paddings[0] == 0 && paddings[1] == 0)
    engine_->itensor_quote_num[input_name] += 1;
  return false;
 }
 class Conv2dOpConverter : public OpConverter {
 public:
  void operator()(const framework::proto::OpDesc& op,
@ -31,6 +46,7 @@ class Conv2dOpConverter : public OpConverter {
    PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1);
    auto* X = engine_->GetITensor(op_desc.Input("Input").front());
    // Declare weights
    auto* Y_v = scope.FindVar(op_desc.Input("Filter").front());
    PADDLE_ENFORCE_NOT_NULL(Y_v);
@ -83,7 +99,10 @@ class Conv2dOpConverter : public OpConverter {
        std::move(weight_tensor);
    layer->getOutput(0)->setName(output_name.c_str());
    engine_->SetITensor(output_name, layer->getOutput(0));
-    if (test_mode) {
+
    if (test_mode ||
        to_skip_merging_optimize(engine_, {filter_h, filter_w}, strides,
                                 paddings, op_desc.Input("Input").front())) {
      engine_->DeclareOutput(output_name);
    }
  }
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@ -133,6 +133,10 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset,
  buffer_sizes_[name] = 0;
 }
 bool TensorRTEngine::HasDeclared(const std::string &name) {
  return buffer_sizes_.count(name) > 0;
 }
 void TensorRTEngine::DeclareOutput(const std::string &name) {
  PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
                    name);
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@ -91,6 +91,8 @@ class TensorRTEngine : public EngineBase {
                     const std::string& name);
  // Set the itensor_map_[name] as the network's output, and set its name.
  void DeclareOutput(const std::string& name);
  // Check if the ITensor has been declared
  bool HasDeclared(const std::string& name);
  // GPU memory address for an ITensor with specific name. One can operate on
  // these memory directly for acceleration, for example, output the converted
@ -132,6 +134,16 @@ class TensorRTEngine : public EngineBase {
  std::unordered_map<std::string /*name*/, std::unique_ptr<framework::Tensor>>
      weight_map;
  // TODO: (NHZLX)
  // In the normal case, the paddle-trt exists bug when runing the googlenet.
  // When there are more than two convolutions of 1 * 1 with the same input, the
  // paddle-tensorrt will do the merging optimization, which fuse those conv
  // into
  // one conv, and then trigger bug. So,  We should use strategy to avoid this
  // optimization for the time being. This bug will be fixed in the future.
  std::unordered_map<std::string /*name*/, int /*ITensor_quote_num*/>
      itensor_quote_num;
 private:
  // the max batch size
  int max_batch_;
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@ -1,5 +1,11 @@
 set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor)
 function(download_model install_dir model_name)
    if (NOT EXISTS ${install_dir})
        inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} ${model_name})
    endif()
 endfunction()
 function(download_model_and_data install_dir model_name data_name)
    if (NOT EXISTS ${install_dir})
        inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} ${model_name})
@ -13,6 +19,13 @@ function(inference_analysis_api_test target install_dir filename)
        ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt)
 endfunction()
 function(inference_analysis_api_test_with_fake_data target install_dir filename model_name)
    download_model(${install_dir} ${model_name})
    inference_analysis_test(${target} SRCS ${filename}
        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
        ARGS --infer_model=${install_dir}/model)
 endfunction()
 # RNN1
 if(NOT APPLE)
    set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1")
@ -66,12 +79,8 @@ endif()
 inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc)
 # resnet50
-set(RESNET50_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/resnet50")
+inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 
-if (NOT EXISTS ${RESNET50_INSTALL_DIR})
+  "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz")
    inference_download_and_uncompress(${RESNET50_INSTALL_DIR} ${INFERENCE_URL} "resnet50_model.tar.gz")
 endif()
 inference_analysis_test(test_analyzer_resnet50 SRCS analyzer_resnet50_tester.cc
    EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${RESNET50_INSTALL_DIR}/model)
 # anakin
 if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
--- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
@ -30,25 +30,7 @@ void SetConfig(AnalysisConfig *cfg) {
 }
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
-  PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data.");
+  SetFakeImageInput(inputs, FLAGS_infer_model);
  PaddleTensor input;
  // channel=3, height/width=318
  std::vector<int> shape({FLAGS_batch_size, 3, 318, 318});
  input.shape = shape;
  input.dtype = PaddleDType::FLOAT32;
  // fill input data, for profile easily, do not use random data here.
  size_t size = FLAGS_batch_size * 3 * 318 * 318;
  input.data.Resize(size * sizeof(float));
  float *input_data = static_cast<float *>(input.data.data());
  for (size_t i = 0; i < size; i++) {
    *(input_data + i) = static_cast<float>(i) / size;
  }
  std::vector<PaddleTensor> input_slots;
  input_slots.assign({input});
  (*inputs).emplace_back(input_slots);
 }
 // Easy for profiling independently.
@ -61,13 +43,6 @@ void profile(bool use_mkldnn = false) {
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
    PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
    size_t size = GetSize(outputs[0]);
    // output is a 512-dimension feature
    EXPECT_EQ(size, 512 * FLAGS_batch_size);
  }
 }
 TEST(Analyzer_resnet50, profile) { profile(); }
@ -83,8 +58,7 @@ TEST(Analyzer_resnet50, fuse_statis) {
  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
  auto fuse_statis = GetFuseStatis(
      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
-  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
+  LOG(INFO) << "num_ops: " << num_ops;
  EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
 }
 // Compare result of NativeConfig and AnalysisConfig
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@ -25,6 +25,7 @@
 #include "paddle/fluid/inference/api/analysis_predictor.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 #include "paddle/fluid/platform/profiler.h"
 DEFINE_string(infer_model, "", "model path");
@ -105,6 +106,34 @@ std::unordered_map<std::string, int> GetFuseStatis(PaddlePredictor *predictor,
  return fuse_statis;
 }
 void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
                       const std::string &dirname) {
  // Set fake_image_data
  PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data.");
  std::vector<std::vector<int64_t>> feed_target_shapes =
      GetFeedTargetShapes(dirname, true, "model", "params");
  int dim1 = feed_target_shapes[0][1];
  int dim2 = feed_target_shapes[0][2];
  int dim3 = feed_target_shapes[0][3];
  PaddleTensor input;
  std::vector<int> shape({FLAGS_batch_size, dim1, dim2, dim3});
  input.shape = shape;
  input.dtype = PaddleDType::FLOAT32;
  // fill input data, for profile easily, do not use random data here.
  size_t size = FLAGS_batch_size * dim1 * dim2 * dim3;
  input.data.Resize(size * sizeof(float));
  float *input_data = static_cast<float *>(input.data.data());
  for (size_t i = 0; i < size; i++) {
    *(input_data + i) = static_cast<float>(i) / size;
  }
  std::vector<PaddleTensor> input_slots;
  input_slots.assign({input});
  (*inputs).emplace_back(input_slots);
 }
 void TestOneThreadPrediction(
    const AnalysisConfig &config,
    const std::vector<std::vector<PaddleTensor>> &inputs,
--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
@ -93,11 +93,16 @@ void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) {
  }
 }
-TEST(trt_models_test, main) {
+TEST(trt_models_test, mobilenet) {
-  std::vector<std::string> infer_models = {"mobilenet", "resnet50",
+  CompareTensorRTWithFluid(1, FLAGS_dirname + "/mobilenet");
-                                           "resnext50"};
+}
-  for (auto &model_dir : infer_models) {
+
-    CompareTensorRTWithFluid(1, FLAGS_dirname + "/" + model_dir);
+TEST(trt_models_test, resnet50) {
-  }
+  CompareTensorRTWithFluid(1, FLAGS_dirname + "/resnet50");
 }
 TEST(trt_models_test, resnext50) {
  CompareTensorRTWithFluid(1, FLAGS_dirname + "/resnext50");
 }
 }  // namespace paddle
--- a/paddle/fluid/inference/tests/test.cmake
+++ b/paddle/fluid/inference/tests/test.cmake
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@ -18,7 +18,6 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/ir/graph_to_program_pass.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/profiler.h"
@ -94,15 +93,15 @@ void CheckError(const paddle::framework::LoDTensor& output1,
 std::unique_ptr<paddle::framework::ProgramDesc> InitProgram(
    paddle::framework::Executor* executor, paddle::framework::Scope* scope,
-    const std::string& dirname, const bool is_combined = false) {
+    const std::string& dirname, const bool is_combined = false,
    const std::string& prog_filename = "__model_combined__",
    const std::string& param_filename = "__params_combined__") {
  std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
  if (is_combined) {
    // All parameters are saved in a single file.
    // Hard-coding the file names of program and parameters in unittest.
    // The file names should be consistent with that used in Python API
    //  `fluid.io.save_inference_model`.
    std::string prog_filename = "__model_combined__";
    std::string param_filename = "__params_combined__";
    inference_program =
        paddle::inference::Load(executor, scope, dirname + "/" + prog_filename,
                                dirname + "/" + param_filename);
@ -115,12 +114,15 @@ std::unique_ptr<paddle::framework::ProgramDesc> InitProgram(
 }
 std::vector<std::vector<int64_t>> GetFeedTargetShapes(
-    const std::string& dirname, const bool is_combined = false) {
+    const std::string& dirname, const bool is_combined = false,
    const std::string& prog_filename = "__model_combined__",
    const std::string& param_filename = "__params_combined__") {
  auto place = paddle::platform::CPUPlace();
  auto executor = paddle::framework::Executor(place);
  auto* scope = new paddle::framework::Scope();
-  auto inference_program = InitProgram(&executor, scope, dirname, is_combined);
+  auto inference_program = InitProgram(&executor, scope, dirname, is_combined,
                                       prog_filename, param_filename);
  auto& global_block = inference_program->Block(0);
  const std::vector<std::string>& feed_target_names =
@ -136,15 +138,6 @@ std::vector<std::vector<int64_t>> GetFeedTargetShapes(
  return feed_target_shapes;
 }
 void Compile(paddle::framework::ProgramDesc* program) {
  std::unique_ptr<paddle::framework::ir::Graph> g(
      new paddle::framework::ir::Graph(*program));
  auto pass = paddle::framework::ir::PassRegistry::Instance().Get(
      "graph_to_program_pass");
  pass->SetNotOwned<paddle::framework::ProgramDesc>("program", program);
  pass->Apply(std::move(g));
 }
 template <typename Place, bool CreateVars = true, bool PrepareContext = false>
 void TestInference(const std::string& dirname,
                   const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
@ -182,7 +175,6 @@ void TestInference(const std::string& dirname,
        paddle::platform::DeviceContextPool::Instance().Get(place));
    inference_program = InitProgram(&executor, scope, dirname, is_combined);
  }
  Compile(inference_program.get());
  // Disable the profiler and print the timing information
  paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault,
@ -261,5 +253,3 @@ void TestInference(const std::string& dirname,
  delete scope;
 }
 USE_PASS(graph_to_program_pass);
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@ -26,6 +26,8 @@ namespace plat = paddle::platform;
      act_type##_grad, ops::ActivationGradKernel<plat::CUDADeviceContext,   \
                                                 ops::grad_functor<float>>, \
      ops::ActivationGradKernel<plat::CUDADeviceContext,                    \
-                                ops::grad_functor<double>>);
+                                ops::grad_functor<double>>,                 \
      ops::ActivationGradKernel<plat::CUDADeviceContext,                    \
                                ops::grad_functor<plat::float16>>);
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@ -333,8 +333,7 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> {
  template <typename Device, typename X, typename Out, typename dOut,
            typename dX>
  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    const Out out_conj = Eigen::numext::conj(out);
+    dx.device(d) = static_cast<T>(0.5) * dout / out;
    dx.device(d) = static_cast<T>(0.5) * dout / out_conj;
  }
 };
@ -740,7 +739,7 @@ struct PowGradFunctor : public BaseActivationFunctor<T> {
            typename dX>
  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
    dx.device(d) = dout * static_cast<T>(factor) *
-                   x.pow(static_cast<T>(factor - static_cast<T>(1)));
+                   x.pow(static_cast<T>(factor) - static_cast<T>(1));
  }
 };
--- a/paddle/fluid/operators/adagrad_op.cc
+++ b/paddle/fluid/operators/adagrad_op.cc
@ -119,8 +119,8 @@ struct SparseAdagradFunctor<platform::CPUDeviceContext, T> {
    auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();
    // 2. m += g_m * g_m
-    math::scatter::Mul<platform::CPUDeviceContext, T> sqare_func;
+    auto grad_square =
-    auto grad_square = sqare_func(context, grad_merge, grad_merge);
+        SquareSelectedRows<platform::CPUDeviceContext, T>(context, grad_merge);
    math::SelectedRowsAddToTensor<platform::CPUDeviceContext, T> functor;
    functor(context, grad_square, moment);
--- a/paddle/fluid/operators/adagrad_op.cu
+++ b/paddle/fluid/operators/adagrad_op.cu
@ -84,8 +84,8 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
    auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();
    framework::Vector<int64_t> merge_rows(grad_merge.rows());
    // 2. m += g_m * g_m
-    math::scatter::Mul<platform::CUDADeviceContext, T> sqare_func;
+    auto grad_square =
-    auto grad_square = sqare_func(context, grad_merge, grad_merge);
+        SquareSelectedRows<platform::CUDADeviceContext, T>(context, grad_merge);
    math::SelectedRowsAddToTensor<platform::CUDADeviceContext, T> functor;
    functor(context, grad_square, moment);
--- a/paddle/fluid/operators/adagrad_op.h
+++ b/paddle/fluid/operators/adagrad_op.h
@ -28,6 +28,20 @@ struct SparseAdagradFunctor {
                  framework::Tensor *moment, framework::Tensor *param);
 };
 template <typename DeviceContext, typename T>
 framework::SelectedRows SquareSelectedRows(
    const DeviceContext &context, const framework::SelectedRows &input) {
  framework::SelectedRows out;
  out.set_rows(input.rows());
  out.set_height(input.height());
  out.mutable_value()->mutable_data<T>(input.value().dims(),
                                       context.GetPlace());
  auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
  auto e_in = framework::EigenVector<T>::Flatten(input.value());
  e_out.device(*context.eigen_device()) = e_in.square();
  return out;
 }
 template <typename DeviceContext, typename T>
 class AdagradOpKernel : public framework::OpKernel<T> {
 public:
--- a/paddle/fluid/operators/batch_norm_op.cu.cc
+++ b/paddle/fluid/operators/batch_norm_op.cu.cc
@ -219,8 +219,8 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
    d_x->mutable_data<T>(ctx.GetPlace());
-    d_scale->mutable_data<T>(ctx.GetPlace());
+    d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    d_bias->mutable_data<T>(ctx.GetPlace());
+    d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
    if ((N * H * W * D) == 1) {
@ -272,8 +272,10 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
    const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
-    const void *saved_mean_data = saved_mean->template data<T>();
+    const void *saved_mean_data =
-    const void *saved_var_data = saved_var->template data<T>();
+        saved_mean->template data<BatchNormParamType<T>>();
    const void *saved_var_data =
        saved_var->template data<BatchNormParamType<T>>();
    CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward(
        dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
@ -281,10 +283,10 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
        CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
        data_desc_, d_y->template data<T>(), data_desc_,
        d_x->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_,
-        scale->template data<T>(),
+        scale->template data<BatchNormParamType<T>>(),
-        d_scale->template mutable_data<T>(ctx.GetPlace()),
+        d_scale->template mutable_data<BatchNormParamType<T>>(ctx.GetPlace()),
-        d_bias->template mutable_data<T>(ctx.GetPlace()), epsilon,
+        d_bias->template mutable_data<BatchNormParamType<T>>(ctx.GetPlace()),
-        saved_mean_data, saved_var_data));
+        epsilon, saved_mean_data, saved_var_data));
    // clean when exit.
    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
@ -304,4 +306,5 @@ REGISTER_OP_CUDA_KERNEL(
    ops::BatchNormKernel<plat::CUDADeviceContext, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
    batch_norm_grad, ops::BatchNormGradKernel<plat::CUDADeviceContext, float>,
-    ops::BatchNormGradKernel<plat::CUDADeviceContext, double>);
+    ops::BatchNormGradKernel<plat::CUDADeviceContext, double>,
    ops::BatchNormGradKernel<plat::CUDADeviceContext, plat::float16>);
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@ -143,9 +143,11 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
          cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
      // Currently tensor core is only enabled using this algo
      algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
      VLOG(5) << "use cudnn_tensor_op_math";
    } else {
      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
          cudnn_conv_desc, CUDNN_DEFAULT_MATH));
      VLOG(5) << "NOT use cudnn_tensor_op_math";
    }
 #endif
@ -361,7 +363,8 @@ REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
 REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
                   paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<double>);
+                   paddle::operators::CUDNNConvGradOpKernel<double>,
                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
 REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
                   paddle::operators::CUDNNConvOpKernel<float>,
--- a/paddle/fluid/operators/cross_entropy_op.cu
+++ b/paddle/fluid/operators/cross_entropy_op.cu
@ -13,12 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/cross_entropy_op.h"
 #include "paddle/fluid/platform/float16.h"
 namespace plat = paddle::platform;
 namespace ops = paddle::operators;
 using CUDACtx = paddle::platform::CUDADeviceContext;
 REGISTER_OP_CUDA_KERNEL(cross_entropy,
                        ops::CrossEntropyOpKernel<CUDACtx, float>,
-                        ops::CrossEntropyOpKernel<CUDACtx, double>);
+                        ops::CrossEntropyOpKernel<CUDACtx, double>,
-REGISTER_OP_CUDA_KERNEL(cross_entropy_grad,
+                        ops::CrossEntropyOpKernel<CUDACtx, plat::float16>);
-                        ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
+
-                        ops::CrossEntropyGradientOpKernel<CUDACtx, double>);
+REGISTER_OP_CUDA_KERNEL(
    cross_entropy_grad, ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
    ops::CrossEntropyGradientOpKernel<CUDACtx, double>,
    ops::CrossEntropyGradientOpKernel<CUDACtx, plat::float16>);
--- a/Show More
+++ b/Show More