Merge remote-tracking branch 'origin/develop' into windows/debug

test=develop
7 years ago · 234a1d9248
parent 2835e04409 a270fdf2db
commit 234a1d9248
79 changed files with 1279 additions and 1419 deletions
--- a/paddle/fluid/framework/details/execution_strategy.h
+++ b/paddle/fluid/framework/details/execution_strategy.h
@ -13,6 +13,7 @@
 // limitations under the License.

 #pragma once
+#include <cstddef>  // for size_t

 namespace paddle {
 namespace framework {
@ -26,6 +27,7 @@ struct ExecutionStrategy {
  bool allow_op_delay_{false};
  size_t num_iteration_per_drop_scope_{100};
  ExecutorType type_{kDefault};
+  bool dry_run_{false};
 };

 }  //  namespace details
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@ -128,7 +128,9 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
    size_t complete = 0;
    while (op_to_run != nullptr) {
      try {
-        op_to_run->Run(strategy_.use_cuda_);
+        if (LIKELY(!strategy_.dry_run_)) {
+          op_to_run->Run(strategy_.use_cuda_);
+        }
        ++complete;
      } catch (...) {
        exception_.Catch(std::current_exception());
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@ -211,7 +211,9 @@ void ThreadedSSAGraphExecutor::RunOp(
      if (VLOG_IS_ON(10)) {
        VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
      }
-      op->Run(strategy_.use_cuda_);
+      if (LIKELY(!strategy_.dry_run_)) {
+        op->Run(strategy_.use_cuda_);
+      }
      VLOG(10) << op << " " << op->Name() << " Done ";
      running_ops_--;
      ready_var_q->Extend(op->Outputs());
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@ -48,7 +48,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
  // Use topological sort algorithm
  FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;

-  ~ThreadedSSAGraphExecutor() {}
+  ~ThreadedSSAGraphExecutor() final = default;

 private:
  void RunOp(const std::shared_ptr<BlockingQueue<VarHandleBase *>> &ready_var_q,
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@ -38,9 +38,20 @@ class ParallelExecutorPrivate {
  explicit ParallelExecutorPrivate(const std::vector<platform::Place> &places)
      : places_(places) {}

+  ~ParallelExecutorPrivate() {
+    if (own_local_scope_) {
+      for (size_t i = 1; i < local_scopes_.size(); ++i) {
+        // Skip the first scope, since it is the global scope.
+        Scope *local_scope = local_scopes_[i];
+        if (global_scope_->HasKid(local_scope)) {
+          global_scope_->DeleteScope(local_scope);
+        }
+      }
+    }
+  }
  std::vector<platform::Place> places_;
  std::vector<Scope *> local_scopes_;
-  Scope *global_scope_;
+  Scope *global_scope_;  // not owned
  std::unique_ptr<details::SSAGraphExecutor> executor_;

 #ifdef PADDLE_WITH_CUDA
@ -306,16 +317,6 @@ ParallelExecutor::~ParallelExecutor() {
  for (auto &p : member_->places_) {
    platform::DeviceContextPool::Instance().Get(p)->Wait();
  }
-
-  if (member_->own_local_scope_) {
-    for (size_t i = 1; i < member_->local_scopes_.size(); ++i) {
-      Scope *local_scope = member_->local_scopes_[i];
-      if (member_->global_scope_->HasKid(local_scope)) {
-        member_->global_scope_->DeleteScope(local_scope);
-      }
-    }
-  }
-
  // member_ must be destructed before gcs_ since the destructor of
  // ReferenceCountOpHandle use raw pointers of gcs_ inside.
  member_.reset();
--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
@ -57,10 +57,10 @@ ThreadPool::ThreadPool(int num_threads) : running_(true) {
 ThreadPool::~ThreadPool() {
  {
    // notify all threads to stop running
-    std::lock_guard<std::mutex> l(mutex_);
+    std::unique_lock<std::mutex> l(mutex_);
    running_ = false;
-    scheduled_.notify_all();
  }
+  scheduled_.notify_all();

  for (auto& t : threads_) {
    t->join();
@ -70,19 +70,25 @@ ThreadPool::~ThreadPool() {

 void ThreadPool::TaskLoop() {
  while (true) {
-    std::unique_lock<std::mutex> lock(mutex_);
+    Task task;

-    scheduled_.wait(
-        lock, [this] { return !this->tasks_.empty() || !this->running_; });
+    {
+      std::unique_lock<std::mutex> lock(mutex_);
+      scheduled_.wait(
+          lock, [this] { return !this->tasks_.empty() || !this->running_; });

-    if (!running_ || tasks_.empty()) {
-      return;
-    }
+      if (!running_ && tasks_.empty()) {
+        return;
+      }
+
+      if (tasks_.empty()) {
+        PADDLE_THROW("This thread has no task to Run");
+      }

-    // pop a task from the task queue
-    auto task = std::move(tasks_.front());
-    tasks_.pop();
-    lock.unlock();
+      // pop a task from the task queue
+      task = std::move(tasks_.front());
+      tasks_.pop();
+    }

    // run the task
    task();
--- a/paddle/fluid/framework/threadpool.h
+++ b/paddle/fluid/framework/threadpool.h
@ -58,7 +58,7 @@ class ThreadPool {
  ~ThreadPool();

  // Run pushes a function to the task queue and returns a std::future
-  // object.  To wait for the completion of the task, call
+  // object. To wait for the completion of the task, call
  // std::future::wait().
  template <typename Callback>
  std::future<void> Run(Callback fn) {
@ -69,7 +69,6 @@ class ThreadPool {
  template <typename Callback>
  std::future<std::unique_ptr<platform::EnforceNotMet>> RunAndGetException(
      Callback fn) {
-    std::unique_lock<std::mutex> lock(mutex_);
    Task task([fn]() -> std::unique_ptr<platform::EnforceNotMet> {
      try {
        fn();
@ -84,7 +83,13 @@ class ThreadPool {
      return nullptr;
    });
    std::future<std::unique_ptr<platform::EnforceNotMet>> f = task.get_future();
-    tasks_.push(std::move(task));
+    {
+      std::unique_lock<std::mutex> lock(mutex_);
+      if (!running_) {
+        PADDLE_THROW("enqueue on stopped ThreadPool");
+      }
+      tasks_.push(std::move(task));
+    }
    scheduled_.notify_one();
    return f;
  }
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@ -1,5 +1,5 @@
 if(WITH_TESTING)
-  include(test.cmake) # some generic cmake funtion for inference
+  include(tests/test.cmake) # some generic cmake funtion for inference
 endif()
 # analysis and tensorrt must be added before creating static library,
 # otherwise, there would be undefined reference to them in static library.
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@ -18,6 +18,21 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {

+bool to_skip_merging_optimize(TensorRTEngine* engine_,
+                              const std::vector<int>& filters,
+                              const std::vector<int>& strides,
+                              const std::vector<int>& paddings,
+                              std::string input_name) {
+  if (engine_->itensor_quote_num[input_name] > 0) {
+    return true;
+  }
+  if (filters[0] == 1 && filters[1] == 1 && strides[0] == 1 &&
+      strides[1] == 1 && paddings[0] == 0 && paddings[1] == 0)
+    engine_->itensor_quote_num[input_name] += 1;
+
+  return false;
+}
+
 class Conv2dOpConverter : public OpConverter {
 public:
  void operator()(const framework::proto::OpDesc& op,
@ -31,6 +46,7 @@ class Conv2dOpConverter : public OpConverter {
    PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1);

    auto* X = engine_->GetITensor(op_desc.Input("Input").front());
+
    // Declare weights
    auto* Y_v = scope.FindVar(op_desc.Input("Filter").front());
    PADDLE_ENFORCE_NOT_NULL(Y_v);
@ -83,7 +99,10 @@ class Conv2dOpConverter : public OpConverter {
        std::move(weight_tensor);
    layer->getOutput(0)->setName(output_name.c_str());
    engine_->SetITensor(output_name, layer->getOutput(0));
-    if (test_mode) {
+
+    if (test_mode ||
+        to_skip_merging_optimize(engine_, {filter_h, filter_w}, strides,
+                                 paddings, op_desc.Input("Input").front())) {
      engine_->DeclareOutput(output_name);
    }
  }
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@ -133,6 +133,10 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset,
  buffer_sizes_[name] = 0;
 }

+bool TensorRTEngine::HasDeclared(const std::string &name) {
+  return buffer_sizes_.count(name) > 0;
+}
+
 void TensorRTEngine::DeclareOutput(const std::string &name) {
  PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
                    name);
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@ -91,6 +91,8 @@ class TensorRTEngine : public EngineBase {
                     const std::string& name);
  // Set the itensor_map_[name] as the network's output, and set its name.
  void DeclareOutput(const std::string& name);
+  // Check if the ITensor has been declared
+  bool HasDeclared(const std::string& name);

  // GPU memory address for an ITensor with specific name. One can operate on
  // these memory directly for acceleration, for example, output the converted
@ -132,6 +134,16 @@ class TensorRTEngine : public EngineBase {
  std::unordered_map<std::string /*name*/, std::unique_ptr<framework::Tensor>>
      weight_map;

+  // TODO: (NHZLX)
+  // In the normal case, the paddle-trt exists bug when runing the googlenet.
+  // When there are more than two convolutions of 1 * 1 with the same input, the
+  // paddle-tensorrt will do the merging optimization, which fuse those conv
+  // into
+  // one conv, and then trigger bug. So,  We should use strategy to avoid this
+  // optimization for the time being. This bug will be fixed in the future.
+  std::unordered_map<std::string /*name*/, int /*ITensor_quote_num*/>
+      itensor_quote_num;
+
 private:
  // the max batch size
  int max_batch_;
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@ -1,5 +1,11 @@
 set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor)

+function(download_model install_dir model_name)
+    if (NOT EXISTS ${install_dir})
+        inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} ${model_name})
+    endif()
+endfunction()
+
 function(download_model_and_data install_dir model_name data_name)
    if (NOT EXISTS ${install_dir})
        inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} ${model_name})
@ -13,6 +19,13 @@ function(inference_analysis_api_test target install_dir filename)
        ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt)
 endfunction()

+function(inference_analysis_api_test_with_fake_data target install_dir filename model_name)
+    download_model(${install_dir} ${model_name})
+    inference_analysis_test(${target} SRCS ${filename}
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+        ARGS --infer_model=${install_dir}/model)
+endfunction()
+
 # RNN1
 if(NOT APPLE)
    set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1")
@ -61,17 +74,13 @@ inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} ana
 # ocr
 set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
 if (NOT EXISTS ${OCR_INSTALL_DIR})
-    inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
+  inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
 endif()
 inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc)

 # resnet50
-set(RESNET50_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/resnet50")
-if (NOT EXISTS ${RESNET50_INSTALL_DIR})
-    inference_download_and_uncompress(${RESNET50_INSTALL_DIR} ${INFERENCE_URL} "resnet50_model.tar.gz")
-endif()
-inference_analysis_test(test_analyzer_resnet50 SRCS analyzer_resnet50_tester.cc
-    EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${RESNET50_INSTALL_DIR}/model)
+inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 
+  "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz")

 # anakin
 if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
--- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
@ -30,25 +30,7 @@ void SetConfig(AnalysisConfig *cfg) {
 }

 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
-  PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data.");
-
-  PaddleTensor input;
-  // channel=3, height/width=318
-  std::vector<int> shape({FLAGS_batch_size, 3, 318, 318});
-  input.shape = shape;
-  input.dtype = PaddleDType::FLOAT32;
-
-  // fill input data, for profile easily, do not use random data here.
-  size_t size = FLAGS_batch_size * 3 * 318 * 318;
-  input.data.Resize(size * sizeof(float));
-  float *input_data = static_cast<float *>(input.data.data());
-  for (size_t i = 0; i < size; i++) {
-    *(input_data + i) = static_cast<float>(i) / size;
-  }
-
-  std::vector<PaddleTensor> input_slots;
-  input_slots.assign({input});
-  (*inputs).emplace_back(input_slots);
+  SetFakeImageInput(inputs, FLAGS_infer_model);
 }

 // Easy for profiling independently.
@ -61,13 +43,6 @@ void profile(bool use_mkldnn = false) {
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
-
-  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
-    PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
-    size_t size = GetSize(outputs[0]);
-    // output is a 512-dimension feature
-    EXPECT_EQ(size, 512 * FLAGS_batch_size);
-  }
 }

 TEST(Analyzer_resnet50, profile) { profile(); }
@ -83,8 +58,7 @@ TEST(Analyzer_resnet50, fuse_statis) {
  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
  auto fuse_statis = GetFuseStatis(
      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
-  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
-  EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
+  LOG(INFO) << "num_ops: " << num_ops;
 }

 // Compare result of NativeConfig and AnalysisConfig
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@ -25,6 +25,7 @@
 #include "paddle/fluid/inference/api/analysis_predictor.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
+#include "paddle/fluid/inference/tests/test_helper.h"
 #include "paddle/fluid/platform/profiler.h"

 DEFINE_string(infer_model, "", "model path");
@ -105,6 +106,34 @@ std::unordered_map<std::string, int> GetFuseStatis(PaddlePredictor *predictor,
  return fuse_statis;
 }

+void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
+                       const std::string &dirname) {
+  // Set fake_image_data
+  PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data.");
+  std::vector<std::vector<int64_t>> feed_target_shapes =
+      GetFeedTargetShapes(dirname, true, "model", "params");
+  int dim1 = feed_target_shapes[0][1];
+  int dim2 = feed_target_shapes[0][2];
+  int dim3 = feed_target_shapes[0][3];
+
+  PaddleTensor input;
+  std::vector<int> shape({FLAGS_batch_size, dim1, dim2, dim3});
+  input.shape = shape;
+  input.dtype = PaddleDType::FLOAT32;
+
+  // fill input data, for profile easily, do not use random data here.
+  size_t size = FLAGS_batch_size * dim1 * dim2 * dim3;
+  input.data.Resize(size * sizeof(float));
+  float *input_data = static_cast<float *>(input.data.data());
+  for (size_t i = 0; i < size; i++) {
+    *(input_data + i) = static_cast<float>(i) / size;
+  }
+
+  std::vector<PaddleTensor> input_slots;
+  input_slots.assign({input});
+  (*inputs).emplace_back(input_slots);
+}
+
 void TestOneThreadPrediction(
    const AnalysisConfig &config,
    const std::vector<std::vector<PaddleTensor>> &inputs,
--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
@ -93,11 +93,16 @@ void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) {
  }
 }

-TEST(trt_models_test, main) {
-  std::vector<std::string> infer_models = {"mobilenet", "resnet50",
-                                           "resnext50"};
-  for (auto &model_dir : infer_models) {
-    CompareTensorRTWithFluid(1, FLAGS_dirname + "/" + model_dir);
-  }
+TEST(trt_models_test, mobilenet) {
+  CompareTensorRTWithFluid(1, FLAGS_dirname + "/mobilenet");
+}
+
+TEST(trt_models_test, resnet50) {
+  CompareTensorRTWithFluid(1, FLAGS_dirname + "/resnet50");
 }
+
+TEST(trt_models_test, resnext50) {
+  CompareTensorRTWithFluid(1, FLAGS_dirname + "/resnext50");
+}
+
 }  // namespace paddle
--- a/paddle/fluid/inference/tests/test.cmake
+++ b/paddle/fluid/inference/tests/test.cmake
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@ -18,7 +18,6 @@ limitations under the License. */
 #include <string>
 #include <vector>

-#include "paddle/fluid/framework/ir/graph_to_program_pass.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/profiler.h"
@ -94,15 +93,15 @@ void CheckError(const paddle::framework::LoDTensor& output1,

 std::unique_ptr<paddle::framework::ProgramDesc> InitProgram(
    paddle::framework::Executor* executor, paddle::framework::Scope* scope,
-    const std::string& dirname, const bool is_combined = false) {
+    const std::string& dirname, const bool is_combined = false,
+    const std::string& prog_filename = "__model_combined__",
+    const std::string& param_filename = "__params_combined__") {
  std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
  if (is_combined) {
    // All parameters are saved in a single file.
    // Hard-coding the file names of program and parameters in unittest.
    // The file names should be consistent with that used in Python API
    //  `fluid.io.save_inference_model`.
-    std::string prog_filename = "__model_combined__";
-    std::string param_filename = "__params_combined__";
    inference_program =
        paddle::inference::Load(executor, scope, dirname + "/" + prog_filename,
                                dirname + "/" + param_filename);
@ -115,12 +114,15 @@ std::unique_ptr<paddle::framework::ProgramDesc> InitProgram(
 }

 std::vector<std::vector<int64_t>> GetFeedTargetShapes(
-    const std::string& dirname, const bool is_combined = false) {
+    const std::string& dirname, const bool is_combined = false,
+    const std::string& prog_filename = "__model_combined__",
+    const std::string& param_filename = "__params_combined__") {
  auto place = paddle::platform::CPUPlace();
  auto executor = paddle::framework::Executor(place);
  auto* scope = new paddle::framework::Scope();

-  auto inference_program = InitProgram(&executor, scope, dirname, is_combined);
+  auto inference_program = InitProgram(&executor, scope, dirname, is_combined,
+                                       prog_filename, param_filename);
  auto& global_block = inference_program->Block(0);

  const std::vector<std::string>& feed_target_names =
@ -136,15 +138,6 @@ std::vector<std::vector<int64_t>> GetFeedTargetShapes(
  return feed_target_shapes;
 }

-void Compile(paddle::framework::ProgramDesc* program) {
-  std::unique_ptr<paddle::framework::ir::Graph> g(
-      new paddle::framework::ir::Graph(*program));
-  auto pass = paddle::framework::ir::PassRegistry::Instance().Get(
-      "graph_to_program_pass");
-  pass->SetNotOwned<paddle::framework::ProgramDesc>("program", program);
-  pass->Apply(std::move(g));
-}
-
 template <typename Place, bool CreateVars = true, bool PrepareContext = false>
 void TestInference(const std::string& dirname,
                   const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
@ -182,7 +175,6 @@ void TestInference(const std::string& dirname,
        paddle::platform::DeviceContextPool::Instance().Get(place));
    inference_program = InitProgram(&executor, scope, dirname, is_combined);
  }
-  Compile(inference_program.get());

  // Disable the profiler and print the timing information
  paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault,
@ -261,5 +253,3 @@ void TestInference(const std::string& dirname,

  delete scope;
 }
-
-USE_PASS(graph_to_program_pass);
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@ -26,6 +26,8 @@ namespace plat = paddle::platform;
      act_type##_grad, ops::ActivationGradKernel<plat::CUDADeviceContext,   \
                                                 ops::grad_functor<float>>, \
      ops::ActivationGradKernel<plat::CUDADeviceContext,                    \
-                                ops::grad_functor<double>>);
+                                ops::grad_functor<double>>,                 \
+      ops::ActivationGradKernel<plat::CUDADeviceContext,                    \
+                                ops::grad_functor<plat::float16>>);

 FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@ -333,8 +333,7 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> {
  template <typename Device, typename X, typename Out, typename dOut,
            typename dX>
  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    const Out out_conj = Eigen::numext::conj(out);
-    dx.device(d) = static_cast<T>(0.5) * dout / out_conj;
+    dx.device(d) = static_cast<T>(0.5) * dout / out;
  }
 };

@ -740,7 +739,7 @@ struct PowGradFunctor : public BaseActivationFunctor<T> {
            typename dX>
  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
    dx.device(d) = dout * static_cast<T>(factor) *
-                   x.pow(static_cast<T>(factor - static_cast<T>(1)));
+                   x.pow(static_cast<T>(factor) - static_cast<T>(1));
  }
 };

--- a/paddle/fluid/operators/adagrad_op.cc
+++ b/paddle/fluid/operators/adagrad_op.cc
@ -119,8 +119,8 @@ struct SparseAdagradFunctor<platform::CPUDeviceContext, T> {
    auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();

    // 2. m += g_m * g_m
-    math::scatter::Mul<platform::CPUDeviceContext, T> sqare_func;
-    auto grad_square = sqare_func(context, grad_merge, grad_merge);
+    auto grad_square =
+        SquareSelectedRows<platform::CPUDeviceContext, T>(context, grad_merge);

    math::SelectedRowsAddToTensor<platform::CPUDeviceContext, T> functor;
    functor(context, grad_square, moment);
--- a/paddle/fluid/operators/adagrad_op.cu
+++ b/paddle/fluid/operators/adagrad_op.cu
@ -84,8 +84,8 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
    auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();
    framework::Vector<int64_t> merge_rows(grad_merge.rows());
    // 2. m += g_m * g_m
-    math::scatter::Mul<platform::CUDADeviceContext, T> sqare_func;
-    auto grad_square = sqare_func(context, grad_merge, grad_merge);
+    auto grad_square =
+        SquareSelectedRows<platform::CUDADeviceContext, T>(context, grad_merge);

    math::SelectedRowsAddToTensor<platform::CUDADeviceContext, T> functor;
    functor(context, grad_square, moment);
--- a/paddle/fluid/operators/adagrad_op.h
+++ b/paddle/fluid/operators/adagrad_op.h
@ -28,6 +28,20 @@ struct SparseAdagradFunctor {
                  framework::Tensor *moment, framework::Tensor *param);
 };

+template <typename DeviceContext, typename T>
+framework::SelectedRows SquareSelectedRows(
+    const DeviceContext &context, const framework::SelectedRows &input) {
+  framework::SelectedRows out;
+  out.set_rows(input.rows());
+  out.set_height(input.height());
+  out.mutable_value()->mutable_data<T>(input.value().dims(),
+                                       context.GetPlace());
+  auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
+  auto e_in = framework::EigenVector<T>::Flatten(input.value());
+  e_out.device(*context.eigen_device()) = e_in.square();
+  return out;
+}
+
 template <typename DeviceContext, typename T>
 class AdagradOpKernel : public framework::OpKernel<T> {
 public:
--- a/paddle/fluid/operators/batch_norm_op.cu.cc
+++ b/paddle/fluid/operators/batch_norm_op.cu.cc
@ -219,8 +219,8 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));

    d_x->mutable_data<T>(ctx.GetPlace());
-    d_scale->mutable_data<T>(ctx.GetPlace());
-    d_bias->mutable_data<T>(ctx.GetPlace());
+    d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());

    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
    if ((N * H * W * D) == 1) {
@ -272,8 +272,10 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>

    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
    const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
-    const void *saved_mean_data = saved_mean->template data<T>();
-    const void *saved_var_data = saved_var->template data<T>();
+    const void *saved_mean_data =
+        saved_mean->template data<BatchNormParamType<T>>();
+    const void *saved_var_data =
+        saved_var->template data<BatchNormParamType<T>>();

    CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward(
        dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
@ -281,10 +283,10 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
        CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
        data_desc_, d_y->template data<T>(), data_desc_,
        d_x->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_,
-        scale->template data<T>(),
-        d_scale->template mutable_data<T>(ctx.GetPlace()),
-        d_bias->template mutable_data<T>(ctx.GetPlace()), epsilon,
-        saved_mean_data, saved_var_data));
+        scale->template data<BatchNormParamType<T>>(),
+        d_scale->template mutable_data<BatchNormParamType<T>>(ctx.GetPlace()),
+        d_bias->template mutable_data<BatchNormParamType<T>>(ctx.GetPlace()),
+        epsilon, saved_mean_data, saved_var_data));

    // clean when exit.
    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
@ -304,4 +306,5 @@ REGISTER_OP_CUDA_KERNEL(
    ops::BatchNormKernel<plat::CUDADeviceContext, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
    batch_norm_grad, ops::BatchNormGradKernel<plat::CUDADeviceContext, float>,
-    ops::BatchNormGradKernel<plat::CUDADeviceContext, double>);
+    ops::BatchNormGradKernel<plat::CUDADeviceContext, double>,
+    ops::BatchNormGradKernel<plat::CUDADeviceContext, plat::float16>);
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@ -143,9 +143,11 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
          cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
      // Currently tensor core is only enabled using this algo
      algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+      VLOG(5) << "use cudnn_tensor_op_math";
    } else {
      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
          cudnn_conv_desc, CUDNN_DEFAULT_MATH));
+      VLOG(5) << "NOT use cudnn_tensor_op_math";
    }
 #endif

@ -361,7 +363,8 @@ REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
 REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
                   paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<double>);
+                   paddle::operators::CUDNNConvGradOpKernel<double>,
+                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>);

 REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
                   paddle::operators::CUDNNConvOpKernel<float>,
--- a/paddle/fluid/operators/cross_entropy_op.cu
+++ b/paddle/fluid/operators/cross_entropy_op.cu
@ -13,12 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/cross_entropy_op.h"
+#include "paddle/fluid/platform/float16.h"

+namespace plat = paddle::platform;
 namespace ops = paddle::operators;
 using CUDACtx = paddle::platform::CUDADeviceContext;
 REGISTER_OP_CUDA_KERNEL(cross_entropy,
                        ops::CrossEntropyOpKernel<CUDACtx, float>,
-                        ops::CrossEntropyOpKernel<CUDACtx, double>);
-REGISTER_OP_CUDA_KERNEL(cross_entropy_grad,
-                        ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
-                        ops::CrossEntropyGradientOpKernel<CUDACtx, double>);
+                        ops::CrossEntropyOpKernel<CUDACtx, double>,
+                        ops::CrossEntropyOpKernel<CUDACtx, plat::float16>);
+
+REGISTER_OP_CUDA_KERNEL(
+    cross_entropy_grad, ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
+    ops::CrossEntropyGradientOpKernel<CUDACtx, double>,
+    ops::CrossEntropyGradientOpKernel<CUDACtx, plat::float16>);
--- a/Show More
+++ b/Show More