Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into refactor-prefetch

test=develop
6 years ago · 145c535750
parent b2c9efef2b 12e1719f96
commit 145c535750
55 changed files with 2344 additions and 383 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -214,6 +214,7 @@ if (NOT WIN32)
 # there is no official support of warpctc, nccl, cupti in windows
 include(external/warpctc)   # download, build, install warpctc
 include(cupti)
 include(external/gzstream)
 endif (NOT WIN32)
 if(WITH_DISTRIBUTE)
--- a/cmake/external/gzstream.cmake
+++ b/cmake/external/gzstream.cmake
@ -0,0 +1,47 @@
 # Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 IF(MOBILE_INFERENCE)
    return()
 ENDIF()
 include (ExternalProject)
 # NOTE: gzstream is needed when linking with ctr reader.
 SET(GZSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/gzstream)
 SET(GZSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gzstream)
 SET(GZSTREAM_INCLUDE_DIR "${GZSTREAM_INSTALL_DIR}/include/" CACHE PATH "gzstream include directory." FORCE)
 ExternalProject_Add(
        extern_gzstream
        GIT_REPOSITORY "https://github.com/jacquesqiao/gzstream.git"
        GIT_TAG ""
        PREFIX          ${GZSTREAM_SOURCES_DIR}
        UPDATE_COMMAND  ""
        CONFIGURE_COMMAND ""
        BUILD_IN_SOURCE 1
        BUILD_COMMAND   make -j8
        INSTALL_COMMAND mkdir -p ${GZSTREAM_INSTALL_DIR}/lib/ && mkdir -p ${GZSTREAM_INSTALL_DIR}/include/
        && cp ${GZSTREAM_SOURCES_DIR}/src/extern_gzstream/libgzstream.a ${GZSTREAM_INSTALL_DIR}/lib
        && cp -r ${GZSTREAM_SOURCES_DIR}/src/extern_gzstream/gzstream.h ${GZSTREAM_INSTALL_DIR}/include
 )
 ADD_LIBRARY(gzstream STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET gzstream PROPERTY IMPORTED_LOCATION
        "${GZSTREAM_INSTALL_DIR}/lib/libgzstream.a")
 include_directories(${GZSTREAM_INCLUDE_DIR})
 ADD_DEPENDENCIES(gzstream extern_gzstream zlib)
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@ -97,8 +97,8 @@ paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_ti
 paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
-paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0))
+paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False))
-paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False))
 paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'name'], varargs=None, keywords=None, defaults=(0, None))
 paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@ -39,11 +39,12 @@ if (WITH_GPU)
 endif()
 cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass)
 cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass)
 cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)
-set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass) 
+set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass) 
 if (WITH_GPU)
  list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass)
 endif()
--- a/paddle/fluid/framework/details/all_reduce_deps_pass.cc
+++ b/paddle/fluid/framework/details/all_reduce_deps_pass.cc
@ -0,0 +1,125 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include <algorithm>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/details/all_reduce_deps_pass.h"
 #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/op_graph_view.h"
 #include "paddle/fluid/framework/details/var_handle.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 namespace paddle {
 namespace framework {
 namespace details {
 static constexpr char kAllOpDescs[] = "all_op_descs";
 VarHandle* GetValidInput(const OpHandleBase* a) {
  for (auto p : a->Inputs()) {
    VarHandle* b = dynamic_cast<VarHandle*>(p);
    if (b) {
      return b;
    }
  }
  return nullptr;
 }
 std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
  auto graph_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
  // get vars order
  int order = 0;
  std::unordered_map<std::string, int> vars;
  // TODO(gongwb): use graph topology sort to find the order of operators.
  //               Note that must assert topology sort is stable
  auto& ops = Get<const std::vector<OpDesc*>>(kAllOpDescs);
  for (auto* op_desc : ops) {
    auto outputs = op_desc->Outputs();
    for (auto& o_it : outputs) {
      for (auto& v : o_it.second) {  // values
        vars[v] = order;
      }
    }
    order++;
  }
  std::vector<OpHandleBase*> dist_ops;
  // get allreduce ops.
  for (auto& op : graph_ops) {
    // FIXME(gongwb):add broad cast.
    if (op->Name() == "all_reduce" || op->Name() == "reduce") {
      dist_ops.push_back(op);
    }
  }
  VLOG(10) << "dist_ops size:" << dist_ops.size() << std::endl;
  std::sort(dist_ops.begin(), dist_ops.end(), [&](OpHandleBase* op1,
                                                  OpHandleBase* op2) {
    VarHandle* i0 = dynamic_cast<VarHandle*>(GetValidInput(op1));
    VarHandle* i1 = dynamic_cast<VarHandle*>(GetValidInput(op2));
    PADDLE_ENFORCE(i0 != nullptr && i1 != nullptr, "%s convert to %s error",
                   op1->DebugString(), op2->DebugString());
    auto l_it = vars.find(i0->name_);
    auto r_it = vars.find(i1->name_);
    if (l_it->second < r_it->second) return true;
    if (l_it->second == r_it->second) {
      return i0->name_ < i1->name_;
    }
    return false;
  });
  // add dependency.
  auto& sorted_ops = dist_ops;
  for (size_t i = 1; i < sorted_ops.size(); ++i) {
    auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar());
    auto* pre_op = sorted_ops[i - 1];
    auto* op = sorted_ops[i];
    pre_op->AddOutput(dep_var);
    op->AddInput(dep_var);
    graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
    VLOG(10) << "add all_reduce sequential dependencies between " << pre_op
             << " and " << op;
    VLOG(10) << "pre_op:" << pre_op->DebugString()
             << ", op:" << op->DebugString();
  }
  return graph;
 }
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
 REGISTER_PASS(all_reduce_deps_pass,
              paddle::framework::details::AllReduceDepsPass)
    .RequirePassAttr(paddle::framework::details::kAllOpDescs);
--- a/paddle/fluid/framework/details/all_reduce_deps_pass.h
+++ b/paddle/fluid/framework/details/all_reduce_deps_pass.h
@ -0,0 +1,33 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/pass.h"
 namespace paddle {
 namespace framework {
 namespace details {
 // TODO(gongwb): overlap allreduce with backward computation.
 class AllReduceDepsPass : public ir::Pass {
 protected:
  std::unique_ptr<ir::Graph> ApplyImpl(
      std::unique_ptr<ir::Graph> graph) const override;
 };
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
 #include "paddle/fluid/framework/details/sequential_execution_pass.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
@ -24,6 +25,10 @@ namespace paddle {
 namespace framework {
 namespace details {
 static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) {
  return (!strategy.enable_sequential_execution_ && strategy.num_trainers_ > 1);
 }
 class ParallelExecutorPassBuilder : public ir::PassBuilder {
 public:
  explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy)
@ -70,6 +75,10 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
    // Verify that the graph is correct for multi-device executor.
    AppendPass("multi_devices_check_pass");
    if (SeqOnlyAllReduceOps(strategy)) {
      AppendPass("all_reduce_deps_pass");
    }
    if (strategy_.remove_unnecessary_lock_) {
      AppendPass("modify_op_lock_and_record_event_pass");
    }
@ -124,6 +133,17 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
      pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
 #endif
    } else if (pass->Type() == "sequential_execution_pass") {
      VLOG(1) << "set enable_sequential_execution:"
              << enable_sequential_execution_;
      pass->Erase(kAllOpDescs);
      pass->Set<const std::vector<OpDesc *>>(
          kAllOpDescs,
          new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
    } else if (pass->Type() == "all_reduce_deps_pass") {
      VLOG(1) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this)
              << ", num_trainers:" << num_trainers_;
      pass->Erase(kAllOpDescs);
      pass->Set<const std::vector<OpDesc *>>(
          kAllOpDescs,
@ -144,4 +164,5 @@ USE_PASS(multi_devices_pass);
 USE_PASS(multi_devices_check_pass);
 USE_PASS(multi_devices_print_pass);
 USE_PASS(sequential_execution_pass);
 USE_PASS(all_reduce_deps_pass);
 USE_PASS(modify_op_lock_and_record_event_pass);
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@ -73,6 +73,7 @@ struct BuildStrategy {
  bool fuse_broadcast_op_{false};
  int num_trainers_{1};
  bool remove_unnecessary_lock_{false};
  // NOTE:
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@ -71,7 +71,7 @@ class OperatorBase;
 class ExecutionContext;
 /**
- * OperatorBase has the basic element that Net will call to do computation.
+ * OperatorBase has the basic elements that Net will call to do computation.
 * Only CreateOperator from OpRegistry will new Operator directly. User
 * should always construct a proto message OpDesc and call
 * OpRegistry::CreateOp(op_desc) to get an Operator instance.
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/graph.h"
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
@ -54,7 +54,7 @@ class ParallelExecutorPrivate {
  Scope *global_scope_;  // not owned
  std::unique_ptr<details::SSAGraphExecutor> executor_;
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
  std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
 #endif
  bool own_local_scope_;
@ -104,7 +104,7 @@ ParallelExecutor::ParallelExecutor(
  if (member_->use_cuda_) {
 // Bcast Parameters to all GPUs
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
    auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME);
    ncclUniqueId *nccl_id = nullptr;
    if (nccl_id_var != nullptr) {
@ -124,7 +124,7 @@ ParallelExecutor::ParallelExecutor(
 // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
 // ncclOp
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
  std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
      main_program, member_->places_, loss_var_name, params,
      member_->local_scopes_, member_->use_cuda_, member_->nccl_ctxs_.get());
@ -213,7 +213,7 @@ void ParallelExecutor::BCastParamsToDevices(
    }
    auto &dims = main_tensor.dims();
    if (paddle::platform::is_gpu_place(main_tensor.place())) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
      std::vector<void *> buffers;
      size_t numel = main_tensor.numel();
      ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@ -120,8 +120,22 @@ class SelectedRows {
   */
  int64_t AutoGrownIndex(int64_t key, bool auto_grown, bool is_test = false);
-  void SyncIndex();
+  /*
   * @brief Get the index of the key from id_to_index_ map.
   */
  inline int64_t GetIndexFromId(int64_t key) {
    auto iter = id_to_index_.find(key);
    if (iter == id_to_index_.end()) {
      return -1;
    } else {
      return iter->second;
    }
  }
  void SyncIndex();
  /*
   * @brief Get complete Dims before
   */
  DDim GetCompleteDims() const {
    std::vector<int64_t> dims = vectorize(value_->dims());
    dims[0] = height_;
@ -133,9 +147,10 @@ class SelectedRows {
  // SelectedRows are simply concated when adding together. Until a
  // SelectedRows add a Tensor, will the duplicate rows be handled.
  Vector<int64_t> rows_;
-  std::unordered_map<int64_t, int64_t> id_to_index_;
+  std::unordered_map<int64_t, int64_t>
      id_to_index_;  // should not be used when rows_ has duplicate member
  std::unique_ptr<Tensor> value_{nullptr};
-  int64_t height_;
+  int64_t height_;  // height indicates the underline tensor's height
  std::unique_ptr<RWLock> rwlock_{nullptr};
 };
--- a/paddle/fluid/framework/transfer_scope_cache.cc
+++ b/paddle/fluid/framework/transfer_scope_cache.cc
@ -17,28 +17,16 @@
 namespace paddle {
 namespace framework {
 // Holds all the transfer scope across the process.
 std::unordered_map<size_t, Scope*>& global_transfer_data_cache() {
-  typedef std::unordered_map<size_t, Scope*> map_t;
+  thread_local auto* x = new std::unordered_map<size_t, Scope*>;
  thread_local std::unique_ptr<map_t> x(new map_t);
  return *x;
 }
 // Holds all the transfer scope for this thread.
 std::unordered_set<Scope*>& global_transfer_scope_cache() {
-  typedef std::unordered_set<Scope*> set_t;
+  thread_local auto* x = new std::unordered_set<Scope*>;
  thread_local std::unique_ptr<set_t> x(new set_t);
  return *x;
 }
 // Try to create a transfer scope. If one cached scope has match the
 // requirement, just return that one.
 // Inputs:
 // @type0: the source kernel type.
 // @type1: the target kernel type.
 // @scope: the execution scope of this op.
 // Returns: A scope used to hold the transfer data across the different kernel
 // type.
 Scope* TryCreateTransferScope(OpKernelType type0, OpKernelType type1,
                              const Scope* scope) {
  Scope* new_scope{nullptr};
@ -58,5 +46,27 @@ Scope* TryCreateTransferScope(OpKernelType type0, OpKernelType type1,
  return new_scope;
 }
 void RemoveKidsFromTransferScopeCache(Scope* scope) {
  auto it = global_transfer_scope_cache().find(scope);
  if (it != global_transfer_scope_cache().end()) {
    global_transfer_scope_cache().erase(it);
  }
  for (auto* s : scope->kids()) {
    auto it = global_transfer_scope_cache().find(s);
    if (it != global_transfer_scope_cache().end()) {
      global_transfer_scope_cache().erase(it);
    }
  }
  // remove global transfer data cache
  auto& cache = global_transfer_data_cache();
  for (auto it = cache.begin(); it != cache.end();) {
    if (it->second == scope)
      it = cache.erase(it);
    else
      it++;
  }
 }
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@ -35,4 +35,5 @@ function(inference_analysis_test TARGET)
  endif()
 endfunction(inference_analysis_test)
-inference_analysis_test(test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS reset_tensor_array paddle_inference_api)
+inference_analysis_test(test_analyzer SRCS analyzer_tester.cc
    EXTRA_DEPS reset_tensor_array paddle_inference_api)
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@ -76,7 +76,8 @@ void TestWord2vecPrediction(const std::string& model_path) {
                     0.000932706};
  const size_t num_elements = outputs.front().data.length() / sizeof(float);
  // The outputs' buffers are in CPU memory.
-  for (size_t i = 0; i < std::min((size_t)5UL, num_elements); i++) {
+  for (size_t i = 0; i < std::min(static_cast<size_t>(5UL), num_elements);
       i++) {
    LOG(INFO) << "data: "
              << static_cast<float*>(outputs.front().data.data())[i];
    PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@ -284,6 +284,7 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
        framework::GetFetchVariable(*scope, "fetch", idx);
    auto type = fetch.type();
    auto output = &(outputs->at(i));
    output->name = fetchs_[idx]->Input("X")[0];
    if (type == typeid(float)) {
      GetFetchOne<float>(fetch, output);
      output->dtype = PaddleDType::FLOAT32;
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@ -109,7 +109,7 @@ class AnalysisPredictor : public PaddlePredictor {
  std::map<std::string, size_t> feed_names_;
  std::vector<framework::OpDesc *> fetchs_;
  // Memory buffer for feed inputs. The temporary LoDTensor will cause serious
-  // concurrency problems, so cache them.
+  // concurrency problems, wrong results and memory leak, so cache them.
  std::vector<framework::LoDTensor> feed_tensors_;
  details::TensorArrayBatchCleaner tensor_array_batch_cleaner_;
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@ -185,8 +185,12 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
               << inputs.size();
    return false;
  }
  // Cache the inputs memory for better concurrency performance.
  feed_tensors_.resize(inputs.size());
  for (size_t i = 0; i < inputs.size(); ++i) {
-    framework::LoDTensor input;
+    auto &input = feed_tensors_[i];
    framework::DDim ddim = framework::make_ddim(inputs[i].shape);
    void *input_ptr;
    if (inputs[i].dtype == PaddleDType::INT64) {
@ -261,6 +265,7 @@ bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs,
        framework::GetFetchVariable(*scope, "fetch", idx);
    auto type = fetch.type();
    auto output = &(outputs->at(i));
    output->name = fetchs_[idx]->Input("X")[0];
    if (type == typeid(float)) {
      GetFetchOne<float>(fetch, output);
      output->dtype = PaddleDType::FLOAT32;
--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
@ -69,6 +69,9 @@ class NativePaddlePredictor : public PaddlePredictor {
  std::vector<framework::OpDesc *> feeds_;
  std::map<std::string, size_t> feed_names_;
  std::vector<framework::OpDesc *> fetchs_;
  // Memory buffer for feed inputs. The temporary LoDTensor will cause serious
  // concurrency problems, wrong results and memory leak, so cache them.
  std::vector<framework::LoDTensor> feed_tensors_;
  // Do not use unique_ptr, use parent scope to delete
  framework::Scope *sub_scope_{nullptr};
  details::TensorArrayBatchCleaner tensor_array_batch_cleaner_;
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@ -86,7 +86,11 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) {
    munlock(p, size);
 #endif
  }
 #ifdef _WIN32
  _aligned_free(p);
 #else
  free(p);
 #endif
 }
 bool CPUAllocator::UseGpu() const { return false; }
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@ -149,6 +149,13 @@ $out = \max(x, 0)$
 )DOC";
 UNUSED constexpr char GeluDoc[] = R"DOC(
 Gelu Activation Operator.
 $out = \\frac{1 + erf(\\frac{x}{\\sqrt{2}})}{2} x$
 )DOC";
 UNUSED constexpr char TanhDoc[] = R"DOC(
 Tanh Activation Operator.
@ -472,6 +479,7 @@ REGISTER_ACTIVATION_OP_MAKER(Sigmoid, SigmoidDoc);
 REGISTER_ACTIVATION_OP_MAKER(LogSigmoid, LogSigmoidDoc);
 REGISTER_ACTIVATION_OP_MAKER(Exp, ExpDoc);
 REGISTER_ACTIVATION_OP_MAKER(Relu, ReluDoc);
 REGISTER_ACTIVATION_OP_MAKER(Gelu, GeluDoc);
 REGISTER_ACTIVATION_OP_MAKER(Tanh, TanhDoc);
 REGISTER_ACTIVATION_OP_MAKER(TanhShrink, TanhShrinkDoc);
 REGISTER_ACTIVATION_OP_MAKER(Sqrt, SqrtDoc);
@ -489,6 +497,7 @@ REGISTER_ACTIVATION_OP_MAKER(Softsign, SoftsignDoc);
 REGISTER_ACTIVATION_OP_GRAD_MAKER(Sigmoid, sigmoid);
 REGISTER_ACTIVATION_OP_GRAD_MAKER(Relu, relu);
 REGISTER_ACTIVATION_OP_GRAD_MAKER(Gelu, gelu);
 REGISTER_ACTIVATION_OP_GRAD_MAKER(Exp, exp);
 REGISTER_ACTIVATION_OP_GRAD_MAKER(Tanh, tanh);
 REGISTER_ACTIVATION_OP_GRAD_MAKER(Ceil, ceil);
@ -525,6 +534,7 @@ namespace ops = paddle::operators;
  __macro(Round, round);             \
  __macro(Log, log);                 \
  __macro(Square, square);           \
  __macro(Gelu, gelu);               \
  __macro(BRelu, brelu);             \
  __macro(Pow, pow);                 \
  __macro(STanh, stanh);             \
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@ -16,6 +16,11 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 #include <cmath>
 #ifndef _USE_MATH_DEFINES
 #define _USE_MATH_DEFINES
 #endif
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
@ -212,6 +217,31 @@ struct ReluGradFunctor : public BaseActivationFunctor<T> {
  }
 };
 // gelu(x) = 0.5 * x *  (1 + erf(x / sqrt(2)))
 template <typename T>
 struct GeluFunctor : public BaseActivationFunctor<T> {
  template <typename Device, typename X, typename Out>
  void operator()(Device d, X x, Out out) const {
    auto temp =
        ((x * static_cast<T>(M_SQRT1_2)).erf()).template cast<T>().eval();
    out.device(d) = x * static_cast<T>(0.5) * (static_cast<T>(1) + temp);
  }
 };
 template <typename T>
 struct GeluGradFunctor : BaseActivationFunctor<T> {
  bool Inplace() const { return IsInplace("gelu"); }
  template <typename Device, typename X, typename Out, typename dOut,
            typename dX>
  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
    auto temp = (static_cast<T>(0.5 * M_2_SQRTPI * M_SQRT1_2) * x *
                 ((-static_cast<T>(0.5) * x.square()).exp()))
                    .template cast<T>()
                    .eval();
    dx.device(d) = dout * (out / x + temp);
  }
 };
 // tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
 template <typename T>
 struct TanhFunctor : public BaseActivationFunctor<T> {
@ -877,6 +907,7 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
  __macro(logsigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);     \
  __macro(exp, ExpFunctor, ExpGradFunctor);                          \
  __macro(relu, ReluFunctor, ReluGradFunctor);                       \
  __macro(gelu, GeluFunctor, GeluGradFunctor);                       \
  __macro(tanh, TanhFunctor, TanhGradFunctor);                       \
  __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor);     \
  __macro(sqrt, SqrtFunctor, SqrtGradFunctor);                       \
--- a/paddle/fluid/operators/bilinear_tensor_product_op.h
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.h
@ -70,7 +70,7 @@ class BilinearTensorProductKernel : public framework::OpKernel<T> {
    if (bias) {
      auto bias_vec = EigenMatrix<T>::From(*bias);
      Eigen::DSizes<int, 2> bcast(batch_size, 1);
-      output_mat.device(place) = bias_vec.broadcast(bcast) + output_mat;
+      output_mat.device(place) = bias_vec.broadcast(bcast).eval() + output_mat;
    }
  }
 };
@ -99,13 +99,13 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
    auto d_out_mat = EigenMatrix<T>::From(*d_out);
    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    // Create the intermediate variable to caculate the Output(Y@Grad).
+    // Create the intermediate variable to calculate the Output(Y@Grad).
    Tensor x_scale;
    x_scale.mutable_data<T>(framework::make_ddim({batch_size, x_dim}),
                            ctx.GetPlace());
    auto x_scale_mat = EigenMatrix<T>::From(x_scale);
-    // Create the intermediate variable to caculate the Output(X@Grad).
+    // Create the intermediate variable to calculate the Output(X@Grad).
    Tensor y_scale;
    y_scale.mutable_data<T>(framework::make_ddim({batch_size, y_dim}),
                            ctx.GetPlace());
@ -113,65 +113,64 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
    math::SetConstant<DeviceContext, T> set_zero;
    // Set Output(X@Grad) be zero.
    if (d_x) {
      d_x->mutable_data<T>(ctx.GetPlace());
      set_zero(dev_ctx, d_x, static_cast<T>(0));
    }
    // Set Output(Y@Grad) be zero.
    if (d_y) {
      d_y->mutable_data<T>(ctx.GetPlace());
      set_zero(dev_ctx, d_y, static_cast<T>(0));
    }
    if (d_weight) {
      d_weight->mutable_data<T>(ctx.GetPlace());
    }
    auto blas = math::GetBlas<DeviceContext, T>(ctx);
    // Caculate the Output(X@Grad) and Output(Y@Grad).
-    if (d_x || d_y) {
+    if (d_x || d_y || d_weight) {
      Eigen::DSizes<int, 2> bcast_for_x(1, y_dim);
      Eigen::DSizes<int, 2> bcast_for_y(1, x_dim);
      Eigen::DSizes<int, 2> bcast_for_weight(1, x_dim);
      for (int i = 0; i < out_dim; ++i) {
        Tensor weight_i = weight->Slice(i, i + 1).Resize(
            framework::make_ddim({x_dim, y_dim}));
        auto output_vec = d_out_mat.chip(i, 1);
        if (d_x) {
          y_scale_mat.device(place) =
              output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
-                  .broadcast(bcast_for_x) *
+                  .broadcast(bcast_for_x)
                  .eval() *
              y_mat;
          blas.GEMM(CblasNoTrans, CblasTrans, batch_size, x_dim, y_dim, 1,
                    y_scale.data<T>(), weight_i.data<T>(), 1, d_x->data<T>());
        }
-        if (d_y) {
+
-          x_scale_mat.device(place) =
+        if (d_y || d_weight) {
          auto output_vec_y =
              output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
-                  .broadcast(bcast_for_y) *
+                  .broadcast(bcast_for_y)
-              x_mat;
+                  .eval();
          x_scale_mat.device(place) = output_vec_y * x_mat;
          if (d_y) {
            blas.GEMM(CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1,
                      x_scale.data<T>(), weight_i.data<T>(), 1, d_y->data<T>());
          }
      }
    }
    // Caculate the gradient of Input(Weight).
          if (d_weight) {
      d_weight->mutable_data<T>(ctx.GetPlace());
      Eigen::DSizes<int, 2> bcast_for_weight(1, x_dim);
      for (int i = 0; i < out_dim; ++i) {
            Tensor d_weight_i = d_weight->Slice(i, i + 1).Resize(
                framework::make_ddim({x_dim, y_dim}));
        auto output_vec = d_out_mat.chip(i, 1);
        x_scale_mat.device(place) =
            output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
                .broadcast(bcast_for_weight) *
            x_mat;
            blas.GEMM(CblasTrans, CblasNoTrans, x_dim, y_dim, batch_size, 1,
                      x_scale.data<T>(), y->data<T>(), 0, d_weight_i.data<T>());
          }
        }
      }
    }
-    // Caculate the gradient of Input(Bias).
+    // calculate the gradient of Input(Bias).
    if (d_bias) {
      d_bias->mutable_data<T>(ctx.GetPlace());
      auto d_bias_mat = framework::EigenVector<T>::Flatten(*d_bias);
--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
@ -120,6 +120,7 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
                      "Dimensions of Input(X) and Mask must be the same.");
    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
    ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
  }
 };
--- a/paddle/fluid/operators/dropout_op_test.cc
+++ b/paddle/fluid/operators/dropout_op_test.cc
@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifndef _WIN32
 #include <unistd.h>
 #endif
 #include <string>
 #include <thread>  // NOLINT
--- a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
@ -19,36 +19,21 @@ limitations under the License. */
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #include "paddle/fluid/operators/math/jit_kernel.h"
-#include "xbyak.h"
+#include "xbyak/xbyak.h"
-#include "xbyak_util.h"
+#include "xbyak/xbyak_util.h"
 namespace paddle {
 namespace operators {
 using framework::DataLayout;
 using mkldnn::memory;
-
+using platform::StringToMKLDNNFormat;
 static mkldnn::memory::format StringToMKLDNNFormat(std::string& format) {
  std::transform(format.begin(), format.end(), format.begin(), ::tolower);
  if (!format.compare("nchw")) {
    return memory::format::nchw;
  } else if (!format.compare("nchw16c")) {
    return memory::format::nChw16c;
  } else if (!format.compare("nchw8c")) {
    return memory::format::nChw8c;
  } else if (!format.compare("nhwc")) {
    return memory::format::nhwc;
  } else {
    return memory::format::any;
  }
 }
 static void UpdateDataFormat(const framework::ExecutionContext& ctx,
                             framework::Tensor* tensor, const char* attribute) {
  if (ctx.op().HasAttr(attribute)) {
    auto format_as_string = ctx.Attr<std::string>(attribute);
-    auto format = StringToMKLDNNFormat(format_as_string);
+    auto format = StringToMKLDNNFormat(&format_as_string);
    if (format != memory::format::any) {
      tensor->set_format(format);
    }
@ -93,8 +78,8 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
    auto y_dims_untrimmed = y->dims();
    auto x_int_dims = paddle::framework::vectorize2int(x_dims);
-    UpdateDataFormat(ctx, (Tensor*)x, "x_data_format");
+    UpdateDataFormat(ctx, const_cast<Tensor*>(x), "x_data_format");
-    UpdateDataFormat(ctx, (Tensor*)y, "y_data_format");
+    UpdateDataFormat(ctx, const_cast<Tensor*>(y), "y_data_format");
    Xbyak::util::Cpu cpu;
    const bool is_avx512_enabled = cpu.has(Xbyak::util::Cpu::tAVX512F);
@ -156,10 +141,10 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
        auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
        const auto& mkldnn_engine = dev_ctx.GetEngine();
        if (!(is_x_nchw || is_x_nc))
-          ReorderInput<T>((Tensor*)x, ctx.GetPlace(), mkldnn_engine,
+          ReorderInput<T>(const_cast<Tensor*>(x), ctx.GetPlace(), mkldnn_engine,
                          x->dims().size() == 4);
        if (!(is_y_nchw || is_y_nc))
-          ReorderInput<T>((Tensor*)y, ctx.GetPlace(), mkldnn_engine,
+          ReorderInput<T>(const_cast<Tensor*>(y), ctx.GetPlace(), mkldnn_engine,
                          y->dims().size() == 4);
      }
--- a/Show More
+++ b/Show More