Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into refactor-prefetch

test=develop
7 years ago · 145c535750
parent b2c9efef2b 12e1719f96
commit 145c535750
55 changed files with 2344 additions and 383 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -214,6 +214,7 @@ if (NOT WIN32)
 # there is no official support of warpctc, nccl, cupti in windows
 include(external/warpctc)   # download, build, install warpctc
 include(cupti)
+include(external/gzstream)
 endif (NOT WIN32)

 if(WITH_DISTRIBUTE)
--- a/cmake/external/gzstream.cmake
+++ b/cmake/external/gzstream.cmake
@ -0,0 +1,47 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+IF(MOBILE_INFERENCE)
+    return()
+ENDIF()
+
+include (ExternalProject)
+
+# NOTE: gzstream is needed when linking with ctr reader.
+
+SET(GZSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/gzstream)
+SET(GZSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gzstream)
+SET(GZSTREAM_INCLUDE_DIR "${GZSTREAM_INSTALL_DIR}/include/" CACHE PATH "gzstream include directory." FORCE)
+
+ExternalProject_Add(
+        extern_gzstream
+        GIT_REPOSITORY "https://github.com/jacquesqiao/gzstream.git"
+        GIT_TAG ""
+        PREFIX          ${GZSTREAM_SOURCES_DIR}
+        UPDATE_COMMAND  ""
+        CONFIGURE_COMMAND ""
+        BUILD_IN_SOURCE 1
+        BUILD_COMMAND   make -j8
+        INSTALL_COMMAND mkdir -p ${GZSTREAM_INSTALL_DIR}/lib/ && mkdir -p ${GZSTREAM_INSTALL_DIR}/include/
+        && cp ${GZSTREAM_SOURCES_DIR}/src/extern_gzstream/libgzstream.a ${GZSTREAM_INSTALL_DIR}/lib
+        && cp -r ${GZSTREAM_SOURCES_DIR}/src/extern_gzstream/gzstream.h ${GZSTREAM_INSTALL_DIR}/include
+)
+
+ADD_LIBRARY(gzstream STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET gzstream PROPERTY IMPORTED_LOCATION
+        "${GZSTREAM_INSTALL_DIR}/lib/libgzstream.a")
+
+include_directories(${GZSTREAM_INCLUDE_DIR})
+ADD_DEPENDENCIES(gzstream extern_gzstream zlib)
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@ -97,8 +97,8 @@ paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_ti
 paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
-paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0))
-paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False))
+paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False))
 paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'name'], varargs=None, keywords=None, defaults=(0, None))
 paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@ -39,11 +39,12 @@ if (WITH_GPU)
 endif()

 cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass)
+cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass)

 cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)

-set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass) 
+set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass) 
 if (WITH_GPU)
  list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass)
 endif()
--- a/paddle/fluid/framework/details/all_reduce_deps_pass.cc
+++ b/paddle/fluid/framework/details/all_reduce_deps_pass.cc
@ -0,0 +1,125 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/fluid/framework/details/all_reduce_deps_pass.h"
+#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/details/op_graph_view.h"
+#include "paddle/fluid/framework/details/var_handle.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+static constexpr char kAllOpDescs[] = "all_op_descs";
+
+VarHandle* GetValidInput(const OpHandleBase* a) {
+  for (auto p : a->Inputs()) {
+    VarHandle* b = dynamic_cast<VarHandle*>(p);
+    if (b) {
+      return b;
+    }
+  }
+
+  return nullptr;
+}
+
+std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  auto graph_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
+
+  // get vars order
+  int order = 0;
+  std::unordered_map<std::string, int> vars;
+  // TODO(gongwb): use graph topology sort to find the order of operators.
+  //               Note that must assert topology sort is stable
+  auto& ops = Get<const std::vector<OpDesc*>>(kAllOpDescs);
+  for (auto* op_desc : ops) {
+    auto outputs = op_desc->Outputs();
+    for (auto& o_it : outputs) {
+      for (auto& v : o_it.second) {  // values
+        vars[v] = order;
+      }
+    }
+    order++;
+  }
+
+  std::vector<OpHandleBase*> dist_ops;
+  // get allreduce ops.
+  for (auto& op : graph_ops) {
+    // FIXME(gongwb):add broad cast.
+    if (op->Name() == "all_reduce" || op->Name() == "reduce") {
+      dist_ops.push_back(op);
+    }
+  }
+
+  VLOG(10) << "dist_ops size:" << dist_ops.size() << std::endl;
+
+  std::sort(dist_ops.begin(), dist_ops.end(), [&](OpHandleBase* op1,
+                                                  OpHandleBase* op2) {
+    VarHandle* i0 = dynamic_cast<VarHandle*>(GetValidInput(op1));
+    VarHandle* i1 = dynamic_cast<VarHandle*>(GetValidInput(op2));
+
+    PADDLE_ENFORCE(i0 != nullptr && i1 != nullptr, "%s convert to %s error",
+                   op1->DebugString(), op2->DebugString());
+
+    auto l_it = vars.find(i0->name_);
+    auto r_it = vars.find(i1->name_);
+
+    if (l_it->second < r_it->second) return true;
+
+    if (l_it->second == r_it->second) {
+      return i0->name_ < i1->name_;
+    }
+
+    return false;
+  });
+
+  // add dependency.
+  auto& sorted_ops = dist_ops;
+  for (size_t i = 1; i < sorted_ops.size(); ++i) {
+    auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar());
+
+    auto* pre_op = sorted_ops[i - 1];
+    auto* op = sorted_ops[i];
+
+    pre_op->AddOutput(dep_var);
+    op->AddInput(dep_var);
+    graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
+
+    VLOG(10) << "add all_reduce sequential dependencies between " << pre_op
+             << " and " << op;
+
+    VLOG(10) << "pre_op:" << pre_op->DebugString()
+             << ", op:" << op->DebugString();
+  }
+
+  return graph;
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(all_reduce_deps_pass,
+              paddle::framework::details::AllReduceDepsPass)
+    .RequirePassAttr(paddle::framework::details::kAllOpDescs);
--- a/paddle/fluid/framework/details/all_reduce_deps_pass.h
+++ b/paddle/fluid/framework/details/all_reduce_deps_pass.h
@ -0,0 +1,33 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+// TODO(gongwb): overlap allreduce with backward computation.
+class AllReduceDepsPass : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@ -16,6 +16,7 @@ limitations under the License. */

 #include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
+#include "paddle/fluid/framework/details/reduce_op_handle.h"
 #include "paddle/fluid/framework/details/sequential_execution_pass.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
@ -24,6 +25,10 @@ namespace paddle {
 namespace framework {
 namespace details {

+static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) {
+  return (!strategy.enable_sequential_execution_ && strategy.num_trainers_ > 1);
+}
+
 class ParallelExecutorPassBuilder : public ir::PassBuilder {
 public:
  explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy)
@ -70,6 +75,10 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
    // Verify that the graph is correct for multi-device executor.
    AppendPass("multi_devices_check_pass");

+    if (SeqOnlyAllReduceOps(strategy)) {
+      AppendPass("all_reduce_deps_pass");
+    }
+
    if (strategy_.remove_unnecessary_lock_) {
      AppendPass("modify_op_lock_and_record_event_pass");
    }
@ -124,6 +133,17 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
      pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
 #endif
    } else if (pass->Type() == "sequential_execution_pass") {
+      VLOG(1) << "set enable_sequential_execution:"
+              << enable_sequential_execution_;
+
+      pass->Erase(kAllOpDescs);
+      pass->Set<const std::vector<OpDesc *>>(
+          kAllOpDescs,
+          new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
+    } else if (pass->Type() == "all_reduce_deps_pass") {
+      VLOG(1) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this)
+              << ", num_trainers:" << num_trainers_;
+
      pass->Erase(kAllOpDescs);
      pass->Set<const std::vector<OpDesc *>>(
          kAllOpDescs,
@ -144,4 +164,5 @@ USE_PASS(multi_devices_pass);
 USE_PASS(multi_devices_check_pass);
 USE_PASS(multi_devices_print_pass);
 USE_PASS(sequential_execution_pass);
+USE_PASS(all_reduce_deps_pass);
 USE_PASS(modify_op_lock_and_record_event_pass);
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@ -73,6 +73,7 @@ struct BuildStrategy {

  bool fuse_broadcast_op_{false};

+  int num_trainers_{1};
  bool remove_unnecessary_lock_{false};

  // NOTE:
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@ -71,7 +71,7 @@ class OperatorBase;
 class ExecutionContext;

 /**
- * OperatorBase has the basic element that Net will call to do computation.
+ * OperatorBase has the basic elements that Net will call to do computation.
 * Only CreateOperator from OpRegistry will new Operator directly. User
 * should always construct a proto message OpDesc and call
 * OpRegistry::CreateOp(op_desc) to get an Operator instance.
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@ -20,7 +20,7 @@ limitations under the License. */

 #include "paddle/fluid/framework/ir/graph.h"

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif

@ -54,7 +54,7 @@ class ParallelExecutorPrivate {
  Scope *global_scope_;  // not owned
  std::unique_ptr<details::SSAGraphExecutor> executor_;

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
  std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
 #endif
  bool own_local_scope_;
@ -104,7 +104,7 @@ ParallelExecutor::ParallelExecutor(

  if (member_->use_cuda_) {
 // Bcast Parameters to all GPUs
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
    auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME);
    ncclUniqueId *nccl_id = nullptr;
    if (nccl_id_var != nullptr) {
@ -124,7 +124,7 @@ ParallelExecutor::ParallelExecutor(

 // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
 // ncclOp
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
  std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
      main_program, member_->places_, loss_var_name, params,
      member_->local_scopes_, member_->use_cuda_, member_->nccl_ctxs_.get());
@ -213,7 +213,7 @@ void ParallelExecutor::BCastParamsToDevices(
    }
    auto &dims = main_tensor.dims();
    if (paddle::platform::is_gpu_place(main_tensor.place())) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
      std::vector<void *> buffers;
      size_t numel = main_tensor.numel();
      ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@ -120,8 +120,22 @@ class SelectedRows {
   */
  int64_t AutoGrownIndex(int64_t key, bool auto_grown, bool is_test = false);

-  void SyncIndex();
+  /*
+   * @brief Get the index of the key from id_to_index_ map.
+   */
+  inline int64_t GetIndexFromId(int64_t key) {
+    auto iter = id_to_index_.find(key);
+    if (iter == id_to_index_.end()) {
+      return -1;
+    } else {
+      return iter->second;
+    }
+  }

+  void SyncIndex();
+  /*
+   * @brief Get complete Dims before
+   */
  DDim GetCompleteDims() const {
    std::vector<int64_t> dims = vectorize(value_->dims());
    dims[0] = height_;
@ -133,9 +147,10 @@ class SelectedRows {
  // SelectedRows are simply concated when adding together. Until a
  // SelectedRows add a Tensor, will the duplicate rows be handled.
  Vector<int64_t> rows_;
-  std::unordered_map<int64_t, int64_t> id_to_index_;
+  std::unordered_map<int64_t, int64_t>
+      id_to_index_;  // should not be used when rows_ has duplicate member
  std::unique_ptr<Tensor> value_{nullptr};
-  int64_t height_;
+  int64_t height_;  // height indicates the underline tensor's height
  std::unique_ptr<RWLock> rwlock_{nullptr};
 };

--- a/paddle/fluid/framework/transfer_scope_cache.cc
+++ b/paddle/fluid/framework/transfer_scope_cache.cc
@ -17,28 +17,16 @@
 namespace paddle {
 namespace framework {

-// Holds all the transfer scope across the process.
 std::unordered_map<size_t, Scope*>& global_transfer_data_cache() {
-  typedef std::unordered_map<size_t, Scope*> map_t;
-  thread_local std::unique_ptr<map_t> x(new map_t);
+  thread_local auto* x = new std::unordered_map<size_t, Scope*>;
  return *x;
 }

-// Holds all the transfer scope for this thread.
 std::unordered_set<Scope*>& global_transfer_scope_cache() {
-  typedef std::unordered_set<Scope*> set_t;
-  thread_local std::unique_ptr<set_t> x(new set_t);
+  thread_local auto* x = new std::unordered_set<Scope*>;
  return *x;
 }

-// Try to create a transfer scope. If one cached scope has match the
-// requirement, just return that one.
-// Inputs:
-// @type0: the source kernel type.
-// @type1: the target kernel type.
-// @scope: the execution scope of this op.
-// Returns: A scope used to hold the transfer data across the different kernel
-// type.
 Scope* TryCreateTransferScope(OpKernelType type0, OpKernelType type1,
                              const Scope* scope) {
  Scope* new_scope{nullptr};
@ -58,5 +46,27 @@ Scope* TryCreateTransferScope(OpKernelType type0, OpKernelType type1,
  return new_scope;
 }

+void RemoveKidsFromTransferScopeCache(Scope* scope) {
+  auto it = global_transfer_scope_cache().find(scope);
+  if (it != global_transfer_scope_cache().end()) {
+    global_transfer_scope_cache().erase(it);
+  }
+  for (auto* s : scope->kids()) {
+    auto it = global_transfer_scope_cache().find(s);
+    if (it != global_transfer_scope_cache().end()) {
+      global_transfer_scope_cache().erase(it);
+    }
+  }
+
+  // remove global transfer data cache
+  auto& cache = global_transfer_data_cache();
+  for (auto it = cache.begin(); it != cache.end();) {
+    if (it->second == scope)
+      it = cache.erase(it);
+    else
+      it++;
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@ -35,4 +35,5 @@ function(inference_analysis_test TARGET)
  endif()
 endfunction(inference_analysis_test)

-inference_analysis_test(test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS reset_tensor_array paddle_inference_api)
+inference_analysis_test(test_analyzer SRCS analyzer_tester.cc
+    EXTRA_DEPS reset_tensor_array paddle_inference_api)
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@ -76,7 +76,8 @@ void TestWord2vecPrediction(const std::string& model_path) {
                     0.000932706};
  const size_t num_elements = outputs.front().data.length() / sizeof(float);
  // The outputs' buffers are in CPU memory.
-  for (size_t i = 0; i < std::min((size_t)5UL, num_elements); i++) {
+  for (size_t i = 0; i < std::min(static_cast<size_t>(5UL), num_elements);
+       i++) {
    LOG(INFO) << "data: "
              << static_cast<float*>(outputs.front().data.data())[i];
    PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@ -284,6 +284,7 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
        framework::GetFetchVariable(*scope, "fetch", idx);
    auto type = fetch.type();
    auto output = &(outputs->at(i));
+    output->name = fetchs_[idx]->Input("X")[0];
    if (type == typeid(float)) {
      GetFetchOne<float>(fetch, output);
      output->dtype = PaddleDType::FLOAT32;
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@ -109,7 +109,7 @@ class AnalysisPredictor : public PaddlePredictor {
  std::map<std::string, size_t> feed_names_;
  std::vector<framework::OpDesc *> fetchs_;
  // Memory buffer for feed inputs. The temporary LoDTensor will cause serious
-  // concurrency problems, so cache them.
+  // concurrency problems, wrong results and memory leak, so cache them.
  std::vector<framework::LoDTensor> feed_tensors_;
  details::TensorArrayBatchCleaner tensor_array_batch_cleaner_;

--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@ -185,8 +185,12 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
               << inputs.size();
    return false;
  }
+
+  // Cache the inputs memory for better concurrency performance.
+  feed_tensors_.resize(inputs.size());
+
  for (size_t i = 0; i < inputs.size(); ++i) {
-    framework::LoDTensor input;
+    auto &input = feed_tensors_[i];
    framework::DDim ddim = framework::make_ddim(inputs[i].shape);
    void *input_ptr;
    if (inputs[i].dtype == PaddleDType::INT64) {
@ -261,6 +265,7 @@ bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs,
        framework::GetFetchVariable(*scope, "fetch", idx);
    auto type = fetch.type();
    auto output = &(outputs->at(i));
+    output->name = fetchs_[idx]->Input("X")[0];
    if (type == typeid(float)) {
      GetFetchOne<float>(fetch, output);
      output->dtype = PaddleDType::FLOAT32;
--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
@ -69,6 +69,9 @@ class NativePaddlePredictor : public PaddlePredictor {
  std::vector<framework::OpDesc *> feeds_;
  std::map<std::string, size_t> feed_names_;
  std::vector<framework::OpDesc *> fetchs_;
+  // Memory buffer for feed inputs. The temporary LoDTensor will cause serious
+  // concurrency problems, wrong results and memory leak, so cache them.
+  std::vector<framework::LoDTensor> feed_tensors_;
  // Do not use unique_ptr, use parent scope to delete
  framework::Scope *sub_scope_{nullptr};
  details::TensorArrayBatchCleaner tensor_array_batch_cleaner_;
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@ -86,7 +86,11 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) {
    munlock(p, size);
 #endif
  }
+#ifdef _WIN32
+  _aligned_free(p);
+#else
  free(p);
+#endif
 }

 bool CPUAllocator::UseGpu() const { return false; }
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@ -149,6 +149,13 @@ $out = \max(x, 0)$

 )DOC";

+UNUSED constexpr char GeluDoc[] = R"DOC(
+Gelu Activation Operator.
+
+$out = \\frac{1 + erf(\\frac{x}{\\sqrt{2}})}{2} x$
+
+)DOC";
+
 UNUSED constexpr char TanhDoc[] = R"DOC(
 Tanh Activation Operator.

@ -472,6 +479,7 @@ REGISTER_ACTIVATION_OP_MAKER(Sigmoid, SigmoidDoc);
 REGISTER_ACTIVATION_OP_MAKER(LogSigmoid, LogSigmoidDoc);
 REGISTER_ACTIVATION_OP_MAKER(Exp, ExpDoc);
 REGISTER_ACTIVATION_OP_MAKER(Relu, ReluDoc);
+REGISTER_ACTIVATION_OP_MAKER(Gelu, GeluDoc);
 REGISTER_ACTIVATION_OP_MAKER(Tanh, TanhDoc);
 REGISTER_ACTIVATION_OP_MAKER(TanhShrink, TanhShrinkDoc);
 REGISTER_ACTIVATION_OP_MAKER(Sqrt, SqrtDoc);
@ -489,6 +497,7 @@ REGISTER_ACTIVATION_OP_MAKER(Softsign, SoftsignDoc);

 REGISTER_ACTIVATION_OP_GRAD_MAKER(Sigmoid, sigmoid);
 REGISTER_ACTIVATION_OP_GRAD_MAKER(Relu, relu);
+REGISTER_ACTIVATION_OP_GRAD_MAKER(Gelu, gelu);
 REGISTER_ACTIVATION_OP_GRAD_MAKER(Exp, exp);
 REGISTER_ACTIVATION_OP_GRAD_MAKER(Tanh, tanh);
 REGISTER_ACTIVATION_OP_GRAD_MAKER(Ceil, ceil);
@ -525,6 +534,7 @@ namespace ops = paddle::operators;
  __macro(Round, round);             \
  __macro(Log, log);                 \
  __macro(Square, square);           \
+  __macro(Gelu, gelu);               \
  __macro(BRelu, brelu);             \
  __macro(Pow, pow);                 \
  __macro(STanh, stanh);             \
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@ -16,6 +16,11 @@ limitations under the License. */
 #include <utility>
 #include <vector>

+#include <cmath>
+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES
+#endif
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
@ -212,6 +217,31 @@ struct ReluGradFunctor : public BaseActivationFunctor<T> {
  }
 };

+// gelu(x) = 0.5 * x *  (1 + erf(x / sqrt(2)))
+template <typename T>
+struct GeluFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto temp =
+        ((x * static_cast<T>(M_SQRT1_2)).erf()).template cast<T>().eval();
+    out.device(d) = x * static_cast<T>(0.5) * (static_cast<T>(1) + temp);
+  }
+};
+
+template <typename T>
+struct GeluGradFunctor : BaseActivationFunctor<T> {
+  bool Inplace() const { return IsInplace("gelu"); }
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto temp = (static_cast<T>(0.5 * M_2_SQRTPI * M_SQRT1_2) * x *
+                 ((-static_cast<T>(0.5) * x.square()).exp()))
+                    .template cast<T>()
+                    .eval();
+    dx.device(d) = dout * (out / x + temp);
+  }
+};
+
 // tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
 template <typename T>
 struct TanhFunctor : public BaseActivationFunctor<T> {
@ -877,6 +907,7 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
  __macro(logsigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);     \
  __macro(exp, ExpFunctor, ExpGradFunctor);                          \
  __macro(relu, ReluFunctor, ReluGradFunctor);                       \
+  __macro(gelu, GeluFunctor, GeluGradFunctor);                       \
  __macro(tanh, TanhFunctor, TanhGradFunctor);                       \
  __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor);     \
  __macro(sqrt, SqrtFunctor, SqrtGradFunctor);                       \
--- a/paddle/fluid/operators/bilinear_tensor_product_op.h
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.h
@ -70,7 +70,7 @@ class BilinearTensorProductKernel : public framework::OpKernel<T> {
    if (bias) {
      auto bias_vec = EigenMatrix<T>::From(*bias);
      Eigen::DSizes<int, 2> bcast(batch_size, 1);
-      output_mat.device(place) = bias_vec.broadcast(bcast) + output_mat;
+      output_mat.device(place) = bias_vec.broadcast(bcast).eval() + output_mat;
    }
  }
 };
@ -99,13 +99,13 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
    auto d_out_mat = EigenMatrix<T>::From(*d_out);
    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    // Create the intermediate variable to caculate the Output(Y@Grad).
+    // Create the intermediate variable to calculate the Output(Y@Grad).
    Tensor x_scale;
    x_scale.mutable_data<T>(framework::make_ddim({batch_size, x_dim}),
                            ctx.GetPlace());
    auto x_scale_mat = EigenMatrix<T>::From(x_scale);

-    // Create the intermediate variable to caculate the Output(X@Grad).
+    // Create the intermediate variable to calculate the Output(X@Grad).
    Tensor y_scale;
    y_scale.mutable_data<T>(framework::make_ddim({batch_size, y_dim}),
                            ctx.GetPlace());
@ -113,65 +113,64 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {

    math::SetConstant<DeviceContext, T> set_zero;

-    // Set Output(X@Grad) be zero.
    if (d_x) {
      d_x->mutable_data<T>(ctx.GetPlace());
      set_zero(dev_ctx, d_x, static_cast<T>(0));
    }

-    // Set Output(Y@Grad) be zero.
    if (d_y) {
      d_y->mutable_data<T>(ctx.GetPlace());
      set_zero(dev_ctx, d_y, static_cast<T>(0));
    }

+    if (d_weight) {
+      d_weight->mutable_data<T>(ctx.GetPlace());
+    }
+
    auto blas = math::GetBlas<DeviceContext, T>(ctx);

    // Caculate the Output(X@Grad) and Output(Y@Grad).
-    if (d_x || d_y) {
+    if (d_x || d_y || d_weight) {
      Eigen::DSizes<int, 2> bcast_for_x(1, y_dim);
      Eigen::DSizes<int, 2> bcast_for_y(1, x_dim);
+      Eigen::DSizes<int, 2> bcast_for_weight(1, x_dim);
+
      for (int i = 0; i < out_dim; ++i) {
        Tensor weight_i = weight->Slice(i, i + 1).Resize(
            framework::make_ddim({x_dim, y_dim}));
        auto output_vec = d_out_mat.chip(i, 1);
+
        if (d_x) {
          y_scale_mat.device(place) =
              output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
-                  .broadcast(bcast_for_x) *
+                  .broadcast(bcast_for_x)
+                  .eval() *
              y_mat;
          blas.GEMM(CblasNoTrans, CblasTrans, batch_size, x_dim, y_dim, 1,
                    y_scale.data<T>(), weight_i.data<T>(), 1, d_x->data<T>());
        }
-        if (d_y) {
-          x_scale_mat.device(place) =
+
+        if (d_y || d_weight) {
+          auto output_vec_y =
              output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
-                  .broadcast(bcast_for_y) *
-              x_mat;
-          blas.GEMM(CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1,
-                    x_scale.data<T>(), weight_i.data<T>(), 1, d_y->data<T>());
+                  .broadcast(bcast_for_y)
+                  .eval();
+          x_scale_mat.device(place) = output_vec_y * x_mat;
+          if (d_y) {
+            blas.GEMM(CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1,
+                      x_scale.data<T>(), weight_i.data<T>(), 1, d_y->data<T>());
+          }
+          if (d_weight) {
+            Tensor d_weight_i = d_weight->Slice(i, i + 1).Resize(
+                framework::make_ddim({x_dim, y_dim}));
+            blas.GEMM(CblasTrans, CblasNoTrans, x_dim, y_dim, batch_size, 1,
+                      x_scale.data<T>(), y->data<T>(), 0, d_weight_i.data<T>());
+          }
        }
      }
    }

-    // Caculate the gradient of Input(Weight).
-    if (d_weight) {
-      d_weight->mutable_data<T>(ctx.GetPlace());
-      Eigen::DSizes<int, 2> bcast_for_weight(1, x_dim);
-      for (int i = 0; i < out_dim; ++i) {
-        Tensor d_weight_i = d_weight->Slice(i, i + 1).Resize(
-            framework::make_ddim({x_dim, y_dim}));
-        auto output_vec = d_out_mat.chip(i, 1);
-        x_scale_mat.device(place) =
-            output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
-                .broadcast(bcast_for_weight) *
-            x_mat;
-        blas.GEMM(CblasTrans, CblasNoTrans, x_dim, y_dim, batch_size, 1,
-                  x_scale.data<T>(), y->data<T>(), 0, d_weight_i.data<T>());
-      }
-    }
-
-    // Caculate the gradient of Input(Bias).
+    // calculate the gradient of Input(Bias).
    if (d_bias) {
      d_bias->mutable_data<T>(ctx.GetPlace());
      auto d_bias_mat = framework::EigenVector<T>::Flatten(*d_bias);
--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
@ -120,6 +120,7 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
                      "Dimensions of Input(X) and Mask must be the same.");

    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
  }
 };

--- a/paddle/fluid/operators/dropout_op_test.cc
+++ b/paddle/fluid/operators/dropout_op_test.cc
@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#ifndef _WIN32
 #include <unistd.h>
+#endif

 #include <string>
 #include <thread>  // NOLINT
--- a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
@ -19,36 +19,21 @@ limitations under the License. */
 #include "paddle/fluid/platform/mkldnn_helper.h"

 #include "paddle/fluid/operators/math/jit_kernel.h"
-#include "xbyak.h"
-#include "xbyak_util.h"
+#include "xbyak/xbyak.h"
+#include "xbyak/xbyak_util.h"

 namespace paddle {
 namespace operators {

 using framework::DataLayout;
 using mkldnn::memory;
-
-static mkldnn::memory::format StringToMKLDNNFormat(std::string& format) {
-  std::transform(format.begin(), format.end(), format.begin(), ::tolower);
-
-  if (!format.compare("nchw")) {
-    return memory::format::nchw;
-  } else if (!format.compare("nchw16c")) {
-    return memory::format::nChw16c;
-  } else if (!format.compare("nchw8c")) {
-    return memory::format::nChw8c;
-  } else if (!format.compare("nhwc")) {
-    return memory::format::nhwc;
-  } else {
-    return memory::format::any;
-  }
-}
+using platform::StringToMKLDNNFormat;

 static void UpdateDataFormat(const framework::ExecutionContext& ctx,
                             framework::Tensor* tensor, const char* attribute) {
  if (ctx.op().HasAttr(attribute)) {
    auto format_as_string = ctx.Attr<std::string>(attribute);
-    auto format = StringToMKLDNNFormat(format_as_string);
+    auto format = StringToMKLDNNFormat(&format_as_string);
    if (format != memory::format::any) {
      tensor->set_format(format);
    }
@ -93,8 +78,8 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
    auto y_dims_untrimmed = y->dims();
    auto x_int_dims = paddle::framework::vectorize2int(x_dims);

-    UpdateDataFormat(ctx, (Tensor*)x, "x_data_format");
-    UpdateDataFormat(ctx, (Tensor*)y, "y_data_format");
+    UpdateDataFormat(ctx, const_cast<Tensor*>(x), "x_data_format");
+    UpdateDataFormat(ctx, const_cast<Tensor*>(y), "y_data_format");

    Xbyak::util::Cpu cpu;
    const bool is_avx512_enabled = cpu.has(Xbyak::util::Cpu::tAVX512F);
@ -156,10 +141,10 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
        auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
        const auto& mkldnn_engine = dev_ctx.GetEngine();
        if (!(is_x_nchw || is_x_nc))
-          ReorderInput<T>((Tensor*)x, ctx.GetPlace(), mkldnn_engine,
+          ReorderInput<T>(const_cast<Tensor*>(x), ctx.GetPlace(), mkldnn_engine,
                          x->dims().size() == 4);
        if (!(is_y_nchw || is_y_nc))
-          ReorderInput<T>((Tensor*)y, ctx.GetPlace(), mkldnn_engine,
+          ReorderInput<T>(const_cast<Tensor*>(y), ctx.GetPlace(), mkldnn_engine,
                          y->dims().size() == 4);
      }

--- a/Show More
+++ b/Show More