Merge branch 'develop' into mozga-intel/Mul_mkldnn

8 years ago · 171471eada
parent 6e7b883bdd 9ced6da444
commit 171471eada
47 changed files with 807 additions and 290 deletions
--- a/benchmark/paddle/image/check_env.sh
+++ b/benchmark/paddle/image/check_env.sh
--- a/doc/fluid/api/initializer.rst
+++ b/doc/fluid/api/initializer.rst
@ -33,3 +33,45 @@ Xavier
    :members:
    :noindex:

+MSRA
+------
+
+..  autoclass:: paddle.fluid.initializer.MSRA
+    :members:
+    :noindex:
+
+ConstantInitializer
+-------------------
+
+..  autoclass:: paddle.fluid.initializer.ConstantInitializer
+    :members:
+    :noindex:
+
+UniformInitializer
+------------------
+
+..  autoclass:: paddle.fluid.initializer.UniformInitializer
+    :members:
+    :noindex:
+
+NormalInitializer
+-----------------
+
+..  autoclass:: paddle.fluid.initializer.NormalInitializer
+    :members:
+    :noindex:
+
+XavierInitializer
+-----------------
+
+..  autoclass:: paddle.fluid.initializer.XavierInitializer
+    :members:
+    :noindex:
+    MSRA
+    ------
+
+MSRAInitializer
+-----------------
+..  autoclass:: paddle.fluid.initializer.MSRAInitializer
+    :members:
+    :noindex:
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@ -815,3 +815,8 @@ zeros
 ..  autofunction:: paddle.fluid.layers.zeros
    :noindex:

+topk
+----
+
+..  autofunction:: paddle.fluid.layers.topk
+    :noindex:
--- a/doc/fluid/design/dist_train/mpi_enabled_design.md
+++ b/doc/fluid/design/dist_train/mpi_enabled_design.md
@ -0,0 +1,46 @@
+# MPI-enabled PaddlePaddle Design doc
+
+# Background
+When we do distribute multi GPU training, the communication overhead between servers become the major bottleneck, because of the following reasons:
+1. Must copy at least once from GPU to CPU memory so that the data can be ready to transfer. And for the pserver side, copy data from CPU to GPU introduce more overhead.
+2. GPU->CPU data transfer is 10 times slower than data transfer between GPUs or between PCIe devices.
+3. TCP connections can not make full use of RDMA 100Gb devices.
+
+We will use OpenMPI API to PaddlePaddle, which can bring two benefits to PaddlePaddle:
+1. Enable RDMA with PaddlePaddle, which bring high-performance low latency networks.
+2. Enable GPUDriect with PaddlePaddle, which bring the highest throughput and lowest latency GPU read and write.
+
+# Change list
+* Compile args: Need add compile args to enable MPI support.
+* Execute args:  Need add execute args to assign when and how to use MPI operations.
+* New ops:  Need new op  ```mpi_send_op``` and ```mpi_listenandserve_op``` to support MPI send and receive.
+* Transpiler optimized: Which can add   ```mpi_send_op``` and ```mpi_listenandserve_op```  to the running graph.
+* MPI utils package: Need MPI utils package as the low-level API supported.
+
+## Compile args
+Because MPI or CUDA need hardware supported, so we will add compile args to enable MPI support and control compiling.Add ```WITH_MPI```  compile args to control MPI to use or not. If the  ```WITH_MPI``` is ```ON```, compile system will find openMPI codes in configuration. We should prepare openMPI environment before compiling.
+
+## Execute args
+Launch the script using the ```mpirun``` launcher, For example: ```mpirun -np 3 -hosts node1,node2,node3 python train.py```. By doing this, We can number the actors (trainer/pserver/master) with o .. (n-1). The node's number is the Rank of the calling process in a group of comm (integer),  The MPI processes identify each other using a Rank ID. We have to create a mapping between PaddlePaddle's nodes and their Rank ID so that we can communicate with the correct destinations when using MPI operations.
+
+## New ops
+We won't replace all the gRPC requests to MPI requests,  the standard gRPC library is used for all administrative operations and the MPI API will be used to transfer tensor or selectRows to Pservers. The base of this idea, we create two new operators to handle requests and receives,  the two operators are ```mpi_send_op``` and ```mpi_listenandserve_op```. They are a little similar to [send_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/send_op.cc) and [listen_and_serv_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/listen_and_serv_op.cc), also, We will build a new module to package MPI send and receive process.
+
+### mpi_send_op
+Very similar with ```send_op```, we will replace gRPC code which used to send gradient with ```mpi_module```, at the same time, we will wrap it with ```framework::Async```.
+
+### mpi_listenandserve_op
+Very similar with ```listen_and_serv_op```, we will replace gRPC code which used to receive gradient with ```mpi_module```, at the same time, we will wrap it with ```framework::Async```.
+
+## Transpiler optimized
+**We can get env ```OMPI_COMM_WORLD_SIZE``` and ```OMPI_COMM_WORLD_RANK``` to distinguish use MPI or not, If we use openMPI, the variable in env must exist.**
+ if  confirm to use MPI, we will modify  ```send_op``` to ```mpi_send_op``` in distribute_transpiler, and modify ```listenandserve_op``` to ```mpi_listenandserve_op``` also.
+
+## MPI utils package
+In this package, We will write openMPI low-level API to use MPI.
+The API included in this package are:
+* MPI send and receive module, We will build a new module to package MPI send and receive process. MPI send and receive are different to gRPC, the MPI [recvice](https://www.open-mpi.org/doc/v1.8/man3/MPI_Irecv.3.php) must know receive buffer size and receive buffer element. For this reason, We have to make communications twice, the first one is to send metadata about gradient through gRPC, the second one is the real communication through MPI which send gradient data to mpi_listenandserve_op.
+The detailed flow is below:
+![](https://github.com/seiriosPlus/Paddle/blob/mpi_enabled/doc/fluid/design/dist_train/src/mpi_module.png)
+* MPI global configurations, which store the Rank ID and the mapping in global variables, for example:
+gRPC client : MPI nodes :``` 127.0.0.1:32004 : 3 ```
--- a/doc/fluid/design/dist_train/src/mpi_module.png
+++ b/doc/fluid/design/dist_train/src/mpi_module.png
--- a/doc/v2/api/data/data_reader.rst
+++ b/doc/v2/api/data/data_reader.rst
@ -6,7 +6,43 @@ Data Reader Interface
 DataTypes
 =========

-..  automodule:: paddle.v2.data_type
+..  autofunction:: paddle.v2.data_type.dense_array
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.integer_value
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.integer_value_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.integer_value_sub_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_binary_vector
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_binary_vector_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_binary_vector_sub_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_float_vector
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_float_vector_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_float_vector_sub_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_non_value_slot
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_value_slot
+    :noindex:
+
+..  autoclass:: paddle.v2.data_type.InputType
    :members:
    :noindex:

--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -102,7 +102,7 @@ cc_test(init_test SRCS init_test.cc DEPS init)
 cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
 cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
      
-cc_test(channel_test SRCS channel_test.cc)
+# cc_test(channel_test SRCS channel_test.cc)
 cc_test(tuple_test SRCS tuple_test.cc )
 cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
        channel_send_op channel_recv_op sum_op select_op elementwise_add_op compare_op
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
@ -77,14 +77,9 @@ struct TestBroadcastOpHandle {
    local_scopes_[input_scope_idx]->Var("input");

    op_handle_.reset(new BroadcastOpHandle(local_scopes_, gpu_list_));
-
-    vars_.emplace_back(new VarHandle());
-    VarHandle* in_var_handle = static_cast<VarHandle*>(vars_.back().get());
-    in_var_handle->place_ = gpu_list_[input_scope_idx];
-    in_var_handle->name_ = "input";
-    in_var_handle->version_ = 1;
-    in_var_handle->scope_idx_ = input_scope_idx;
-    in_var_handle->generated_op_ = nullptr;
+    auto* in_var_handle =
+        new VarHandle(1, input_scope_idx, "input", gpu_list_[input_scope_idx]);
+    vars_.emplace_back(in_var_handle);
    op_handle_->AddInput(in_var_handle);

    // add dummy var
@ -96,12 +91,8 @@ struct TestBroadcastOpHandle {

    for (size_t j = 0; j < gpu_list_.size(); ++j) {
      op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j].get();
-      vars_.emplace_back(new VarHandle());
-      VarHandle* out_var_handle = static_cast<VarHandle*>(vars_.back().get());
-      out_var_handle->place_ = gpu_list_[j];
-      out_var_handle->name_ = "out";
-      out_var_handle->version_ = 2;
-      out_var_handle->scope_idx_ = j;
+      VarHandle* out_var_handle = new VarHandle(2, j, "out", gpu_list_[j]);
+      vars_.emplace_back(out_var_handle);
      op_handle_->AddOutput(out_var_handle);
    }

--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@ -79,13 +79,8 @@ struct TestGatherOpHandle {
    // add input
    for (size_t j = 0; j < gpu_list_.size(); ++j) {
      op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j].get();
-      vars_.emplace_back(new VarHandle());
-      VarHandle* in_var_handle = static_cast<VarHandle*>(vars_.back().get());
-      in_var_handle->place_ = gpu_list_[j];
-      in_var_handle->name_ = "input";
-      in_var_handle->version_ = 1;
-      in_var_handle->scope_idx_ = j;
-      in_var_handle->generated_op_ = nullptr;
+      auto* in_var_handle = new VarHandle(1, j, "input", gpu_list_[j]);
+      vars_.emplace_back(in_var_handle);
      op_handle_->AddInput(in_var_handle);
    }

@ -97,12 +92,9 @@ struct TestGatherOpHandle {
    op_handle_->AddInput(in_dummy_var_handle);

    // add output
-    vars_.emplace_back(new VarHandle());
-    VarHandle* out_var_handle = static_cast<VarHandle*>(vars_.back().get());
-    out_var_handle->place_ = gpu_list_[input_scope_idx];
-    out_var_handle->name_ = "out";
-    out_var_handle->version_ = 2;
-    out_var_handle->scope_idx_ = input_scope_idx;
+    auto* out_var_handle =
+        new VarHandle(2, input_scope_idx, "out", gpu_list_[input_scope_idx]);
+    vars_.emplace_back(out_var_handle);
    op_handle_->AddOutput(out_var_handle);

    // add dummy var
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@ -177,13 +177,9 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
            auto &prev_grad = vars[vars.size() - 1];
            op_handle->AddInput(prev_grad.get());

-            vars.emplace_back(new VarHandle);
-            auto &var = vars.back();
-            var->place_ = p;
-            var->name_ = og;
-            var->version_ = vars.size() - 1;
-
-            op_handle->AddOutput(var.get());
+            auto var = new VarHandle(vars.size() - 1, i, og, p);
+            vars.emplace_back(var);
+            op_handle->AddOutput(var);
          }
 #else
          PADDLE_ENFORCE("Not implemented");
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
@ -73,8 +73,9 @@ void NCCLAllReduceOpHandle::RunImpl() {

    for (size_t i = 0; i < local_scopes_.size(); ++i) {
      auto *s = local_scopes_[i];
+      auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get<Scope *>();

-      auto &lod_tensor = s->FindVar(var_name)->Get<LoDTensor>();
+      auto &lod_tensor = local_scope.FindVar(var_name)->Get<LoDTensor>();
      lod_tensors.emplace_back(lod_tensor);
    }

@ -110,17 +111,21 @@ void NCCLAllReduceOpHandle::RunImpl() {
        }
      });
    } else {  // Special handle CPU only Operator's gradient. Like CRF
-      auto &trg =
-          *this->local_scopes_[0]->Var()->GetMutable<framework::LoDTensor>();
+      auto &trg = *this->local_scopes_[0]
+                       ->FindVar(kLocalExecScopeName)
+                       ->Get<Scope *>()
+                       ->Var()
+                       ->GetMutable<framework::LoDTensor>();

      // Reduce All Tensor to trg in CPU
      ReduceLoDTensor func(lod_tensors, &trg);
      VisitDataType(ToDataType(lod_tensors[0].type()), func);

      for (size_t i = 0; i < local_scopes_.size(); ++i) {
-        auto &scope = local_scopes_[i];
+        auto &scope =
+            *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
        auto &p = places_[i];
-        auto *var = scope->FindVar(var_name);
+        auto *var = scope.FindVar(var_name);
        auto *dev_ctx = dev_ctxes_[p];

        RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@ -30,10 +30,11 @@ ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {}

 void ScaleLossGradOpHandle::RunImpl() {
  std::string var_name = static_cast<VarHandle *>(this->outputs_[0])->name_;
+  auto &local_scope = *scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();

-  float *tmp =
-      scope_->FindVar(var_name)->GetMutable<LoDTensor>()->mutable_data<float>(
-          make_ddim({1}), place_);
+  float *tmp = local_scope.FindVar(var_name)
+                   ->GetMutable<LoDTensor>()
+                   ->mutable_data<float>(make_ddim({1}), place_);

  if (platform::is_cpu_place(place_)) {
    *tmp = coeff_;
--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder.cc
@ -54,13 +54,8 @@ VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle(
  auto &var_holder = var_holders[each_var_name];
  VarHandle *var = nullptr;
  if (var_holder.empty()) {
-    var_holder.emplace_back(new VarHandle);
-    auto &init_var = var_holder[0];
-    init_var->place_ = place;
-    init_var->name_ = each_var_name;
-    init_var->generated_op_ = nullptr;
-    init_var->version_ = 0;
-    var = init_var.get();
+    var = new VarHandle(0, place_offset, each_var_name, place);
+    var_holder.emplace_back(var);
  } else {
    var = var_holder.rbegin()->get();
  }
@ -73,12 +68,9 @@ void SSAGraphBuilder::CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle,
                                     size_t place_offset) {
  auto &vars = graph->vars_[place_offset][each_var_name];
  size_t version = vars.size();
-  vars.emplace_back(new VarHandle());
-  auto &var = vars.back();
-  var->version_ = version;
-  var->name_ = each_var_name;
-  var->place_ = place;
-  op_handle->AddOutput(var.get());
+  auto var = new VarHandle(version, place_offset, each_var_name, place);
+  vars.emplace_back(var);
+  op_handle->AddOutput(var);
 }

 template <typename Callback>
--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@ -16,6 +16,7 @@
 #include <sstream>
 #include <string>
 #include <unordered_set>
+#include <utility>

 #include "paddle/fluid/platform/place.h"

@ -33,10 +34,10 @@ struct VarHandleBase {

  // The operator who generate this variable. nullptr if the variable
  // is a root node.
-  OpHandleBase *generated_op_;
+  OpHandleBase* generated_op_{nullptr};

  // Operators which depend on this variable ready.
-  std::unordered_set<OpHandleBase *> pending_ops_;
+  std::unordered_set<OpHandleBase*> pending_ops_;
 };

 // VarHandle is actually a single version of Runtime Variable.
@ -47,6 +48,13 @@ struct VarHandleBase {
 struct VarHandle : public VarHandleBase {
  std::string DebugString() const override;

+  VarHandle(size_t version, size_t scope_index, std::string name,
+            platform::Place place)
+      : version_(version),
+        scope_idx_(scope_index),
+        name_(std::move(name)),
+        place_(std::move(place)) {}
+
  // version field currently is not used, however, just store the version to
  // debug easily.
  size_t version_;
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@ -63,13 +63,14 @@ ParallelExecutor::ParallelExecutor(
  // Step 1. Bcast the params to devs.
  // Create local scopes
  if (local_scopes.empty()) {
-    for (size_t i = 0; i < member_->places_.size(); ++i) {
-      member_->local_scopes_.push_back(&scope->NewScope());
+    member_->local_scopes_.emplace_back(member_->global_scope_);
+    for (size_t i = 1; i < member_->places_.size(); ++i) {
+      member_->local_scopes_.emplace_back(&scope->NewScope());
    }
  } else {
    PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size());
    for (size_t i = 0; i < member_->places_.size(); ++i) {
-      member_->local_scopes_.push_back(local_scopes[i]);
+      member_->local_scopes_.emplace_back(local_scopes[i]);
    }
  }

@ -159,7 +160,9 @@ void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
                           const std::string &fetched_var_name) {
  platform::RecordBlock b(0);
  // Create local scopes.
-  for (auto &scope : member_->local_scopes_) {
+  for (auto it = member_->local_scopes_.rbegin();
+       it != member_->local_scopes_.rend(); ++it) {
+    auto &scope = *it;
    Scope &local_scope = scope->NewScope();
    *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>() =
        &local_scope;
@ -173,7 +176,7 @@ void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
        InitializeVariable(scope->Var(std::get<0>(name_type_pair)),
                           std::get<1>(name_type_pair));
      } else {
-        InitializeVariable(scope->Var(std::get<0>(name_type_pair)),
+        InitializeVariable(local_scope.Var(std::get<0>(name_type_pair)),
                           std::get<1>(name_type_pair));
      }
    }
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/beam_search_decode_op.h"
+#include <string>
 #include "paddle/fluid/platform/device_context.h"

 namespace paddle {
--- a/paddle/fluid/operators/beam_search_decode_op.h
+++ b/paddle/fluid/operators/beam_search_decode_op.h
@ -14,6 +14,7 @@ limitations under the License. */

 #pragma once

+#include <vector>
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"

@ -87,7 +88,7 @@ struct BeamSearchDecoder {
   */
  std::vector<BeamNodeVector<T>> PackTwoSteps(
      const LoDTensor& cur_ids, const LoDTensor& cur_scores,
-      std::vector<BeamNodeVector<T>>& prefixes_list,
+      std::vector<BeamNodeVector<T>>* prefixes_list,
      std::vector<SentenceVector<T>>* sentence_vector_list) const;

  /**
@ -140,7 +141,7 @@ Sentence<T> BeamSearchDecoder<T>::MakeSentence(const BeamNode<T>* node) const {
 template <typename T>
 std::vector<BeamNodeVector<T>> BeamSearchDecoder<T>::PackTwoSteps(
    const LoDTensor& cur_ids, const LoDTensor& cur_scores,
-    std::vector<BeamNodeVector<T>>& prefixes_list,
+    std::vector<BeamNodeVector<T>>* prefixes_list,
    std::vector<SentenceVector<T>>* sentence_vector_list) const {
  std::vector<BeamNodeVector<T>> result;

@ -153,7 +154,7 @@ std::vector<BeamNodeVector<T>> BeamSearchDecoder<T>::PackTwoSteps(

    // if prefixes size is 0, it means this is the first step. In this step,
    // all candidate id is the start of candidate sentences.
-    if (prefixes_list.empty()) {
+    if (prefixes_list->empty()) {
      PADDLE_ENFORCE_EQ(cur_ids.lod().at(kSourceLevel).back(),
                        cur_ids.lod().at(kSentenceLevel).back(),
                        "in the first step");
@ -162,7 +163,7 @@ std::vector<BeamNodeVector<T>> BeamSearchDecoder<T>::PackTwoSteps(
            cur_ids.data<int64_t>()[id_idx], cur_scores.data<T>()[id_idx])));
      }
    } else {
-      BeamNodeVector<T>& prefixes = prefixes_list[src_idx];
+      BeamNodeVector<T>& prefixes = prefixes_list->at(src_idx);
      SentenceVector<T>& sentence_vector = (*sentence_vector_list)[src_idx];

      PADDLE_ENFORCE_EQ(src_end - src_start, prefixes.size(),
@ -262,7 +263,7 @@ void BeamSearchDecoder<T>::PackAllSteps(const LoDTensorArray& step_ids,
  for (size_t step_id = 0; step_id < step_num; ++step_id) {
    beamnode_vector_list =
        PackTwoSteps(step_ids.at(step_id), step_scores.at(step_id),
-                     beamnode_vector_list, &sentence_vector_list);
+                     &beamnode_vector_list, &sentence_vector_list);
  }
  // append last beam_node to result
  for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
--- a/paddle/fluid/operators/beam_search_decode_op_test.cc
+++ b/paddle/fluid/operators/beam_search_decode_op_test.cc
@ -125,7 +125,7 @@ TEST(BeamSearchDecodeOp, PackTwoStepsFistStep) {

  BeamSearchDecoder<float> helper;
  beamnode_vector_list = helper.PackTwoSteps(
-      ids[0], scores[0], beamnode_vector_list, &sentence_vector_list);
+      ids[0], scores[0], &beamnode_vector_list, &sentence_vector_list);
  ASSERT_EQ(beamnode_vector_list.size(), 2UL);
  ASSERT_EQ(beamnode_vector_list[0].size(), 2UL);
  ASSERT_EQ(beamnode_vector_list[1].size(), 4UL);
@ -167,7 +167,7 @@ TEST(BeamSearchDecodeOp, PackTwoSteps) {

  BeamSearchDecoder<float> helper1;
  beamnode_vector_list = helper1.PackTwoSteps(
-      ids[0], scores[0], beamnode_vector_list, &sentence_vector_list);
+      ids[0], scores[0], &beamnode_vector_list, &sentence_vector_list);

  ASSERT_EQ(sentence_vector_list[0].size(), 1UL);
  ASSERT_EQ(sentence_vector_list[1].size(), 0UL);
--- a/paddle/fluid/operators/beam_search_op.cc
+++ b/paddle/fluid/operators/beam_search_op.cc
@ -14,7 +14,10 @@ limitations under the License. */

 #include "paddle/fluid/operators/beam_search_op.h"

+#include <algorithm>
 #include <map>
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"

--- a/paddle/fluid/operators/beam_search_op.h
+++ b/paddle/fluid/operators/beam_search_op.h
@ -18,6 +18,8 @@ limitations under the License. */
 #include "gtest/gtest.h"
 #endif

+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/operator.h"

--- a/paddle/fluid/operators/chunk_eval_op.cc
+++ b/paddle/fluid/operators/chunk_eval_op.cc
@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/chunk_eval_op.h"
+#include <string>
+#include <vector>

 namespace paddle {
 namespace operators {
--- a/paddle/fluid/operators/chunk_eval_op.h
+++ b/paddle/fluid/operators/chunk_eval_op.h
@ -14,6 +14,9 @@ limitations under the License. */

 #pragma once
 #include <set>
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"

@ -36,11 +39,11 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
  };

  void GetSegments(const int64_t* label, int length,
-                   std::vector<Segment>& segments, int num_chunk_types,
+                   std::vector<Segment>* segments, int num_chunk_types,
                   int num_tag_types, int other_chunk_type, int tag_begin,
                   int tag_inside, int tag_end, int tag_single) const {
-    segments.clear();
-    segments.reserve(length);
+    segments->clear();
+    segments->reserve(length);
    int chunk_start = 0;
    bool in_chunk = false;
    int tag = -1;
@ -58,7 +61,7 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
            i - 1,        // end
            prev_type,
        };
-        segments.push_back(segment);
+        segments->push_back(segment);
        in_chunk = false;
      }
      if (ChunkBegin(prev_tag, prev_type, tag, type, other_chunk_type,
@ -73,7 +76,7 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
          length - 1,   // end
          type,
      };
-      segments.push_back(segment);
+      segments->push_back(segment);
    }
  }

@ -177,8 +180,8 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
    for (int i = 0; i < num_sequences; ++i) {
      int seq_length = lod[0][i + 1] - lod[0][i];
      EvalOneSeq(inference_data + lod[0][i], label_data + lod[0][i], seq_length,
-                 output_segments, label_segments, *num_infer_chunks_data,
-                 *num_label_chunks_data, *num_correct_chunks_data,
+                 &output_segments, &label_segments, num_infer_chunks_data,
+                 num_label_chunks_data, num_correct_chunks_data,
                 num_chunk_types, num_tag_types, other_chunk_type, tag_begin,
                 tag_inside, tag_end, tag_single, excluded_chunk_types);
    }
@ -197,10 +200,10 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
  }

  void EvalOneSeq(const int64_t* output, const int64_t* label, int length,
-                  std::vector<Segment>& output_segments,
-                  std::vector<Segment>& label_segments,
-                  int64_t& num_output_segments, int64_t& num_label_segments,
-                  int64_t& num_correct, int num_chunk_types, int num_tag_types,
+                  std::vector<Segment>* output_segments,
+                  std::vector<Segment>* label_segments,
+                  int64_t* num_output_segments, int64_t* num_label_segments,
+                  int64_t* num_correct, int num_chunk_types, int num_tag_types,
                  int other_chunk_type, int tag_begin, int tag_inside,
                  int tag_end, int tag_single,
                  const std::set<int>& excluded_chunk_types) const {
@ -209,25 +212,29 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
    GetSegments(label, length, label_segments, num_chunk_types, num_tag_types,
                other_chunk_type, tag_begin, tag_inside, tag_end, tag_single);
    size_t i = 0, j = 0;
-    while (i < output_segments.size() && j < label_segments.size()) {
-      if (output_segments[i] == label_segments[j] &&
-          excluded_chunk_types.count(output_segments[i].type) != 1) {
-        ++num_correct;
+    while (i < output_segments->size() && j < label_segments->size()) {
+      if (output_segments->at(i) == label_segments->at(j) &&
+          excluded_chunk_types.count(output_segments->at(i).type) != 1) {
+        ++(*num_correct);
      }
-      if (output_segments[i].end < label_segments[j].end) {
+      if (output_segments->at(i).end < label_segments->at(j).end) {
        ++i;
-      } else if (output_segments[i].end > label_segments[j].end) {
+      } else if (output_segments->at(i).end > label_segments->at(j).end) {
        ++j;
      } else {
        ++i;
        ++j;
      }
    }
-    for (auto& segment : label_segments) {
-      if (excluded_chunk_types.count(segment.type) != 1) ++num_label_segments;
+    for (auto& segment : (*label_segments)) {
+      if (excluded_chunk_types.count(segment.type) != 1) {
+        ++(*num_label_segments);
+      }
    }
-    for (auto& segment : output_segments) {
-      if (excluded_chunk_types.count(segment.type) != 1) ++num_output_segments;
+    for (auto& segment : (*output_segments)) {
+      if (excluded_chunk_types.count(segment.type) != 1) {
+        ++(*num_output_segments);
+      }
    }
  }
 };
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@ -73,9 +73,11 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
        dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);

    auto src_memory =
-        mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data);
+        mkldnn::memory({src_md, mkldnn_engine},
+                       reinterpret_cast<void*>(const_cast<T*>(input_data)));
    auto weights_memory =
-        mkldnn::memory({weights_md, mkldnn_engine}, (void*)filter_data);
+        mkldnn::memory({weights_md, mkldnn_engine},
+                       reinterpret_cast<void*>(const_cast<T*>(filter_data)));
    auto dst_memory = mkldnn::memory({dst_md, mkldnn_engine}, output_data);

    std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd =
@ -180,8 +182,9 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
        dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);

    // create memory
-    auto diff_dst_memory = mkldnn::memory({diff_weights_md, mkldnn_engine},
-                                          (void*)output_grad_data);
+    auto diff_dst_memory = mkldnn::memory(
+        {diff_weights_md, mkldnn_engine},
+        reinterpret_cast<void*>(const_cast<T*>(output_grad_data)));
    // Retrieve conv_pd from device context
    auto conv_pd =
        std::static_pointer_cast<mkldnn::convolution_forward::primitive_desc>(
@ -198,10 +201,12 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
                                      mkldnn_engine);

      // create memory
-      auto diff_weights_memory = mkldnn::memory(
-          {diff_weights_md, mkldnn_engine}, (void*)filter_grad_data);
+      auto diff_weights_memory =
+          mkldnn::memory({diff_weights_md, mkldnn_engine},
+                         reinterpret_cast<void*>(filter_grad_data));
      auto src_memory =
-          mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data);
+          mkldnn::memory({src_md, mkldnn_engine},
+                         reinterpret_cast<void*>(const_cast<T*>(input_data)));

      // create backward conv primitive for weights
      auto conv_bwd_weights_prim = mkldnn::convolution_backward_weights(
@ -220,10 +225,12 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
                                   strides, paddings, *conv_pd, mkldnn_engine);

      // create memory
-      auto diff_src_memory =
-          mkldnn::memory({diff_src_md, mkldnn_engine}, (void*)input_grad_data);
+      auto diff_src_memory = mkldnn::memory(
+          {diff_src_md, mkldnn_engine},
+          reinterpret_cast<void*>(const_cast<T*>(input_grad_data)));
      auto weights_memory =
-          mkldnn::memory({weights_md, mkldnn_engine}, (void*)filter_data);
+          mkldnn::memory({weights_md, mkldnn_engine},
+                         reinterpret_cast<void*>(const_cast<T*>(filter_data)));

      // create backward conv primitive for data
      auto conv_bwd_data_prim = mkldnn::convolution_backward_data(
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@ -14,6 +14,7 @@ limitations under the License. */

 #pragma once

+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/depthwise_conv.h"
@ -41,9 +42,10 @@ inline int ConvOutputSize(int input_size, int filter_size, int dilation,

  return output_size;
 }
-inline bool IsExpand(std::vector<int64_t>& filter_dim,
-                     std::vector<int>& strides, std::vector<int>& paddings,
-                     std::vector<int>& dilations) {
+inline bool IsExpand(const std::vector<int64_t>& filter_dim,
+                     const std::vector<int>& strides,
+                     const std::vector<int>& paddings,
+                     const std::vector<int>& dilations) {
  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
  for (size_t j = 0; j < strides.size(); ++j) {
    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
--- a/paddle/fluid/operators/detection_map_op.cc
+++ b/paddle/fluid/operators/detection_map_op.cc
@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/detection_map_op.h"
+#include <string>

 namespace paddle {
 namespace operators {
--- a/Show More
+++ b/Show More