Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_dist_unit_test

7 years ago · 745aacfc38
parent 34cffe9169 672cc25e13
commit 745aacfc38
87 changed files with 1959 additions and 481 deletions
--- a/4
+++ b/4
@ -1,6 +1,6 @@
 # A image for building paddle binaries
 # Use cuda devel base image for both cpu and gpu environment
-FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu16.04
+FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 ARG UBUNTU_MIRROR
@ -57,7 +57,7 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U
 # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
 # version(1.7.1 for now), which causes building documentation failed.
-RUN pip install --upgrade pip && \
+RUN pip install --upgrade pip==9.0.3 && \
    pip install -U wheel && \
    pip install -U docopt PyYAML sphinx==1.5.6 && \
    pip install sphinx-rtd-theme==0.1.9 recommonmark
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@ -78,7 +78,7 @@ if(NOT CMAKE_CROSSCOMPILING)
    /usr/lib/reference/
  )
 else()
-  # Diable the finding of reference cblas under host's system path
+  # Disable the finding of reference cblas under host's system path
  set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/include)
  set(REFERENCE_CBLAS_LIB_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/lib)
 endif()
--- a/doc/fluid/design/motivation/fluid.md
+++ b/doc/fluid/design/motivation/fluid.md
@ -119,7 +119,7 @@ An actual Fluid example is described  [here](https://github.com/PaddlePaddle/Pad
 From the example, the Fluid programs look very similar to their PyTorch equivalent programs, except that Fluid's loop structure, wrapped with Python's `with` statement, could run much faster than just a Python loop.
-We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/if_else_op.md) structure of Fluid.
+We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/if_else_op.md) structure of Fluid.
 ## Turing Completeness
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@ -1,5 +1,5 @@
 cc_library(var_handle SRCS var_handle.cc DEPS place)
-cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context)
+cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor)
 cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
@ -20,3 +20,11 @@ cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
        simple_threadpool device_context)
 cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory)
 cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory)
 cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
        device_context broadcast_op_handle)
 cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
        device_context gather_op_handle)
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@ -0,0 +1,111 @@
 //   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/broadcast_op_handle.h"
 namespace paddle {
 namespace framework {
 namespace details {
 Tensor *GetTensorFromVar(Variable *in_var) {
  if (in_var->IsType<LoDTensor>()) {
    return in_var->GetMutable<LoDTensor>();
  } else if (in_var->IsType<SelectedRows>()) {
    return in_var->GetMutable<SelectedRows>()->mutable_value();
  } else {
    PADDLE_THROW("Var should be LoDTensor or SelectedRows");
  }
  return nullptr;
 }
 BroadcastOpHandle::BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
                                     const std::vector<platform::Place> &places)
    : local_scopes_(local_scopes), places_(places) {}
 void BroadcastOpHandle::RunImpl() {
  // the input may have dummy var.
  std::vector<VarHandle *> in_var_handle;
  for (auto *in : inputs_) {
    auto *out_handle = dynamic_cast<VarHandle *>(in);
    if (out_handle) {
      in_var_handle.push_back(out_handle);
    }
  }
  PADDLE_ENFORCE_EQ(in_var_handle.size(), 1,
                    "The number of input should be one.");
  // the output may have dummy var.
  std::vector<VarHandle *> out_var_handles;
  for (auto *out : outputs_) {
    auto *out_handle = dynamic_cast<VarHandle *>(out);
    if (out_handle) {
      out_var_handles.push_back(out_handle);
    }
  }
  PADDLE_ENFORCE_EQ(
      out_var_handles.size(), places_.size(),
      "The number of output should equal to the number of places.");
  // Wait input done, this Wait is asynchronous operation
  auto &in_place = in_var_handle[0]->place_;
  if (in_var_handle[0]->generated_op_) {
    for (auto *out : out_var_handles) {
      auto &out_p = out->place_;
      in_var_handle[0]->generated_op_->Wait(dev_ctxes_[out_p]);
    }
  }
  //
  auto in_scope_idx = in_var_handle[0]->scope_idx_;
  auto in_var =
      local_scopes_.at(in_scope_idx)->FindVar(in_var_handle[0]->name_);
  Tensor *in_tensor = GetTensorFromVar(in_var);
  for (auto *out : out_var_handles) {
    auto &out_p = out->place_;
    auto out_var = local_scopes_.at(out->scope_idx_)->FindVar(out->name_);
    PADDLE_ENFORCE_EQ(out_p.which(), in_place.which(),
                      "Places must be all on CPU or all on CUDA.");
    if (in_var->IsType<framework::SelectedRows>()) {
      auto &in_sr = in_var->Get<framework::SelectedRows>();
      auto out_sr = out_var->GetMutable<framework::SelectedRows>();
      if (&in_sr == out_sr) continue;
      out_sr->set_height(in_sr.height());
      out_sr->set_rows(in_sr.rows());
      out_sr->mutable_value()->Resize(in_sr.value().dims());
      out_sr->mutable_value()->mutable_data(out_p, in_sr.value().type());
    } else if (in_var->IsType<framework::LoDTensor>()) {
      auto in_lod = in_var->Get<framework::LoDTensor>();
      auto out_lod = out_var->GetMutable<framework::LoDTensor>();
      if (&in_lod == out_lod) continue;
      out_lod->set_lod(in_lod.lod());
      out_lod->Resize(in_lod.dims());
      out_lod->mutable_data(out_p, in_lod.type());
    } else {
      PADDLE_THROW("Var should be LoDTensor or SelectedRows.");
    }
    Tensor *out_tensor = GetTensorFromVar(out_var);
    paddle::framework::TensorCopy(*in_tensor, out_p, *(dev_ctxes_[in_place]),
                                  out_tensor);
  }
 }
 std::string BroadcastOpHandle::Name() const { return "broadcast"; }
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@ -0,0 +1,48 @@
 //   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include <map>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/device_context.h"
 namespace paddle {
 namespace framework {
 namespace details {
 struct BroadcastOpHandle : public OpHandleBase {
  const std::vector<Scope *> &local_scopes_;
  const std::vector<platform::Place> &places_;
  BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
                    const std::vector<platform::Place> &places);
  std::string Name() const override;
  bool IsMultiDeviceTransfer() override { return false; };
 protected:
  void RunImpl() override;
 };
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
@ -0,0 +1,231 @@
 //   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/broadcast_op_handle.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/platform/device_context.h"
 namespace paddle {
 namespace framework {
 namespace details {
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 // test data amount
 const f::DDim kDims = {20, 20};
 struct TestBroadcastOpHandle {
  std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
  std::vector<Scope*> local_scopes_;
  Scope g_scope_;
  std::unique_ptr<OpHandleBase> op_handle_;
  std::vector<std::unique_ptr<VarHandleBase>> vars_;
  std::vector<p::Place> gpu_list_;
  void WaitAll() {
    for (size_t j = 0; j < ctxs_.size(); ++j) {
      ctxs_[j]->Wait();
    }
  }
  void InitCtxOnGpu(bool use_gpu) {
    if (use_gpu) {
 #ifdef PADDLE_WITH_CUDA
      int count = p::GetCUDADeviceCount();
      if (count <= 1) {
        LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
                        "device count is "
                     << count;
        exit(0);
      }
      for (int i = 0; i < count; ++i) {
        auto p = p::CUDAPlace(i);
        gpu_list_.push_back(p);
        ctxs_.emplace_back(new p::CUDADeviceContext(p));
      }
 #else
      PADDLE_THROW("CUDA is not support.");
 #endif
    } else {
      int count = 8;
      for (int i = 0; i < count; ++i) {
        auto p = p::CPUPlace();
        gpu_list_.push_back(p);
        ctxs_.emplace_back(new p::CPUDeviceContext(p));
      }
    }
  }
  void InitBroadcastOp(size_t input_scope_idx) {
    for (size_t j = 0; j < gpu_list_.size(); ++j) {
      local_scopes_.push_back(&(g_scope_.NewScope()));
      local_scopes_[j]->Var("out");
    }
    local_scopes_[input_scope_idx]->Var("input");
    op_handle_.reset(new BroadcastOpHandle(local_scopes_, gpu_list_));
    vars_.emplace_back(new VarHandle());
    VarHandle* in_var_handle = static_cast<VarHandle*>(vars_.back().get());
    in_var_handle->place_ = gpu_list_[input_scope_idx];
    in_var_handle->name_ = "input";
    in_var_handle->version_ = 1;
    in_var_handle->scope_idx_ = input_scope_idx;
    in_var_handle->generated_op_ = nullptr;
    op_handle_->AddInput(in_var_handle);
    // add dummy var
    vars_.emplace_back(new DummyVarHandle());
    DummyVarHandle* dummy_var_handle =
        static_cast<DummyVarHandle*>(vars_.back().get());
    dummy_var_handle->generated_op_ = nullptr;
    op_handle_->AddInput(dummy_var_handle);
    for (size_t j = 0; j < gpu_list_.size(); ++j) {
      op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j].get();
      vars_.emplace_back(new VarHandle());
      VarHandle* out_var_handle = static_cast<VarHandle*>(vars_.back().get());
      out_var_handle->place_ = gpu_list_[j];
      out_var_handle->name_ = "out";
      out_var_handle->version_ = 2;
      out_var_handle->scope_idx_ = j;
      op_handle_->AddOutput(out_var_handle);
    }
    // add dummy var
    vars_.emplace_back(new DummyVarHandle());
    DummyVarHandle* out_dummy_var_handle =
        static_cast<DummyVarHandle*>(vars_.back().get());
    out_dummy_var_handle->generated_op_ = nullptr;
    op_handle_->AddOutput(out_dummy_var_handle);
  }
  void TestBroadcastLodTensor(size_t input_scope_idx) {
    auto in_var = local_scopes_[input_scope_idx]->Var("input");
    auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
    in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
    std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
    for (size_t k = 0; k < send_vector.size(); ++k) {
      send_vector[k] = k;
    }
    f::LoD lod{{0, 10, 20}};
    paddle::framework::TensorFromVector<float>(
        send_vector, *(ctxs_[input_scope_idx]), in_lod_tensor);
    in_lod_tensor->set_lod(lod);
    op_handle_->Run(false);
    WaitAll();
    p::CPUPlace cpu_place;
    for (size_t j = 0; j < gpu_list_.size(); ++j) {
      auto out_var = local_scopes_[j]->Var("out");
      auto out_tensor = out_var->Get<f::LoDTensor>();
      PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal.");
      f::Tensor result_tensor;
      f::TensorCopy(out_tensor, cpu_place, *(ctxs_[j]), &result_tensor);
      float* ct = result_tensor.mutable_data<float>(cpu_place);
      for (int64_t i = 0; i < f::product(kDims); ++i) {
        ASSERT_NEAR(ct[i], send_vector[i], 1e-5);
      }
    }
  }
  void TestBroadcastSelectedRows(size_t input_scope_idx) {
    auto in_var = local_scopes_[input_scope_idx]->Var("input");
    auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
    auto value = in_selected_rows->mutable_value();
    value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
    int height = static_cast<int>(kDims[0]) * 2;
    std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
                              2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
    in_selected_rows->set_height(height);
    in_selected_rows->set_rows(rows);
    std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
    for (size_t k = 0; k < send_vector.size(); ++k) {
      send_vector[k] = k;
    }
    paddle::framework::TensorFromVector<float>(
        send_vector, *(ctxs_[input_scope_idx]), value);
    op_handle_->Run(false);
    WaitAll();
    p::CPUPlace cpu_place;
    for (size_t j = 0; j < gpu_list_.size(); ++j) {
      auto out_var = local_scopes_[j]->Var("out");
      auto& out_select_rows = out_var->Get<f::SelectedRows>();
      auto rt = out_select_rows.value();
      PADDLE_ENFORCE_EQ(out_select_rows.height(), height,
                        "height is not equal.");
      for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
        PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k]);
      }
      f::Tensor result_tensor;
      f::TensorCopy(rt, cpu_place, *(ctxs_[j]), &result_tensor);
      float* ct = result_tensor.data<float>();
      for (int64_t i = 0; i < f::product(kDims); ++i) {
        ASSERT_NEAR(ct[i], send_vector[i], 1e-5);
      }
    }
  }
 };
 TEST(BroadcastTester, TestCPUBroadcastTestLodTensor) {
  TestBroadcastOpHandle test_op;
  size_t input_scope_idx = 0;
  test_op.InitCtxOnGpu(false);
  test_op.InitBroadcastOp(input_scope_idx);
  test_op.TestBroadcastLodTensor(input_scope_idx);
 }
 TEST(BroadcastTester, TestCPUBroadcastTestSelectedRows) {
  TestBroadcastOpHandle test_op;
  size_t input_scope_idx = 0;
  test_op.InitCtxOnGpu(false);
  test_op.InitBroadcastOp(input_scope_idx);
  test_op.TestBroadcastSelectedRows(input_scope_idx);
 }
 #ifdef PADDLE_WITH_CUDA
 TEST(BroadcastTester, TestGPUBroadcastTestLodTensor) {
  TestBroadcastOpHandle test_op;
  size_t input_scope_idx = 0;
  test_op.InitCtxOnGpu(true);
  test_op.InitBroadcastOp(input_scope_idx);
  test_op.TestBroadcastLodTensor(input_scope_idx);
 }
 TEST(BroadcastTester, TestGPUBroadcastTestSelectedRows) {
  TestBroadcastOpHandle test_op;
  size_t input_scope_idx = 0;
  test_op.InitCtxOnGpu(true);
  test_op.InitBroadcastOp(input_scope_idx);
  test_op.TestBroadcastSelectedRows(input_scope_idx);
 }
 #endif
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@ -35,7 +35,9 @@ void ComputationOpHandle::RunImpl() {
    }
  }
-  op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(), place_);
+  this->RunAndRecordEvent([this] {
    op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(), place_);
  });
 }
 std::string ComputationOpHandle::Name() const { return op_->Type(); }
--- a/paddle/fluid/framework/details/gather_op_handle.cc
+++ b/paddle/fluid/framework/details/gather_op_handle.cc
@ -0,0 +1,126 @@
 //   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/gather_op_handle.h"
 namespace paddle {
 namespace framework {
 namespace details {
 GatherOpHandle::GatherOpHandle(const std::vector<Scope *> &local_scopes,
                               const std::vector<platform::Place> &places)
    : local_scopes_(local_scopes), places_(places) {}
 void GatherOpHandle::RunImpl() {
  // the input may have dummy var.
  std::vector<VarHandle *> in_var_handles;
  for (auto *in : inputs_) {
    auto *in_handle = dynamic_cast<VarHandle *>(in);
    if (in_handle) {
      in_var_handles.push_back(in_handle);
    }
  }
  PADDLE_ENFORCE_EQ(
      in_var_handles.size(), places_.size(),
      "The number of output should equal to the number of places.");
  // the output may have dummy var.
  std::vector<VarHandle *> out_var_handles;
  for (auto *out : outputs_) {
    auto *out_handle = dynamic_cast<VarHandle *>(out);
    if (out_handle) {
      out_var_handles.push_back(out_handle);
    }
  }
  PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
                    "The number of output should be one.");
  auto in_0_handle = static_cast<VarHandle *>(in_var_handles[0]);
  auto pre_in_var =
      local_scopes_[in_0_handle->scope_idx_]->FindVar(in_0_handle->name_);
  auto pre_place = in_0_handle->place_;
  PADDLE_ENFORCE(pre_in_var->IsType<framework::SelectedRows>(),
                 "Currently, gather_op only can gather SelectedRows.");
  PADDLE_ENFORCE_EQ(out_var_handles[0]->place_.which(), pre_place.which(),
                    "The place of input and output should be the same.");
  // Wait input done, this Wait is asynchronous operation
  for (auto *in : in_var_handles) {
    if (in->generated_op_) {
      in->generated_op_->Wait(dev_ctxes_[in->place_]);
    }
  }
  std::vector<int64_t> out_rows;
  std::vector<Tensor> in_tensors;
  std::vector<platform::Place> in_places;
  auto &pre_in = pre_in_var->Get<framework::SelectedRows>();
  // gather the inputs
  for (auto *in : in_var_handles) {
    auto in_handle = static_cast<VarHandle *>(in);
    auto in_p = in_handle->place_;
    in_places.push_back(in_p);
    PADDLE_ENFORCE_EQ(in_p.which(), pre_place.which(),
                      "Places must be all on CPU or all on CUDA.");
    auto in_var =
        local_scopes_.at(in_handle->scope_idx_)->FindVar(in_handle->name_);
    auto &in_sr = in_var->Get<framework::SelectedRows>();
    PADDLE_ENFORCE_EQ(in_sr.value().type(), pre_in.value().type(),
                      "The type of input is not consistent.");
    PADDLE_ENFORCE_EQ(pre_in.height(), in_sr.height(),
                      "The height of inputs is not consistent.");
    PADDLE_ENFORCE_EQ(pre_in.GetCompleteDims(), in_sr.GetCompleteDims(), ,
                      "The dims of inputs is not consistent.");
    auto in_sr_rows = in_sr.rows();
    out_rows.insert(out_rows.end(), in_sr_rows.begin(), in_sr_rows.end());
    in_tensors.emplace_back(in_sr.value());
  }
  // write the output
  auto &out_place = out_var_handles[0]->place_;
  auto out_scope_idx = out_var_handles[0]->scope_idx_;
  auto out_var =
      local_scopes_[out_scope_idx]->FindVar(out_var_handles[0]->name_);
  auto out = out_var->GetMutable<framework::SelectedRows>();
  out->set_height(pre_in.height());
  out->set_rows(out_rows);
  size_t rows = out_rows.size();
  DDim out_dim = pre_in.GetCompleteDims();
  out_dim[0] = static_cast<int64_t>(rows);
  out->mutable_value()->Resize(out_dim);
  out->mutable_value()->mutable_data(out_place, pre_in.value().type());
  Tensor *out_tensor = out->mutable_value();
  // copy
  int s = 0, e = 0;
  for (size_t j = 0; j < in_tensors.size(); ++j) {
    e += in_tensors[j].dims()[0];
    auto sub_out = out_tensor->Slice(s, e);
    paddle::framework::TensorCopy(in_tensors[j], out_place,
                                  *(dev_ctxes_[in_places[j]]), &sub_out);
    s = e;
  }
 }
 std::string GatherOpHandle::Name() const { return "gather"; }
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/gather_op_handle.h
+++ b/paddle/fluid/framework/details/gather_op_handle.h
@ -0,0 +1,48 @@
 //   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include <map>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/device_context.h"
 namespace paddle {
 namespace framework {
 namespace details {
 struct GatherOpHandle : public OpHandleBase {
  const std::vector<Scope *> &local_scopes_;
  const std::vector<platform::Place> &places_;
  GatherOpHandle(const std::vector<Scope *> &local_scopes,
                 const std::vector<platform::Place> &places);
  std::string Name() const override;
  bool IsMultiDeviceTransfer() override { return false; };
 protected:
  void RunImpl() override;
 };
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@ -0,0 +1,192 @@
 //   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/gather_op_handle.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/platform/device_context.h"
 namespace paddle {
 namespace framework {
 namespace details {
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 // test data amount
 const f::DDim kDims = {20, 20};
 struct TestGatherOpHandle {
  std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
  std::vector<Scope*> local_scopes_;
  Scope g_scope_;
  std::unique_ptr<OpHandleBase> op_handle_;
  std::vector<std::unique_ptr<VarHandleBase>> vars_;
  std::vector<p::Place> gpu_list_;
  void WaitAll() {
    for (size_t j = 0; j < ctxs_.size(); ++j) {
      ctxs_[j]->Wait();
    }
  }
  void InitCtxOnGpu(bool use_gpu) {
    if (use_gpu) {
 #ifdef PADDLE_WITH_CUDA
      int count = p::GetCUDADeviceCount();
      if (count <= 1) {
        LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
                        "device count is "
                     << count;
        exit(0);
      }
      for (int i = 0; i < count; ++i) {
        auto p = p::CUDAPlace(i);
        gpu_list_.push_back(p);
        ctxs_.emplace_back(new p::CUDADeviceContext(p));
      }
 #else
      PADDLE_THROW("CUDA is not support.");
 #endif
    } else {
      int count = 8;
      for (int i = 0; i < count; ++i) {
        auto p = p::CPUPlace();
        gpu_list_.push_back(p);
        ctxs_.emplace_back(new p::CPUDeviceContext(p));
      }
    }
  }
  void InitGatherOp(size_t input_scope_idx) {
    for (size_t j = 0; j < gpu_list_.size(); ++j) {
      local_scopes_.push_back(&(g_scope_.NewScope()));
      local_scopes_[j]->Var("out");
    }
    local_scopes_[input_scope_idx]->Var("input");
    op_handle_.reset(new GatherOpHandle(local_scopes_, gpu_list_));
    // add input
    for (size_t j = 0; j < gpu_list_.size(); ++j) {
      op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j].get();
      vars_.emplace_back(new VarHandle());
      VarHandle* in_var_handle = static_cast<VarHandle*>(vars_.back().get());
      in_var_handle->place_ = gpu_list_[j];
      in_var_handle->name_ = "input";
      in_var_handle->version_ = 1;
      in_var_handle->scope_idx_ = j;
      in_var_handle->generated_op_ = nullptr;
      op_handle_->AddInput(in_var_handle);
    }
    // add dummy var
    vars_.emplace_back(new DummyVarHandle());
    DummyVarHandle* in_dummy_var_handle =
        static_cast<DummyVarHandle*>(vars_.back().get());
    in_dummy_var_handle->generated_op_ = nullptr;
    op_handle_->AddInput(in_dummy_var_handle);
    // add output
    vars_.emplace_back(new VarHandle());
    VarHandle* out_var_handle = static_cast<VarHandle*>(vars_.back().get());
    out_var_handle->place_ = gpu_list_[input_scope_idx];
    out_var_handle->name_ = "out";
    out_var_handle->version_ = 2;
    out_var_handle->scope_idx_ = input_scope_idx;
    op_handle_->AddOutput(out_var_handle);
    // add dummy var
    vars_.emplace_back(new DummyVarHandle());
    DummyVarHandle* dummy_var_handle =
        static_cast<DummyVarHandle*>(vars_.back().get());
    op_handle_->AddOutput(dummy_var_handle);
  }
  void TestGatherSelectedRows(size_t output_scope_idx) {
    int height = kDims[0] * 2;
    std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
                              2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
    std::vector<float> send_vector(f::product(kDims));
    for (size_t k = 0; k < send_vector.size(); ++k) {
      send_vector[k] = k;
    }
    for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
         ++input_scope_idx) {
      auto in_var = local_scopes_[input_scope_idx]->Var("input");
      auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
      auto value = in_selected_rows->mutable_value();
      value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
      in_selected_rows->set_height(height);
      in_selected_rows->set_rows(rows);
      paddle::framework::TensorFromVector<float>(
          send_vector, *(ctxs_[input_scope_idx]), value);
      value->Resize(kDims);
    }
    auto out_var = local_scopes_[output_scope_idx]->Var("out");
    auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();
    auto in_var = local_scopes_[output_scope_idx]->Var("input");
    auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
    out_selected_rows->mutable_value()->ShareDataWith(
        in_selected_rows->value());
    op_handle_->Run(false);
    WaitAll();
    p::CPUPlace cpu_place;
    auto& out_select_rows = out_var->Get<f::SelectedRows>();
    auto rt = out_select_rows.value();
    PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal.");
    for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
      PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k % rows.size()]);
    }
    f::Tensor result_tensor;
    f::TensorCopy(rt, cpu_place, *(ctxs_[output_scope_idx]), &result_tensor);
    float* ct = result_tensor.data<float>();
    for (int64_t j = 0; j < f::product(kDims); ++j) {
      ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5);
    }
  }
 };
 TEST(GatherTester, TestCPUGatherTestSelectedRows) {
  TestGatherOpHandle test_op;
  size_t input_scope_idx = 0;
  test_op.InitCtxOnGpu(false);
  test_op.InitGatherOp(input_scope_idx);
  test_op.TestGatherSelectedRows(input_scope_idx);
 }
 #ifdef PADDLE_WITH_CUDA
 TEST(GatherTester, TestGPUGatherTestSelectedRows) {
  TestGatherOpHandle test_op;
  size_t input_scope_idx = 0;
  test_op.InitCtxOnGpu(false);
  test_op.InitGatherOp(input_scope_idx);
  test_op.TestGatherSelectedRows(input_scope_idx);
 }
 #endif
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
@ -14,6 +14,8 @@
 #include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
 #include <algorithm>
 namespace paddle {
 namespace framework {
 namespace details {
@ -27,6 +29,32 @@ NCCLAllReduceOpHandle::NCCLAllReduceOpHandle(
  }
 }
 struct ReduceLoDTensor {
  const std::vector<LoDTensor> &src_tensors_;
  LoDTensor &dst_tensor_;
  ReduceLoDTensor(const std::vector<LoDTensor> &src, LoDTensor *dst)
      : src_tensors_(src), dst_tensor_(*dst) {}
  template <typename T>
  void operator()() const {
    PADDLE_ENFORCE(!src_tensors_.empty());
    auto &t0 = src_tensors_[0];
    PADDLE_ENFORCE_NE(t0.numel(), 0);
    dst_tensor_.Resize(t0.dims());
    T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace());
    std::copy(t0.data<T>(), t0.data<T>() + t0.numel(), dst);
    for (size_t i = 1; i < src_tensors_.size(); ++i) {
      auto &t = src_tensors_[i];
      PADDLE_ENFORCE_EQ(t.dims(), t0.dims());
      PADDLE_ENFORCE_EQ(t.type(), t0.type());
      std::transform(t.data<T>(), t.data<T>() + t.numel(), dst, dst,
                     [](T a, T b) -> T { return a + b; });
    }
  }
 };
 void NCCLAllReduceOpHandle::RunImpl() {
  if (inputs_.size() == 1) {
    return;  // No need to all reduce when GPU count = 1;
@ -41,37 +69,66 @@ void NCCLAllReduceOpHandle::RunImpl() {
    int dtype = -1;
    size_t numel = 0;
-    std::vector<std::function<void()>> all_reduce_calls;
+    std::vector<LoDTensor> lod_tensors;
    for (size_t i = 0; i < local_scopes_.size(); ++i) {
      auto &p = places_[i];
      auto *s = local_scopes_[i];
      int dev_id = boost::get<platform::CUDAPlace>(p).device;
      auto &lod_tensor = s->FindVar(var_name)->Get<LoDTensor>();
-      void *buffer = const_cast<void *>(lod_tensor.data<void>());
+      lod_tensors.emplace_back(lod_tensor);
    }
-      if (dtype == -1) {
+    if (platform::is_gpu_place(lod_tensors[0].place())) {
-        dtype = platform::ToNCCLDataType(lod_tensor.type());
+      std::vector<std::function<void()>> all_reduce_calls;
-      }
+      for (size_t i = 0; i < local_scopes_.size(); ++i) {
        auto &p = places_[i];
        auto &lod_tensor = lod_tensors[i];
        void *buffer = const_cast<void *>(lod_tensor.data<void>());
-      if (numel == 0) {
+        if (dtype == -1) {
-        numel = static_cast<size_t>(lod_tensor.numel());
+          dtype = platform::ToNCCLDataType(lod_tensor.type());
-      }
+        }
-      auto &nccl_ctx = nccl_ctxs_.at(dev_id);
+        if (numel == 0) {
-      auto stream = nccl_ctx.stream();
+          numel = static_cast<size_t>(lod_tensor.numel());
-      auto comm = nccl_ctx.comm_;
+        }
-      all_reduce_calls.emplace_back([=] {
+
-        PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
+        int dev_id = boost::get<platform::CUDAPlace>(p).device;
-            buffer, buffer, numel, static_cast<ncclDataType_t>(dtype), ncclSum,
+        auto &nccl_ctx = nccl_ctxs_.at(dev_id);
-            comm, stream));
+        auto stream = nccl_ctx.stream();
        auto comm = nccl_ctx.comm_;
        all_reduce_calls.emplace_back([=] {
          PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
              buffer, buffer, numel, static_cast<ncclDataType_t>(dtype),
              ncclSum, comm, stream));
        });
      }
      this->RunAndRecordEvent([&] {
        platform::NCCLGroupGuard guard;
        for (auto &call : all_reduce_calls) {
          call();
        }
      });
-    }
+    } else {  // Special handle CPU only Operator's gradient. Like CRF
      auto &trg =
          *this->local_scopes_[0]->Var()->GetMutable<framework::LoDTensor>();
      // Reduce All Tensor to trg in CPU
      ReduceLoDTensor func(lod_tensors, &trg);
      VisitDataType(ToDataType(lod_tensors[0].type()), func);
-    platform::NCCLGroupGuard guard;
+      for (size_t i = 0; i < local_scopes_.size(); ++i) {
-    for (auto &call : all_reduce_calls) {
+        auto &scope = local_scopes_[i];
-      call();
+        auto &p = places_[i];
        auto *var = scope->FindVar(var_name);
        auto *dev_ctx = dev_ctxes_[p];
        RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
          auto &tensor_gpu = *var->GetMutable<framework::LoDTensor>();
          auto &tensor_cpu = trg;
          TensorCopy(tensor_cpu, p, *dev_ctx, &tensor_gpu);
        });
      }
    }
  }
 }
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@ -54,17 +54,6 @@ void OpHandleBase::Run(bool use_event) {
 #endif
  RunImpl();
 #ifdef PADDLE_WITH_CUDA
  if (use_event) {
    for (auto &p : dev_ctxes_) {
      int dev_id = boost::get<platform::CUDAPlace>(p.first).device;
      auto stream =
          static_cast<platform::CUDADeviceContext *>(p.second)->stream();
      PADDLE_ENFORCE(cudaEventRecord(events_.at(dev_id), stream));
    }
  }
 #endif
 }
 void OpHandleBase::Wait(platform::DeviceContext *waited_dev) {
@ -97,6 +86,43 @@ void OpHandleBase::AddOutput(VarHandleBase *out) {
  out->generated_op_ = this;
 }
 void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
 #ifdef PADDLE_WITH_CUDA
  if (!events_.empty()) {  // Use event
    std::function<void()> method = callback;
    for (auto &p : dev_ctxes_) {
      method = [method, p, this]() {
        static_cast<platform::CUDADeviceContext *>(p.second)->RecordEvent(
            events_.at(boost::get<platform::CUDAPlace>(p.first).device),
            method);
      };
    }
    method();
  } else {
 #endif
    callback();
 #ifdef PADDLE_WITH_CUDA
  }
 #endif
 }
 void OpHandleBase::RunAndRecordEvent(platform::Place p,
                                     const std::function<void()> &callback) {
 #ifdef PADDLE_WITH_CUDA
  if (platform::is_cpu_place(p) || events_.empty()) {
    callback();
  } else {
    auto *ctx = dev_ctxes_.at(p);
    auto *cuda_ctx = static_cast<platform::CUDADeviceContext *>(ctx);
    cuda_ctx->RecordEvent(events_.at(boost::get<platform::CUDAPlace>(p).device),
                          callback);
  }
 #else
  callback();
 #endif
 }
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@ -62,6 +62,11 @@ class OpHandleBase {
  virtual bool IsMultiDeviceTransfer() { return false; }
 protected:
  void RunAndRecordEvent(const std::function<void()> &callback);
  void RunAndRecordEvent(platform::Place p,
                         const std::function<void()> &callback);
  virtual void RunImpl() = 0;
 };
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@ -14,6 +14,8 @@
 #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
 #include <string>
 namespace paddle {
 namespace framework {
 namespace details {
@ -37,11 +39,13 @@ void ScaleLossGradOpHandle::RunImpl() {
    *tmp = coeff_;
  } else {
 #ifdef PADDLE_WITH_CUDA
-    auto stream =
+    this->RunAndRecordEvent([&] {
-        static_cast<platform::CUDADeviceContext *>(this->dev_ctxes_[place_])
+      auto stream =
-            ->stream();
+          static_cast<platform::CUDADeviceContext *>(this->dev_ctxes_[place_])
-    memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
+              ->stream();
-                 platform::CPUPlace(), &coeff_, sizeof(float), stream);
+      memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
                   platform::CPUPlace(), &coeff_, sizeof(float), stream);
    });
 #endif
  }
 }
--- a/paddle/fluid/framework/details/send_op_handle.cc
+++ b/paddle/fluid/framework/details/send_op_handle.cc
@ -34,7 +34,7 @@ void SendOpHandle::RunImpl() {
    }
    in->generated_op_->Wait(dev_ctxes_[p]);
  }
-  op_->Run(*local_scope_, place_);
+  this->RunAndRecordEvent([&] { op_->Run(*local_scope_, place_); });
 }
 std::string SendOpHandle::Name() const { return "send"; }
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@ -196,10 +196,12 @@ void ThreadedSSAGraphExecutor::RunOp(
    BlockingQueue<VarHandleBase *> *ready_var_q, details::OpHandleBase *op) {
  auto op_run = [ready_var_q, op, this] {
    try {
-      VLOG(10) << op->Name() << " : " << op->DebugString();
+      VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
      op->Run(use_event_);
      VLOG(10) << op << " " << op->Name() << " Done ";
      running_ops_--;
      ready_var_q->Extend(op->outputs_);
      VLOG(10) << op << " " << op->Name() << "Signal posted";
    } catch (platform::EnforceNotMet ex) {
      exception_.reset(new platform::EnforceNotMet(ex));
    } catch (...) {
--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@ -50,6 +50,7 @@ struct VarHandle : public VarHandleBase {
  // version field currently is not used, however, just store the version to
  // debug easily.
  size_t version_;
  size_t scope_idx_;
  std::string name_;
  platform::Place place_;
 };
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@ -83,8 +83,8 @@ static void CheckTensorNANOrInf(const std::string& name,
  if (tensor.memory_size() == 0) {
    return;
  }
-  if (tensor.type().hash_code() != typeid(float).hash_code() &&
+  if (tensor.type().hash_code() != typeid(float).hash_code() &&   // NOLINT
-      tensor.type().hash_code() != typeid(double).hash_code()) {
+      tensor.type().hash_code() != typeid(double).hash_code()) {  // NOLINT
    return;
  }
  PADDLE_ENFORCE(!framework::TensorContainsInf(tensor),
@ -145,12 +145,13 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
 // Return true if the block has feed operators and holder of matching info.
 static bool has_feed_operators(
    const BlockDesc& block,
-    std::map<std::string, const LoDTensor*>& feed_targets,
+    const std::map<std::string, const LoDTensor*>& feed_targets,
    const std::string& feed_holder_name) {
  size_t feed_count = 0;
  for (auto* op : block.AllOps()) {
    if (op->Type() == kFeedOpType) {
      feed_count++;
      // The input variable's name of feed_op should be feed_holder_name.
      PADDLE_ENFORCE_EQ(op->Input("X")[0], feed_holder_name,
                        "Input to feed op should be '%s'", feed_holder_name);
      std::string feed_target_name = op->Output("Out")[0];
@ -166,13 +167,15 @@ static bool has_feed_operators(
        feed_count, feed_targets.size(),
        "The number of feed operators should match 'feed_targets'");
-    // When feed operator are present, so should be feed_holder
+    if (!feed_holder_name.empty()) {
-    auto var = block.FindVar(feed_holder_name);
+      // When feed operator are present, so should be feed_holder.
-    PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
+      auto var = block.FindVar(feed_holder_name);
-                            feed_holder_name);
+      PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
-    PADDLE_ENFORCE_EQ(var->GetType(), proto::VarType::FEED_MINIBATCH,
+                              feed_holder_name);
-                      "'%s' variable should be 'FEED_MINIBATCH' type",
+      PADDLE_ENFORCE_EQ(var->GetType(), proto::VarType::FEED_MINIBATCH,
-                      feed_holder_name);
+                        "'%s' variable should be 'FEED_MINIBATCH' type",
                        feed_holder_name);
    }
  }
  return feed_count > 0;
@ -185,12 +188,14 @@ static bool has_feed_operators(
 // and fetch_holder_name. Raise exception when any mismatch is found.
 // Return true if the block has fetch operators and holder of matching info.
 static bool has_fetch_operators(
-    const BlockDesc& block, std::map<std::string, LoDTensor*>& fetch_targets,
+    const BlockDesc& block,
    const std::map<std::string, LoDTensor*>& fetch_targets,
    const std::string& fetch_holder_name) {
  size_t fetch_count = 0;
  for (auto* op : block.AllOps()) {
    if (op->Type() == kFetchOpType) {
      fetch_count++;
      // The output variable's name of fetch_op should be fetch_holder_name.
      PADDLE_ENFORCE_EQ(op->Output("Out")[0], fetch_holder_name,
                        "Output of fetch op should be '%s'", fetch_holder_name);
      std::string fetch_target_name = op->Input("X")[0];
@ -206,13 +211,15 @@ static bool has_fetch_operators(
        fetch_count, fetch_targets.size(),
        "The number of fetch operators should match 'fetch_targets'");
-    // When fetch operator are present, so should be fetch_holder
+    if (!fetch_holder_name.empty()) {
-    auto var = block.FindVar(fetch_holder_name);
+      // When fetch operator are present, so should be fetch_holder.
-    PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
+      auto var = block.FindVar(fetch_holder_name);
-                            fetch_holder_name);
+      PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
-    PADDLE_ENFORCE_EQ(var->GetType(), proto::VarType::FETCH_LIST,
+                              fetch_holder_name);
-                      "'%s' variable should be 'FETCH_LIST' type",
+      PADDLE_ENFORCE_EQ(var->GetType(), proto::VarType::FETCH_LIST,
-                      fetch_holder_name);
+                        "'%s' variable should be 'FETCH_LIST' type",
                        fetch_holder_name);
    }
  }
  return fetch_count > 0;
@ -259,16 +266,6 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
    }
  }
  // map the data of feed_targets to feed_holder
  for (auto* op : global_block->AllOps()) {
    if (op->Type() == kFeedOpType) {
      std::string feed_target_name = op->Output("Out")[0];
      int idx = boost::get<int>(op->GetAttr("col"));
      SetFeedVariable(scope, *feed_targets[feed_target_name], feed_holder_name,
                      idx);
    }
  }
  if (!has_fetch_ops) {
    // create fetch_holder variable
    auto* fetch_holder = global_block->Var(fetch_holder_name);
@ -292,17 +289,9 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
    }
  }
-  Run(*copy_program, scope, 0, create_vars, create_vars);
+  auto ctx = Prepare(*copy_program, 0);
-
+  RunPreparedContext(ctx.get(), scope, feed_targets, fetch_targets, create_vars,
-  // obtain the data of fetch_targets from fetch_holder
+                     feed_holder_name, fetch_holder_name);
  for (auto* op : global_block->AllOps()) {
    if (op->Type() == kFetchOpType) {
      std::string fetch_target_name = op->Input("X")[0];
      int idx = boost::get<int>(op->GetAttr("col"));
      *fetch_targets[fetch_target_name] =
          GetFetchVariable(*scope, fetch_holder_name, idx);
    }
  }
 }
 std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
@ -370,5 +359,42 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
  }
 }
 void Executor::RunPreparedContext(
    ExecutorPrepareContext* ctx, Scope* scope,
    std::map<std::string, const LoDTensor*>& feed_targets,
    std::map<std::string, LoDTensor*>& fetch_targets, bool create_vars,
    const std::string& feed_holder_name, const std::string& fetch_holder_name) {
  auto& global_block = ctx->prog_.Block(ctx->block_id_);
  PADDLE_ENFORCE(
      has_feed_operators(global_block, feed_targets, feed_holder_name),
      "Program in ExecutorPrepareContext should has feed_ops.");
  PADDLE_ENFORCE(
      has_fetch_operators(global_block, fetch_targets, fetch_holder_name),
      "Program in the prepared context should has fetch_ops.");
  // map the data of feed_targets to feed_holder
  for (auto* op : global_block.AllOps()) {
    if (op->Type() == kFeedOpType) {
      std::string feed_target_name = op->Output("Out")[0];
      int idx = boost::get<int>(op->GetAttr("col"));
      SetFeedVariable(scope, *feed_targets[feed_target_name], feed_holder_name,
                      idx);
    }
  }
  RunPreparedContext(ctx, scope, create_vars, create_vars);
  // obtain the data of fetch_targets from fetch_holder
  for (auto* op : global_block.AllOps()) {
    if (op->Type() == kFetchOpType) {
      std::string fetch_target_name = op->Input("X")[0];
      int idx = boost::get<int>(op->GetAttr("col"));
      *fetch_targets[fetch_target_name] =
          GetFetchVariable(*scope, fetch_holder_name, idx);
    }
  }
 }
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@ -14,6 +14,9 @@ limitations under the License. */
 #pragma once
 #include <map>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
@ -70,6 +73,13 @@ class Executor {
                          bool create_local_scope = true,
                          bool create_vars = true);
  void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
                          std::map<std::string, const LoDTensor*>& feed_targets,
                          std::map<std::string, LoDTensor*>& fetch_targets,
                          bool create_vars = true,
                          const std::string& feed_holder_name = "feed",
                          const std::string& fetch_holder_name = "fetch");
 private:
  const platform::Place place_;
 };
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@ -11,8 +11,10 @@
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include <algorithm>
 #include <limits>
 #include <vector>
 namespace paddle {
 namespace framework {
@ -65,8 +67,6 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
    auto ctx_place = ctx.GetPlace();
    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
    auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
    memory::Copy(
        dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
@ -14,8 +14,12 @@
 #include "paddle/fluid/framework/threadpool.h"
 #include "gflags/gflags.h"
 #include "paddle/fluid/platform/enforce.h"
 DEFINE_int32(io_threadpool_size, 100,
             "number of threads used for doing IO, default 100");
 namespace paddle {
 namespace framework {
@ -91,5 +95,20 @@ void ThreadPool::TaskLoop() {
  }
 }
 std::unique_ptr<ThreadPool> ThreadPoolIO::io_threadpool_(nullptr);
 std::once_flag ThreadPoolIO::io_init_flag_;
 ThreadPool* ThreadPoolIO::GetInstanceIO() {
  std::call_once(io_init_flag_, &ThreadPoolIO::InitIO);
  return io_threadpool_.get();
 }
 void ThreadPoolIO::InitIO() {
  if (io_threadpool_.get() == nullptr) {
    // TODO(typhoonzero1986): make this configurable
    io_threadpool_.reset(new ThreadPool(FLAGS_io_threadpool_size));
  }
 }
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/threadpool.h
+++ b/paddle/fluid/framework/threadpool.h
@ -14,12 +14,12 @@ limitations under the License. */
 #pragma once
-#include <condition_variable>
+#include <condition_variable>  // NOLINT
 #include <functional>
-#include <future>
+#include <future>  // NOLINT
-#include <mutex>
+#include <mutex>   // NOLINT
 #include <queue>
-#include <thread>
+#include <thread>  // NOLINT
 #include <vector>
 #include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"
@ -28,6 +28,22 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 struct ExceptionHandler {
  mutable std::future<std::unique_ptr<platform::EnforceNotMet>> future_;
  explicit ExceptionHandler(
      std::future<std::unique_ptr<platform::EnforceNotMet>>&& f)
      : future_(std::move(f)) {}
  void operator()() const {
    auto ex = this->future_.get();
    if (ex != nullptr) {
      LOG(FATAL) << "The exception is thrown inside the thread pool. You "
                    "should use RunAndGetException to handle the exception.\n"
                    "The default exception handler is LOG(FATAL)."
                 << ex->what();
    }
  }
 };
 // ThreadPool maintains a queue of tasks, and runs them using a fixed
 // number of threads.
 class ThreadPool {
@ -87,22 +103,6 @@ class ThreadPool {
  void Wait();
 private:
  struct ExceptionHandler {
    mutable std::future<std::unique_ptr<platform::EnforceNotMet>> future_;
    explicit ExceptionHandler(
        std::future<std::unique_ptr<platform::EnforceNotMet>>&& f)
        : future_(std::move(f)) {}
    void operator()() const {
      auto ex = this->future_.get();
      if (ex != nullptr) {
        LOG(FATAL) << "The exception is thrown inside the thread pool. You "
                      "should use RunAndGetException to handle the exception.\n"
                      "The default exception handler is LOG(FATAL)."
                   << ex->what();
      }
    }
  };
  DISABLE_COPY_AND_ASSIGN(ThreadPool);
  // If the task queue is empty and avaialbe is equal to the number of
@ -135,6 +135,17 @@ class ThreadPool {
  std::condition_variable completed_;
 };
 class ThreadPoolIO : ThreadPool {
 public:
  static ThreadPool* GetInstanceIO();
  static void InitIO();
 private:
  // NOTE: threadpool in base will be inhereted here.
  static std::unique_ptr<ThreadPool> io_threadpool_;
  static std::once_flag io_init_flag_;
 };
 // Run a function asynchronously.
 // NOTE: The function must return void. If the function need to return a value,
 // you can use lambda to capture a value pointer.
@ -143,5 +154,10 @@ std::future<void> Async(Callback callback) {
  return ThreadPool::GetInstance()->Run(callback);
 }
 template <typename Callback>
 std::future<void> AsyncIO(Callback callback) {
  return ThreadPoolIO::GetInstanceIO()->Run(callback);
 }
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@ -23,7 +23,7 @@ limitations under the License. */
 namespace paddle {
 namespace inference {
-// Temporarilly add this function for exposing framework::InitDevices() when
+// Temporarily add this function for exposing framework::InitDevices() when
 // linking the inference shared library.
 void Init(bool init_p2p) { framework::InitDevices(init_p2p); }
--- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
@ -46,8 +46,8 @@ TEST(inference, image_classification) {
  // Run inference on CPU
  LOG(INFO) << "--- CPU Runs: ---";
-  TestInference<paddle::platform::CPUPlace, false>(dirname, cpu_feeds,
+  TestInference<paddle::platform::CPUPlace, false, true>(
-                                                   cpu_fetchs1, FLAGS_repeat);
+      dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat);
  LOG(INFO) << output1.dims();
 #ifdef PADDLE_WITH_CUDA
@ -57,8 +57,8 @@ TEST(inference, image_classification) {
  // Run inference on CUDA GPU
  LOG(INFO) << "--- GPU Runs: ---";
-  TestInference<paddle::platform::CUDAPlace, false>(dirname, cpu_feeds,
+  TestInference<paddle::platform::CUDAPlace, false, true>(
-                                                    cpu_fetchs2, FLAGS_repeat);
+      dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat);
  LOG(INFO) << output2.dims();
  CheckError<float>(output1, output2);
--- a/Show More
+++ b/Show More