Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into imperative_lr_scheduler

7 years ago · b5bbb13ac1
parent a424ab499e 7c5319ba12
commit b5bbb13ac1
442 changed files with 23401 additions and 5792 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -24,6 +24,8 @@ message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
        "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
 message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
        "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
+message(STATUS "AR tools: ${CMAKE_AR}")
+
 if(WIN32)
    set(CMAKE_SUPPRESS_REGENERATION ON)
    set(CMAKE_STATIC_LIBRARY_PREFIX lib)
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@ -179,7 +179,6 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog,
    else:
        build_strategy.reduce_strategy = fluid.BuildStrategy(
        ).ReduceStrategy.AllReduce
-    build_strategy.fuse_broadcast_op = args.fuse_broadcast_op

    avg_loss = train_args[0]

--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@ -24,7 +24,7 @@ set(BOOST_PROJECT       "extern_boost")
 # So we use 1.41.0 here.
 set(BOOST_VER           "1.41.0")
 set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
-set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
+set(BOOST_URL "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)

 MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")

--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@ -44,7 +44,7 @@ ExternalProject_Add(
    # 3. keep only zlib, cares, protobuf, boringssl under "third_party",
    #    checkout and clean other dirs under third_party
    # 4. remove .git, and package the directory.
-    URL "http://paddlepaddledeps.cdn.bcebos.com/grpc-v1.10.x.tar.gz"
+    URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.10.x.tar.gz"
    URL_MD5  "1f268a2aff6759839dccd256adcc91cf"
    PREFIX          ${GRPC_SOURCES_DIR}
    UPDATE_COMMAND  ""
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@ -34,7 +34,7 @@ SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
 SET(TIME_VERSION "2019.0.1.20181227")
 IF(WIN32)
    SET(MKLML_VER "mklml_win_${TIME_VERSION}" CACHE STRING "" FORCE)
-    SET(MKLML_URL "https://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
+    SET(MKLML_URL "https://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/mklml.lib)
    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5md.lib)
    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/mklml.dll)
@ -43,7 +43,7 @@ ELSE()
    #TODO(intel-huying):
    #  Now enable Erf function in mklml library temporarily, it will be updated as offical version later.
    SET(MKLML_VER "Glibc225_vsErf_mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE)
-    SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
+    SET(MKLML_URL "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/libmklml_intel.so)
    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5.so)
    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/libmklml_intel.so)
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@ -110,7 +110,7 @@ function(op_library TARGET)
    # Define operators that don't need pybind here.
    foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
 "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
-"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op")
+"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op")
        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
            set(pybind_flag 1)
        endif()
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -174,7 +174,7 @@ else()
  cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()

-target_link_libraries(executor garbage_collector)
+target_link_libraries(executor garbage_collector while_op_helper)

 cc_library(parallel_executor SRCS parallel_executor.cc DEPS
        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@ -9,6 +9,7 @@ cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place
 cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper)
 cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper)
 cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc DEPS multi_devices_helper)
+cc_library(alloc_continuous_space_for_grad_pass SRCS alloc_continuous_space_for_grad_pass.cc DEPS graph graph_helper)

 cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)

@ -22,6 +23,8 @@ endif()
 if(WITH_GPU)
    nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
            dynload_cuda variable_visitor)
+    nv_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
+            dynload_cuda variable_visitor)
    if(WITH_DISTRIBUTE)
        nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
            ddim dynload_cuda selected_rows_functor sendrecvop_rpc)
@ -35,6 +38,8 @@ if(WITH_GPU)
 else()
    cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
             variable_visitor)
+    cc_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
+            variable_visitor)
    if(WITH_DISTRIBUTE)
        cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
            ddim selected_rows_functor sendrecvop_rpc)
@ -46,9 +51,7 @@ else()
    cc_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle)
 endif()

-cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_base scope lod_tensor)
 cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
-cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)

 if(WITH_GPU)
 cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper gpu_info)
@ -61,14 +64,17 @@ cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_
 cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
 cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle)
 cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper)
-cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass)
+cc_library(while_op_eager_deletion_pass SRCS while_op_eager_deletion_pass.cc DEPS while_op_helper graph_helper pass computation_op_handle)
+cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass while_op_eager_deletion_pass)
 cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view reference_count_pass_helper)

 cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass)
 cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass)

 cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
-        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)
+        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle fused_broadcast_op_handle)
+
+cc_library(fuse_all_reduce_op_pass SRCS fuse_all_reduce_op_pass.cc DEPS graph graph_helper fused_all_reduce_op_handle)

 set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass inplace_op_pass)
 if (WITH_GPU)
@ -97,5 +103,5 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS
        graph_viz_pass multi_devices_graph_pass
        multi_devices_graph_print_pass multi_devices_graph_check_pass
        fuse_elewise_add_act_pass multi_batch_merge_pass 
-        fuse_relu_depthwise_conv_pass
-        memory_optimize_pass lock_free_optimize_pass)
+        fuse_relu_depthwise_conv_pass 
+        memory_optimize_pass lock_free_optimize_pass alloc_continuous_space_for_grad_pass fuse_all_reduce_op_pass)
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@ -11,9 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include <algorithm>
-
 #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
+#include <algorithm>
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
@ -56,6 +55,7 @@ void AllReduceOpHandle::RunImpl() {
  platform::RecordEvent record_event(Name());

  WaitInputVarGenerated();
+
  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
  PADDLE_ENFORCE_EQ(
--- a/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc
+++ b/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc
--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@ -57,7 +57,7 @@ struct BroadcastOpHandle : public OpHandleBase {

  std::string Name() const override;

-  bool IsMultiDeviceTransfer() override { return false; };
+  bool IsMultiDeviceTransfer() override { return true; };

 protected:
  void RunImpl() override;
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@ -16,6 +16,7 @@ limitations under the License. */

 #include <glog/logging.h>
 #include <memory>
+#include <utility>

 #include "paddle/fluid/framework/details/memory_optimize_helper.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
@ -45,12 +46,27 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
 public:
  explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy)
      : ir::PassBuilder(), strategy_(strategy) {
+    // Add a graph viz pass to record a graph.
+    if (!strategy_.debug_graphviz_path_.empty()) {
+      auto viz_pass = AppendPass("graph_viz_pass");
+      const std::string graph_path = string::Sprintf(
+          "%s%s", strategy_.debug_graphviz_path_.c_str(), "_original_graph");
+      viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
+    }
+
    if (strategy_.enable_sequential_execution_) {
+      VLOG(10) << "Add sequential_execution_pass";
      AppendPass("sequential_execution_pass");
    }

+    // Add op fusion.
+    if (strategy.sync_batch_norm_) {
+      AppendPass("sync_batch_norm_pass");
+    }
+
    // Add op fusion.
    if (strategy.fuse_relu_depthwise_conv_) {
+      VLOG(10) << "Add fuse_relu_depthwise_conv_pass";
      AppendPass("fuse_relu_depthwise_conv_pass");
    }

@ -62,29 +78,30 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {

    // Add automatically inplace.
    if (strategy_.enable_inplace_) {
+      VLOG(10) << "Add inplace_pass";
      AppendPass("inplace_pass");
    }

+    if (strategy.fuse_elewise_add_act_ops_) {
+      VLOG(10) << "Add fuse_elewise_add_act_pass";
+      AppendPass("fuse_elewise_add_act_pass");
+    }
+
+    // for single card training, fuse_all_reduce_ops is unnecessary.
+    // alloc_continuous_space_for_grad_pass should be before of MultiDevPass.
+    if (strategy.fuse_all_reduce_ops_) {
+      VLOG(10) << "Add alloc_continuous_space_for_grad_pass";
+      AppendPass("alloc_continuous_space_for_grad_pass");
+    }
+
    // Add a graph viz pass to record a graph.
-    if (!strategy_.debug_graphviz_path_.empty()) {
+    if (!strategy.debug_graphviz_path_.empty()) {
      auto viz_pass = AppendPass("graph_viz_pass");
      const std::string graph_path = string::Sprintf(
-          "%s%s", strategy_.debug_graphviz_path_.c_str(), "_original_graph");
+          "%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph");
      viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
    }

-    if (strategy.fuse_elewise_add_act_ops_) {
-      auto fuse_elewise_add_act_pass = AppendPass("fuse_elewise_add_act_pass");
-      // Add a graph viz pass to record a graph.
-      if (!strategy.debug_graphviz_path_.empty()) {
-        auto viz_pass = AppendPass("graph_viz_pass");
-        const std::string graph_path = string::Sprintf(
-            "%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph");
-        viz_pass->Set<std::string>("graph_viz_path",
-                                   new std::string(graph_path));
-      }
-    }
-
    CollectiveContext *context = CollectiveContext::GetInstance();
    context->endpoints_ = strategy_.trainers_endpoints_;
    context->trainer_id_ = strategy_.trainer_id_;
@ -102,11 +119,19 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
    // A side-effect of that, memory optimize cannot forsee the fetched vars
    // , so fetchlist should be set persistable before call the Run interface.
    if (strategy.memory_optimize_) {
-      auto memory_optimize_pass = AppendPass("memory_optimize_pass");
+      VLOG(10) << "Add memory_optimize_pass";
+      AppendPass("memory_optimize_pass");
    }

    AppendMultiDevPass(strategy);

+    if (strategy.fuse_all_reduce_ops_) {
+      // NOTE: fuse_all_reduce_ops will count the number of all_reduce operator
+      // first, if the number is zero, fuse_all_reduce_ops will do nothing.
+      VLOG(10) << "Add fuse_all_reduce_op_pass";
+      AppendPass("fuse_all_reduce_op_pass");
+    }
+
    // Add a graph print pass to record a graph with device info.
    if (!strategy_.debug_graphviz_path_.empty()) {
      auto multi_devices_print_pass = AppendPass("multi_devices_print_pass");
@ -122,28 +147,34 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
    // Verify that the graph is correct for multi-device executor.
    AppendPass("multi_devices_check_pass");

+    if (VLOG_IS_ON(2)) {
+      AppendPass("all_reduce_deps_pass");
+    }
+
    if (SeqOnlyAllReduceOps(strategy)) {
+      VLOG(10) << "Add all_reduce_deps_pass";
      AppendPass("all_reduce_deps_pass");
    }

    if (strategy_.remove_unnecessary_lock_) {
+      VLOG(10) << "Add modify_op_lock_and_record_event_pass";
      AppendPass("modify_op_lock_and_record_event_pass");
    }
  }

  // Convert graph to run on multi-devices.
  void AppendMultiDevPass(const BuildStrategy &strategy) {
-    ir::Pass *multi_devices_pass;
+    ir::Pass *multi_devices_pass = nullptr;
    if (strategy_.is_distribution_) {
-      VLOG(3) << "multi device parameter server mode";
+      VLOG(10) << "Add dist_multi_devices_pass";
      multi_devices_pass = AppendPass("dist_multi_devices_pass").get();
    } else {
      if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
-        VLOG(3) << "multi devices collective mode with allreduce";
+        VLOG(10) << "Add all_reduce_mode_multi_devices_pass";
        multi_devices_pass =
-            AppendPass("allreduce_mode_multi_devices_pass").get();
+            AppendPass("all_reduce_mode_multi_devices_pass").get();
      } else if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
-        VLOG(3) << "multi deivces collective mode with reduce";
+        VLOG(10) << "Add reduce_mode_multi_devices_pass";
        multi_devices_pass = AppendPass("reduce_mode_multi_devices_pass").get();
      } else {
        PADDLE_THROW("Unknown reduce strategy.");
@ -200,9 +231,26 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(

 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
      platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
-      pass->Erase("nccl_ctxs");
-      pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
+      pass->Erase(kNCCLCtxs);
+      pass->SetNotOwned<platform::NCCLContextMap>(kNCCLCtxs, nctx);
+#endif
+    } else if (pass->Type() == "fuse_all_reduce_op_pass") {
+      pass->Erase(kPlaces);
+      pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
+      pass->Erase(kLocalScopes);
+      pass->SetNotOwned<const std::vector<Scope *>>(kLocalScopes,
+                                                    &local_scopes);
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+      platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
+      pass->Erase(kNCCLCtxs);
+      pass->SetNotOwned<platform::NCCLContextMap>(kNCCLCtxs, nctx);
 #endif
+    } else if (pass->Type() == "alloc_continuous_space_for_grad_pass") {
+      pass->Erase(kPlaces);
+      pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
+      pass->Erase(kLocalScopes);
+      pass->SetNotOwned<const std::vector<Scope *>>(kLocalScopes,
+                                                    &local_scopes);
    } else if (pass->Type() == "sequential_execution_pass") {
      LOG(INFO) << "set enable_sequential_execution:"
                << enable_sequential_execution_;
@ -227,12 +275,13 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
 }  // namespace framework
 }  // namespace paddle

+USE_PASS(sync_batch_norm_pass);
 USE_PASS(fuse_relu_depthwise_conv_pass);
 USE_PASS(fuse_elewise_add_act_pass);
 USE_PASS(graph_viz_pass);
 USE_PASS(multi_batch_merge_pass);
 USE_PASS(reduce_mode_multi_devices_pass);
-USE_PASS(allreduce_mode_multi_devices_pass);
+USE_PASS(all_reduce_mode_multi_devices_pass);
 USE_PASS(dist_multi_devices_pass);
 USE_PASS(multi_devices_check_pass);
 USE_PASS(multi_devices_print_pass);
@ -242,4 +291,6 @@ USE_PASS(all_reduce_deps_pass);
 USE_PASS(modify_op_lock_and_record_event_pass);
 USE_PASS(inplace_pass);
 USE_PASS(lock_free_optimize_pass);
+USE_PASS(alloc_continuous_space_for_grad_pass);
 USE_PASS(graph_to_program_pass);
+USE_PASS(fuse_all_reduce_op_pass);
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@ -16,6 +16,7 @@

 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>

 #include "paddle/fluid/framework/ir/pass_builder.h"
@ -75,8 +76,12 @@ struct BuildStrategy {

  bool fuse_elewise_add_act_ops_{false};

+  bool fuse_all_reduce_ops_{false};
+
  bool fuse_relu_depthwise_conv_{false};

+  bool sync_batch_norm_{false};
+
  bool memory_optimize_{true};
  // TODO(dzhwinter):
  // make enable_inplace, memory_optimize_
--- a/paddle/fluid/framework/details/computation_op_handle.h
+++ b/paddle/fluid/framework/details/computation_op_handle.h
@ -14,6 +14,7 @@

 #pragma once

+#include <memory>
 #include <string>
 #include <vector>

@ -31,6 +32,8 @@ class ComputationOpHandle : public OpHandleBase {
  ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place,
                      size_t scope_idx);

+  OperatorBase *GetOp() { return op_.get(); }
+
  std::string Name() const override;

  const Scope *GetScope() const { return scope_; }
--- a/paddle/fluid/framework/details/data_balance_op_handle.cc
+++ b/paddle/fluid/framework/details/data_balance_op_handle.cc
@ -1,154 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/data_balance_op_handle.h"
-#include <algorithm>
-#include "paddle/fluid/framework/details/container_cast.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-DataBalanceOpHandle::DataBalanceOpHandle(
-    ir::Node *node, const std::vector<Scope *> &local_scopes,
-    const std::vector<platform::Place> &places,
-    const platform::NCCLContextMap *ctxs)
-    : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {
-  if (ctxs) {
-    for (auto &p : places_) {
-      this->SetDeviceContext(p, ctxs->DevCtx(p));
-    }
-  }
-}
-#else
-DataBalanceOpHandle::DataBalanceOpHandle(
-    ir::Node *node, const std::vector<Scope *> &local_scopes,
-    const std::vector<platform::Place> &places)
-    : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
-#endif
-
-std::string DataBalanceOpHandle::Name() const { return "data balance"; }
-
-std::vector<std::array<int, 3>> DataBalanceOpHandle::GetBalancePlan(
-    const std::vector<int> &device_sizes) {
-  int device_num = device_sizes.size();
-  int total_size = 0;
-  int empty_num = 0;
-  std::vector<std::array<int, 2>> size_device_vec;
-  size_device_vec.reserve(device_num);
-  for (int i = 0; i < device_num; ++i) {
-    if (device_sizes[i] == 0) {
-      ++empty_num;
-    }
-    total_size += device_sizes[i];
-    size_device_vec.push_back({{device_sizes[i], i}});
-  }
-  std::vector<std::array<int, 3>> res;
-  if (empty_num == 0) {
-    // No need to do data balance.
-    return res;
-  }
-  if (total_size < device_num) {
-    // No enough data.
-    PADDLE_THROW_EOF();
-  }
-  std::sort(size_device_vec.begin(), size_device_vec.end(),
-            [](const std::array<int, 2> &a, const std::array<int, 2> &b) {
-              return a[0] > b[0];
-            });
-  int expected_device_size = total_size / device_num;
-  int src_idx = 0;
-  for (int dst_idx = device_num - empty_num; dst_idx < device_num; ++dst_idx) {
-    if (size_device_vec[src_idx][0] <= expected_device_size) {
-      ++src_idx;
-      PADDLE_ENFORCE_LT(
-          src_idx, device_num - empty_num,
-          "In current srategy an empty tensor should not be copy source.");
-    }
-    size_device_vec[src_idx][0] -= expected_device_size;
-    size_device_vec[dst_idx][0] += expected_device_size;
-    res.push_back({{size_device_vec[src_idx][1], size_device_vec[dst_idx][1],
-                    expected_device_size}});
-  }
-  return res;
-}
-
-void DataBalanceOpHandle::RunImpl() {
-  PADDLE_ENFORCE_GT(places_.size(), 1UL,
-                    "Data balance can only be enabled when the number of "
-                    "places to run larger than 1.");
-  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
-  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
-  PADDLE_ENFORCE(in_var_handles.size() % places_.size() == 0);
-  PADDLE_ENFORCE_EQ(
-      in_var_handles.size(), out_var_handles.size(),
-      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
-  int data_num = in_var_handles.size() / places_.size();
-  WaitInputVarGenerated();
-  std::vector<std::vector<LoDTensor *>> lod_tensors(data_num);
-  std::vector<int> device_sizes;
-  for (int i = 0; i < static_cast<int>(in_var_handles.size()); ++i) {
-    PADDLE_ENFORCE_EQ(in_var_handles[i]->name(), out_var_handles[i]->name(),
-                      "The name of input and output should be equal.");
-    int place_idx = i / data_num;
-    int data_idx = i % data_num;
-    auto *local_scope =
-        local_scopes_[place_idx]->FindVar(kLocalExecScopeName)->Get<Scope *>();
-    auto *tensor_var = local_scope->FindVar(in_var_handles[i]->name());
-    PADDLE_ENFORCE(tensor_var->IsType<LoDTensor>());
-    auto *tensor = tensor_var->GetMutable<LoDTensor>();
-    lod_tensors[data_idx].push_back(tensor);
-    int ins_size =
-        tensor->lod().empty() ? tensor->dims()[0] : tensor->NumElements();
-    if (data_idx == 0) {
-      device_sizes.emplace_back(ins_size);
-    } else {
-      PADDLE_ENFORCE_EQ(
-          ins_size, device_sizes.at(place_idx),
-          "All data on the same device shall have the same batch size.");
-    }
-  }
-  const auto &balance_plan = GetBalancePlan(device_sizes);
-
-  for (const auto &trans : balance_plan) {
-    for (int data_idx = 0; data_idx < data_num; ++data_idx) {
-      LoDTensor *src_tensor = lod_tensors[data_idx][trans[0]];
-      LoDTensor *dst_tensor = lod_tensors[data_idx][trans[1]];
-      int trans_ins_size = trans[2];
-      LoD src_lod = src_tensor->lod();
-      int src_ins_size =
-          src_lod.empty() ? src_tensor->dims()[0] : src_tensor->NumElements();
-      int cut_point = src_ins_size - trans_ins_size;
-      if (!src_lod.empty()) {
-        for (auto &level : src_lod) {
-          cut_point = level[cut_point];
-        }
-      }
-      TensorCopySync(src_tensor->Slice(cut_point, src_tensor->dims()[0]),
-                     dst_tensor->place(), dst_tensor);
-      src_tensor->ShareDataWith(src_tensor->Slice(0, cut_point));
-      if (!src_lod.empty()) {
-        dst_tensor->set_lod(SliceInLevel(
-            src_lod, 0, src_ins_size - trans_ins_size, src_ins_size));
-        src_tensor->set_lod(
-            SliceInLevel(src_lod, 0, 0, src_ins_size - trans_ins_size));
-      }
-    }
-  }
-}
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/framework/details/data_balance_op_handle.h
+++ b/paddle/fluid/framework/details/data_balance_op_handle.h
@ -1,59 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-struct DataBalanceOpHandle : public OpHandleBase {
- public:
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  DataBalanceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
-                      const std::vector<platform::Place> &places,
-                      const platform::NCCLContextMap *ctxs);
-#else
-  DataBalanceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
-                      const std::vector<platform::Place> &places);
-#endif
-
-  std::string Name() const override;
-
-  bool IsMultiDeviceTransfer() override { return false; };
-
- protected:
-  void RunImpl() override;
-
- private:
-  // std::vector<(src_dev_id, dst_dev_id, trans_size)>
-  std::vector<std::array<int, 3>> GetBalancePlan(
-      const std::vector<int> &batch_size_per_device);
-
-  const std::vector<Scope *> local_scopes_;
-  const std::vector<platform::Place> places_;
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@ -12,6 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include <memory>
+#include <unordered_set>
+#include <utility>
+
 #include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/scope.h"
@ -45,6 +49,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
    }
  }
 #endif
+  PADDLE_ENFORCE(!var_names_.empty(), "Var names cannot be empty");
 }

 EagerDeletionOpHandle::~EagerDeletionOpHandle() {
@ -60,15 +65,20 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() {
 std::string EagerDeletionOpHandle::Name() const { return "eager_deletion"; }

 void EagerDeletionOpHandle::RunImpl() {
-  auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
+  Scope *exec_scope = nullptr;
  std::deque<std::shared_ptr<memory::Allocation>> garbages;
  for (auto &name : var_names_) {
    auto it = ref_cnts_->find(name);
-    // Var not found, not reference count has not decreased to 0
+    // Reference count has not decreased to 0
    if (it == ref_cnts_->end() || it->second.fetch_sub(1) != 1) {
      continue;
    }

+    if (!exec_scope) {
+      exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
+    }
+
+    // Var not found
    auto *var = exec_scope->FindVar(name);
    if (var == nullptr) {
      continue;
--- a/paddle/fluid/framework/details/eager_deletion_pass.cc
+++ b/paddle/fluid/framework/details/eager_deletion_pass.cc
@ -12,20 +12,173 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include <algorithm>
+#include <functional>
 #include <queue>
 #include <string>
+#include <tuple>
 #include <vector>

 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
-#include "paddle/fluid/framework/details/eager_deletion_pass.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"

+DEFINE_double(memory_fraction_of_eager_deletion, 1.0,
+              "Fraction of eager deletion. If less than 1.0, all variables in "
+              "the program would be sorted according to its memory size, and "
+              "only the FLAGS_memory_fraction_of_eager_deletion of the largest "
+              "variables would be deleted.");
+
 namespace paddle {
 namespace framework {
 namespace details {

+// op -> variables which can be deleted after op runs
+using OpToVarNameSetMap =
+    std::unordered_map<ComputationOpHandle *, std::unordered_set<std::string>>;
+
+// Check whether the variable is LoDTensor based on static VarDesc info
+static bool IsLoDTensor(VarDesc *var) {
+  return var->Proto()->type().type() == proto::VarType::LOD_TENSOR;
+}
+
+// Get memory size of LoDTensor
+static int64_t GetMemorySize(
+    const std::unordered_map<std::string, std::vector<VarHandle *>> &vars,
+    const std::string &var_name) {
+  auto *var_desc = TryGetLatestVarDesc(vars.at(var_name));
+  PADDLE_ENFORCE_NOT_NULL(var_desc);
+  PADDLE_ENFORCE(IsLoDTensor(var_desc));
+  auto dims = var_desc->GetShape();
+  return SizeOfType(var_desc->GetDataType()) *
+         std::accumulate(dims.begin(), dims.end(), static_cast<int64_t>(1),
+                         std::multiplies<int64_t>());
+}
+
+// Split all variables in the graph into LoDTensor and Non-LoDTensor (e.g.
+// SelectedRows, LoDTensorArray)
+// Since partial GC is based on static analysis of memory size of each variable
+// So we should skip SelectedRows and LoDTensorArray here
+static void SplitIntoLoDTensorAndNonLoDTensorVars(
+    const OpToVarNameSetMap &m, const GraphVars &vars,
+    OpToVarNameSetMap *lod_tensors, OpToVarNameSetMap *other_vars) {
+  lod_tensors->clear();
+  other_vars->clear();
+
+  for (auto &op_vars_pair : m) {
+    for (auto &var_name : op_vars_pair.second) {
+      auto *var_desc = TryGetLatestVarDesc(
+          vars[op_vars_pair.first->GetScopeIdx()].at(var_name));
+      if (IsLoDTensor(var_desc)) {
+        (*lod_tensors)[op_vars_pair.first].insert(var_name);
+      } else {
+        (*other_vars)[op_vars_pair.first].insert(var_name);
+      }
+    }
+  }
+}
+
+struct GCVarInfo {
+  GCVarInfo(const std::string &name, int64_t memory_size,
+            ComputationOpHandle *op, size_t scope_idx)
+      : name_(name),
+        memory_size_(memory_size),
+        op_(op),
+        scope_idx_(scope_idx) {}
+
+  std::string name_;         // variable name
+  int64_t memory_size_;      // memory size
+  ComputationOpHandle *op_;  // op after which the variable could be deleted
+  size_t scope_idx_;         // scope index where the variable locates
+
+  int64_t AbsMemorySize() const { return std::abs(memory_size_); }
+};
+
+// Delete delete_lod_tensor_only is not used currently
+static OpToVarNameSetMap ShrinkGCVars(
+    const OpToVarNameSetMap &m, const GraphVars &vars,
+    const std::vector<platform::Place> &places, double fraction_of_memory_size,
+    bool delete_lod_tensor_only = false) {
+  // Do not perform gc when fraction_of_memory_size = 0
+  if (fraction_of_memory_size <= 0.0) return {};
+
+  /**
+   * Step 1: Split all variables into LoDTensor and Non-LoDTensor.
+   * We can only calculate memory size of LoDTensors
+   */
+  OpToVarNameSetMap lod_tensors, other_vars;
+  SplitIntoLoDTensorAndNonLoDTensorVars(m, vars, &lod_tensors, &other_vars);
+
+  // Perform complete gc when fraction_of_memory_size >= 1
+  if (fraction_of_memory_size >= 1.0) {
+    return delete_lod_tensor_only ? lod_tensors : m;
+  }
+
+  /**
+   * Step 2: build GCVarInfos, and calculate total memory sizes of each device
+   */
+
+  // place -> variable info (name, memory size, place, scope_idx)
+  std::map<platform::Place, std::vector<GCVarInfo>> place_to_vars;
+
+  // place -> total memory sizes
+  std::map<platform::Place, int64_t> place_to_size;
+  for (auto &op_vars_pair : lod_tensors) {
+    auto *op = op_vars_pair.first;
+    auto &var_names = op_vars_pair.second;
+    auto scope_idx = op->GetScopeIdx();
+    auto &place = places[scope_idx];
+
+    for (auto &var_name : var_names) {
+      auto var_size = GetMemorySize(vars[scope_idx], var_name);
+      GCVarInfo var_info(var_name, var_size, op, scope_idx);
+      place_to_size[place] += var_info.AbsMemorySize();
+      place_to_vars[place].emplace_back(std::move(var_info));
+    }
+  }
+
+  /**
+   * Step 3: sort GCVarInfos, and only delete the largest variables.
+   */
+  OpToVarNameSetMap partial_vars;
+  for (auto &place_to_var_pair : place_to_vars) {
+    auto &place = place_to_var_pair.first;
+    auto &gc_vars = place_to_var_pair.second;
+    std::sort(gc_vars.begin(), gc_vars.end(),
+              [](const GCVarInfo &var1, const GCVarInfo &var2) {
+                return var1.AbsMemorySize() > var2.AbsMemorySize();
+              });
+
+    int64_t accumulated_size = 0;
+    int64_t size_threshold =
+        static_cast<int64_t>(fraction_of_memory_size * place_to_size[place]);
+    for (size_t i = 0; i < gc_vars.size() && accumulated_size < size_threshold;
+         ++i) {
+      partial_vars[gc_vars[i].op_].insert(gc_vars[i].name_);
+      accumulated_size += gc_vars[i].AbsMemorySize();
+    }
+  }
+
+  /**
+   * Step 4: Combine other vars (SelectedRows, LoDTensorArray)
+   */
+  if (!delete_lod_tensor_only) {
+    for (auto &op_vars_pair : other_vars) {
+      partial_vars[op_vars_pair.first].insert(op_vars_pair.second.begin(),
+                                              op_vars_pair.second.end());
+    }
+  }
+
+  return partial_vars;
+}
+
+class EagerDeletionPass : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
 std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
  auto &ref_cnts =
@ -43,9 +196,7 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(

  // a reverse map of last_live_ops
  //   i.e., last op --> variable names which can be deleted.
-  std::unordered_map<ComputationOpHandle *, std::unordered_set<std::string>>
-      op_vars_map;
-
+  OpToVarNameSetMap op_vars_map;
  for (auto &var_ops_map : last_live_ops) {
    for (auto &var_ops_pair : var_ops_map) {
      const std::string &var_name = var_ops_pair.first;
@ -55,6 +206,9 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
    }
  }

+  op_vars_map = ShrinkGCVars(op_vars_map, vars, places,
+                             FLAGS_memory_fraction_of_eager_deletion);
+
  for (auto &pair : op_vars_map) {
    auto *op = pair.first;
    auto &var_names = pair.second;
@ -85,8 +239,13 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
    eager_deletion_op->AddOutput(dummy_leaf);
  }

+  VLOG(10) << "FLAGS_memory_fraction_of_eager_deletion = "
+           << FLAGS_memory_fraction_of_eager_deletion;
  VLOG(10) << "Create " << op_vars_map.size() << " EagerDeletionOpHandle(s)";
-  return graph;
+
+  auto while_op_eager_deletion_pass =
+      ir::PassRegistry::Instance().Get("while_op_eager_deletion_pass");
+  return while_op_eager_deletion_pass->Apply(std::move(graph));
 }

 }  // namespace details
@ -99,3 +258,5 @@ REGISTER_PASS(eager_deletion_pass,
    .RequirePassAttr(paddle::framework::details::kLastLiveOpsOfVars)
    .RequirePassAttr(paddle::framework::details::kAllPlaces)
    .RequirePassAttr(paddle::framework::details::kGarbageCollector);
+
+USE_PASS(while_op_eager_deletion_pass);
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@ -82,6 +82,8 @@ void FetchOpHandle::WaitInputVarGenerated(const platform::Place &place) {
  }
 }

+bool FetchOpHandle::IsMultiDeviceTransfer() { return true; }
+
 std::string FetchOpHandle::Name() const { return "Fetch"; }

 }  // namespace details
--- a/paddle/fluid/framework/details/fetch_op_handle.h
+++ b/paddle/fluid/framework/details/fetch_op_handle.h
@ -39,6 +39,8 @@ struct FetchOpHandle : public OpHandleBase {

  std::string Name() const override;

+  bool IsMultiDeviceTransfer() override;
+
 protected:
  void RunImpl() override;

--- a/paddle/fluid/framework/details/fuse_all_reduce_op_pass.cc
+++ b/paddle/fluid/framework/details/fuse_all_reduce_op_pass.cc
@ -0,0 +1,195 @@
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
+#include "paddle/fluid/framework/details/container_cast.h"
+#include "paddle/fluid/framework/details/fused_all_reduce_op_handle.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class FuseAllReduceOpPass : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override {
+    ir::Graph &result = *graph;
+
+    auto &places = Get<const std::vector<platform::Place>>(kPlaces);
+    auto &local_scopes = Get<const std::vector<Scope *>>(kLocalScopes);
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+    auto *nccl_ctxs = &Get<platform::NCCLContextMap>(kNCCLCtxs);
+#endif
+
+    std::unordered_set<std::string> grads;
+    auto &params_grads = result.Get<ParamsAndGrads>(kParamsAndGrads);
+    size_t num_of_all_reduce = params_grads.size();
+    grads.reserve(num_of_all_reduce);
+    for (auto p_g : params_grads) {
+      grads.insert(p_g.second);
+    }
+
+    size_t num_place = places.size();
+    std::unordered_map<std::string, ir::Node *> all_reduce_ops;
+    all_reduce_ops.reserve(grads.size());
+    for (auto &node : result.Nodes()) {
+      if (node->IsOp()) {
+        PADDLE_ENFORCE(node->IsWrappedBy<OpHandleBase>());
+        auto *all_reduce_op_handle =
+            dynamic_cast<AllReduceOpHandle *>(&node->Wrapper<OpHandleBase>());
+        if (all_reduce_op_handle) {
+          auto inputs = DynamicCast<VarHandle>(all_reduce_op_handle->Inputs());
+          PADDLE_ENFORCE_EQ(inputs.size(), num_place);
+          // The inputs' name should be the same.
+          auto &grad_name = inputs[0]->name();
+          for (size_t i = 1; i < inputs.size(); ++i) {
+            PADDLE_ENFORCE_EQ(inputs[i]->name(), grad_name,
+                              "The input name should be the same.");
+          }
+          PADDLE_ENFORCE_NE(grads.count(grad_name), static_cast<size_t>(0));
+          all_reduce_ops.emplace(grad_name, node);
+        }
+      }
+    }
+
+    VLOG(10) << "Find all_reduce_ops: " << all_reduce_ops.size();
+    if (all_reduce_ops.size() == 0) {
+      return std::move(graph);
+    }
+
+    PADDLE_ENFORCE_EQ(all_reduce_ops.size(), grads.size(),
+                      "The number of all_reduce OpHandle is not equal to the "
+                      "number of grads. Maybe some gradients are sparse type, "
+                      "it is not supported currently.");
+    VLOG(10) << "Insert fused_all_reduce";
+
+    auto &group_grads_params =
+        graph->Get<GroupGradsAndParams>(kGroupGradsAndParams);
+
+    for (auto &group_g_p : group_grads_params) {
+      size_t group_size = group_g_p.size();
+      PADDLE_ENFORCE_GT(group_size, static_cast<size_t>(0));
+      std::vector<ir::Node *> group_all_reduce_ops;
+      group_all_reduce_ops.reserve(group_size);
+      for (auto &g_p : group_g_p) {
+        group_all_reduce_ops.emplace_back(all_reduce_ops.at(g_p.first));
+      }
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+      InsertFusedAllReduce(places, local_scopes, group_size,
+                           group_all_reduce_ops, nccl_ctxs, &result);
+#else
+      InsertFusedAllReduce(places, local_scopes, group_size,
+                           group_all_reduce_ops, &result);
+#endif
+    }
+    return std::move(graph);
+  }
+
+  void InsertFusedAllReduce(const std::vector<platform::Place> &places,
+                            const std::vector<Scope *> &local_scopes,
+                            const size_t num_of_all_reduce,
+                            const std::vector<ir::Node *> &all_reduce_ops,
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+                            const platform::NCCLContextMap *nccl_ctxs,
+#endif
+                            ir::Graph *result) const {
+    std::vector<VarHandleBase *> inputs;
+    std::vector<VarHandleBase *> outputs;
+    for (auto &op : all_reduce_ops) {
+      auto &op_handle = op->Wrapper<OpHandleBase>();
+      inputs.insert(inputs.end(), op_handle.Inputs().begin(),
+                    op_handle.Inputs().end());
+      // Remove output
+      for_each(op_handle.Inputs().begin(), op_handle.Inputs().end(),
+               [&op_handle](VarHandleBase *var_handle) {
+                 var_handle->RemoveOutput(&op_handle, op_handle.Node());
+               });
+
+      outputs.insert(outputs.end(), op_handle.Outputs().begin(),
+                     op_handle.Outputs().end());
+      // Remove Input
+      for_each(
+          op_handle.Outputs().begin(), op_handle.Outputs().end(),
+          [](VarHandleBase *var_handle) { var_handle->ClearGeneratedOp(); });
+
+      result->RemoveNode(op_handle.Node());
+    }
+
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+    CreateFusedAllReduceOp(inputs, outputs, num_of_all_reduce, places,
+                           local_scopes, nccl_ctxs, result);
+#else
+    CreateFusedAllReduceOp(inputs, outputs, num_of_all_reduce, places,
+                           local_scopes, result);
+#endif
+  }
+
+ private:
+  void CreateFusedAllReduceOp(const std::vector<VarHandleBase *> &inputs,
+                              const std::vector<VarHandleBase *> &outputs,
+                              const size_t num_of_all_reduce,
+                              const std::vector<platform::Place> &places,
+                              const std::vector<Scope *> &local_scopes,
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+                              const platform::NCCLContextMap *nccl_ctxs,
+#endif
+                              ir::Graph *result) const {
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+    auto *op_handle = new FusedAllReduceOpHandle(
+        result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation),
+        local_scopes, places, num_of_all_reduce, nccl_ctxs);
+#else
+    auto *op_handle = new FusedAllReduceOpHandle(
+        result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation),
+        local_scopes, places, num_of_all_reduce);
+#endif
+
+    for (auto in : inputs) {
+      op_handle->AddInput(in);
+    }
+
+    for (auto out : outputs) {
+      op_handle->AddOutput(out);
+    }
+
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+    if (!nccl_ctxs) {
+      SetCommunicationContext(places, op_handle);
+    }
+#else
+    SetCommunicationContext(places, op_handle);
+#endif
+  }
+
+  void SetCommunicationContext(const std::vector<platform::Place> &places,
+                               FusedAllReduceOpHandle *op_handle) const {
+    for (size_t i = 0; i < places.size(); ++i) {
+      op_handle->SetDeviceContext(
+          places[i], platform::DeviceContextPool::Instance().Get(places[i]));
+    }
+  }
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(fuse_all_reduce_op_pass,
+              paddle::framework::details::FuseAllReduceOpPass);
--- a/paddle/fluid/framework/details/fuse_vars_op_handle.cc
+++ b/paddle/fluid/framework/details/fuse_vars_op_handle.cc
@ -1,51 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/fuse_vars_op_handle.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-void FuseVarsOpHandle::RunImpl() {
-  WaitInputVarGenerated(place_);
-
-  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
-  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
-  PADDLE_ENFORCE_EQ(in_var_handles.size(), 0UL);
-  PADDLE_ENFORCE_EQ(out_var_handles.size() - 1, inputs_numel_.size(), "");
-
-  auto scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
-
-  auto out_var_handle = out_var_handles[0];
-  auto out_var = scope->Var(out_var_handle->name());
-
-  auto out_tensor = out_var->GetMutable<LoDTensor>();
-  out_tensor->Resize({total_numel_}).mutable_data(this->place_, type_);
-
-  int64_t s = 0;
-  for (size_t i = 1; i < out_var_handles.size(); ++i) {
-    auto out_name = out_var_handles[i]->name();
-    auto out_t = scope->Var(out_name)->GetMutable<LoDTensor>();
-    auto numel = this->inputs_numel_.at(out_name);
-    out_t->ShareDataWith(out_tensor->Slice(s, s + numel));
-    s += numel;
-  }
-  this->RunAndRecordEvent([] {});
-}
-
-std::string FuseVarsOpHandle::Name() const { return "fuse vars"; }
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/framework/details/fuse_vars_op_handle.h
+++ b/paddle/fluid/framework/details/fuse_vars_op_handle.h
@ -1,65 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <map>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/details/container_cast.h"
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-struct FuseVarsOpHandle : public OpHandleBase {
- public:
-  FuseVarsOpHandle(ir::Node *node, Scope *local_scope,
-                   const platform::Place &place,
-                   const std::unordered_map<std::string, int64_t> &inputs_numel,
-                   const proto::VarType::Type var_type)
-      : OpHandleBase(node),
-        local_scope_(local_scope),
-        place_(place),
-        inputs_numel_(inputs_numel),
-        type_(var_type) {
-    total_numel_ = 0;
-    for (auto in_numel : inputs_numel) {
-      PADDLE_ENFORCE_GT(in_numel.second, 0);
-      total_numel_ += in_numel.second;
-    }
-  }
-
-  std::string Name() const override;
-
-  bool IsMultiDeviceTransfer() override { return false; };
-
- protected:
-  void RunImpl() override;
-
- private:
-  Scope *local_scope_;
-  const platform::Place place_;
-  const std::unordered_map<std::string, int64_t> inputs_numel_;
-  const proto::VarType::Type type_;
-  int64_t total_numel_;
-};
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@ -0,0 +1,249 @@
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/fused_all_reduce_op_handle.h"
+#include <algorithm>
+#include <utility>
+#include "paddle/fluid/framework/details/container_cast.h"
+#include "paddle/fluid/framework/details/reduce_and_gather.h"
+#include "paddle/fluid/framework/details/variable_visitor.h"
+#include "paddle/fluid/platform/profiler.h"
+
+DEFINE_bool(skip_fused_all_reduce_check, false, "");
+namespace paddle {
+namespace framework {
+namespace details {
+
+typedef std::vector<std::vector<std::pair<std::string, const LoDTensor *>>>
+    GradientAndLoDTensor;
+
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+FusedAllReduceOpHandle::FusedAllReduceOpHandle(
+    ir::Node *node, const std::vector<Scope *> &local_scopes,
+    const std::vector<platform::Place> &places, const size_t num_of_all_reduce,
+    const platform::NCCLContextMap *ctxs)
+    : OpHandleBase(node),
+      local_scopes_(local_scopes),
+      places_(places),
+      num_of_all_reduce_(num_of_all_reduce),
+      nccl_ctxs_(ctxs) {
+  if (nccl_ctxs_) {
+    for (auto &p : places_) {
+      this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p));
+    }
+  }
+  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
+}
+#else
+
+FusedAllReduceOpHandle::FusedAllReduceOpHandle(
+    ir::Node *node, const std::vector<Scope *> &local_scopes,
+    const std::vector<platform::Place> &places, const size_t num_of_all_reduce)
+    : OpHandleBase(node),
+      local_scopes_(local_scopes),
+      places_(places),
+      num_of_all_reduce_(num_of_all_reduce) {
+  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
+}
+
+#endif
+
+void FusedAllReduceOpHandle::RunImpl() {
+  platform::RecordEvent record_event(Name());
+
+  VLOG(4) << this->DebugString();
+
+  WaitInputVarGenerated();
+  // The input: grad0(dev0), grad0(dev1), grad1(dev0), grad1(dev1)...
+  // The output: grad0(dev0), grad0(dev1), grad1(dev0), grad1(dev1)...
+  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
+  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
+
+  size_t place_num = places_.size();
+  PADDLE_ENFORCE_EQ(
+      in_var_handles.size(), place_num * num_of_all_reduce_,
+      "The NoDummyInputSize should be equal to the number of places.");
+  PADDLE_ENFORCE_EQ(
+      in_var_handles.size(), out_var_handles.size(),
+      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
+
+  GradientAndLoDTensor grads_tensor;
+  grads_tensor.resize(place_num);
+
+  int64_t numel = -1;
+  auto dtype = static_cast<framework::proto::VarType::Type>(0);
+  for (size_t scope_idx = 0; scope_idx < local_scopes_.size(); ++scope_idx) {
+    auto &g_tensor = grads_tensor.at(scope_idx);
+    g_tensor.reserve(num_of_all_reduce_);
+
+    GetGradLoDTensor(scope_idx, in_var_handles, out_var_handles, &g_tensor);
+
+    int64_t element_num = 0;
+    framework::proto::VarType::Type ele_dtype =
+        static_cast<framework::proto::VarType::Type>(0);
+    GetDTypeAndNumel(g_tensor, &ele_dtype, &element_num);
+
+    if (numel == -1) {
+      numel = element_num;
+    }
+    if (dtype == static_cast<framework::proto::VarType::Type>(0)) {
+      dtype = ele_dtype;
+      PADDLE_ENFORCE_NE(ele_dtype,
+                        static_cast<framework::proto::VarType::Type>(0));
+    }
+    PADDLE_ENFORCE_EQ(ele_dtype, dtype);
+
+    // Check whether the address space is contiguous.
+    std::sort(
+        g_tensor.begin(), g_tensor.end(),
+        [](const std::pair<std::string, const LoDTensor *> &grad1,
+           const std::pair<std::string, const LoDTensor *> &grad2) -> bool {
+          return grad1.second->data<void>() < grad2.second->data<void>();
+        });
+
+    for (size_t k = 1; k < g_tensor.size(); ++k) {
+      const void *cur_address = g_tensor.at(k - 1).second->data<void>();
+      int64_t len = g_tensor.at(k - 1).second->numel();
+      auto offset = len * framework::SizeOfType(dtype);
+      void *infer_next_address = reinterpret_cast<void *>(
+          reinterpret_cast<uintptr_t>(cur_address) + offset);
+      const void *next_address = g_tensor.at(k).second->data<void>();
+
+      VLOG(10) << string::Sprintf(
+          "Input[%d](%s) address: 0X%02x, Input[%d](%s) address: 0X%02x, Infer "
+          "input[%d] address: 0X%02x. The offset: %d",
+          k - 1, g_tensor.at(k - 1).first, cur_address, g_tensor.at(k).first, k,
+          next_address, k, infer_next_address, offset);
+      PADDLE_ENFORCE_EQ(infer_next_address, next_address,
+                        "The address is not consistent.");
+    }
+  }
+
+  if (!FLAGS_skip_fused_all_reduce_check) {
+    for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) {
+      for (size_t j = 1; j < num_of_all_reduce_; ++j) {
+        PADDLE_ENFORCE_EQ(grads_tensor.at(0).at(j).first,
+                          grads_tensor.at(scope_idx).at(j).first);
+      }
+    }
+  }
+
+  std::vector<const void *> lod_tensor_data;
+  for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) {
+    auto data = grads_tensor.at(scope_idx).at(0).second->data<void>();
+    lod_tensor_data.emplace_back(data);
+  }
+
+  if (platform::is_gpu_place(places_[0])) {
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+    PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
+    int nccl_dtype = platform::ToNCCLDataType(dtype);
+    std::vector<std::function<void()>> all_reduce_calls;
+    for (size_t i = 0; i < local_scopes_.size(); ++i) {
+      auto &p = places_[i];
+      void *buffer = const_cast<void *>(lod_tensor_data.at(i));
+
+      int dev_id = boost::get<platform::CUDAPlace>(p).device;
+      auto &nccl_ctx = nccl_ctxs_->at(dev_id);
+      auto stream = nccl_ctx.stream();
+      auto comm = nccl_ctx.comm_;
+      all_reduce_calls.emplace_back([=] {
+        PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
+            buffer, buffer, numel, static_cast<ncclDataType_t>(nccl_dtype),
+            ncclSum, comm, stream));
+      });
+    }
+
+    this->RunAndRecordEvent([&] {
+      if (all_reduce_calls.size() == 1UL) {
+        // Do not use NCCLGroup when manage NCCL by per thread per device
+        all_reduce_calls[0]();
+      } else {
+        platform::NCCLGroupGuard guard;
+        for (auto &call : all_reduce_calls) {
+          call();
+        }
+      }
+    });
+#else
+    PADDLE_THROW("Not compiled with CUDA");
+#endif
+  } else {
+    // Special handle CPU only Operator's gradient. Like CRF
+    auto grad_name = grads_tensor.at(0).at(0).first;
+    auto &trg = *this->local_scopes_[0]
+                     ->FindVar(kLocalExecScopeName)
+                     ->Get<Scope *>()
+                     ->FindVar(grad_name)
+                     ->GetMutable<framework::LoDTensor>();
+
+    // Reduce All data to trg in CPU
+    ReduceBufferData func(lod_tensor_data, trg.data<void>(), numel);
+    VisitDataType(trg.type(), func);
+
+    for (size_t i = 1; i < local_scopes_.size(); ++i) {
+      auto &scope =
+          *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
+      auto &p = places_[i];
+      auto *var = scope.FindVar(grad_name);
+      auto *dev_ctx = dev_ctxes_.at(p);
+      size_t size = numel * SizeOfType(trg.type());
+      RunAndRecordEvent(p, [&trg, var, dev_ctx, p, size] {
+        auto dst_ptr = var->GetMutable<framework::LoDTensor>()->data<void>();
+        platform::CPUPlace cpu_place;
+        memory::Copy(cpu_place, dst_ptr, cpu_place, trg.data<void>(), size);
+      });
+    }
+  }
+}
+
+void FusedAllReduceOpHandle::GetGradLoDTensor(
+    const size_t &scope_idx, const std::vector<VarHandle *> &in_var_handles,
+    const std::vector<VarHandle *> &out_var_handles,
+    std::vector<std::pair<std::string, const LoDTensor *>> *grad_tensor) const {
+  auto *local_scope =
+      local_scopes_.at(scope_idx)->FindVar(kLocalExecScopeName)->Get<Scope *>();
+  size_t place_num = places_.size();
+
+  for (size_t j = 0; j < in_var_handles.size(); j += place_num) {
+    auto var_name = in_var_handles[j]->name();
+    PADDLE_ENFORCE_EQ(var_name, out_var_handles[j]->name());
+    auto &lod_tensor = local_scope->FindVar(var_name)->Get<LoDTensor>();
+    PADDLE_ENFORCE_EQ(lod_tensor.place(), places_.at(scope_idx));
+    grad_tensor->emplace_back(std::make_pair(var_name, &lod_tensor));
+  }
+}
+
+void FusedAllReduceOpHandle::GetDTypeAndNumel(
+    const std::vector<std::pair<std::string, const LoDTensor *>> &grad_tensor,
+    proto::VarType::Type *dtype, int64_t *numel) const {
+  *numel = 0;
+  for (size_t i = 0; i < grad_tensor.size(); ++i) {
+    // Get element number
+    int64_t len = grad_tensor.at(i).second->numel();
+    PADDLE_ENFORCE_GT(len, 0);
+    *numel += len;
+
+    // Get dtype
+    auto ele_type = grad_tensor.at(i).second->type();
+    if (i == 0) {
+      *dtype = ele_type;
+    }
+    PADDLE_ENFORCE_EQ(ele_type, *dtype);
+  }
+}
+
+std::string FusedAllReduceOpHandle::Name() const { return "fused_all_reduce"; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/Show More
+++ b/Show More