Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into rewrite_allocation

7 years ago · 057a682ee9
parent c01696f8c2 80132933b7
commit 057a682ee9
263 changed files with 8689 additions and 2983 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -62,13 +62,11 @@ option(WITH_DISTRIBUTE  "Compile with distributed support"              OFF)
 option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
 option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen"            OFF)
 option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
 option(WITH_FAST_BUNDLE_TEST    "Bundle tests that can be run in a single process together to reduce launch overhead"   OFF)
 option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
 option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
 option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
 option(WITH_INFERENCE    "Compile fluid inference library"              ON)
 option(ON_INFER         "Turn on inference optimization."               OFF)
 option(WITH_INFERENCE_API_TEST   "Test fluid inference high-level api interface"  OFF)
 option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
@ -305,6 +303,9 @@ if(WITH_DOC)
 endif()
 if (ON_INFER)
-    message(WARNING "On inference mode, will take place some specific optimization.")
+    message(STATUS "On inference mode, will take place some specific optimization.")
    add_definitions(-DPADDLE_ON_INFERENCE)
 else()
    #TODO(luotao), combine this warning with `make inference_lib_dist` command.
    message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")
 endif()
--- a/README.md
+++ b/README.md
@ -2,8 +2,8 @@
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.0/getstarted/index_en.html)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.1/getstarted/index_en.html)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.0/beginners_guide/index.html)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
@ -19,7 +19,7 @@ Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
-### Latest PaddlePaddle Release: [Fluid 1.0.1](https://github.com/PaddlePaddle/Paddle/tree/release/1.0.0)
+### Latest PaddlePaddle Release: [Fluid 1.1.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.1)
 ### Install Latest Stable Release:
 ```
 # Linux CPU
@ -27,9 +27,9 @@ pip install paddlepaddle
 # Linux GPU cuda9cudnn7
 pip install paddlepaddle-gpu
 # Linux GPU cuda8cudnn7
-pip install paddlepaddle-gpu==1.0.1.post87
+pip install paddlepaddle-gpu==1.1.0.post87
 # Linux GPU cuda8cudnn5
-pip install paddlepaddle-gpu==1.0.1.post85
+pip install paddlepaddle-gpu==1.1.0.post85
 # For installation on other platform, refer to http://paddlepaddle.org/
 ```
@ -76,26 +76,26 @@ pip install paddlepaddle-gpu==1.0.1.post85
 ## Installation
-It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.0/beginners_guide/index.html) on our website.
+It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html) on our website.
 ## Documentation
-We provide [English](http://paddlepaddle.org/documentation/docs/en/1.0.0/getstarted/index_en.html) and
+We provide [English](http://paddlepaddle.org/documentation/docs/en/1.1/getstarted/index_en.html) and
-[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.0/beginners_guide/index.html) documentation.
+[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html) documentation.
 - [Deep Learning 101](https://github.com/PaddlePaddle/book)
  You might want to start from this online interactive book that can run in a Jupyter Notebook.
- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.0/user_guides/howto/training/cluster_howto.html)
+- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.1/user_guides/howto/training/cluster_howto.html)
  You can run distributed training jobs on MPI clusters.
- [Python API](http://paddlepaddle.org/documentation/api/zh/1.0/fluid.html)
+- [Python API](http://paddlepaddle.org/documentation/api/zh/1.1/fluid.html)
   Our new API enables much shorter programs.
- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.0/advanced_usage/development/contribute_to_paddle.html)
+- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.1/advanced_usage/development/contribute_to_paddle.html)
   We appreciate your contributions!
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@ -50,11 +50,7 @@ if(NOT WITH_PROFILER)
 endif(NOT WITH_PROFILER)
 if(NOT CMAKE_CROSSCOMPILING)
-    if(WITH_AVX AND AVX512F_FOUND)
+    if(WITH_AVX AND AVX_FOUND)
        set(SIMD_FLAG ${AVX512F_FLAG})
    elseif(WITH_AVX AND AVX2_FOUND)
        set(SIMD_FLAG ${AVX2_FLAG})
    elseif(WITH_AVX AND AVX_FOUND)
        set(SIMD_FLAG ${AVX_FLAG})
    elseif(SSE3_FOUND)
        set(SIMD_FLAG ${SSE3_FLAG})
--- a/cmake/external/xxhash.cmake
+++ b/cmake/external/xxhash.cmake
@ -7,7 +7,11 @@ set(XXHASH_INCLUDE_DIR "${XXHASH_INSTALL_DIR}/include")
 IF(WITH_STATIC_LIB)
  SET(BUILD_CMD make lib)
 ELSE()
-  SET(BUILD_CMD sed -i "s/-Wstrict-prototypes -Wundef/-Wstrict-prototypes -Wundef -fPIC/g" ${XXHASH_SOURCE_DIR}/src/extern_xxhash/Makefile && make lib)
+  IF(APPLE)
    SET(BUILD_CMD sed -i \"\" "s/-Wstrict-prototypes -Wundef/-Wstrict-prototypes -Wundef -fPIC/g" ${XXHASH_SOURCE_DIR}/src/extern_xxhash/Makefile && make lib)
  ELSE(APPLE)
    SET(BUILD_CMD sed -i "s/-Wstrict-prototypes -Wundef/-Wstrict-prototypes -Wundef -fPIC/g" ${XXHASH_SOURCE_DIR}/src/extern_xxhash/Makefile && make lib)
  ENDIF(APPLE)
 ENDIF()
 ExternalProject_Add(
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@ -14,9 +14,6 @@
 # make package for paddle fluid shared and static library
 function(copy TARGET)
    if (NOT ON_INFER)
      message(WARNING "Turn on the ON_INFER flag when building inference_lib only.")
    endif()
    set(options "")
    set(oneValueArgs "")
    set(multiValueArgs SRCS DSTS DEPS)
--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@ -89,7 +89,9 @@ CHECK_CXX_SOURCE_RUNS("
 #include <immintrin.h>
 int main()
 {
-    __m512i a = _mm512_undefined_epi32();
+    __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
                                  13, -5, 6, -7, 9, 2, -6, 3);
    __m512i result = _mm512_abs_epi32 (a);
    return 0;
 }" AVX512F_FOUND)
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@ -24,6 +24,7 @@ if(NOT WITH_FLUID_ONLY)
 endif()
 add_subdirectory(testing)
 set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests CACHE INTERNAL "python tests directory")
 if(NOT MOBILE_INFERENCE AND NOT RPI AND NOT WITH_C_API)
  add_subdirectory(fluid)
 endif()
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@ -64,11 +64,11 @@ paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', '
 paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None))
 paddle.fluid.layers.conv2d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None))
 paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None))
-paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type', 'is_test'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None))
 paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None))
-paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None))
+paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
-paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None))
+paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
 paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False))
 paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
@ -103,7 +103,7 @@ paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 's
 paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
-paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100))
+paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode'], varargs=None, keywords=None, defaults=(False, -100, False))
 paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1))
@ -174,9 +174,13 @@ paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None
 paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None))
 paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None))
 paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None))
 paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None))
 paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@ -9,8 +9,6 @@ add_subdirectory(pybind)
 add_subdirectory(recordio)
 endif(NOT WIN32)
-if(WITH_INFERENCE)
+# NOTE: please add subdirectory inference at last.
-  # NOTE: please add subdirectory inference at last.
+add_subdirectory(inference)
-  add_subdirectory(inference)
+add_subdirectory(train)
  add_subdirectory(train)
 endif()
--- a/paddle/fluid/framework/attribute.cc
+++ b/paddle/fluid/framework/attribute.cc
@ -64,6 +64,13 @@ Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc) {
    case proto::AttrType::LONG: {
      return attr_desc.l();
    }
    case proto::AttrType::LONGS: {
      std::vector<int64_t> val(attr_desc.longs_size());
      for (int i = 0; i < attr_desc.longs_size(); ++i) {
        val[i] = attr_desc.longs(i);
      }
      return val;
    }
    default:
      PADDLE_THROW("Unsupport attr type %d", attr_desc.type());
  }
--- a/paddle/fluid/framework/attribute.h
+++ b/paddle/fluid/framework/attribute.h
@ -26,6 +26,113 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 template <typename T>
 struct ExtractAttribute {
  explicit ExtractAttribute(const std::string& attr_name)
      : attr_name_(attr_name) {}
  T* operator()(Attribute& attr) const {
    T* attr_value = nullptr;
    try {
      attr_value = &boost::get<T>(attr);
    } catch (boost::bad_get& bad_get) {
      PADDLE_THROW("Cannot get attribute %s by type %s, its type is %s",
                   attr_name_, paddle::platform::demangle(typeid(T).name()),
                   paddle::platform::demangle(attr.type().name()));
    }
    return attr_value;
  }
  const std::string& attr_name_;
 };
 // special handle bool
 // FIXME(yuyang18): Currently we cast bool into int in python binding. It is
 // hard to change the logic there. In another way, we should correct handle
 // if the user set `some_flag=1`.
 //
 // FIX ME anytime if there is a better solution.
 template <>
 struct ExtractAttribute<bool> {
  explicit ExtractAttribute(const std::string& attr_name)
      : attr_name_(attr_name) {}
  bool* operator()(Attribute& attr) const {
    if (attr.type() == typeid(int)) {  // NOLINT
      int val = boost::get<int>(attr);
      attr = static_cast<bool>(val);
    } else if (attr.type() == typeid(float)) {  // NOLINT
      float val = boost::get<float>(attr);
      attr = static_cast<bool>(val);
    }
    bool* attr_value = nullptr;
    try {
      attr_value = &boost::get<bool>(attr);
    } catch (boost::bad_get& bad_get) {
      PADDLE_THROW("Cannot get attribute %s by type bool, its type is %s",
                   attr_name_, paddle::platform::demangle(attr.type().name()));
    }
    return attr_value;
  }
  const std::string& attr_name_;
 };
 template <>
 struct ExtractAttribute<int64_t> {
  explicit ExtractAttribute(const std::string& attr_name)
      : attr_name_(attr_name) {}
  int64_t* operator()(Attribute& attr) const {
    if (attr.type() == typeid(int)) {  // NOLINT
      int val = boost::get<int>(attr);
      attr = static_cast<int64_t>(val);
    } else if (attr.type() == typeid(float)) {  // NOLINT
      int val = boost::get<float>(attr);
      attr = static_cast<int64_t>(val);
    }
    int64_t* attr_value = nullptr;
    try {
      attr_value = &boost::get<int64_t>(attr);
    } catch (boost::bad_get& bad_get) {
      PADDLE_THROW("Cannot get attribute %s by type int64_t, its type is %s",
                   attr_name_, paddle::platform::demangle(attr.type().name()));
    }
    return attr_value;
  }
  const std::string& attr_name_;
 };
 template <>
 struct ExtractAttribute<std::vector<int64_t>> {
  explicit ExtractAttribute(const std::string& attr_name)
      : attr_name_(attr_name) {}
  std::vector<int64_t>* operator()(Attribute& attr) const {
    if (attr.type() == typeid(std::vector<int>)) {  // NOLINT
      std::vector<int> val = boost::get<std::vector<int>>(attr);
      std::vector<int64_t> vec(val.begin(), val.end());
      attr = vec;
    } else if (attr.type() == typeid(std::vector<float>)) {  // NOLINT
      std::vector<float> val = boost::get<std::vector<float>>(attr);
      std::vector<int64_t> vec(val.begin(), val.end());
      attr = vec;
    }
    std::vector<int64_t>* attr_value = nullptr;
    try {
      attr_value = &boost::get<std::vector<int64_t>>(attr);
    } catch (boost::bad_get& bad_get) {
      PADDLE_THROW("Cannot get attribute %s by type int64_t, its type is %s",
                   attr_name_, paddle::platform::demangle(attr.type().name()));
    }
    return attr_value;
  }
  const std::string& attr_name_;
 };
 template <typename T>
 inline proto::AttrType AttrTypeID() {
  Attribute tmp = T();
@ -42,7 +149,11 @@ class AttrReader {
  inline const T& Get(const std::string& name) const {
    PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap",
                   name);
-    return boost::get<T>(attrs_.at(name));
+
    Attribute& attr = const_cast<Attribute&>(attrs_.at(name));
    ExtractAttribute<T> extract_attr(name);
    T* attr_value = extract_attr(attr);
    return *attr_value;
  }
 private:
@ -82,7 +193,7 @@ class DefaultValueSetter {
 public:
  explicit DefaultValueSetter(T default_value)
      : default_value_(default_value) {}
-  void operator()(T& value) const { value = default_value_; }
+  void operator()(T& value) const { value = default_value_; }  // NOLINT
 private:
  T default_value_;
@ -117,84 +228,6 @@ class EnumInContainer {
  std::unordered_set<T> container_;
 };
 template <typename T>
 struct ExtractAttribute {
  explicit ExtractAttribute(const std::string& attr_name)
      : attr_name_(attr_name) {}
  T* operator()(Attribute& attr) const {
    T* attr_value = nullptr;
    try {
      attr_value = &boost::get<T>(attr);
    } catch (boost::bad_get& bad_get) {
      PADDLE_THROW("Cannot get attribute %s by type %s, its type is %s",
                   attr_name_, paddle::platform::demangle(typeid(T).name()),
                   paddle::platform::demangle(attr.type().name()));
    }
    return attr_value;
  }
  const std::string& attr_name_;
 };
 // special handle bool
 // FIXME(yuyang18): Currently we cast bool into int in python binding. It is
 // hard to change the logic there. In another way, we should correct handle
 // if the user set `some_flag=1`.
 //
 // FIX ME anytime if there is a better solution.
 template <>
 struct ExtractAttribute<bool> {
  explicit ExtractAttribute(const std::string& attr_name)
      : attr_name_(attr_name) {}
  bool* operator()(Attribute& attr) const {
    if (attr.type() == typeid(int)) {  // NOLINT
      int val = boost::get<int>(attr);
      attr = static_cast<bool>(val);
    } else if (attr.type() == typeid(float)) {  // NOLINT
      float val = boost::get<float>(attr);
      attr = static_cast<bool>(val);
    }
    bool* attr_value = nullptr;
    try {
      attr_value = &boost::get<bool>(attr);
    } catch (boost::bad_get& bad_get) {
      PADDLE_THROW("Cannot get attribute %s by type bool, its type is %s",
                   attr_name_, paddle::platform::demangle(attr.type().name()));
    }
    return attr_value;
  }
  const std::string& attr_name_;
 };
 template <>
 struct ExtractAttribute<int64_t> {
  explicit ExtractAttribute(const std::string& attr_name)
      : attr_name_(attr_name) {}
  int64_t* operator()(Attribute& attr) const {
    if (attr.type() == typeid(int)) {  // NOLINT
      int val = boost::get<int>(attr);
      attr = static_cast<int64_t>(val);
    } else if (attr.type() == typeid(float)) {  // NOLINT
      int val = boost::get<float>(attr);
      attr = static_cast<int64_t>(val);
    }
    int64_t* attr_value = nullptr;
    try {
      attr_value = &boost::get<int64_t>(attr);
    } catch (boost::bad_get& bad_get) {
      PADDLE_THROW("Cannot get attribute %s by type int64_t, its type is %s",
                   attr_name_, paddle::platform::demangle(attr.type().name()));
    }
    return attr_value;
  }
  const std::string& attr_name_;
 };
 // check whether a certain attribute fit its limits
 // an attribute can have more than one limits
 template <typename T>
@ -235,7 +268,7 @@ class TypedAttrChecker {
    return *this;
  }
-  void operator()(AttributeMap& attr_map) const {
+  void operator()(AttributeMap& attr_map) const {  // NOLINT
    if (!attr_map.count(attr_name_)) {
      // user do not set this attr
      PADDLE_ENFORCE(!default_value_setter_.empty(),
@ -271,7 +304,7 @@ class OpAttrChecker {
    return *(checker.target<TypedAttrChecker<T>>());
  }
-  void Check(AttributeMap& attr_map) const {
+  void Check(AttributeMap& attr_map) const {  // NOLINT
    for (const auto& checker : attr_checkers_) {
      checker(attr_map);
    }
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@ -1,5 +1,6 @@
 cc_library(var_handle SRCS var_handle.cc DEPS place framework_proto node)
 cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor)
 cc_library(op_graph_view SRCS op_graph_view.cc DEPS op_handle_base)
 cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
@ -30,20 +31,25 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_
 cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
 cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
-if(WITH_GPU)
+cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
 if (WITH_GPU)
  cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle scale_loss_grad_op_handle rpc_op_handle
          all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass)
 endif()
 cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass)
 cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)
-if(WITH_GPU)
+set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass) 
-  cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto reference_count_pass)
+if (WITH_GPU)
-else()
+  list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass)
  cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto)
 endif()
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS})
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
        simple_threadpool device_context)
@ -56,6 +62,7 @@ cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_execu
 #        device_context reduce_op_handle )
 cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc
        DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context)
 cc_test(fused_broadcast_op_test SRCS fused_broadcast_op_handle_test.cc DEPS fused_broadcast_op_handle)
 cc_library(build_strategy SRCS build_strategy.cc DEPS
        graph_viz_pass multi_devices_graph_pass
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@ -34,7 +34,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
      nccl_ctxs_(ctxs) {
  if (nccl_ctxs_) {
    for (auto &p : places_) {
-      this->dev_ctxes_[p] = nccl_ctxs_->DevCtx(p);
+      this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p));
    }
  }
 }
@ -46,7 +46,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
 #endif
 void AllReduceOpHandle::RunImpl() {
-  platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
+  platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
  if (NoDummyInputSize() == 1) {
    return;  // No need to all reduce when GPU count = 1;
@ -127,7 +127,7 @@ void AllReduceOpHandle::RunImpl() {
            *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
        auto &p = places_[i];
        auto *var = scope.FindVar(out_var_handles[i]->name_);
-        auto *dev_ctx = dev_ctxes_[p];
+        auto *dev_ctx = dev_ctxes_.at(p);
        RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
          auto &tensor_gpu = *var->GetMutable<framework::LoDTensor>();
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@ -59,6 +59,10 @@ void BroadcastOpHandle::BroadcastOneVar(
      var_scopes.at(in_var_handle.scope_idx_)->FindVar(in_var_handle.name_);
  PADDLE_ENFORCE_NOT_NULL(in_var);
  Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
  if (UNLIKELY(!in_tensor.IsInitialized())) {
    VLOG(3) << "in var " << in_var_handle.name_ << "not inited, return!";
    return;
  }
  InitOutputValue(in_var_handle, out_var_handles);
--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@ -44,7 +44,8 @@ struct BroadcastOpHandle : public OpHandleBase {
        nccl_ctxs_(nccl_ctxs) {
    if (nccl_ctxs_) {
      for (auto &p_ctx : nccl_ctxs_->contexts_) {
-        dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get();
+        this->SetDeviceContext(platform::CUDAPlace(p_ctx.first),
                               p_ctx.second.ctx_.get());
      }
    }
  }
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
@ -12,232 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/framework/details/broadcast_op_handle.h"
+#include "paddle/fluid/framework/details/broadcast_op_handle_test.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/platform/device_context.h"
 namespace paddle {
 namespace framework {
 namespace details {
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 // test data amount
 const f::DDim kDims = {20, 20};
 struct TestBroadcastOpHandle {
  std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
  std::vector<Scope*> local_scopes_;
  std::vector<Scope*> param_scopes_;
  Scope g_scope_;
  std::unique_ptr<OpHandleBase> op_handle_;
  std::vector<std::unique_ptr<VarHandleBase>> vars_;
  std::vector<p::Place> gpu_list_;
  bool use_gpu_;
 #ifdef PADDLE_WITH_CUDA
  std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
 #endif
  void WaitAll() {
    for (size_t j = 0; j < ctxs_.size(); ++j) {
      ctxs_[j]->Wait();
    }
 #ifdef PADDLE_WITH_CUDA
    if (nccl_ctxs_) {
      nccl_ctxs_->WaitAll();
    }
 #endif
  }
  void InitCtxOnGpu(bool use_gpu) {
    use_gpu_ = use_gpu;
    if (use_gpu_) {
 #ifdef PADDLE_WITH_CUDA
      int count = p::GetCUDADeviceCount();
      if (count <= 1) {
        LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
                        "device count is "
                     << count;
        exit(0);
      }
      for (int i = 0; i < count; ++i) {
        auto p = p::CUDAPlace(i);
        gpu_list_.push_back(p);
        ctxs_.emplace_back(new p::CUDADeviceContext(p));
      }
      nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_));
 #else
      PADDLE_THROW("CUDA is not support.");
 #endif
    } else {
      int count = 8;
      for (int i = 0; i < count; ++i) {
        auto p = p::CPUPlace();
        gpu_list_.push_back(p);
        ctxs_.emplace_back(new p::CPUDeviceContext(p));
      }
 #ifdef PADDLE_WITH_CUDA
      nccl_ctxs_.reset(nullptr);
 #endif
    }
  }
  void InitBroadcastOp(size_t input_scope_idx) {
    for (size_t j = 0; j < gpu_list_.size(); ++j) {
      local_scopes_.push_back(&(g_scope_.NewScope()));
      Scope& local_scope = local_scopes_.back()->NewScope();
      *local_scopes_.back()
           ->Var(details::kLocalExecScopeName)
           ->GetMutable<Scope*>() = &local_scope;
      local_scope.Var("out");
      param_scopes_.emplace_back(&local_scope);
    }
    param_scopes_[input_scope_idx]->Var("input");
    std::unique_ptr<ir::Node> n =
        ir::CreateNodeForTest("node0", ir::Node::Type::kOperation);
    if (use_gpu_) {
 #ifdef PADDLE_WITH_CUDA
      op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_,
                                             nccl_ctxs_.get()));
 #else
      PADDLE_THROW("CUDA is not support.");
 #endif
    } else {
 #ifdef PADDLE_WITH_CUDA
      op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_,
                                             nccl_ctxs_.get()));
 #else
      op_handle_.reset(
          new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_));
 #endif
    }
    std::unique_ptr<ir::Node> v =
        ir::CreateNodeForTest("node1", ir::Node::Type::kVariable);
    auto* in_var_handle = new VarHandle(v.get(), 1, input_scope_idx, "input",
                                        gpu_list_[input_scope_idx]);
    vars_.emplace_back(in_var_handle);
    op_handle_->AddInput(in_var_handle);
    // add dummy var
    std::unique_ptr<ir::Node> v2 =
        ir::CreateNodeForTest("node2", ir::Node::Type::kVariable);
    vars_.emplace_back(new DummyVarHandle(v2.get()));
    DummyVarHandle* dummy_var_handle =
        static_cast<DummyVarHandle*>(vars_.back().get());
    dummy_var_handle->ClearGeneratedOp();
    op_handle_->AddInput(dummy_var_handle);
    for (size_t j = 0; j < gpu_list_.size(); ++j) {
      if (!use_gpu_) {
        op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
      }
      std::unique_ptr<ir::Node> v3 =
          ir::CreateNodeForTest("node3", ir::Node::Type::kVariable);
      VarHandle* out_var_handle =
          new VarHandle(v3.get(), 2, j, "out", gpu_list_[j]);
      vars_.emplace_back(out_var_handle);
      op_handle_->AddOutput(out_var_handle);
    }
    // add dummy var
    std::unique_ptr<ir::Node> v4 =
        ir::CreateNodeForTest("node4", ir::Node::Type::kVariable);
    vars_.emplace_back(new DummyVarHandle(v4.get()));
    DummyVarHandle* out_dummy_var_handle =
        static_cast<DummyVarHandle*>(vars_.back().get());
    out_dummy_var_handle->ClearGeneratedOp();
    op_handle_->AddOutput(out_dummy_var_handle);
  }
  void TestBroadcastLodTensor(size_t input_scope_idx) {
    auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
    PADDLE_ENFORCE_NOT_NULL(in_var);
    auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
    in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
    std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
    for (size_t k = 0; k < send_vector.size(); ++k) {
      send_vector[k] = k;
    }
    f::LoD lod{{0, 10, 20}};
    paddle::framework::TensorFromVector<float>(
        send_vector, *(ctxs_[input_scope_idx]), in_lod_tensor);
    in_lod_tensor->set_lod(lod);
    in_lod_tensor->Resize(kDims);
    op_handle_->Run(false);
    WaitAll();
    p::CPUPlace cpu_place;
    for (size_t j = 0; j < gpu_list_.size(); ++j) {
      auto out_var = param_scopes_[j]->FindVar("out");
      PADDLE_ENFORCE_NOT_NULL(out_var);
      auto out_tensor = out_var->Get<f::LoDTensor>();
      PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal.");
      f::Tensor result_tensor;
      f::TensorCopySync(out_tensor, cpu_place, &result_tensor);
      float* ct = result_tensor.mutable_data<float>(cpu_place);
      for (int64_t i = 0; i < f::product(kDims); ++i) {
        ASSERT_NEAR(ct[i], send_vector[i], 1e-5);
      }
    }
  }
  void TestBroadcastSelectedRows(size_t input_scope_idx) {
    auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
    PADDLE_ENFORCE_NOT_NULL(in_var);
    auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
    auto value = in_selected_rows->mutable_value();
    value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
    int height = static_cast<int>(kDims[0]) * 2;
    std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
                              2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
    in_selected_rows->set_height(height);
    in_selected_rows->set_rows(rows);
    std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
    for (size_t k = 0; k < send_vector.size(); ++k) {
      send_vector[k] = k;
    }
    paddle::framework::TensorFromVector<float>(
        send_vector, *(ctxs_[input_scope_idx]), value);
    op_handle_->Run(false);
    WaitAll();
    p::CPUPlace cpu_place;
    for (size_t j = 0; j < gpu_list_.size(); ++j) {
      auto out_var = param_scopes_[j]->FindVar("out");
      PADDLE_ENFORCE_NOT_NULL(out_var);
      auto& out_select_rows = out_var->Get<f::SelectedRows>();
      auto rt = out_select_rows.value();
      PADDLE_ENFORCE_EQ(out_select_rows.height(), height,
                        "height is not equal.");
      for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
        PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k]);
      }
      f::Tensor result_tensor;
      f::TensorCopySync(rt, cpu_place, &result_tensor);
      float* ct = result_tensor.data<float>();
      for (int64_t i = 0; i < f::product(kDims); ++i) {
        ASSERT_NEAR(ct[i], send_vector[i], 1e-5);
      }
    }
  }
 };
 TEST(BroadcastTester, TestCPUBroadcastTestLodTensor) {
  TestBroadcastOpHandle test_op;
  size_t input_scope_idx = 0;
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
 #include "paddle/fluid/framework/details/sequential_execution_pass.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
@ -27,6 +28,10 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
 public:
  explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy)
      : ir::PassBuilder(), strategy_(strategy) {
    if (strategy_.enable_sequential_execution_) {
      AppendPass("sequential_execution_pass");
    }
    // Add a graph viz pass to record a graph.
    if (!strategy_.debug_graphviz_path_.empty()) {
      auto viz_pass = AppendPass("graph_viz_pass");
@ -64,6 +69,10 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
    // Verify that the graph is correct for multi-device executor.
    AppendPass("multi_devices_check_pass");
    if (strategy_.remove_unnecessary_lock_) {
      AppendPass("modify_op_lock_and_record_event_pass");
    }
  }
 private:
@ -110,6 +119,11 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
      pass->Erase("nccl_ctxs");
      pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
 #endif
    } else if (pass->Type() == "sequential_execution_pass") {
      pass->Erase(kAllOpDescs);
      pass->Set<const std::vector<OpDesc *>>(
          kAllOpDescs,
          new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
    }
    graph = pass->Apply(std::move(graph));
  }
@ -125,3 +139,5 @@ USE_PASS(multi_batch_merge_pass);
 USE_PASS(multi_devices_pass);
 USE_PASS(multi_devices_check_pass);
 USE_PASS(multi_devices_print_pass);
 USE_PASS(sequential_execution_pass);
 USE_PASS(modify_op_lock_and_record_event_pass);
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@ -69,8 +69,12 @@ struct BuildStrategy {
  bool enable_data_balance_{false};
  bool enable_sequential_execution_{false};
  bool fuse_broadcast_op_{false};
  bool remove_unnecessary_lock_{false};
  // User normally doesn't need to call this API.
  // The PassBuilder allows for more customized insert, remove of passes
  // from python side.
--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@ -29,15 +29,21 @@ ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope,
 void ComputationOpHandle::RunImpl() {
  WaitInputVarGenerated(place_);
-  this->RunAndRecordEvent([this] {
+  auto run_func = [this]() {
    op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(), place_);
-  });
+  };
  if (is_lock_and_record_event_free_) {
    run_func();
  } else {
    this->RunAndRecordEvent(run_func);
  }
 }
 bool ComputationOpHandle::NeedWait(VarHandleBase *in_var) {
  bool need_wait =
      in_var && in_var->GeneratedOp() &&
-      in_var->GeneratedOp()->DeviceContext(place_) != dev_ctxes_[place_];
+      in_var->GeneratedOp()->DeviceContext(place_) != dev_ctxes_.at(place_);
  return need_wait;
 }
--- a/paddle/fluid/framework/details/computation_op_handle.h
+++ b/paddle/fluid/framework/details/computation_op_handle.h
@ -36,6 +36,8 @@ struct ComputationOpHandle : public OpHandleBase {
  const platform::Place &GetPlace() const { return place_; }
  void SetLockAndRecordEventFree(bool b) { is_lock_and_record_event_free_ = b; }
 protected:
  void RunImpl() override;
@ -45,6 +47,7 @@ struct ComputationOpHandle : public OpHandleBase {
  std::unique_ptr<OperatorBase> op_;
  Scope *scope_;
  platform::Place place_;
  bool is_lock_and_record_event_free_{false};
 };
 }  // namespace details
 }  // namespace framework
--- a/paddle/fluid/framework/details/data_balance_op_handle.cc
+++ b/paddle/fluid/framework/details/data_balance_op_handle.cc
@ -28,7 +28,7 @@ DataBalanceOpHandle::DataBalanceOpHandle(
    : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {
  if (ctxs) {
    for (auto &p : places_) {
-      this->dev_ctxes_[p] = ctxs->DevCtx(p);
+      this->SetDeviceContext(p, ctxs->DevCtx(p));
    }
  }
 }
@ -89,8 +89,8 @@ void DataBalanceOpHandle::RunImpl() {
  PADDLE_ENFORCE_GT(places_.size(), 1,
                    "Data balance can only be enabled when the number of "
                    "places to run larger than 1.");
-  auto in_var_handles = DynamicCast<VarHandle>(inputs_);
+  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
-  auto out_var_handles = DynamicCast<VarHandle>(outputs_);
+  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
  PADDLE_ENFORCE(in_var_handles.size() % places_.size() == 0);
  PADDLE_ENFORCE_EQ(
      in_var_handles.size(), out_var_handles.size(),
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@ -92,13 +92,13 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
  size_t num_complete = 0;
  remaining_ = 0;
-  BlockingQueue<size_t> complete_q;
+  auto complete_q = std::make_shared<BlockingQueue<size_t>>();
  for (auto op : bootstrap_ops_) {
-    RunOpAsync(op_deps.get(), op, &complete_q);
+    RunOpAsync(op_deps.get(), op, complete_q);
  }
  while (num_complete != op_deps->size()) {
-    size_t num_comp = complete_q.Pop();
+    size_t num_comp = complete_q->Pop();
    if (num_comp == -1UL) {
      int remaining = 0;
      while (true) {
@ -107,7 +107,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
          break;
        }
        for (int i = 0; i < remaining; ++i) {
-          complete_q.Pop();
+          complete_q->Pop();
        }
      }
      exception_.ReThrow();
@ -120,7 +120,8 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
 }
 void FastThreadedSSAGraphExecutor::RunOpAsync(
    std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
-    OpHandleBase *op, BlockingQueue<size_t> *complete_q) {
+    OpHandleBase *op,
    const std::shared_ptr<BlockingQueue<size_t>> &complete_q) {
  ++remaining_;
  this->pool_.enqueue([=] {
    OpHandleBase *op_to_run = op;
@ -144,7 +145,7 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
            if (op_to_run == nullptr) {
              op_to_run = pending_op;
            } else {
-              this->RunOpAsync(op_deps, pending_op, complete_q);
+              RunOpAsync(op_deps, pending_op, complete_q);
            }
          }
        }
@ -156,8 +157,7 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
 }
 void FastThreadedSSAGraphExecutor::PrepareAtomicOpDeps() {
  atomic_op_deps_ = pool_.enqueue([&] {
-    std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps =
+    auto *op_deps = new std::unordered_map<OpHandleBase *, std::atomic<int>>;
        new std::unordered_map<OpHandleBase *, std::atomic<int>>;
    for (auto &pair : op_deps_) {
      (*op_deps)[pair.first] = pair.second;
    }
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
@ -50,7 +50,8 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
  std::atomic<int> remaining_;
  void RunOpAsync(std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
-                  OpHandleBase *op, BlockingQueue<size_t> *complete_q);
+                  OpHandleBase *op,
                  const std::shared_ptr<BlockingQueue<size_t>> &complete_q);
  void PrepareAtomicOpDeps();
--- a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
@ -0,0 +1,165 @@
 //   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/details/broadcast_op_handle_test.h"
 namespace paddle {
 namespace framework {
 namespace details {
 struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
  std::vector<std::string> out_varnames_;
  void InitFusedBroadcastOp(std::vector<size_t> input_scope_idxes) {
    // initialize scope and var
    for (size_t i = 0; i < place_list_.size(); ++i) {
      local_scopes_.push_back(&(g_scope_.NewScope()));
      Scope& local_scope = local_scopes_.back()->NewScope();
      *local_scopes_.back()
           ->Var(details::kLocalExecScopeName)
           ->GetMutable<Scope*>() = &local_scope;
      for (size_t j = 0; j < input_scope_idxes.size(); ++j) {
        local_scope.Var("out_var" + j);
        if (i == j) local_scope.Var("in_var" + j);
      }
      param_scopes_.emplace_back(&local_scope);
    }
    // create op handle node
    std::unique_ptr<ir::Node> n =
        ir::CreateNodeForTest("fused_broadcast", ir::Node::Type::kOperation);
    if (use_gpu_) {
 #ifdef PADDLE_WITH_CUDA
      op_handle_.reset(new FusedBroadcastOpHandle(
          n.get(), local_scopes_, place_list_, nccl_ctxs_.get()));
 #else
      PADDLE_THROW("CUDA is not supported.");
 #endif
    } else {
 #ifdef PADDLE_WITH_CUDA
      op_handle_.reset(new FusedBroadcastOpHandle(
          n.get(), local_scopes_, place_list_, nccl_ctxs_.get()));
 #else
      op_handle_.reset(
          new FusedBroadcastOpHandle(n.get(), local_scopes_, place_list_));
 #endif
    }
    for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
      // add input var handle
      std::unique_ptr<ir::Node> in_node =
          ir::CreateNodeForTest("in_node" + i, ir::Node::Type::kVariable);
      VarHandle* in_var_handle =
          new VarHandle(in_node.get(), 1, input_scope_idxes[i], "in_var" + i,
                        place_list_[input_scope_idxes[i]]);
      vars_.emplace_back(in_var_handle);
      op_handle_->AddInput(in_var_handle);
      // add output var handle
      for (size_t j = 0; j < place_list_.size(); ++j) {
        std::unique_ptr<ir::Node> out_node =
            ir::CreateNodeForTest("out_node" + i, ir::Node::Type::kVariable);
        VarHandle* out_var_handle =
            new VarHandle(out_node.get(), 2, j, "out_var" + i, place_list_[j]);
        vars_.emplace_back(out_var_handle);
        op_handle_->AddOutput(out_var_handle);
      }
    }
  }
  void TestFusedBroadcastLoDTensor(std::vector<size_t> input_scope_idxes) {
    std::vector<std::vector<float>> send_vec;
    f::LoD lod{{0, 10, 20}};
    for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
      const std::string varname("in_var" + i);
      float val_scalar = static_cast<float>(i);
      send_vec.push_back(
          InitLoDTensor(varname, input_scope_idxes[i], lod, val_scalar));
    }
    op_handle_->Run(false);
    WaitAll();
    for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
      const std::string& varname("out_var" + i);
      for (size_t j = 0; j < place_list_.size(); ++j) {
        LoDTensorEqual(varname, send_vec[i], lod, param_scopes_[j]);
      }
    }
  }
  void TestFusedBroadcastSelectedRows(std::vector<size_t> input_scope_idxes) {
    std::vector<std::vector<float>> send_vector;
    std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
                              2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
    int height = static_cast<int>(kDims[0] * 2);
    for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
      const std::string varname("in_var" + i);
      float val_scalar = static_cast<float>(i);
      send_vector.push_back(InitSelectedRows(varname, input_scope_idxes[i],
                                             rows, height, val_scalar));
    }
    op_handle_->Run(false);
    WaitAll();
    for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
      const std::string& varname("out_var" + i);
      for (size_t j = 0; j < place_list_.size(); ++j) {
        SelectedRowsEqual(varname, input_scope_idxes[i], send_vector[i], rows,
                          height);
      }
    }
  }
 };
 TEST(FusedBroadcastTester, CPULodTensor) {
  TestFusedBroadcastOpHandle test_op;
  std::vector<size_t> input_scope_idxes = {0, 1};
  test_op.InitCtxOnGpu(false);
  test_op.InitFusedBroadcastOp(input_scope_idxes);
  test_op.TestFusedBroadcastLoDTensor(input_scope_idxes);
 }
 TEST(FusedBroadcastTester, CPUSelectedRows) {
  TestFusedBroadcastOpHandle test_op;
  std::vector<size_t> input_scope_idxes = {0, 1};
  test_op.InitCtxOnGpu(false);
  test_op.InitFusedBroadcastOp(input_scope_idxes);
  test_op.TestFusedBroadcastSelectedRows(input_scope_idxes);
 }
 #ifdef PADDLE_WITH_CUDA
 TEST(FusedBroadcastTester, GPULodTensor) {
  TestFusedBroadcastOpHandle test_op;
  std::vector<size_t> input_scope_idxes = {0, 1};
  test_op.InitCtxOnGpu(true);
  test_op.InitFusedBroadcastOp(input_scope_idxes);
  test_op.TestFusedBroadcastLoDTensor(input_scope_idxes);
 }
 TEST(FusedBroadcastTester, GPUSelectedRows) {
  TestFusedBroadcastOpHandle test_op;
  std::vector<size_t> input_scope_idxes = {0, 1};
  test_op.InitCtxOnGpu(true);
  test_op.InitFusedBroadcastOp(input_scope_idxes);
  test_op.TestFusedBroadcastSelectedRows(input_scope_idxes);
 }
 #endif
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/Show More
+++ b/Show More