Merge remote-tracking branch 'ups/develop' into pool-int8

6 years ago · 086f892225
parent 8f051b36d5 3e8408429d
commit 086f892225
144 changed files with 3263 additions and 1424 deletions
--- a/76
+++ b/76
@ -94,52 +94,52 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U
 # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
 # version(1.7.1 for now), which causes building documentation failed.
-RUN pip3 install -U wheel && \
+RUN pip3 --no-cache-dir install -U wheel && \
-    pip3 install -U docopt PyYAML sphinx==1.5.6 && \
+    pip3 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \
+    pip3 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
-    pip3.6 install -U wheel && \
+    pip3.6 --no-cache-dir install -U wheel && \
-    pip3.6 install -U docopt PyYAML sphinx==1.5.6 && \
+    pip3.6 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3.6 install sphinx-rtd-theme==0.1.9 recommonmark && \
+    pip3.6 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
-    pip3.7 install -U wheel && \
+    pip3.7 --no-cache-dir install -U wheel && \
-    pip3.7 install -U docopt PyYAML sphinx==1.5.6 && \
+    pip3.7 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3.7 install sphinx-rtd-theme==0.1.9 recommonmark && \
+    pip3.7 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
    easy_install -U pip && \
-    pip install -U pip setuptools wheel && \
+    pip --no-cache-dir install -U pip setuptools wheel && \
-    pip install -U docopt PyYAML sphinx==1.5.6 && \
+    pip --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip install sphinx-rtd-theme==0.1.9 recommonmark
+    pip --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark
-
+
-RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3 install opencv-python && \
+    pip3 --no-cache-dir install opencv-python && \
-    pip3.6 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip3.6 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip3.6 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.6 install opencv-python && \
+    pip3.6 --no-cache-dir install opencv-python && \
-    pip3.7 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    pip3.7 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip3.7 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip3.7 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.7 install opencv-python && \
+    pip3.7 --no-cache-dir install opencv-python && \
-    pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    pip --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip install opencv-python
+    pip --no-cache-dir install opencv-python
 #For docstring checker
-RUN pip3 install pylint pytest astroid isort
+RUN pip3 --no-cache-dir install pylint pytest astroid isort
-RUN pip3.6 install pylint pytest astroid isort
+RUN pip3.6 --no-cache-dir install pylint pytest astroid isort
-RUN pip3.7 install pylint pytest astroid isort
+RUN pip3.7 --no-cache-dir install pylint pytest astroid isort
-RUN pip install pylint pytest astroid isort LinkChecker
+RUN pip --no-cache-dir install pylint pytest astroid isort LinkChecker
 COPY ./python/requirements.txt /root/
-RUN pip3 install -r /root/requirements.txt
+RUN pip3 --no-cache-dir install -r /root/requirements.txt
-RUN pip3.6 install -r /root/requirements.txt
+RUN pip3.6 --no-cache-dir install -r /root/requirements.txt
-RUN pip3.7 install -r /root/requirements.txt
+RUN pip3.7 --no-cache-dir install -r /root/requirements.txt
-RUN pip install -r /root/requirements.txt
+RUN pip --no-cache-dir install -r /root/requirements.txt
 # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
 # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
-RUN apt-get install -y libssl-dev libffi-dev
+RUN apt-get install -y libssl-dev libffi-dev && apt-get clean -y
-RUN pip3 install certifi urllib3[secure]
+RUN pip3 --no-cache-dir install certifi urllib3[secure]
-RUN pip3.6 install certifi urllib3[secure]
+RUN pip3.6 --no-cache-dir install certifi urllib3[secure]
-RUN pip3.7 install certifi urllib3[secure]
+RUN pip3.7 --no-cache-dir install certifi urllib3[secure]
-RUN pip install certifi urllib3[secure]
+RUN pip --no-cache-dir install certifi urllib3[secure]
 # Install woboq_codebrowser to /woboq
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@ -106,10 +106,10 @@ else(WIN32)
    SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0)
    ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB}
            COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB}
-            DEPENDS mkldnn)
+            DEPENDS mkldnn shared_mkldnn)
 endif(WIN32)
 ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB})
-
+ADD_DEPENDENCIES(mkldnn_shared_lib ${MKLDNN_PROJECT} mkldnn)
 IF(WITH_C_API)
  INSTALL(FILES ${MKLDNN_SHARED_LIB} DESTINATION lib)
 ENDIF()
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@ -136,7 +136,7 @@ if (WITH_MKLDNN)
    copy(mkldnn_lib
            SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB}
            DSTS ${dst_dir} ${dst_dir}/lib
-            DEPS mkldnn
+            DEPS mkldnn_shared_lib
            )
 endif ()
--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@ -57,46 +57,43 @@ int main()
    return 0;
 }" SSE3_FOUND)
-# disable AVX by default on windows
+# Check AVX
-if(NOT WIN32)
+set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
-    # Check AVX
+set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-    set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
+CHECK_CXX_SOURCE_RUNS("
-    set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+#include <immintrin.h>
-    CHECK_CXX_SOURCE_RUNS("
+int main()
-    #include <immintrin.h>
+{
-    int main()
+    __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
-    {
+    __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
-        __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
+    __m256 result = _mm256_add_ps (a, b);
-        __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
+    return 0;
-        __m256 result = _mm256_add_ps (a, b);
+}" AVX_FOUND)
        return 0;
    }" AVX_FOUND)
-    # Check AVX 2
+# Check AVX 2
-    set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
+set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
-    set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-    CHECK_CXX_SOURCE_RUNS("
+CHECK_CXX_SOURCE_RUNS("
-    #include <immintrin.h>
+#include <immintrin.h>
-    int main()
+int main()
-    {
+{
-        __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
+    __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
-        __m256i result = _mm256_abs_epi32 (a);
+    __m256i result = _mm256_abs_epi32 (a);
-        return 0;
+    return 0;
-    }" AVX2_FOUND)
+}" AVX2_FOUND)
-    # Check AVX512F
+# Check AVX512F
-    set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
+set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
-    set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-    CHECK_CXX_SOURCE_RUNS("
+CHECK_CXX_SOURCE_RUNS("
-    #include <immintrin.h>
+#include <immintrin.h>
-    int main()
+int main()
-    {
+{
-        __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
+    __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
-                                      13, -5, 6, -7, 9, 2, -6, 3);
+                                  13, -5, 6, -7, 9, 2, -6, 3);
-        __m512i result = _mm512_abs_epi32 (a);
+    __m512i result = _mm512_abs_epi32 (a);
-        return 0;
+    return 0;
-    }" AVX512F_FOUND)
+}" AVX512F_FOUND)
 endif(NOT WIN32)
 set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
 mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND)
--- a/paddle/contrib/float16/float16_transpiler.py
+++ b/paddle/contrib/float16/float16_transpiler.py
@ -60,7 +60,7 @@ class Float16Transpiler:
            raise TypeError("place should be as CPUPlace/CUDAPlace type")
        if scope is None:
            scope = global_scope()
-        if not isinstance(scope, core.Scope):
+        if not isinstance(scope, core._Scope):
            raise TypeError("scope should be as Scope type or None")
        self.scope = scope
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@ -464,11 +464,7 @@ paddle.fluid.unique_name.switch ArgSpec(args=['new_generator'], varargs=None, ke
 paddle.fluid.unique_name.guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
 paddle.fluid.recordio_writer.convert_reader_to_recordio_file ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None))
 paddle.fluid.recordio_writer.convert_reader_to_recordio_files ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None))
-paddle.fluid.Scope.__init__ __init__(self: paddle.fluid.core.Scope) -> None
+paddle.fluid.Scope Scope() -> paddle.fluid.core._Scope
 paddle.fluid.Scope.drop_kids drop_kids(self: paddle.fluid.core.Scope) -> None
 paddle.fluid.Scope.find_var find_var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable
 paddle.fluid.Scope.new_scope new_scope(self: paddle.fluid.core.Scope) -> paddle.fluid.core.Scope
 paddle.fluid.Scope.var var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable
 paddle.reader.map_readers ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None)
 paddle.reader.buffered ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None)
 paddle.reader.compose ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None)
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -7,27 +7,17 @@ function(windows_symbolic TARGET)
  cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
  set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH})
  foreach(src ${windows_symbolic_SRCS})
-  get_filename_component(src ${src} NAME_WE)
+    get_filename_component(src ${src} NAME_WE)
-  if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc OR NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cu)
+    if (NOT EXISTS ${final_path}/${src}.cc OR NOT EXISTS ${final_path}/${src}.cu)
-      message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.")
+        message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.")
-  endif()
+    endif()
-
+
-#only copy the xx.cu to.xx.cu when the content are modified
+    file(GENERATE OUTPUT ${final_path}/.${src}.cu INPUT ${final_path}/${src}.cc)
-  set(copy_flag 1)
+
-  if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu)
+    add_custom_command(OUTPUT ${final_path}/.${src}.cu
-  file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc SOURCE_STR)
+            COMMAND ${CMAKE_COMMAND} -E copy_if_different "${final_path}/${src}.cc" "${final_path}/.${src}.cu"
-  file(READ ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu TARGET_STR)
+            COMMENT "create hidden file of ${src}.cu")
-  if (SOURCE_STR STREQUAL TARGET_STR)
+    add_custom_target(${TARGET} ALL DEPENDS .${src}.cu)
    set(copy_flag 0)
  endif()
  endif()
  if (copy_flag)
  add_custom_command(OUTPUT .${src}.cu
          COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu
          COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc" "${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu"
          COMMENT "create hidden file of ${src}.cu")
  endif(copy_flag)
  add_custom_target(${TARGET} ALL DEPENDS .${src}.cu)
  endforeach()
 endfunction()
@ -48,10 +38,10 @@ if(WITH_GPU)
    nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
    add_dependencies(tensor tensor_util)
  else()
-    nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context)
+    nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context )
  endif(WIN32)
 else()
-  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context)
+  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context )
 endif()
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
@ -78,17 +68,23 @@ cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memor
 cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
 cc_test(reader_test SRCS reader_test.cc DEPS reader)
 cc_test(variable_test SRCS variable_test.cc)
 cc_library(threadpool SRCS threadpool.cc DEPS enforce)
 cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
-cc_library(scope SRCS scope.cc DEPS glog threadpool)
+cc_library(var_type_traits SRCS var_type_traits DEPS lod_tensor selected_rows framework_proto) 
 if (WITH_GPU)
  target_link_libraries(var_type_traits dynload_cuda)
 endif()
 cc_test(var_type_traits_test SRCS var_type_traits_test.cc DEPS var_type_traits)
 cc_library(scope SRCS scope.cc DEPS glog threadpool var_type_traits)
 cc_library(scope_pool SRCS scope_pool.cc DEPS scope)
 cc_test(scope_test SRCS scope_test.cc DEPS scope)
 cc_test(variable_test SRCS variable_test.cc DEPS tensor var_type_traits)
 cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
 nv_test(data_device_transform_test SRCS data_device_transform_test.cu
-        DEPS operator op_registry device_context math_function)
+        DEPS operator op_registry device_context math_function scope)
 if(WITH_GPU)
  if (WIN32)
--- a/paddle/fluid/framework/attribute.h
+++ b/paddle/fluid/framework/attribute.h
@ -165,7 +165,7 @@ template <typename T>
 class GreaterThanChecker {
 public:
  explicit GreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
-  void operator()(T& value) const {
+  void operator()(const T& value) const {
    PADDLE_ENFORCE(value > lower_bound_, "larger_than check fails.");
  }
@ -177,7 +177,7 @@ template <typename T>
 class EqualGreaterThanChecker {
 public:
  explicit EqualGreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
-  void operator()(T& value) const {
+  void operator()(const T& value) const {
    PADDLE_ENFORCE_GE(value, lower_bound_, "equal_larger_than check fails.");
  }
@ -193,7 +193,7 @@ class DefaultValueSetter {
 public:
  explicit DefaultValueSetter(T default_value)
      : default_value_(default_value) {}
-  void operator()(T& value) const { value = default_value_; }  // NOLINT
+  void operator()(T* value) const { *value = default_value_; }
 private:
  T default_value_;
@ -203,7 +203,7 @@ template <typename T>
 class EnumInContainer {
 public:
  explicit EnumInContainer(const std::unordered_set<T>& c) : container_(c) {}
-  void operator()(T& val) const {
+  void operator()(const T& val) const {
    PADDLE_ENFORCE(container_.find(val) != container_.end(),
                   "Value %s is not in enum container %s", val,
                   ContainerDebugString());
@ -232,7 +232,8 @@ class EnumInContainer {
 // an attribute can have more than one limits
 template <typename T>
 class TypedAttrChecker {
-  typedef std::function<void(T&)> ValueChecker;
+  typedef std::function<void(T*)> DefaultValueChecker;
  typedef std::function<void(const T&)> ValueChecker;
 public:
  explicit TypedAttrChecker(const std::string& attr_name)
@ -268,17 +269,17 @@ class TypedAttrChecker {
    return *this;
  }
-  void operator()(AttributeMap& attr_map) const {  // NOLINT
+  void operator()(AttributeMap* attr_map) const {
-    if (!attr_map.count(attr_name_)) {
+    if (!attr_map->count(attr_name_)) {
      // user do not set this attr
      PADDLE_ENFORCE(!default_value_setter_.empty(),
                     "Attribute '%s' is required!", attr_name_);
      // default_value_setter_ has no more than one element
      T val;
-      (default_value_setter_[0])(val);
+      (default_value_setter_[0])(&val);
-      attr_map[attr_name_] = val;
+      (*attr_map)[attr_name_] = val;
    }
-    Attribute& attr = attr_map.at(attr_name_);
+    Attribute& attr = attr_map->at(attr_name_);
    ExtractAttribute<T> extract_attr(attr_name_);
    T* attr_value = extract_attr(attr);
    for (const auto& checker : value_checkers_) {
@ -289,12 +290,12 @@ class TypedAttrChecker {
 private:
  std::string attr_name_;
  std::vector<ValueChecker> value_checkers_;
-  std::vector<ValueChecker> default_value_setter_;
+  std::vector<DefaultValueChecker> default_value_setter_;
 };
 // check whether op's all attributes fit their own limits
 class OpAttrChecker {
-  typedef std::function<void(AttributeMap&)> AttrChecker;
+  typedef std::function<void(AttributeMap*)> AttrChecker;
 public:
  template <typename T>
@ -304,7 +305,7 @@ class OpAttrChecker {
    return *(checker.target<TypedAttrChecker<T>>());
  }
-  void Check(AttributeMap& attr_map) const {  // NOLINT
+  void Check(AttributeMap* attr_map) const {
    for (const auto& checker : attr_checkers_) {
      checker(attr_map);
    }
--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@ -88,7 +88,7 @@ void EagerDeletionOpHandle::RunImpl() {
      }
    } else {
      PADDLE_THROW("Type %s of %s is not supported eager deletion",
-                   var->Type().name(), name);
+                   framework::ToTypeName(var->Type()), name);
    }
  }
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@ -120,6 +120,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
  ClearFetchOp(graph_.get(), &fetch_ops);
  return fetches;
 }
 void FastThreadedSSAGraphExecutor::RunOpAsync(
    std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
    OpHandleBase *op,
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@ -45,7 +45,7 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
 #endif
  int GetVarDeviceID(
-      const ir::Graph &graph, const std::string &varname,
+      const std::string &varname,
      const std::unordered_map<std::string, int> &sharded_var_device) const;
  bool IsScaleLossOp(ir::Node *node) const;
@ -57,18 +57,13 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
      ir::Graph *result, ir::Node *node,
      std::unordered_map<std::string, int> *sharded_var_device) const;
  std::vector<std::string> FindDistTrainSendVars(
      const std::vector<ir::Node *> &nodes) const;
  std::vector<std::string> FindDistTrainRecvVars(
      const std::vector<ir::Node *> &nodes) const;
  void CreateComputationalOps(ir::Graph *result, ir::Node *node,
                              size_t num_places) const;
  void CreateScaleLossGradOp(ir::Graph *result,
                             const std::string &loss_grad_name,
-                             ir::Node *out_var_node) const;
+                             ir::Node *out_var_node,
                             proto::VarType::Type dtype) const;
  VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og,
                            int dst_dev_id) const;
@ -76,7 +71,7 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
                             int dev_id) const;
  int GetOpDeviceID(
-      const ir::Graph &graph, ir::Node *node,
+      ir::Node *node,
      const std::unordered_map<std::string, int> &sharded_var_device) const;
  void InsertAllReduceOp(ir::Graph *result, const std::string &og) const;
@ -99,6 +94,15 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
  void SetCommunicationContext(OpHandleBase *op_handle,
                               const platform::Place &p) const;
  std::vector<ir::Node *> SortForReduceMode(
      const std::vector<ir::Node *> &) const;
  int GetOpDeviceID(
      ir::Node *node,
      const std::unordered_map<std::string, int> &shared_var_device,
      std::unordered_map<std::string, std::vector<ir::Node *>> *delay_ops)
      const;
  mutable std::string loss_var_name_;
  mutable std::vector<platform::Place> places_;
  mutable std::vector<Scope *> local_scopes_;
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@ -22,39 +22,66 @@ namespace details {
 ScaleLossGradOpHandle::ScaleLossGradOpHandle(ir::Node *node, size_t num_dev,
                                             Scope *scope,
                                             platform::Place place,
-                                             platform::DeviceContext *dev_ctx)
+                                             platform::DeviceContext *dev_ctx,
                                             proto::VarType::Type dtype)
    : OpHandleBase(node),
      coeff_(static_cast<float>(1.0 / num_dev)),
      scope_(scope),
-      place_(place) {
+      place_(place),
      out_dtype_(dtype) {
  this->SetDeviceContext(place_, dev_ctx);
 }
 ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {}
 struct ScaleLossGradFunctor {
  float coeff_;
  Tensor *out_;
  platform::Place place_;
  OpHandleBase *op_handle_;
  proto::VarType::Type out_dtype_;
  platform::DeviceContext *ctx_;
  ScaleLossGradFunctor(float coeff, Tensor *out, platform::Place place,
                       OpHandleBase *op_handle, proto::VarType::Type dtype,
                       platform::DeviceContext *ctx)
      : coeff_(coeff), out_(out), place_(place), out_dtype_(dtype), ctx_(ctx) {}
  template <typename OutT>
  void apply() const {
    auto *out_data = out_->mutable_data<OutT>(place_);
    if (platform::is_cpu_place(place_)) {
      *out_data = static_cast<OutT>(coeff_);
    } else {
 #ifdef PADDLE_WITH_CUDA
      OutT cast_coeff = static_cast<OutT>(coeff_);
      auto stream = static_cast<platform::CUDADeviceContext *>(ctx_)->stream();
      memory::Copy(boost::get<platform::CUDAPlace>(place_), out_data,
                   platform::CPUPlace(), &cast_coeff, SizeOfType(out_dtype_),
                   stream);
      VLOG(10) << place_ << "RUN Scale loss grad op";
 #endif
    }
  }
 };
 void ScaleLossGradOpHandle::RunImpl() {
  // Doesn't wait any event
  std::string var_name = static_cast<VarHandle *>(this->outputs_[0])->name_;
  auto &local_scope = *scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
-  float *tmp = local_scope.FindVar(var_name)
+  auto *tensor = local_scope.FindVar(var_name)->GetMutable<LoDTensor>();
-                   ->GetMutable<LoDTensor>()
+  tensor->Resize(make_ddim({1}));
                   ->mutable_data<float>(make_ddim({1}), place_);
  if (platform::is_cpu_place(place_)) {
    *tmp = coeff_;
  } else {
 #ifdef PADDLE_WITH_CUDA
-    this->RunAndRecordEvent([&] {
+  ScaleLossGradFunctor func(coeff_, tensor, place_, this, out_dtype_,
-      auto stream = static_cast<platform::CUDADeviceContext *>(
+                            this->dev_ctxes_.at(place_));
-                        this->dev_ctxes_.at(place_))
+  this->RunAndRecordEvent([&] { framework::VisitDataType(out_dtype_, func); });
-                        ->stream();
+#else
-      memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
+  ScaleLossGradFunctor func(coeff_, tensor, place_, this, out_dtype_, nullptr);
-                   platform::CPUPlace(), &coeff_, sizeof(float), stream);
+  framework::VisitDataType(out_dtype_, func);
      VLOG(10) << place_ << "RUN Scale loss grad op";
    });
 #endif
  }
 }
 std::string ScaleLossGradOpHandle::Name() const { return "Scale LossGrad"; }
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
@ -26,8 +26,8 @@ namespace details {
 struct ScaleLossGradOpHandle : public OpHandleBase {
  ScaleLossGradOpHandle(ir::Node *node, size_t num_dev, Scope *scope,
-                        platform::Place place,
+                        platform::Place place, platform::DeviceContext *context,
-                        platform::DeviceContext *context);
+                        proto::VarType::Type dtype);
  ~ScaleLossGradOpHandle() final;
@ -40,6 +40,7 @@ struct ScaleLossGradOpHandle : public OpHandleBase {
  float coeff_;
  Scope *scope_;
  platform::Place place_;
  proto::VarType::Type out_dtype_;
 };
 }  // namespace details
--- a/paddle/fluid/framework/details/variable_visitor.cc
+++ b/paddle/fluid/framework/details/variable_visitor.cc
@ -24,7 +24,7 @@ static void VisitVariable(Variable* var, Func* func) {
  } else if (var->IsType<SelectedRows>()) {
    (*func)(var->GetMutable<SelectedRows>());
  } else {
-    PADDLE_THROW("Not supported type %s", var->Type().name());
+    PADDLE_THROW("Not supported type %s", ToTypeName(var->Type()));
  }
 }
@ -35,7 +35,7 @@ static void VisitVariable(const Variable& var, Func* func) {
  } else if (var.IsType<SelectedRows>()) {
    (*func)(var.Get<SelectedRows>());
  } else {
-    PADDLE_THROW("Not supported type %s", var.Type().name());
+    PADDLE_THROW("Not supported type %s", ToTypeName(var.Type()));
  }
 }
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@ -119,7 +119,7 @@ static void DeleteUnusedTensors(
          }
        } else {
          PADDLE_THROW("Type %s of %s is not supported eager deletion",
-                       var->Type().name(), name);
+                       framework::ToTypeName(var->Type()), name);
        }
      }
    }
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@ -45,6 +45,7 @@ pass_library(is_test_pass base)
 pass_library(conv_elementwise_add_act_fuse_pass inference)
 pass_library(conv_elementwise_add2_act_fuse_pass inference)
 pass_library(conv_elementwise_add_fuse_pass inference)
 pass_library(conv_affine_channel_fuse_pass inference)
 if(WITH_MKLDNN)
    pass_library(mkldnn_placement_pass base)
    pass_library(depthwise_conv_mkldnn_pass base)
--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
@ -0,0 +1,222 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h"
 #include <functional>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/platform/enforce.h"
 namespace paddle {
 namespace framework {
 namespace ir {
 #define GET_CONV_BN_NODES(pattern_name)                                    \
  /* OPERATORS */                                                          \
  GET_IR_NODE_FROM_SUBGRAPH(conv, conv, pattern_name);                     \
  GET_IR_NODE_FROM_SUBGRAPH(affine_channel, affine_channel, pattern_name); \
  /* CONV inputs */                                                        \
  GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, pattern_name);       \
  /* CONV outputs */                                                       \
  GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, pattern_name);             \
  /* Affine Channel inputs */                                              \
  GET_IR_NODE_FROM_SUBGRAPH(ac_scale, ac_scale, pattern_name);             \
  GET_IR_NODE_FROM_SUBGRAPH(ac_bias, ac_bias, pattern_name);               \
  /* Affine channel outputs */                                             \
  GET_IR_NODE_FROM_SUBGRAPH(ac_out, ac_out, pattern_name); /* Out */
 void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight,
                                const ir::Node& ac_scale,
                                const LoDTensor& ac_bias_tensor,
                                LoDTensor* eltwise_y_in_tensor) {
  using EigenVectorArrayMap =
      Eigen::Map<Eigen::Array<float, Eigen::Dynamic, 1>>;
  using ConstEigenVectorArrayMap =
      Eigen::Map<const Eigen::Array<float, Eigen::Dynamic, 1>>;
  using EigenMatrixArrayMap = Eigen::Map<
      Eigen::Array<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
  // Re-compute bias of conv2d from AffineChannel
  PADDLE_ENFORCE_EQ(eltwise_y_in_tensor->dims(), ac_bias_tensor.dims());
  auto* scale_tensor = scope->FindVar(ac_scale.Name())->GetMutable<LoDTensor>();
  ConstEigenVectorArrayMap scale_array(scale_tensor->data<float>(),
                                       scale_tensor->numel(), 1);
  ConstEigenVectorArrayMap ac_bias_array(ac_bias_tensor.data<float>(),
                                         ac_bias_tensor.numel(), 1);
  EigenVectorArrayMap eltwise_y_in_array(
      eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
      eltwise_y_in_tensor->numel(), 1);
  eltwise_y_in_array = (eltwise_y_in_array * scale_array) + ac_bias_array;
  // Re-compute weight of conv2d from AffineChannel
  auto* weights = scope->FindVar(conv_weight->Name())->GetMutable<LoDTensor>();
  auto weights_shape = weights->dims();
  auto weights_shape_2d = flatten_to_2d(weights_shape, 1);
  EigenMatrixArrayMap weights_array_2d(
      weights->mutable_data<float>(platform::CPUPlace()), weights_shape_2d[0],
      weights_shape_2d[1]);
  weights_array_2d.colwise() *= scale_array;
 }
 std::unique_ptr<ir::Graph> ConvAffineChannelFusePass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
  PADDLE_ENFORCE(graph.get());
  FusePassBase::Init(name_scope_, graph.get());
  auto* scope = param_scope();
  PADDLE_ENFORCE(scope);
  GraphPatternDetector gpd;
  auto* conv_input =
      gpd.mutable_pattern()
          ->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
          ->AsInput()
          ->assert_is_op_input("conv2d", "Input");
  patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(),
                                              name_scope_);
  conv_ac_pattern(conv_input, false /*with_eltwise_add*/);
  int found_conv_ac_count = 0;
  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                     Graph* g) {
    VLOG(4) << "handle ConvAffineChannel fuse";
    GET_CONV_BN_NODES(conv_ac_pattern);
    // check if fuse can be done and if MKL-DNN should be used
    FuseOptions fuse_option = FindFuseOption(*conv, *affine_channel);
    if (fuse_option == DO_NOT_FUSE) {
      VLOG(3) << "do not perform conv+affinechannel fuse";
      return;
    }
    // Create eltwise_y (conv bias) variable
    VarDesc eltwise_y_in_desc(
        patterns::PDNodeName(name_scope_, "eltwise_y_in"));
    eltwise_y_in_desc.SetPersistable(true);
    auto* eltwise_y_in_node = g->CreateVarNode(&eltwise_y_in_desc);
    auto* eltwise_y_in_tensor =
        scope->Var(eltwise_y_in_node->Name())->GetMutable<LoDTensor>();
    // Get affine_channel bias
    auto* ac_bias_tensor =
        scope->FindVar(ac_bias->Name())->GetMutable<LoDTensor>();
    // Initialize eltwise_y
    eltwise_y_in_tensor->Resize(ac_bias_tensor->dims());
    std::fill_n(eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
                eltwise_y_in_tensor->numel(), 0.0f);
    // update weights and biases
    recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor,
                               eltwise_y_in_tensor);
    // create an elementwise add node.
    OpDesc desc;
    desc.SetInput("X", std::vector<std::string>({conv_out->Name()}));
    desc.SetInput("Y", std::vector<std::string>({eltwise_y_in_node->Name()}));
    desc.SetOutput("Out", std::vector<std::string>({ac_out->Name()}));
    desc.SetType("elementwise_add");
    desc.SetAttr("axis", 1);
    auto eltwise_op = g->CreateOpNode(&desc);  // OpDesc will be copied.
    GraphSafeRemoveNodes(graph.get(), {ac_scale, ac_bias, affine_channel});
    IR_NODE_LINK_TO(conv_out, eltwise_op);
    IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op);
    IR_NODE_LINK_TO(eltwise_op, ac_out);
    found_conv_ac_count++;
  };
  gpd(graph.get(), handler);
  AddStatis(found_conv_ac_count);
  return graph;
 }
 std::unique_ptr<ir::Graph> ConvEltwiseAddAffineChannelFusePass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
  PADDLE_ENFORCE(graph.get());
  FusePassBase::Init(name_scope_, graph.get());
  auto* scope = param_scope();
  PADDLE_ENFORCE(scope);
  GraphPatternDetector gpd;
  auto* conv_input =
      gpd.mutable_pattern()
          ->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
          ->AsInput()
          ->assert_is_op_input("conv2d", "Input");
  patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(),
                                              name_scope_);
  conv_ac_pattern(conv_input, true /*with_eltwise_add*/);
  int found_conv_ac_count = 0;
  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                     Graph* g) {
    VLOG(4) << "handle ConvBN fuse";
    GET_CONV_BN_NODES(conv_ac_pattern);
    // OPERATORS
    GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_ac_pattern);
    // BIAS inputs
    GET_IR_NODE_FROM_SUBGRAPH(eltwise_y_in, eltwise_y_in, conv_ac_pattern);
    // BIAS outputs
    GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_ac_pattern);
    // Get eltwise_y (conv bias) variable
    auto* eltwise_y_in_tensor =
        scope->FindVar(eltwise_y_in->Name())->GetMutable<LoDTensor>();
    // Get batch norm bias
    auto* ac_bias_tensor =
        scope->FindVar(ac_bias->Name())->GetMutable<LoDTensor>();
    recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor,
                               eltwise_y_in_tensor);
    // Update the elementwise_add node
    eltwise->Op()->SetAttr("axis", 1);
    eltwise->Op()->SetOutput("Out", std::vector<std::string>({ac_out->Name()}));
    GraphSafeRemoveNodes(graph.get(),
                         {ac_scale, ac_bias, affine_channel, eltwise_out});
    IR_NODE_LINK_TO(eltwise, ac_out);
    found_conv_ac_count++;
  };
  gpd(graph.get(), handler);
  AddStatis(found_conv_ac_count);
  return graph;
 }
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
 REGISTER_PASS(conv_affine_channel_fuse_pass,
              paddle::framework::ir::ConvAffineChannelFusePass);
 REGISTER_PASS(conv_eltwiseadd_affine_channel_fuse_pass,
              paddle::framework::ir::ConvEltwiseAddAffineChannelFusePass);
--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
@ -0,0 +1,49 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include <string>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 namespace paddle {
 namespace framework {
 namespace ir {
 /*
 * Fuse the Conv and ConvAffineChannel.
 */
 class ConvAffineChannelFusePass : public FusePassBase {
 public:
  virtual ~ConvAffineChannelFusePass() {}
 protected:
  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
  const std::string name_scope_{"conv_affine_channel_fuse"};
 };
 class ConvEltwiseAddAffineChannelFusePass : public FusePassBase {
 public:
  virtual ~ConvEltwiseAddAffineChannelFusePass() {}
 protected:
  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
  const std::string name_scope_{"conv_eltwiseadd_affine_channel_fuse"};
 };
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
@ -40,18 +40,20 @@ framework::proto::OpDesc PrepareOpDesc(
    const std::string& output) {
  auto proto = base_desc;
  framework::OpDesc desc(proto, nullptr);
  desc.SetType("conv2d_fusion");
  desc.SetInput("Bias", {bias});
  desc.SetInput("ResidualData", {bias1});
  desc.SetAttr("activation", activation);
  desc.SetOutput("Output", {output});
  desc.SetAttr("is_test", true);
-
+  desc.SetAttr("use_cudnn", false);
  desc.Flush();
  return *desc.Proto();
 }
 std::unique_ptr<ir::Graph> ConvElementwiseAdd2ActFusePass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
-  const std::string pattern_name = "conv_elementwise_add_act_fuse";
+  const std::string pattern_name = "conv_elementwise_add2_act_fuse";
  FusePassBase::Init(pattern_name, graph.get());
  GraphPatternDetector gpd;
@ -76,22 +78,23 @@ std::unique_ptr<ir::Graph> ConvElementwiseAdd2ActFusePass::ApplyImpl(
    framework::OpDesc new_op_desc(new_op_proto, nullptr);
    // Create a new node for the fused op.
-    graph->CreateOpNode(&new_op_desc);
+    auto* new_conv_op = graph->CreateOpNode(&new_op_desc);
    // Link inputs and outputs.
    PADDLE_ENFORCE(subgraph.count(x));
    auto* conv_in_node = subgraph.at(x);
-    IR_NODE_LINK_TO(conv_in_node, conv_op);            // Input
+    IR_NODE_LINK_TO(conv_in_node, new_conv_op);            // Input
-    IR_NODE_LINK_TO(conv_filter, conv_op);             // Filter
+    IR_NODE_LINK_TO(conv_filter, new_conv_op);             // Filter
-    IR_NODE_LINK_TO(conv_op, conv_out);                // Output
+    IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op);    // Bias
-    IR_NODE_LINK_TO(elementwise_add_in_y, conv_op);    // Bias
+    IR_NODE_LINK_TO(elementwise_add_in_y_1, new_conv_op);  // Bias
-    IR_NODE_LINK_TO(elementwise_add_in_y_1, conv_op);  // Bias
+    IR_NODE_LINK_TO(new_conv_op, act_out);                 // Output
    // Delete the unneeded nodes.
-    GraphSafeRemoveNodes(graph.get(),
+    GraphSafeRemoveNodes(
-                         {conv_op, elementwise_add_op, elementwise_add_op_1,
+        graph.get(),
-                          elementwise_add_out});
+        {conv_op, conv_out, elementwise_add_op, elementwise_add_op_1,
         elementwise_add_out, elementwise_add_out_1, act_op});
  };
  gpd(graph.get(), handler);
  return graph;
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@ -23,66 +23,8 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 namespace ir {
 namespace {
 void CheckProgram(const ProgramDesc &program) {
 #define _INT(role) static_cast<int>(role)
  std::map<int, bool> visit;
  for (OpDesc *op : program.Block(0).AllOps()) {
    // For backward compatibility, some program doesn't have role added.
    if (!op->HasAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) continue;
    int role_id =
        boost::get<int>(op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
    visit[role_id] = true;
    switch (role_id) {
      case _INT(OpRole::kForward):
        if (visit.find(_INT(OpRole::kBackward)) != visit.end()) {
          LOG(ERROR) << "Cannot add backward operator before forward operator "
                     << op->Type();
        }
        break;
      case _INT(OpRole::kBackward):
      case _INT(OpRole::kBackward) | _INT(OpRole::kLoss):
        PADDLE_ENFORCE(
            visit.find(_INT(OpRole::kOptimize)) == visit.end(),
            "Cannot add backward operator %s after optimize operator.",
            op->Type());
        break;
      case _INT(OpRole::kForward) | _INT(OpRole::kLoss):
        PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) |
                                  _INT(OpRole::kLoss)) == visit.end(),
                       "Cannot add backward|loss operator before "
                       "forward|loss operator %s.",
                       op->Type());
        PADDLE_ENFORCE(
            visit.find(_INT(OpRole::kOptimize)) == visit.end(),
            "Cannot add forward|loss operator %s after optimize operator.",
            op->Type());
        break;
      case _INT(OpRole::kOptimize):
      case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched):
        PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(),
                       "Optimize operators %s must follow backward operator.",
                       op->Type());
        break;
      case _INT(OpRole::kLRSched):
      case _INT(OpRole::kDist):
      case _INT(OpRole::kRPC):
      case _INT(OpRole::kNotSpecified):
        break;
      default:
        LOG(FATAL) << "Unknown operator role. Don't add new role because "
                      "you don't know what you are doing.";
    }
  }
 #undef _INT
 }
 }  // namespace
 Graph::Graph(const ProgramDesc &program) : program_(program) {
  CheckProgram(program_);
  auto var_nodes = InitFromProgram(program_);
  ResolveHazard(var_nodes);
 }
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@ -1101,9 +1101,7 @@ PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) {
  return out_var;
 }
-std::unordered_set<std::string> conv_act_set({"identity", "sigmoid", "relu",
+std::unordered_set<std::string> conv_act_set({"identity", "relu"});
                                              "relu6", "relux", "tanh",
                                              "band_pass"});
 PDNode *patterns::ConvElementwiseaddAct::operator()(PDNode *conv_in) {
  conv_in->AsInput();
@ -1169,13 +1167,13 @@ PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) {
                                  ->AsInput();
  auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr())
                                 ->assert_is_op_output("elementwise_add")
-                                 ->assert_is_op_input("elementwise_add", "X")
+                                 ->assert_is_op_input("elementwise_add", "Y")
                                 ->AsIntermediate();
  auto elementwise_add_op_1 = pattern->NewNode(elementwise_add_op_1_repr())
                                  ->assert_is_op("elementwise_add");
  auto elementwise_add_in_y_1 = pattern->NewNode(elementwise_add_in_y_1_repr())
-                                    ->assert_is_op_input("elementwise_add", "Y")
+                                    ->assert_is_op_input("elementwise_add", "X")
                                    ->AsInput();
  auto elementwise_add_out_1 = pattern->NewNode(elementwise_add_out_1_repr())
                                   ->assert_is_op_output("elementwise_add")
@ -1203,8 +1201,8 @@ PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) {
  conv_op->LinksFrom({conv_in, conv_filter}).LinksTo({conv_out});
  elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y})
      .LinksTo({elementwise_add_out});
-  elementwise_add_op_1->LinksFrom(
+  elementwise_add_op_1->LinksFrom({elementwise_add_out, elementwise_add_in_y_1})
-      {elementwise_add_out, elementwise_add_in_y_1});
+      .LinksTo({elementwise_add_out_1});
  act_op->LinksFrom({elementwise_add_out_1}).LinksTo({act_out});
  return act_out;
 }
@ -1236,6 +1234,78 @@ PDNode *patterns::ConvElementwiseadd::operator()(PDNode *conv_in) {
  return elementwise_add_out;
 }
 PDNode *patterns::ConvAffineChannel::operator()(
    paddle::framework::ir::PDNode *conv_input, bool with_eltwise_add) {
  // Create Operators
  conv_input->assert_is_op_input("conv2d", "Input");
  auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d");
  PDNode *eltwise_op = nullptr;
  if (with_eltwise_add) {
    eltwise_op =
        pattern->NewNode(eltwise_repr())->assert_is_op("elementwise_add");
  }
  auto *affine_channel_op =
      pattern->NewNode(affine_channel_repr())->assert_is_op("affine_channel");
  // Create variables
  // Conv Filter
  auto *conv_weight_var = pattern->NewNode(conv_weight_repr())
                              ->AsInput()
                              ->assert_is_persistable_var()
                              ->assert_is_op_input("conv2d", "Filter");
  auto *conv_out_var = pattern->NewNode(conv_out_repr())
                           ->AsIntermediate()
                           ->assert_is_only_output_of_op("conv2d");
  PDNode *eltwise_y_in_var = nullptr;
  PDNode *eltwise_out_var = nullptr;
  if (with_eltwise_add) {
    // Conv output as Bias input
    conv_out_var->assert_is_op_input("elementwise_add", "X");
    // Bias
    eltwise_y_in_var = pattern->NewNode(eltwise_y_in_repr())
                           ->assert_is_op_input("elementwise_add", "Y")
                           ->AsInput();
    eltwise_out_var = pattern->NewNode(eltwise_out_repr())
                          ->AsIntermediate()
                          ->assert_is_only_output_of_op("elementwise_add");
  } else {
    // Conv output as AffineChannel input
    conv_out_var->assert_is_op_input("affine_channel", "X");
  }
  // AC Scale
  auto *ac_scale_var = pattern->NewNode(ac_scale_repr())
                           ->AsInput()
                           ->assert_is_persistable_var()
                           ->assert_is_op_input("affine_channel", "Scale");
  // AC Bias
  auto *ac_bias_var = pattern->NewNode(ac_bias_repr())
                          ->AsInput()
                          ->assert_is_persistable_var()
                          ->assert_is_op_input("affine_channel", "Bias");
  // AC output
  auto *ac_out_var = pattern->NewNode(ac_out_repr())
                         ->AsOutput()
                         ->assert_is_op_output("affine_channel");
  conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var});
  if (with_eltwise_add) {
    eltwise_op->LinksFrom({conv_out_var, eltwise_y_in_var})
        .LinksTo({eltwise_out_var});
    affine_channel_op->LinksFrom({eltwise_out_var, ac_scale_var, ac_bias_var})
        .LinksTo({ac_out_var});
  } else {
    affine_channel_op->LinksFrom({conv_out_var, ac_scale_var, ac_bias_var})
        .LinksTo({ac_out_var});
  }
  return ac_out_var;
 }
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@ -734,6 +734,38 @@ struct ConvElementwiseadd : public PatternBase {
  PATTERN_DECL_NODE(elementwise_add_out);
 };
 // Conv with affine_channel
 // op: conv + (elementwise_add +) affine_channel
 // named nodes:
 // conv_weight, conv_out, conv,
 // ac_x, ac_scale, ac_bias
 // affine_channel, ac_out
 struct ConvAffineChannel : public PatternBase {
  ConvAffineChannel(PDPattern* pattern, const std::string& name_scope)
      : PatternBase(pattern, name_scope, "conv_affine_channel") {}
  PDNode* operator()(PDNode* conv_input, bool with_eltwise_add);
  // declare operator node's name
  PATTERN_DECL_NODE(conv);
  PATTERN_DECL_NODE(affine_channel);
  PATTERN_DECL_NODE(eltwise);  // ELEMENTWISE_ADD
  // CONV inputs
  PATTERN_DECL_NODE(conv_weight);  // Filter
  // CONV outputs
  PATTERN_DECL_NODE(conv_out);  // tmp
  // ELTWISE inputs
  PATTERN_DECL_NODE(eltwise_y_in);
  // ELTWISE outputs
  PATTERN_DECL_NODE(eltwise_out);  // tmp
  // AC(Affine_Channel) inputs
  PATTERN_DECL_NODE(ac_scale);
  PATTERN_DECL_NODE(ac_bias);
  // AC outputs
  PATTERN_DECL_NODE(ac_out);  // Out
 };
 }  // namespace patterns
 // Link two ir::Nodes from each other.
--- a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
+++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
@ -75,6 +75,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
  std::vector<Node*> optimize_ops;
  std::vector<Node*> lr_ops;  // ops other than forward/backward/optimize
  std::unordered_set<std::string> grad_names;
  std::unordered_map<std::string, std::string> gradname2paramname;
  std::vector<ir::Node*> nodes = TopologySortOperations(*graph);
  auto origin_nodes = graph->ReleaseNodes();
@ -99,6 +100,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
      auto op_role_vars = boost::get<std::vector<std::string>>(op_role_var);
      for (size_t i = 0; i < op_role_vars.size(); i += 2) {
        grad_names.insert(op_role_vars[i + 1]);
        gradname2paramname[op_role_vars[i + 1]] = op_role_vars[i];
      }
    } else if (op_role & static_cast<int>(framework::OpRole::kLRSched)) {
      lr_ops.push_back(node);
@ -109,7 +111,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
  // 2. copy forward backward
  ir::Node* prev_repeat_last_op_node = nullptr;
-  // record origin_grad -> repeated grad list map.
+  // record origin_grad -> repeated_grad_list map.
  std::map<ir::Node*, std::vector<ir::Node*>> grad_repeated_map;
  std::map<std::string, std::vector<ir::Node*>> created;
  std::unordered_set<std::string> bn_vars_need_rename;
@ -124,10 +126,16 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
        if (grad_names.find(outname) != grad_names.end()) {
          std::string new_gname = string::Sprintf("%s.repeat.%d", outname, i);
          repeated_op.RenameOutput(outname, new_gname);
          // remove op_role_var for backward ops that outputs grad for a
          // parameter.
          repeated_op.SetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName(),
                              std::vector<std::string>());
        }
      }
      // 3.5 let batch_norm ops use independent vars, note batch_norm_grad do
-      // not need this update
+      // not need this update, because only moving mean and variance should be
      // differ, trainable parameter scale and bias is the same as other
      // parameters.
      if (node->Name() == "batch_norm") {
        // NOTE: assume bn op created by layers use save var as output mean and
        // variance
@ -224,16 +232,25 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
        var->inputs.push_back(repeated_node);
      }
    }
-  }
+  }  // end copy forward backward
-  // 5. create GRAD merge op node
+  // 5. create GRAD merge op node: sum(repeat.0...repeat.n) ->
  // scale(1/num_repeats)
  for (auto kv : grad_repeated_map) {
    OpDesc sum_op;
    sum_op.SetType("sum");
    std::vector<std::string> repeated_grad_names;
    std::vector<std::string> param_grad_op_role_var;
    for (auto r : kv.second) {
      repeated_grad_names.push_back(r->Var()->Name());
    }
    // NOTE: use op_role_var to control allreduce op appending in
    //       multi_devices_graph_pass, we want to append op_role_var
    //       only once for the merged gradient, so break after first call.
    param_grad_op_role_var.push_back(
        gradname2paramname.at(kv.first->Var()->Name()));        // param
    param_grad_op_role_var.push_back(kv.first->Var()->Name());  // grad
    sum_op.SetInput("X", repeated_grad_names);
    sum_op.SetOutput("Out", {kv.first->Var()->Name()});
    sum_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
@ -256,6 +273,10 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
    scale_op.SetAttr("scale", static_cast<float>(1.0f / num_repeats));
    scale_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
                     static_cast<int>(OpRole::kBackward));
    scale_op.SetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName(),
                     param_grad_op_role_var);
    auto scale_op_node = result.CreateOpNode(&scale_op);
    scale_op_node->inputs.push_back(sum_out_var_node);
    sum_out_var_node->outputs.push_back(scale_op_node);
--- a/Show More
+++ b/Show More