Merge branch 'develop' of github.com:PaddlePaddle/Paddle into parallel_graph_mode

7 years ago · 4743c9cd5d
parent 8cad371a60 4048cfa9da
commit 4743c9cd5d
101 changed files with 2266 additions and 1057 deletions
--- a/76
+++ b/76
@ -94,52 +94,52 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U
 # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
 # version(1.7.1 for now), which causes building documentation failed.
-RUN pip3 install -U wheel && \
+RUN pip3 --no-cache-dir install -U wheel && \
-    pip3 install -U docopt PyYAML sphinx==1.5.6 && \
+    pip3 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \
+    pip3 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
-    pip3.6 install -U wheel && \
+    pip3.6 --no-cache-dir install -U wheel && \
-    pip3.6 install -U docopt PyYAML sphinx==1.5.6 && \
+    pip3.6 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3.6 install sphinx-rtd-theme==0.1.9 recommonmark && \
+    pip3.6 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
-    pip3.7 install -U wheel && \
+    pip3.7 --no-cache-dir install -U wheel && \
-    pip3.7 install -U docopt PyYAML sphinx==1.5.6 && \
+    pip3.7 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3.7 install sphinx-rtd-theme==0.1.9 recommonmark && \
+    pip3.7 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
    easy_install -U pip && \
-    pip install -U pip setuptools wheel && \
+    pip --no-cache-dir install -U pip setuptools wheel && \
-    pip install -U docopt PyYAML sphinx==1.5.6 && \
+    pip --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip install sphinx-rtd-theme==0.1.9 recommonmark
+    pip --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark
-
+
-RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3 install opencv-python && \
+    pip3 --no-cache-dir install opencv-python && \
-    pip3.6 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip3.6 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip3.6 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.6 install opencv-python && \
+    pip3.6 --no-cache-dir install opencv-python && \
-    pip3.7 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    pip3.7 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip3.7 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip3.7 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.7 install opencv-python && \
+    pip3.7 --no-cache-dir install opencv-python && \
-    pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    pip --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip install opencv-python
+    pip --no-cache-dir install opencv-python
 #For docstring checker
-RUN pip3 install pylint pytest astroid isort
+RUN pip3 --no-cache-dir install pylint pytest astroid isort
-RUN pip3.6 install pylint pytest astroid isort
+RUN pip3.6 --no-cache-dir install pylint pytest astroid isort
-RUN pip3.7 install pylint pytest astroid isort
+RUN pip3.7 --no-cache-dir install pylint pytest astroid isort
-RUN pip install pylint pytest astroid isort LinkChecker
+RUN pip --no-cache-dir install pylint pytest astroid isort LinkChecker
 COPY ./python/requirements.txt /root/
-RUN pip3 install -r /root/requirements.txt
+RUN pip3 --no-cache-dir install -r /root/requirements.txt
-RUN pip3.6 install -r /root/requirements.txt
+RUN pip3.6 --no-cache-dir install -r /root/requirements.txt
-RUN pip3.7 install -r /root/requirements.txt
+RUN pip3.7 --no-cache-dir install -r /root/requirements.txt
-RUN pip install -r /root/requirements.txt
+RUN pip --no-cache-dir install -r /root/requirements.txt
 # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
 # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
-RUN apt-get install -y libssl-dev libffi-dev
+RUN apt-get install -y libssl-dev libffi-dev && apt-get clean -y
-RUN pip3 install certifi urllib3[secure]
+RUN pip3 --no-cache-dir install certifi urllib3[secure]
-RUN pip3.6 install certifi urllib3[secure]
+RUN pip3.6 --no-cache-dir install certifi urllib3[secure]
-RUN pip3.7 install certifi urllib3[secure]
+RUN pip3.7 --no-cache-dir install certifi urllib3[secure]
-RUN pip install certifi urllib3[secure]
+RUN pip --no-cache-dir install certifi urllib3[secure]
 # Install woboq_codebrowser to /woboq
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@ -106,10 +106,10 @@ else(WIN32)
    SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0)
    ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB}
            COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB}
-            DEPENDS mkldnn)
+            DEPENDS mkldnn shared_mkldnn)
 endif(WIN32)
 ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB})
-
+ADD_DEPENDENCIES(mkldnn_shared_lib ${MKLDNN_PROJECT} mkldnn)
 IF(WITH_C_API)
  INSTALL(FILES ${MKLDNN_SHARED_LIB} DESTINATION lib)
 ENDIF()
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@ -136,7 +136,7 @@ if (WITH_MKLDNN)
    copy(mkldnn_lib
            SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB}
            DSTS ${dst_dir} ${dst_dir}/lib
-            DEPS mkldnn
+            DEPS mkldnn_shared_lib
            )
 endif ()
--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@ -57,8 +57,6 @@ int main()
    return 0;
 }" SSE3_FOUND)
 # disable AVX by default on windows
 if(NOT WIN32)
 # Check AVX
 set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
 set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
@ -96,7 +94,6 @@ if(NOT WIN32)
    __m512i result = _mm512_abs_epi32 (a);
    return 0;
 }" AVX512F_FOUND)
 endif(NOT WIN32)
 set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
 mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND)
--- a/paddle/contrib/float16/float16_transpiler.py
+++ b/paddle/contrib/float16/float16_transpiler.py
@ -60,7 +60,7 @@ class Float16Transpiler:
            raise TypeError("place should be as CPUPlace/CUDAPlace type")
        if scope is None:
            scope = global_scope()
-        if not isinstance(scope, core.Scope):
+        if not isinstance(scope, core._Scope):
            raise TypeError("scope should be as Scope type or None")
        self.scope = scope
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@ -464,11 +464,7 @@ paddle.fluid.unique_name.switch ArgSpec(args=['new_generator'], varargs=None, ke
 paddle.fluid.unique_name.guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
 paddle.fluid.recordio_writer.convert_reader_to_recordio_file ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None))
 paddle.fluid.recordio_writer.convert_reader_to_recordio_files ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None))
-paddle.fluid.Scope.__init__ __init__(self: paddle.fluid.core.Scope) -> None
+paddle.fluid.Scope Scope() -> paddle.fluid.core._Scope
 paddle.fluid.Scope.drop_kids drop_kids(self: paddle.fluid.core.Scope) -> None
 paddle.fluid.Scope.find_var find_var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable
 paddle.fluid.Scope.new_scope new_scope(self: paddle.fluid.core.Scope) -> paddle.fluid.core.Scope
 paddle.fluid.Scope.var var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable
 paddle.reader.map_readers ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None)
 paddle.reader.buffered ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None)
 paddle.reader.compose ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None)
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -8,25 +8,15 @@ function(windows_symbolic TARGET)
  set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH})
  foreach(src ${windows_symbolic_SRCS})
    get_filename_component(src ${src} NAME_WE)
-  if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc OR NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cu)
+    if (NOT EXISTS ${final_path}/${src}.cc OR NOT EXISTS ${final_path}/${src}.cu)
        message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.")
    endif()
-#only copy the xx.cu to.xx.cu when the content are modified
+    file(GENERATE OUTPUT ${final_path}/.${src}.cu INPUT ${final_path}/${src}.cc)
-  set(copy_flag 1)
+
-  if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu)
+    add_custom_command(OUTPUT ${final_path}/.${src}.cu
-  file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc SOURCE_STR)
+            COMMAND ${CMAKE_COMMAND} -E copy_if_different "${final_path}/${src}.cc" "${final_path}/.${src}.cu"
  file(READ ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu TARGET_STR)
  if (SOURCE_STR STREQUAL TARGET_STR)
    set(copy_flag 0)
  endif()
  endif()
  if (copy_flag)
  add_custom_command(OUTPUT .${src}.cu
          COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu
          COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc" "${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu"
            COMMENT "create hidden file of ${src}.cu")
  endif(copy_flag)
    add_custom_target(${TARGET} ALL DEPENDS .${src}.cu)
  endforeach()
 endfunction()
@ -78,17 +68,23 @@ cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memor
 cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
 cc_test(reader_test SRCS reader_test.cc DEPS reader)
 cc_test(variable_test SRCS variable_test.cc)
 cc_library(threadpool SRCS threadpool.cc DEPS enforce)
 cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
-cc_library(scope SRCS scope.cc DEPS glog threadpool)
+cc_library(var_type_traits SRCS var_type_traits DEPS lod_tensor selected_rows framework_proto) 
 if (WITH_GPU)
  target_link_libraries(var_type_traits dynload_cuda)
 endif()
 cc_test(var_type_traits_test SRCS var_type_traits_test.cc DEPS var_type_traits)
 cc_library(scope SRCS scope.cc DEPS glog threadpool var_type_traits)
 cc_library(scope_pool SRCS scope_pool.cc DEPS scope)
 cc_test(scope_test SRCS scope_test.cc DEPS scope)
 cc_test(variable_test SRCS variable_test.cc DEPS tensor var_type_traits)
 cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
 nv_test(data_device_transform_test SRCS data_device_transform_test.cu
-        DEPS operator op_registry device_context math_function)
+        DEPS operator op_registry device_context math_function scope)
 if(WITH_GPU)
  if (WIN32)
--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@ -88,7 +88,7 @@ void EagerDeletionOpHandle::RunImpl() {
      }
    } else {
      PADDLE_THROW("Type %s of %s is not supported eager deletion",
-                   var->Type().name(), name);
+                   framework::ToTypeName(var->Type()), name);
    }
  }
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@ -120,6 +120,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
  ClearFetchOp(graph_.get(), &fetch_ops);
  return fetches;
 }
 void FastThreadedSSAGraphExecutor::RunOpAsync(
    std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
    OpHandleBase *op,
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@ -45,7 +45,7 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
 #endif
  int GetVarDeviceID(
-      const ir::Graph &graph, const std::string &varname,
+      const std::string &varname,
      const std::unordered_map<std::string, int> &sharded_var_device) const;
  bool IsScaleLossOp(ir::Node *node) const;
@ -57,12 +57,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
      ir::Graph *result, ir::Node *node,
      std::unordered_map<std::string, int> *sharded_var_device) const;
  std::vector<std::string> FindDistTrainSendVars(
      const std::vector<ir::Node *> &nodes) const;
  std::vector<std::string> FindDistTrainRecvVars(
      const std::vector<ir::Node *> &nodes) const;
  void CreateComputationalOps(ir::Graph *result, ir::Node *node,
                              size_t num_places) const;
@ -77,7 +71,7 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
                             int dev_id) const;
  int GetOpDeviceID(
-      const ir::Graph &graph, ir::Node *node,
+      ir::Node *node,
      const std::unordered_map<std::string, int> &sharded_var_device) const;
  void InsertAllReduceOp(ir::Graph *result, const std::string &og) const;
@ -100,6 +94,15 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
  void SetCommunicationContext(OpHandleBase *op_handle,
                               const platform::Place &p) const;
  std::vector<ir::Node *> SortForReduceMode(
      const std::vector<ir::Node *> &) const;
  int GetOpDeviceID(
      ir::Node *node,
      const std::unordered_map<std::string, int> &shared_var_device,
      std::unordered_map<std::string, std::vector<ir::Node *>> *delay_ops)
      const;
  mutable std::string loss_var_name_;
  mutable std::vector<platform::Place> places_;
  mutable std::vector<Scope *> local_scopes_;
--- a/paddle/fluid/framework/details/variable_visitor.cc
+++ b/paddle/fluid/framework/details/variable_visitor.cc
@ -24,7 +24,7 @@ static void VisitVariable(Variable* var, Func* func) {
  } else if (var->IsType<SelectedRows>()) {
    (*func)(var->GetMutable<SelectedRows>());
  } else {
-    PADDLE_THROW("Not supported type %s", var->Type().name());
+    PADDLE_THROW("Not supported type %s", ToTypeName(var->Type()));
  }
 }
@ -35,7 +35,7 @@ static void VisitVariable(const Variable& var, Func* func) {
  } else if (var.IsType<SelectedRows>()) {
    (*func)(var.Get<SelectedRows>());
  } else {
-    PADDLE_THROW("Not supported type %s", var.Type().name());
+    PADDLE_THROW("Not supported type %s", ToTypeName(var.Type()));
  }
 }
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@ -119,7 +119,7 @@ static void DeleteUnusedTensors(
          }
        } else {
          PADDLE_THROW("Type %s of %s is not supported eager deletion",
-                       var->Type().name(), name);
+                       framework::ToTypeName(var->Type()), name);
        }
      }
    }
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@ -45,6 +45,7 @@ pass_library(is_test_pass base)
 pass_library(conv_elementwise_add_act_fuse_pass inference)
 pass_library(conv_elementwise_add2_act_fuse_pass inference)
 pass_library(conv_elementwise_add_fuse_pass inference)
 pass_library(conv_affine_channel_fuse_pass inference)
 if(WITH_MKLDNN)
    pass_library(mkldnn_placement_pass base)
    pass_library(depthwise_conv_mkldnn_pass base)
--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
@ -0,0 +1,222 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h"
 #include <functional>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/platform/enforce.h"
 namespace paddle {
 namespace framework {
 namespace ir {
 #define GET_CONV_BN_NODES(pattern_name)                                    \
  /* OPERATORS */                                                          \
  GET_IR_NODE_FROM_SUBGRAPH(conv, conv, pattern_name);                     \
  GET_IR_NODE_FROM_SUBGRAPH(affine_channel, affine_channel, pattern_name); \
  /* CONV inputs */                                                        \
  GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, pattern_name);       \
  /* CONV outputs */                                                       \
  GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, pattern_name);             \
  /* Affine Channel inputs */                                              \
  GET_IR_NODE_FROM_SUBGRAPH(ac_scale, ac_scale, pattern_name);             \
  GET_IR_NODE_FROM_SUBGRAPH(ac_bias, ac_bias, pattern_name);               \
  /* Affine channel outputs */                                             \
  GET_IR_NODE_FROM_SUBGRAPH(ac_out, ac_out, pattern_name); /* Out */
 void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight,
                                const ir::Node& ac_scale,
                                const LoDTensor& ac_bias_tensor,
                                LoDTensor* eltwise_y_in_tensor) {
  using EigenVectorArrayMap =
      Eigen::Map<Eigen::Array<float, Eigen::Dynamic, 1>>;
  using ConstEigenVectorArrayMap =
      Eigen::Map<const Eigen::Array<float, Eigen::Dynamic, 1>>;
  using EigenMatrixArrayMap = Eigen::Map<
      Eigen::Array<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
  // Re-compute bias of conv2d from AffineChannel
  PADDLE_ENFORCE_EQ(eltwise_y_in_tensor->dims(), ac_bias_tensor.dims());
  auto* scale_tensor = scope->FindVar(ac_scale.Name())->GetMutable<LoDTensor>();
  ConstEigenVectorArrayMap scale_array(scale_tensor->data<float>(),
                                       scale_tensor->numel(), 1);
  ConstEigenVectorArrayMap ac_bias_array(ac_bias_tensor.data<float>(),
                                         ac_bias_tensor.numel(), 1);
  EigenVectorArrayMap eltwise_y_in_array(
      eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
      eltwise_y_in_tensor->numel(), 1);
  eltwise_y_in_array = (eltwise_y_in_array * scale_array) + ac_bias_array;
  // Re-compute weight of conv2d from AffineChannel
  auto* weights = scope->FindVar(conv_weight->Name())->GetMutable<LoDTensor>();
  auto weights_shape = weights->dims();
  auto weights_shape_2d = flatten_to_2d(weights_shape, 1);
  EigenMatrixArrayMap weights_array_2d(
      weights->mutable_data<float>(platform::CPUPlace()), weights_shape_2d[0],
      weights_shape_2d[1]);
  weights_array_2d.colwise() *= scale_array;
 }
 std::unique_ptr<ir::Graph> ConvAffineChannelFusePass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
  PADDLE_ENFORCE(graph.get());
  FusePassBase::Init(name_scope_, graph.get());
  auto* scope = param_scope();
  PADDLE_ENFORCE(scope);
  GraphPatternDetector gpd;
  auto* conv_input =
      gpd.mutable_pattern()
          ->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
          ->AsInput()
          ->assert_is_op_input("conv2d", "Input");
  patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(),
                                              name_scope_);
  conv_ac_pattern(conv_input, false /*with_eltwise_add*/);
  int found_conv_ac_count = 0;
  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                     Graph* g) {
    VLOG(4) << "handle ConvAffineChannel fuse";
    GET_CONV_BN_NODES(conv_ac_pattern);
    // check if fuse can be done and if MKL-DNN should be used
    FuseOptions fuse_option = FindFuseOption(*conv, *affine_channel);
    if (fuse_option == DO_NOT_FUSE) {
      VLOG(3) << "do not perform conv+affinechannel fuse";
      return;
    }
    // Create eltwise_y (conv bias) variable
    VarDesc eltwise_y_in_desc(
        patterns::PDNodeName(name_scope_, "eltwise_y_in"));
    eltwise_y_in_desc.SetPersistable(true);
    auto* eltwise_y_in_node = g->CreateVarNode(&eltwise_y_in_desc);
    auto* eltwise_y_in_tensor =
        scope->Var(eltwise_y_in_node->Name())->GetMutable<LoDTensor>();
    // Get affine_channel bias
    auto* ac_bias_tensor =
        scope->FindVar(ac_bias->Name())->GetMutable<LoDTensor>();
    // Initialize eltwise_y
    eltwise_y_in_tensor->Resize(ac_bias_tensor->dims());
    std::fill_n(eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
                eltwise_y_in_tensor->numel(), 0.0f);
    // update weights and biases
    recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor,
                               eltwise_y_in_tensor);
    // create an elementwise add node.
    OpDesc desc;
    desc.SetInput("X", std::vector<std::string>({conv_out->Name()}));
    desc.SetInput("Y", std::vector<std::string>({eltwise_y_in_node->Name()}));
    desc.SetOutput("Out", std::vector<std::string>({ac_out->Name()}));
    desc.SetType("elementwise_add");
    desc.SetAttr("axis", 1);
    auto eltwise_op = g->CreateOpNode(&desc);  // OpDesc will be copied.
    GraphSafeRemoveNodes(graph.get(), {ac_scale, ac_bias, affine_channel});
    IR_NODE_LINK_TO(conv_out, eltwise_op);
    IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op);
    IR_NODE_LINK_TO(eltwise_op, ac_out);
    found_conv_ac_count++;
  };
  gpd(graph.get(), handler);
  AddStatis(found_conv_ac_count);
  return graph;
 }
 std::unique_ptr<ir::Graph> ConvEltwiseAddAffineChannelFusePass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
  PADDLE_ENFORCE(graph.get());
  FusePassBase::Init(name_scope_, graph.get());
  auto* scope = param_scope();
  PADDLE_ENFORCE(scope);
  GraphPatternDetector gpd;
  auto* conv_input =
      gpd.mutable_pattern()
          ->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
          ->AsInput()
          ->assert_is_op_input("conv2d", "Input");
  patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(),
                                              name_scope_);
  conv_ac_pattern(conv_input, true /*with_eltwise_add*/);
  int found_conv_ac_count = 0;
  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                     Graph* g) {
    VLOG(4) << "handle ConvBN fuse";
    GET_CONV_BN_NODES(conv_ac_pattern);
    // OPERATORS
    GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_ac_pattern);
    // BIAS inputs
    GET_IR_NODE_FROM_SUBGRAPH(eltwise_y_in, eltwise_y_in, conv_ac_pattern);
    // BIAS outputs
    GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_ac_pattern);
    // Get eltwise_y (conv bias) variable
    auto* eltwise_y_in_tensor =
        scope->FindVar(eltwise_y_in->Name())->GetMutable<LoDTensor>();
    // Get batch norm bias
    auto* ac_bias_tensor =
        scope->FindVar(ac_bias->Name())->GetMutable<LoDTensor>();
    recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor,
                               eltwise_y_in_tensor);
    // Update the elementwise_add node
    eltwise->Op()->SetAttr("axis", 1);
    eltwise->Op()->SetOutput("Out", std::vector<std::string>({ac_out->Name()}));
    GraphSafeRemoveNodes(graph.get(),
                         {ac_scale, ac_bias, affine_channel, eltwise_out});
    IR_NODE_LINK_TO(eltwise, ac_out);
    found_conv_ac_count++;
  };
  gpd(graph.get(), handler);
  AddStatis(found_conv_ac_count);
  return graph;
 }
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
 REGISTER_PASS(conv_affine_channel_fuse_pass,
              paddle::framework::ir::ConvAffineChannelFusePass);
 REGISTER_PASS(conv_eltwiseadd_affine_channel_fuse_pass,
              paddle::framework::ir::ConvEltwiseAddAffineChannelFusePass);
--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
@ -0,0 +1,49 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include <string>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 namespace paddle {
 namespace framework {
 namespace ir {
 /*
 * Fuse the Conv and ConvAffineChannel.
 */
 class ConvAffineChannelFusePass : public FusePassBase {
 public:
  virtual ~ConvAffineChannelFusePass() {}
 protected:
  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
  const std::string name_scope_{"conv_affine_channel_fuse"};
 };
 class ConvEltwiseAddAffineChannelFusePass : public FusePassBase {
 public:
  virtual ~ConvEltwiseAddAffineChannelFusePass() {}
 protected:
  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
  const std::string name_scope_{"conv_eltwiseadd_affine_channel_fuse"};
 };
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
@ -40,18 +40,20 @@ framework::proto::OpDesc PrepareOpDesc(
    const std::string& output) {
  auto proto = base_desc;
  framework::OpDesc desc(proto, nullptr);
  desc.SetType("conv2d_fusion");
  desc.SetInput("Bias", {bias});
  desc.SetInput("ResidualData", {bias1});
  desc.SetAttr("activation", activation);
  desc.SetOutput("Output", {output});
  desc.SetAttr("is_test", true);
-
+  desc.SetAttr("use_cudnn", false);
  desc.Flush();
  return *desc.Proto();
 }
 std::unique_ptr<ir::Graph> ConvElementwiseAdd2ActFusePass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
-  const std::string pattern_name = "conv_elementwise_add_act_fuse";
+  const std::string pattern_name = "conv_elementwise_add2_act_fuse";
  FusePassBase::Init(pattern_name, graph.get());
  GraphPatternDetector gpd;
@ -76,22 +78,23 @@ std::unique_ptr<ir::Graph> ConvElementwiseAdd2ActFusePass::ApplyImpl(
    framework::OpDesc new_op_desc(new_op_proto, nullptr);
    // Create a new node for the fused op.
-    graph->CreateOpNode(&new_op_desc);
+    auto* new_conv_op = graph->CreateOpNode(&new_op_desc);
    // Link inputs and outputs.
    PADDLE_ENFORCE(subgraph.count(x));
    auto* conv_in_node = subgraph.at(x);
-    IR_NODE_LINK_TO(conv_in_node, conv_op);            // Input
+    IR_NODE_LINK_TO(conv_in_node, new_conv_op);            // Input
-    IR_NODE_LINK_TO(conv_filter, conv_op);             // Filter
+    IR_NODE_LINK_TO(conv_filter, new_conv_op);             // Filter
-    IR_NODE_LINK_TO(conv_op, conv_out);                // Output
+    IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op);    // Bias
-    IR_NODE_LINK_TO(elementwise_add_in_y, conv_op);    // Bias
+    IR_NODE_LINK_TO(elementwise_add_in_y_1, new_conv_op);  // Bias
-    IR_NODE_LINK_TO(elementwise_add_in_y_1, conv_op);  // Bias
+    IR_NODE_LINK_TO(new_conv_op, act_out);                 // Output
    // Delete the unneeded nodes.
-    GraphSafeRemoveNodes(graph.get(),
+    GraphSafeRemoveNodes(
-                         {conv_op, elementwise_add_op, elementwise_add_op_1,
+        graph.get(),
-                          elementwise_add_out});
+        {conv_op, conv_out, elementwise_add_op, elementwise_add_op_1,
         elementwise_add_out, elementwise_add_out_1, act_op});
  };
  gpd(graph.get(), handler);
  return graph;
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@ -23,66 +23,8 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 namespace ir {
 namespace {
 void CheckProgram(const ProgramDesc &program) {
 #define _INT(role) static_cast<int>(role)
  std::map<int, bool> visit;
  for (OpDesc *op : program.Block(0).AllOps()) {
    // For backward compatibility, some program doesn't have role added.
    if (!op->HasAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) continue;
    int role_id =
        boost::get<int>(op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
    visit[role_id] = true;
    switch (role_id) {
      case _INT(OpRole::kForward):
        if (visit.find(_INT(OpRole::kBackward)) != visit.end()) {
          LOG(ERROR) << "Cannot add backward operator before forward operator "
                     << op->Type();
        }
        break;
      case _INT(OpRole::kBackward):
      case _INT(OpRole::kBackward) | _INT(OpRole::kLoss):
        PADDLE_ENFORCE(
            visit.find(_INT(OpRole::kOptimize)) == visit.end(),
            "Cannot add backward operator %s after optimize operator.",
            op->Type());
        break;
      case _INT(OpRole::kForward) | _INT(OpRole::kLoss):
        PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) |
                                  _INT(OpRole::kLoss)) == visit.end(),
                       "Cannot add backward|loss operator before "
                       "forward|loss operator %s.",
                       op->Type());
        PADDLE_ENFORCE(
            visit.find(_INT(OpRole::kOptimize)) == visit.end(),
            "Cannot add forward|loss operator %s after optimize operator.",
            op->Type());
        break;
      case _INT(OpRole::kOptimize):
      case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched):
        PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(),
                       "Optimize operators %s must follow backward operator.",
                       op->Type());
        break;
      case _INT(OpRole::kLRSched):
      case _INT(OpRole::kDist):
      case _INT(OpRole::kRPC):
      case _INT(OpRole::kNotSpecified):
        break;
      default:
        LOG(FATAL) << "Unknown operator role. Don't add new role because "
                      "you don't know what you are doing.";
    }
  }
 #undef _INT
 }
 }  // namespace
 Graph::Graph(const ProgramDesc &program) : program_(program) {
  CheckProgram(program_);
  auto var_nodes = InitFromProgram(program_);
  ResolveHazard(var_nodes);
 }
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@ -1101,9 +1101,7 @@ PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) {
  return out_var;
 }
-std::unordered_set<std::string> conv_act_set({"identity", "sigmoid", "relu",
+std::unordered_set<std::string> conv_act_set({"identity", "relu"});
                                              "relu6", "relux", "tanh",
                                              "band_pass"});
 PDNode *patterns::ConvElementwiseaddAct::operator()(PDNode *conv_in) {
  conv_in->AsInput();
@ -1169,13 +1167,13 @@ PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) {
                                  ->AsInput();
  auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr())
                                 ->assert_is_op_output("elementwise_add")
-                                 ->assert_is_op_input("elementwise_add", "X")
+                                 ->assert_is_op_input("elementwise_add", "Y")
                                 ->AsIntermediate();
  auto elementwise_add_op_1 = pattern->NewNode(elementwise_add_op_1_repr())
                                  ->assert_is_op("elementwise_add");
  auto elementwise_add_in_y_1 = pattern->NewNode(elementwise_add_in_y_1_repr())
-                                    ->assert_is_op_input("elementwise_add", "Y")
+                                    ->assert_is_op_input("elementwise_add", "X")
                                    ->AsInput();
  auto elementwise_add_out_1 = pattern->NewNode(elementwise_add_out_1_repr())
                                   ->assert_is_op_output("elementwise_add")
@ -1203,8 +1201,8 @@ PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) {
  conv_op->LinksFrom({conv_in, conv_filter}).LinksTo({conv_out});
  elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y})
      .LinksTo({elementwise_add_out});
-  elementwise_add_op_1->LinksFrom(
+  elementwise_add_op_1->LinksFrom({elementwise_add_out, elementwise_add_in_y_1})
-      {elementwise_add_out, elementwise_add_in_y_1});
+      .LinksTo({elementwise_add_out_1});
  act_op->LinksFrom({elementwise_add_out_1}).LinksTo({act_out});
  return act_out;
 }
@ -1236,6 +1234,78 @@ PDNode *patterns::ConvElementwiseadd::operator()(PDNode *conv_in) {
  return elementwise_add_out;
 }
 PDNode *patterns::ConvAffineChannel::operator()(
    paddle::framework::ir::PDNode *conv_input, bool with_eltwise_add) {
  // Create Operators
  conv_input->assert_is_op_input("conv2d", "Input");
  auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d");
  PDNode *eltwise_op = nullptr;
  if (with_eltwise_add) {
    eltwise_op =
        pattern->NewNode(eltwise_repr())->assert_is_op("elementwise_add");
  }
  auto *affine_channel_op =
      pattern->NewNode(affine_channel_repr())->assert_is_op("affine_channel");
  // Create variables
  // Conv Filter
  auto *conv_weight_var = pattern->NewNode(conv_weight_repr())
                              ->AsInput()
                              ->assert_is_persistable_var()
                              ->assert_is_op_input("conv2d", "Filter");
  auto *conv_out_var = pattern->NewNode(conv_out_repr())
                           ->AsIntermediate()
                           ->assert_is_only_output_of_op("conv2d");
  PDNode *eltwise_y_in_var = nullptr;
  PDNode *eltwise_out_var = nullptr;
  if (with_eltwise_add) {
    // Conv output as Bias input
    conv_out_var->assert_is_op_input("elementwise_add", "X");
    // Bias
    eltwise_y_in_var = pattern->NewNode(eltwise_y_in_repr())
                           ->assert_is_op_input("elementwise_add", "Y")
                           ->AsInput();
    eltwise_out_var = pattern->NewNode(eltwise_out_repr())
                          ->AsIntermediate()
                          ->assert_is_only_output_of_op("elementwise_add");
  } else {
    // Conv output as AffineChannel input
    conv_out_var->assert_is_op_input("affine_channel", "X");
  }
  // AC Scale
  auto *ac_scale_var = pattern->NewNode(ac_scale_repr())
                           ->AsInput()
                           ->assert_is_persistable_var()
                           ->assert_is_op_input("affine_channel", "Scale");
  // AC Bias
  auto *ac_bias_var = pattern->NewNode(ac_bias_repr())
                          ->AsInput()
                          ->assert_is_persistable_var()
                          ->assert_is_op_input("affine_channel", "Bias");
  // AC output
  auto *ac_out_var = pattern->NewNode(ac_out_repr())
                         ->AsOutput()
                         ->assert_is_op_output("affine_channel");
  conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var});
  if (with_eltwise_add) {
    eltwise_op->LinksFrom({conv_out_var, eltwise_y_in_var})
        .LinksTo({eltwise_out_var});
    affine_channel_op->LinksFrom({eltwise_out_var, ac_scale_var, ac_bias_var})
        .LinksTo({ac_out_var});
  } else {
    affine_channel_op->LinksFrom({conv_out_var, ac_scale_var, ac_bias_var})
        .LinksTo({ac_out_var});
  }
  return ac_out_var;
 }
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@ -734,6 +734,38 @@ struct ConvElementwiseadd : public PatternBase {
  PATTERN_DECL_NODE(elementwise_add_out);
 };
 // Conv with affine_channel
 // op: conv + (elementwise_add +) affine_channel
 // named nodes:
 // conv_weight, conv_out, conv,
 // ac_x, ac_scale, ac_bias
 // affine_channel, ac_out
 struct ConvAffineChannel : public PatternBase {
  ConvAffineChannel(PDPattern* pattern, const std::string& name_scope)
      : PatternBase(pattern, name_scope, "conv_affine_channel") {}
  PDNode* operator()(PDNode* conv_input, bool with_eltwise_add);
  // declare operator node's name
  PATTERN_DECL_NODE(conv);
  PATTERN_DECL_NODE(affine_channel);
  PATTERN_DECL_NODE(eltwise);  // ELEMENTWISE_ADD
  // CONV inputs
  PATTERN_DECL_NODE(conv_weight);  // Filter
  // CONV outputs
  PATTERN_DECL_NODE(conv_out);  // tmp
  // ELTWISE inputs
  PATTERN_DECL_NODE(eltwise_y_in);
  // ELTWISE outputs
  PATTERN_DECL_NODE(eltwise_out);  // tmp
  // AC(Affine_Channel) inputs
  PATTERN_DECL_NODE(ac_scale);
  PATTERN_DECL_NODE(ac_bias);
  // AC outputs
  PATTERN_DECL_NODE(ac_out);  // Out
 };
 }  // namespace patterns
 // Link two ir::Nodes from each other.
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@ -215,7 +215,7 @@ class Vector {
      auto stream = dev_ctx->stream();
      void *src = gpu_->ptr();
      void *dst = cpu_.data();
-      memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src,
+      paddle::memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src,
                           gpu_->size(), stream);
      dev_ctx->Wait();
    }
@ -261,7 +261,7 @@ class Vector {
      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
          platform::DeviceContextPool::Instance().Get(place));
      auto stream = dev_ctx->stream();
-      memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src,
+      paddle::memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src,
                           gpu_->size(), stream);
    }
@ -284,7 +284,7 @@ class Vector {
    bool IsInCPU() const { return flag_ & kDataInCPU; }
    mutable std::vector<T> cpu_;
-    mutable memory::AllocationPtr gpu_;
+    mutable paddle::memory::AllocationPtr gpu_;
    mutable int flag_;
    mutable std::mutex mtx_;
--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@ -82,10 +82,6 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
  AddAttr<std::string>(OpNamescopeAttrName(), "Operator name with namesope.")
      .SetDefault("");
  AddAttr<std::vector<std::string>>(OpCreationCallstackAttrName(),
                                    "Callstack for Op Creatation.")
      .SetDefault({});
  Validate();
 }
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@ -47,7 +47,6 @@ class OpProtoAndCheckerMaker {
  static const char *OpRoleAttrName() { return "op_role"; }
  static const char *OpRoleVarAttrName() { return "op_role_var"; }
  static const char *OpNamescopeAttrName() { return "op_namescope"; }
  static const char *OpCreationCallstackAttrName() { return "op_callstack"; }
  void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker);
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@ -23,6 +23,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include "glog/logging.h"               // For VLOG()
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/details/op_registry.h"
--- a/Show More
+++ b/Show More