diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 414e92eb27..5be7be6413 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -139,10 +139,12 @@ endfunction()
 message(STATUS "CUDA detected: " ${CUDA_VERSION})
 if (${CUDA_VERSION} LESS 7.0)
   set(paddle_known_gpu_archs ${paddle_known_gpu_archs})
+  add_definitions("-DPADDLE_CUDA_BINVER=\"60\"")
 elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x
   set(paddle_known_gpu_archs ${paddle_known_gpu_archs7})
   list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
   list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
+  add_definitions("-DPADDLE_CUDA_BINVER=\"70\"")
 elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
   set(paddle_known_gpu_archs ${paddle_known_gpu_archs8})
   list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
@@ -150,6 +152,7 @@ elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
   # CUDA 8 may complain that sm_20 is no longer supported. Suppress the
   # warning for now.
   list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
+  add_definitions("-DPADDLE_CUDA_BINVER=\"80\"")
 endif()
 
 include_directories(${CUDA_INCLUDE_DIRS})
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index fb899e3d7c..fff1980637 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -89,6 +89,7 @@ if(CUDNN_FOUND)
         if(NOT CUDNN_MAJOR_VERSION)
             set(CUDNN_VERSION "???")
         else()
+            add_definitions("-DPADDLE_CUDNN_BINVER=\"${CUDNN_MAJOR_VERSION}\"")
             math(EXPR CUDNN_VERSION
                 "${CUDNN_MAJOR_VERSION} * 1000 +
                  ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}")
diff --git a/cmake/external/cub.cmake b/cmake/external/cub.cmake
index c94849cf4b..f06728de91 100644
--- a/cmake/external/cub.cmake
+++ b/cmake/external/cub.cmake
@@ -32,4 +32,4 @@ endif()
 
 add_dependencies(cub extern_cub)
 
-LIST(APPEND externl_project_dependencies cub)
+LIST(APPEND external_project_dependencies cub)
diff --git a/cmake/external/dlpack.cmake b/cmake/external/dlpack.cmake
index 94d8fcc668..4587475d79 100644
--- a/cmake/external/dlpack.cmake
+++ b/cmake/external/dlpack.cmake
@@ -28,4 +28,4 @@ endif()
 
 add_dependencies(dlpack extern_dlpack)
 
-LIST(APPEND externl_project_dependencies dlpack)
+LIST(APPEND external_project_dependencies dlpack)
diff --git a/cmake/external/ngraph.cmake b/cmake/external/ngraph.cmake
index e66459fa3a..9da657b7d7 100644
--- a/cmake/external/ngraph.cmake
+++ b/cmake/external/ngraph.cmake
@@ -37,13 +37,12 @@ INCLUDE(GNUInstallDirs)
 INCLUDE(ExternalProject)
 
 SET(NGRAPH_PROJECT         "extern_ngraph")
-SET(NGRAPH_VERSION         "0.9")
-SET(NGRAPH_GIT_TAG         "f9fd9d4cc318dc59dd4b68448e7fbb5f67a28bd0")
+SET(NGRAPH_GIT_TAG         "v0.10.1")
 SET(NGRAPH_SOURCES_DIR     ${THIRD_PARTY_PATH}/ngraph)
 SET(NGRAPH_INSTALL_DIR     ${THIRD_PARTY_PATH}/install/ngraph)
 SET(NGRAPH_INC_DIR         ${NGRAPH_INSTALL_DIR}/include)
 SET(NGRAPH_LIB_DIR         ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR})
-SET(NGRAPH_SHARED_LIB_NAME libngraph.so.${NGRAPH_VERSION})
+SET(NGRAPH_SHARED_LIB_NAME libngraph.so)
 SET(NGRAPH_CPU_LIB_NAME    libcpu_backend.so)
 SET(NGRAPH_TBB_LIB_NAME    libtbb.so.2)
 SET(NGRAPH_GIT_REPO        "https://github.com/NervanaSystems/ngraph.git")
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 70d159b4f3..59c40a0e5d 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -110,7 +110,7 @@ function(op_library TARGET)
     # Define operators that don't need pybind here.
     foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
 "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
-"fusion_transpose_flatten_concat_op")
+"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op")
         if ("${TARGET}" STREQUAL "${manual_pybind_op}")
             set(pybind_flag 1)
         endif()
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index f9c90e05c3..a167511160 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -27,9 +27,10 @@ add_subdirectory(details)
 proto_library(framework_proto SRCS framework.proto)
 proto_library(async_executor_proto SRCS data_feed.proto)
 
-cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
+cc_library(ddim SRCS ddim.cc DEPS eigen3 boost enforce)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
+cc_test(unroll_array_ops_test SRCS unroll_array_ops_test.cc)
 cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context)
 cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor)
 if(WITH_GPU)
@@ -71,13 +72,13 @@ cc_test(reader_test SRCS reader_test.cc DEPS reader)
 cc_library(threadpool SRCS threadpool.cc DEPS enforce)
 cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
 
-cc_library(var_type_traits SRCS var_type_traits DEPS lod_tensor selected_rows framework_proto) 
+cc_library(var_type_traits SRCS var_type_traits DEPS lod_tensor selected_rows framework_proto)
 if (WITH_GPU)
   target_link_libraries(var_type_traits dynload_cuda)
 endif()
 cc_test(var_type_traits_test SRCS var_type_traits_test.cc DEPS var_type_traits)
 
-cc_library(scope SRCS scope.cc DEPS glog threadpool var_type_traits)
+cc_library(scope SRCS scope.cc DEPS glog threadpool xxhash var_type_traits)
 cc_library(scope_pool SRCS scope_pool.cc DEPS scope)
 cc_test(scope_test SRCS scope_test.cc DEPS scope)
 cc_test(variable_test SRCS variable_test.cc DEPS tensor var_type_traits)
@@ -129,11 +130,9 @@ cc_test(version_test SRCS version_test.cc DEPS version)
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
 
 if(WITH_NGRAPH)
-  if(NOT WIN32)
-    cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
-    cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
-      shape_inference data_transform lod_tensor profiler ngraph)
-  endif(NOT WIN32)
+  cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
+  cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
+             shape_inference data_transform lod_tensor profiler)
 endif(WITH_NGRAPH)
 
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
@@ -175,11 +174,7 @@ if(WITH_DISTRIBUTE)
 
 else()
   if(WITH_NGRAPH)
-    if(NOT WIN32)
-      cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph ngraph_operator variable_helper)
-    else(NOT WIN32)
-      cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
-    endif(NOT WIN32)
+    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper)
   else(WITH_NGRAPH)
     cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
   endif(WITH_NGRAPH)
@@ -194,9 +189,9 @@ cc_library(parallel_executor SRCS parallel_executor.cc DEPS
         fast_threaded_ssa_graph_executor variable_helper)
 
 if(WITH_PSLIB)
-    cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib_brpc pslib)
+    cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib_brpc pslib timer)
 else()
-    cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper)
+    cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper timer)
 endif(WITH_PSLIB)
 
 
diff --git a/paddle/fluid/framework/array.h b/paddle/fluid/framework/array.h
index be9efcd749..b530829868 100644
--- a/paddle/fluid/framework/array.h
+++ b/paddle/fluid/framework/array.h
@@ -15,34 +15,123 @@
 #pragma once
 
 #include <cstdint>
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/fluid/framework/unroll_array_ops.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace framework {
+
 template <typename T, size_t N>
 class Array {
-  static_assert(N > 0, "The size of array must be larger than 0");
-
  public:
-  HOSTDEVICE Array() {}
+  static constexpr size_t kSize = N;
+
+  HOSTDEVICE inline Array() {}
 
-  HOSTDEVICE explicit Array(const T &val) {
-    for (size_t i = 0; i < N; ++i) data_[i] = val;
+  template <typename... Args>
+  HOSTDEVICE inline explicit Array(const T &val, Args... args) {
+    static_assert(N == sizeof...(Args) + 1, "Invalid argument");
+    UnrollVarArgsAssign<T>::Run(data_, val, args...);
   }
 
-  HOSTDEVICE const T *Get() const { return data_; }
+  HOSTDEVICE inline void Fill(const T &val) {
+    UnrollFillConstant<N>::Run(data_, val);
+  }
 
-  HOSTDEVICE T *GetMutable() { return data_; }
+  HOSTDEVICE inline const T *Get() const { return data_; }
 
-  HOSTDEVICE T &operator[](size_t index) { return data_[index]; }
+  HOSTDEVICE inline T *GetMutable() { return data_; }
 
-  HOSTDEVICE const T &operator[](size_t index) const { return data_[index]; }
+  HOSTDEVICE inline T &operator[](size_t i) { return *advance(data_, i); }
+
+  // Writing "return data_[i]" would cause compilation warning/error:
+  // "array subscript is above array bound" in Python 35 CI.
+  // It seems that it is a false warning of GCC if we do not check the bounds
+  // of array index. But for better performance, we do not check in operator[]
+  // like what is in STL. If users want to check the bounds, use at() instead
+  HOSTDEVICE inline const T &operator[](size_t i) const {
+    return *advance(data_, i);
+  }
+
+  HOSTDEVICE inline T &at(size_t i) {
+#ifndef __CUDA_ARCH__
+    PADDLE_ENFORCE_LT(i, N, "Array index out of bounds");
+#endif
+    return (*this)[i];
+  }
+
+  HOSTDEVICE inline const T &at(size_t i) const {
+#ifndef __CUDA_ARCH__
+    PADDLE_ENFORCE_LT(i, N, "Array index out of bounds");
+#endif
+    return (*this)[i];
+  }
 
   HOSTDEVICE constexpr size_t size() const { return N; }
 
+  HOSTDEVICE inline bool operator==(const Array<T, N> &other) const {
+    return UnrollCompare<N>::Run(data_, other.data_);
+  }
+
+  HOSTDEVICE inline bool operator!=(const Array<T, N> &other) const {
+    return !(*this == other);
+  }
+
  private:
+  template <typename U>
+  HOSTDEVICE static inline U *advance(U *ptr, size_t i) {
+    return ptr + i;
+  }
+
   T data_[N];
 };
 
+template <typename T>
+class Array<T, 0> {
+ public:
+  static constexpr size_t kSize = 0;
+
+  HOSTDEVICE inline Array() {}
+
+  HOSTDEVICE inline void Fill(const T &val) {}
+
+  HOSTDEVICE inline constexpr T *Get() const { return nullptr; }
+
+  // Add constexpr to GetMutable() cause warning in MAC
+  HOSTDEVICE inline T *GetMutable() { return nullptr; }
+
+  HOSTDEVICE inline T &operator[](size_t) {
+#ifdef __CUDA_ARCH__
+    static T obj();
+    return obj;
+#else
+    PADDLE_THROW("Array<T, 0> has no element");
+#endif
+  }
+
+  HOSTDEVICE inline const T &operator[](size_t) const {
+#ifdef __CUDA_ARCH__
+    static const T obj();
+    return obj;
+#else
+    PADDLE_THROW("Array<T, 0> has no element");
+#endif
+  }
+
+  HOSTDEVICE inline T &at(size_t i) { return (*this)[i]; }
+
+  HOSTDEVICE inline const T &at(size_t i) const { return (*this)[i]; }
+
+  HOSTDEVICE constexpr size_t size() const { return 0; }
+
+  HOSTDEVICE constexpr bool operator==(const Array<T, 0> &other) const {
+    return true;
+  }
+
+  HOSTDEVICE constexpr bool operator!=(const Array<T, 0> &other) const {
+    return false;
+  }
+};
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc
index ee3c5e01f8..1d9678a1ba 100644
--- a/paddle/fluid/framework/async_executor.cc
+++ b/paddle/fluid/framework/async_executor.cc
@@ -304,8 +304,13 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
 
   // start executing ops in multiple threads
   for (int thidx = 0; thidx < actual_thread_num; ++thidx) {
-    threads.push_back(
-        std::thread(&ExecutorThreadWorker::TrainFiles, workers[thidx].get()));
+    if (debug) {
+      threads.push_back(std::thread(&ExecutorThreadWorker::TrainFilesWithTimer,
+                                    workers[thidx].get()));
+    } else {
+      threads.push_back(
+          std::thread(&ExecutorThreadWorker::TrainFiles, workers[thidx].get()));
+    }
   }
 
   for (auto& th : threads) {
diff --git a/paddle/fluid/framework/ddim.cc b/paddle/fluid/framework/ddim.cc
index 05e423b8a5..e7a6df57e5 100644
--- a/paddle/fluid/framework/ddim.cc
+++ b/paddle/fluid/framework/ddim.cc
@@ -18,312 +18,159 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-/// @cond HIDDEN
-
-template <int i>
-Dim<i> make_dim(const int64_t* d) {
-  return Dim<i>(*d, make_dim<i - 1>(d + 1));
-}
-
-template <>
-Dim<0> make_dim<0>(const int64_t* d) {
-  return Dim<0>(*d);
-}
-
-void make_ddim(DDim& ddim, const int64_t* dims, int n) {
-  switch (n) {
-    case 0:
-      ddim = make_dim<0>(dims);
-      break;
-    case 1:
-      ddim = make_dim<1>(dims);
-      break;
-    case 2:
-      ddim = make_dim<2>(dims);
-      break;
-    case 3:
-      ddim = make_dim<3>(dims);
-      break;
-    case 4:
-      ddim = make_dim<4>(dims);
-      break;
-    case 5:
-      ddim = make_dim<5>(dims);
-      break;
-    case 6:
-      ddim = make_dim<6>(dims);
-      break;
-    case 7:
-      ddim = make_dim<7>(dims);
-      break;
-    case 8:
-      ddim = make_dim<8>(dims);
-      break;
-    case 9:
-      ddim = make_dim<9>(dims);
-      break;
-    default:
-      PADDLE_THROW("Dynamic dimensions must have between [1, 9] dimensions.");
-  }
-}
-
-/// @endcond
-
 DDim make_ddim(std::initializer_list<int64_t> dims) {
-  DDim result(make_dim(0));
-  make_ddim(result, dims.begin(), dims.size());
-  return result;
+  return DDim(dims.begin(), dims.size());
 }
 
 DDim make_ddim(const std::vector<int64_t>& dims) {
-  DDim result(make_dim(0));
-  make_ddim(result, &dims[0], dims.size());
-  return result;
+  return DDim(dims.data(), dims.size());
 }
 
 DDim make_ddim(const std::vector<int>& dims) {
-  std::vector<int64_t> res(dims.size());
-  std::transform(dims.begin(), dims.end(), res.begin(),
-                 [](int d) { return static_cast<int64_t>(d); });
-  return make_ddim(res);
+  return DDim(dims.data(), dims.size());
 }
 
-/// @cond HIDDEN
-// XXX For some reason, putting this in an anonymous namespace causes errors
-class DynamicMutableIndexer : public boost::static_visitor<int64_t&> {
- public:
-  explicit DynamicMutableIndexer(int idx) : idx_(idx) {}
+struct DDimEqualityVisitor {
+  explicit DDimEqualityVisitor(const int64_t* d) : d_(d) {}
 
   template <int D>
-  int64_t& operator()(Dim<D>& dim) const {
-    return dim[idx_];
+  inline bool operator()(const Dim<D>& self) const {
+    return UnrollCompare<D>::Run(self.Get(), d_);
   }
 
- private:
-  int idx_;
+  const int64_t* d_;
 };
 
-class DynamicConstIndexer : public boost::static_visitor<int64_t> {
- public:
-  explicit DynamicConstIndexer(int idx) : idx_(idx) {}
-
-  template <int D>
-  int64_t operator()(const Dim<D>& dim) const {
-    return dim[idx_];
-  }
-
- private:
-  int idx_;
-};
-
-/// @endcond
-
-int64_t& DDim::operator[](int idx) {
-  return boost::apply_visitor(DynamicMutableIndexer(idx), var);
+bool DDim::operator==(const DDim& d) const {
+  return size() == d.size() &&
+         this->apply_visitor(DDimEqualityVisitor(d.Get()));
 }
 
-int64_t DDim::operator[](int idx) const {
-  return boost::apply_visitor(DynamicConstIndexer(idx), var);
-}
+bool DDim::operator!=(const DDim& d) const { return !(*this == d); }
 
-int DDim::size() const { return arity(*this); }
+struct DDimPlusVisitor {
+  explicit DDimPlusVisitor(const int64_t* d1, const int64_t* d2)
+      : d1_(d1), d2_(d2) {}
 
-bool DDim::operator==(DDim d) const {
-  if (var.which() != d.getVar().which()) {
-    return false;
-  } else {
-    std::vector<int64_t> v1 = vectorize(*this);
-    std::vector<int64_t> v2 = vectorize(d);
-
-    for (unsigned int i = 0; i < v1.size(); i++) {
-      if (v1[i] != v2[i]) {
-        return false;
-      }
-    }
-
-    return true;
+  template <int D>
+  inline void operator()(Dim<D>& self) const {
+    UnrollAdd<D>::Run(d1_, d2_, self.GetMutable());
   }
-}
-
-bool DDim::operator!=(DDim d) const { return !(*this == d); }
-
-DDim DDim::operator+(DDim d) const {
-  std::vector<int64_t> v1 = vectorize(*this);
-  std::vector<int64_t> v2 = vectorize(d);
-
-  std::vector<int64_t> v3;
 
-  assert(v1.size() == v2.size());
-
-  for (unsigned int i = 0; i < v1.size(); i++) {
-    v3.push_back(v1[i] + v2[i]);
-  }
+  const int64_t* d1_;
+  const int64_t* d2_;
+};
 
-  return make_ddim(v3);
+DDim DDim::operator+(const DDim& d) const {
+  PADDLE_ENFORCE(size() == d.size());
+  DDim ret;
+  ret.rank_ = rank_;
+  ret.apply_visitor(DDimPlusVisitor(Get(), d.Get()));
+  return ret;
 }
 
-DDim DDim::operator*(DDim d) const {
-  std::vector<int64_t> v1 = vectorize(*this);
-  std::vector<int64_t> v2 = vectorize(d);
+struct DDimMulVisitor {
+  explicit DDimMulVisitor(const int64_t* d1, const int64_t* d2)
+      : d1_(d1), d2_(d2) {}
 
-  std::vector<int64_t> v3;
-
-  assert(v1.size() == v2.size());
-
-  for (unsigned int i = 0; i < v1.size(); i++) {
-    v3.push_back(v1[i] * v2[i]);
+  template <int D>
+  inline void operator()(Dim<D>& self) const {
+    UnrollMul<D>::Run(d1_, d2_, self.GetMutable());
   }
 
-  return make_ddim(v3);
+  const int64_t* d1_;
+  const int64_t* d2_;
+};
+
+DDim DDim::operator*(const DDim& d) const {
+  PADDLE_ENFORCE(size() == d.size());
+  DDim ret;
+  ret.rank_ = rank_;
+  ret.apply_visitor(DDimMulVisitor(Get(), d.Get()));
+  return ret;
 }
 
 int64_t get(const DDim& ddim, int idx) { return ddim[idx]; }
 
-void set(DDim& ddim, int idx, int value) { ddim[idx] = value; }
-
-/// @cond HIDDEN
-struct VectorizeVisitor : public boost::static_visitor<> {
-  std::vector<int64_t>& vector;
-
-  explicit VectorizeVisitor(std::vector<int64_t>& v) : vector(v) {}
-
-  template <typename T>
-  void operator()(const T& t) {
-    vector.push_back(t.head);
-    this->operator()(t.tail);
-  }
-
-  void operator()(const Dim<0>& t) {}
-};
-/// @endcond
+void set(DDim& ddim, int idx, int value) { ddim[idx] = value; }  // NOLINT
 
 std::vector<int64_t> vectorize(const DDim& ddim) {
-  std::vector<int64_t> result;
-  VectorizeVisitor visitor(result);
-  boost::apply_visitor(visitor, ddim);
+  std::vector<int64_t> result(DDim::kMaxRank);
+  dynamic_dim_assign(ddim.Get(), result.data(), ddim.size());
+  result.resize(ddim.size());
   return result;
 }
 
 // NOTE: framework::vectorize converts to type int64_t
 //       which does not fit cudnn inputs.
 std::vector<int> vectorize2int(const DDim& ddim) {
-  std::vector<int64_t> temp = vectorize(ddim);
-  std::vector<int> result(temp.begin(), temp.end());
+  std::vector<int> result(DDim::kMaxRank);
+  dynamic_dim_assign(ddim.Get(), result.data(), ddim.size());
+  result.resize(ddim.size());
   return result;
 }
 
-struct ProductVisitor : public boost::static_visitor<int64_t> {
+struct ProductVisitor {
   template <int D>
-  int64_t operator()(const Dim<D>& dim) {
+  inline int64_t operator()(const Dim<D>& dim) {
     return product(dim);
   }
 };
 
 int64_t product(const DDim& ddim) {
-  ProductVisitor visitor;
-  return boost::apply_visitor(visitor, ddim);
+  return ddim.apply_visitor(ProductVisitor());
 }
 
-struct SliceVectorizeVisitor : public boost::static_visitor<> {
-  std::vector<int64_t>& vector;
-  int begin;
-  int end;
-
-  SliceVectorizeVisitor(std::vector<int64_t>& v, int b, int e)
-      : vector(v), begin(b), end(e) {
-    PADDLE_ENFORCE(begin < end,
-                   "Begin index must be less than end index in ddim slice.");
-    PADDLE_ENFORCE(begin >= 0,
-                   "Begin index can't be less than zero in ddim slice.");
-  }
-
-  template <int S>
-  void operator()(const Dim<S>& dim) {
-    if (begin == 0) {
-      vector.push_back(dim.head);
-    } else {
-      --begin;
-    }
-    --end;
-    if (end > 0) {
-      this->operator()(dim.tail);
-    }
-  }
-
-  void operator()(const Dim<0>& dim) {
-    PADDLE_ENFORCE(end == 0, "End index in ddim slice is out of bound.");
-  }
-};
-
 DDim slice_ddim(const DDim& dim, int begin, int end) {
-  std::vector<int64_t> vec;
-  vec.reserve(end - begin);
-  SliceVectorizeVisitor visitor(vec, begin, end);
-  boost::apply_visitor(visitor, dim);
-  return make_ddim(vec);
+  PADDLE_ENFORCE(begin >= 0 && end <= dim.size(),
+                 "[begin(%d), end(%d)) must be inside [0, %d) in ddim slice.",
+                 begin, end, dim.size());
+  // Constructor of DDim would check whether end - begin is valid
+  return DDim(dim.Get() + begin, end - begin);
 }
 
-/// \cond HIDDEN
-
-struct ArityVisitor : boost::static_visitor<int> {
-  template <int D>
-  int operator()(Dim<D>) const {
-    return D;
-  }
-};
-
-/// \endcond
-
-int arity(const DDim& d) { return boost::apply_visitor(ArityVisitor(), d); }
+int arity(const DDim& d) { return d.size(); }
 
-/// \cond HIDDEN
-
-struct DDimPrinter : boost::static_visitor<void> {
+struct DDimPrinter {
   std::ostream& os;
   explicit DDimPrinter(std::ostream& os_) : os(os_) {}
 
-  template <typename T>
-  void operator()(const T& t) {
+  template <int D>
+  void operator()(const Dim<D>& t) {
     os << t;
   }
 };
 
-/// \endcond
-
 std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
-  DDimPrinter printer(os);
-  boost::apply_visitor(printer, ddim);
+  ddim.apply_visitor(DDimPrinter(os));
   return os;
 }
 
-DDim::DDim(std::initializer_list<int64_t> init_list) {
-  *this = make_ddim(init_list);
-}
-
 DDim flatten_to_2d(const DDim& src, int num_col_dims) {
-  int rank = src.size();
-  return make_ddim({product(slice_ddim(src, 0, num_col_dims)),
-                    product(slice_ddim(src, num_col_dims, rank))});
+  return DDim({product(slice_ddim(src, 0, num_col_dims)),
+               product(slice_ddim(src, num_col_dims, src.size()))});
 }
 
-DDim flatten_to_1d(const DDim& src) { return make_ddim({product(src)}); }
+DDim flatten_to_1d(const DDim& src) { return DDim({product(src)}); }
 
 DDim stride(const DDim& ddim) {
-  std::vector<int64_t> strides(ddim.size());
+  DDim strides;
+  strides.rank_ = ddim.size();
   strides[ddim.size() - 1] = 1;
   for (int i = ddim.size() - 2; i >= 0; --i) {
     strides[i] = strides[i + 1] * ddim[i + 1];
   }
-  return framework::make_ddim(strides);
+  return strides;
 }
 
-DDim stride_numel(const framework::DDim& ddim) {
-  std::vector<int64_t> strides(ddim.size());
+DDim stride_numel(const DDim& ddim) {
+  DDim strides;
+  strides.rank_ = ddim.size();
   strides[ddim.size() - 1] = ddim[ddim.size() - 1];
   for (int i = ddim.size() - 2; i >= 0; --i) {
     strides[i] = strides[i + 1] * ddim[i];
   }
-  return framework::make_ddim(strides);
+  return strides;
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/ddim.h b/paddle/fluid/framework/ddim.h
index f05b5ee3fa..31a41dab2a 100644
--- a/paddle/fluid/framework/ddim.h
+++ b/paddle/fluid/framework/ddim.h
@@ -18,62 +18,145 @@ limitations under the License. */
 #include <stdexcept>
 #include <vector>
 #include "paddle/fluid/framework/dim.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/variant.h"
 
 namespace paddle {
 namespace framework {
 
+#define PADDLE_VISIT_DDIM_BASE(rank, callback) \
+  case (rank): {                               \
+    constexpr auto kRank = (rank);             \
+    return (callback);                         \
+  }
+
+#define PADDLE_VISIT_DDIM(rank, callback)    \
+  switch (rank) {                            \
+    PADDLE_VISIT_DDIM_BASE(0, callback);     \
+    PADDLE_VISIT_DDIM_BASE(1, callback);     \
+    PADDLE_VISIT_DDIM_BASE(2, callback);     \
+    PADDLE_VISIT_DDIM_BASE(3, callback);     \
+    PADDLE_VISIT_DDIM_BASE(4, callback);     \
+    PADDLE_VISIT_DDIM_BASE(5, callback);     \
+    PADDLE_VISIT_DDIM_BASE(6, callback);     \
+    PADDLE_VISIT_DDIM_BASE(7, callback);     \
+    PADDLE_VISIT_DDIM_BASE(8, callback);     \
+    PADDLE_VISIT_DDIM_BASE(9, callback);     \
+    default:                                 \
+      PADDLE_THROW("Invalid rank %d", rank); \
+  }
+
+template <typename T1, typename T2>
+inline void dynamic_dim_assign(const T1* in, T2* out, int n) {
+  PADDLE_VISIT_DDIM(n, (static_dim_assign<kRank, T1, T2>(in, out)));
+}
+
 /**
  * \brief A dynamically sized dimension.
  *
  * The number of dimensions must be between [1, 9].
  */
-struct DDim {
-  typedef boost::variant<Dim<0>, Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>,
-                         Dim<7>, Dim<8>, Dim<9>>
-      DDimVar;
-  DDimVar var;
+class DDim {
+ public:
+  constexpr static int kMaxRank = 9;
+
+  DDim() : rank_(1) { dim_[0] = 0; }
 
-  DDim() : var(Dim<1>()) {}
+  DDim(const DDim& ddim) : dim_() { CopyFrom(ddim); }
+
+  DDim(const int* d, int n) : rank_(n) {
+    dynamic_dim_assign(d, dim_.GetMutable(), n);
+  }
+
+  DDim(const int64_t* d, int n) : rank_(n) {
+    dynamic_dim_assign(d, dim_.GetMutable(), n);
+  }
 
   template <int D>
-  explicit DDim(const Dim<D>& in) : var(in) {}
+  /*implicit*/ DDim(const Dim<D>& in) : rank_(D) {  // NOLINT
+    UnsafeCast<D>() = in;
+  }
+
+  /*implicit*/ DDim(std::initializer_list<int64_t> init_list)
+      : DDim(init_list.begin(), init_list.size()) {}
 
-  /*implicit*/ DDim(std::initializer_list<int64_t> init_list);
+  inline DDim& operator=(const DDim& ddim) { return CopyFrom(ddim); }
 
   template <int D>
-  DDim& operator=(const Dim<D>& in) {
-    var = in;
+  inline DDim& operator=(const Dim<D>& dim) {
+    rank_ = D;
+    UnsafeCast<D>() = dim;
     return *this;
   }
 
-  int64_t& operator[](int idx);
-  int64_t operator[](int idx) const;
+  inline int64_t& operator[](int idx) { return dim_[idx]; }
+
+  inline int64_t operator[](int idx) const { return dim_[idx]; }
+
+  inline int64_t& at(int idx) {
+    PADDLE_ENFORCE(idx >= 0 && idx < rank_, "Invalid idx %d", idx);
+    return dim_[idx];
+  }
+
+  inline int64_t at(int idx) const {
+    PADDLE_ENFORCE(idx >= 0 && idx < rank_, "Invalid idx %d", idx);
+    return dim_[idx];
+  }
 
   template <typename Visitor>
-  typename Visitor::result_type apply_visitor(Visitor& visitor) {
-    return var.apply_visitor(visitor);
+  typename std::result_of<Visitor(Dim<0>&)>::type apply_visitor(
+      Visitor&& visitor) {
+    PADDLE_VISIT_DDIM(rank_, visitor(UnsafeCast<kRank>()));
   }
 
   template <typename Visitor>
-  typename Visitor::result_type apply_visitor(Visitor& visitor) const {
-    return var.apply_visitor(visitor);
+  typename std::result_of<Visitor(const Dim<0>&)>::type apply_visitor(
+      Visitor&& visitor) const {
+    PADDLE_VISIT_DDIM(rank_, visitor(UnsafeCast<kRank>()));
   }
 
-  DDimVar getVar() { return var; }
+  bool operator==(const DDim& d) const;
+
+  bool operator!=(const DDim& d) const;
+
+  DDim operator+(const DDim& d) const;
 
-  bool operator==(DDim d) const;
+  DDim operator*(const DDim& d) const;
 
-  bool operator!=(DDim d) const;
+  inline const int64_t* Get() const { return dim_.Get(); }
 
-  DDim operator+(DDim d) const;
+  inline int64_t* GetMutable() { return dim_.GetMutable(); }
 
-  DDim operator*(DDim d) const;
+  inline int size() const { return rank_; }
+
+ private:
+  template <int D>
+  inline Dim<D>& UnsafeCast() {
+    static_assert(D >= 0 && D <= kMaxRank, "Invalid rank");
+    auto* p = static_cast<void*>(&dim_);
+    return *reinterpret_cast<Dim<D>*>(p);
+  }
+
+  template <int D>
+  inline const Dim<D>& UnsafeCast() const {
+    static_assert(D >= 0 && D <= kMaxRank, "Invalid rank");
+    auto* p = static_cast<const void*>(&dim_);
+    return *reinterpret_cast<const Dim<D>*>(p);
+  }
 
-  int size() const;
+  inline DDim& CopyFrom(const DDim& ddim) {
+    PADDLE_VISIT_DDIM(ddim.rank_, (*this = ddim.UnsafeCast<kRank>()));
+  }
+
+  friend DDim stride(const DDim& ddim);
+  friend DDim stride_numel(const DDim& ddim);
+
+ private:
+  Dim<kMaxRank> dim_;
+  int rank_;
 };
 
+#undef PADDLE_VISIT_DDIM_BASE
+#undef PADDLE_VISIT_DDIM
+
 /**
  * \brief Make a DDim from std::vector<int64_t>
  *
@@ -92,7 +175,7 @@ DDim make_ddim(const std::vector<int>& dims);
 DDim make_ddim(std::initializer_list<int64_t> dims);
 
 int64_t get(const DDim& dim, int idx);
-void set(DDim& dim, int idx, int val);
+void set(DDim& dim, int idx, int val);  // NOLINT
 
 std::vector<int64_t> vectorize(const DDim& ddim);
 std::vector<int> vectorize2int(const DDim& ddim);
@@ -129,12 +212,3 @@ DDim stride(const DDim& ddim);
 DDim stride_numel(const DDim& ddim);
 }  // namespace framework
 }  // namespace paddle
-
-namespace boost {
-
-template <typename T>
-T get(const paddle::framework::DDim& in) {
-  return boost::get<T>(in.var);
-}
-
-}  // namespace boost
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index 47872a9f2a..6f8409d8fc 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -55,9 +55,6 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
 void AllReduceOpHandle::RunImpl() {
   platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
 
-  // FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR,
-  // this is a distributed or inter-process call, find a better way.
-  // Wait input done
   WaitInputVarGenerated();
   auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
   auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h
index 15c496130c..37b07e5736 100644
--- a/paddle/fluid/framework/details/execution_strategy.h
+++ b/paddle/fluid/framework/details/execution_strategy.h
@@ -25,7 +25,7 @@ struct ExecutionStrategy {
   size_t num_threads_{0};
   bool use_cuda_{true};
   bool allow_op_delay_{false};
-  size_t num_iteration_per_drop_scope_{100};
+  size_t num_iteration_per_drop_scope_{1};
   ExecutorType type_{kDefault};
   bool dry_run_{false};
 };
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index f432079087..91e4f9adb4 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -64,20 +64,26 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
   }
 
   platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
-  drop_scope_counter_ += 1;
+  ++drop_scope_counter_;
 
-  if (!fetch_tensors.empty() ||
-      drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
-    drop_scope_counter_ = 0;
-    // Wait All computational streams
-    for (auto p : places_) {
-      platform::DeviceContextPool::Instance().Get(p)->Wait();
+  bool stream_end = false;
+  if (!fetch_tensors.empty()) {
+    WaitComputationalStreams();
+    stream_end = true;
+  }
+
+  if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
+    if (!stream_end) {
+      WaitComputationalStreams();
     }
+
     for (auto &scope : local_scopes_) {
       auto &local_scope =
           *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
       scope->DeleteScope(local_scope);
     }
+
+    drop_scope_counter_ = 0;
   }
   if (eptr) {
     std::rethrow_exception(eptr);
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
index 5e87e0bf50..0f6340213d 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
@@ -47,6 +47,14 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
 
   FeedFetchList Run(const std::vector<std::string>& fetch_tensors) override;
 
+ private:
+  inline void WaitComputationalStreams() {
+    // Wait All computational streams
+    for (auto p : places_) {
+      platform::DeviceContextPool::Instance().Get(p)->Wait();
+    }
+  }
+
  private:
   size_t drop_scope_counter_{0};
 
diff --git a/paddle/fluid/framework/dim.h b/paddle/fluid/framework/dim.h
index 73f92fa389..88aee8379d 100644
--- a/paddle/fluid/framework/dim.h
+++ b/paddle/fluid/framework/dim.h
@@ -16,332 +16,184 @@
 #include <iostream>
 #include <sstream>
 #include <stdexcept>
+#include <string>
 #include <type_traits>
 
+#include "paddle/fluid/framework/array.h"
 #include "paddle/fluid/platform/assert.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
 namespace framework {
 
 // Statically sized, statically indexed dimension
-template <int i>
-struct Dim {
-  static constexpr int dimensions = i;
+template <int D>
+class Dim : public Array<int64_t, D> {
+ public:
+  static_assert(D >= 0, "D must be not less than 0");
 
-  template <typename... Args>
-  HOSTDEVICE Dim(int64_t _head, Args... _tail) : head(_head), tail(_tail...) {
-    static_assert(sizeof...(_tail) == i - 1,
-                  "Dim initialized with the wrong number of parameters");
-  }
+  static constexpr int kRank = D;
+  using BaseClass = Array<int64_t, D>;
 
-  HOSTDEVICE
-  Dim(int64_t _head, const Dim<i - 1>& _tail) : head(_head), tail(_tail) {}
+  inline Dim(int64_t head, const Dim<D - 1>& tail) {
+    (*this)[0] = head;
+    new (this->GetMutable() + 1) Dim<D - 1>(tail);
+  }
 
-  HOSTDEVICE
-  Dim() : head(0), tail() {}
+  template <typename... Args>
+  HOSTDEVICE explicit Dim(int64_t head, Args... args)
+      : BaseClass(head, args...) {}
 
   /** Construct a Dim from a linear index and size.  Uses Fortran order
    * indexing. */
-  HOSTDEVICE
-  Dim(int64_t idx, const Dim<i>& size)
-      : head(idx % size.head), tail(idx / size.head, size.tail) {}
+  HOSTDEVICE Dim(int64_t idx, const Dim<D>& size);
 
   /** Construct a Dim with each dimension set to the given index */
-  HOSTDEVICE
-  Dim(int64_t idx) : head(idx), tail(idx) {}
+  HOSTDEVICE explicit Dim(int64_t idx) { this->Fill(idx); }
 
-  HOSTDEVICE
-  bool operator==(const Dim<i>& o) const {
-    return (head == o.head) && (tail == o.tail);
-  }
-
-  HOSTDEVICE
-  bool operator!=(const Dim<i>& o) const { return !(*this == o); }
-
-  HOSTDEVICE
-  int64_t& operator[](int idx);
-  HOSTDEVICE
-  int64_t operator[](int idx) const;
+  HOSTDEVICE Dim() = default;
 
   HOST std::string to_string() const;
-
-  int64_t head;
-  Dim<i - 1> tail;
 };
 
-// Base case specialization
-template <>
-struct Dim<0> {
-  static constexpr int dimensions = 0;
-
-  HOSTDEVICE
-  Dim(int64_t _head) {}
-
-  HOSTDEVICE
-  Dim() {}
-
-  HOSTDEVICE
-  Dim(int idx, const Dim<0>& size) {
-#ifndef __CUDA_ARCH__
-    if (idx > 0) {
-      throw std::invalid_argument("Index out of range.");
-    }
-#else
-    PADDLE_ASSERT(idx == 0);
-#endif
-  }
-
-  HOSTDEVICE
-  bool operator==(const Dim<0>& o) const { return true; }
-
-  HOSTDEVICE
-  bool operator!=(const Dim<0>& o) const { return false; }
-
-  HOSTDEVICE
-  int64_t& operator[](int idx);
-  HOSTDEVICE
-  int64_t operator[](int idx) const;
-};
-
-namespace {
-
-// Helper for accessing Dim classes
-template <int i>
-struct DimGetter {
-  // Return a copy if Dim is const
-  template <typename D>
-  HOSTDEVICE static int64_t impl(const D& d) {
-    return DimGetter<i - 1>::impl(d.tail);
-  }
-  // Return a reference if Dim is mutable
-  template <typename D>
-  HOSTDEVICE static int64_t& impl(D& d) {
-    return DimGetter<i - 1>::impl(d.tail);
+namespace detail {
+template <int kStart, int kEnd, bool kStop>
+struct FortranOrderIndexingConstructorFunctor {
+  HOSTDEVICE inline static void Run(const int64_t* in, int64_t* idx,
+                                    int64_t* out) {
+    out[kStart] = (*idx) % in[kStart];
+    (*idx) /= in[kStart];
+    FortranOrderIndexingConstructorFunctor<kStart + 1, kEnd,
+                                           kStart + 1 == kEnd>::Run(in, idx,
+                                                                    out);
   }
 };
 
-// Eureka! We found the element!
-template <>
-struct DimGetter<0> {
-  // Return a copy if Dim is const
-  template <typename D>
-  HOSTDEVICE static int64_t impl(const D& d) {
-    return d.head;
-  }
-  // Return a reference if Dim is mutable
-  template <typename D>
-  HOSTDEVICE static int64_t& impl(D& d) {
-    return d.head;
-  }
+template <int kStart, int kEnd>
+struct FortranOrderIndexingConstructorFunctor<kStart, kEnd, true> {
+  HOSTDEVICE inline static void Run(const int64_t* in, int64_t* idx,
+                                    int64_t* out) {}
 };
+}  // namespace detail
 
 template <int D>
-HOSTDEVICE int64_t& indexer(Dim<D>& dim, int idx) {
-#ifndef __CUDA_ARCH__
-  if (idx < 0) {
-    throw std::invalid_argument("Tried to access a negative dimension");
-  }
-#else
-  PADDLE_ASSERT(idx >= 0);
-#endif
-  if (idx == 0) {
-    return dim.head;
-  }
-  return indexer(dim.tail, idx - 1);
-}
-
-template <>
-HOSTDEVICE int64_t& indexer<0>(Dim<0>& dim, int idx) {
-#ifndef __CUDA_ARCH__
-  throw std::invalid_argument("Invalid index");
-#else
-  PADDLE_ASSERT(false);
-#if CUDA_VERSION < 8000
-  // On CUDA versions previous to 8.0, only __shared__ variables
-  // could be declared as static in the device code.
-  int64_t head = 0;
-#else
-  static int64_t head = 0;
-#endif
-  return head;
-#endif
-}
-
-template <int D>
-HOSTDEVICE int64_t indexer(const Dim<D>& dim, int idx) {
-#ifndef __CUDA_ARCH__
-  if (idx < 0) {
-    throw std::invalid_argument("Tried to access a negative dimension");
-  }
-#else
-  PADDLE_ASSERT(idx >= 0);
-#endif
-  if (idx == 0) {
-    return dim.head;
-  }
-  return indexer(dim.tail, idx - 1);
-}
-
-template <>
-HOSTDEVICE int64_t indexer<0>(const Dim<0>& dim, int idx) {
-#ifndef __CUDA_ARCH__
-  throw std::invalid_argument("Invalid index");
-#else
-  PADDLE_ASSERT(false);
-#if CUDA_VERSION < 8000
-  // On CUDA versions previous to 8.0, only __shared__ variables
-  // could be declared as static in the device code.
-  int64_t head = 0;
-#else
-  static int64_t head = 0;
-#endif
-  return head;
-#endif
-}
-
-}  // namespace
-// Static access to constant Dim
-template <int i, int l>
-HOSTDEVICE int64_t get(const Dim<l>& d) {
-  return DimGetter<i>::impl(d);
+HOSTDEVICE Dim<D>::Dim(int64_t idx, const Dim<D>& size) {
+  detail::FortranOrderIndexingConstructorFunctor<0, D, D == 0>::Run(
+      size.Get(), &idx, this->GetMutable());
 }
 
-// Static access to mutable Dim
-template <int i, int l>
-HOSTDEVICE int64_t& get(Dim<l>& d) {
-  return DimGetter<i>::impl(d);
+template <int idx, int D>
+HOSTDEVICE inline int64_t get(const Dim<D>& dim) {
+  return dim[idx];
 }
 
-// Dynamic access to constant Dim
-template <int l>
-HOSTDEVICE int64_t Dim<l>::operator[](int i) const {
-  return indexer(*this, i);
+template <int idx, int D>
+HOSTDEVICE inline int64_t& get(Dim<D>& dim) {  // NOLINT
+  return dim[idx];
 }
 
-// Dynamic access to mutable Dim
-template <int l>
-HOSTDEVICE int64_t& Dim<l>::operator[](int i) {
-  return indexer(*this, i);
-}
-
-// Dynamic access to constant Dim
-inline HOSTDEVICE int64_t Dim<0>::operator[](int i) const {
-  return indexer(*this, i);
-}
-
-// Dynamic access to mutable Dim
-inline HOSTDEVICE int64_t& Dim<0>::operator[](int i) {
-  return indexer(*this, i);
-}
-
-// Dynamic access to constant Dim
-// without std::enable_if will try to instantiate this on get<0>(d)
-template <int l>
-HOSTDEVICE typename std::enable_if<(l > 0), int64_t>::type get(const Dim<l>& d,
-                                                               int i) {
-  return d[i];
+template <int D>
+HOSTDEVICE inline int64_t get(const Dim<D>& dim, int idx) {
+  return dim[idx];
 }
 
-// Dynamic access to mutable Dim
-template <int l>
-HOSTDEVICE typename std::enable_if<(l > 0), int64_t&>::type get(Dim<l>& d,
-                                                                int i) {
-  return d[i];
+template <int D>
+HOSTDEVICE inline int64_t& get(Dim<D>& dim, int idx) {  // NOLINT
+  return dim[idx];
 }
 
 // Dot product of two dims
-template <int i>
-HOSTDEVICE int64_t linearize(const Dim<i>& a, const Dim<i>& b) {
-  return a.head * b.head + linearize(a.tail, b.tail);
-}
-
-// Base case dot product of two Dims
-// Notice it is inline because it is no longer a template
-template <>
-HOSTDEVICE inline int64_t linearize(const Dim<0>& a, const Dim<0>& b) {
-  return 0;
+template <int D>
+HOSTDEVICE inline int64_t linearize(const Dim<D>& a, const Dim<D>& b) {
+  return UnrollProduct<D>::Run(a.Get(), b.Get());
 }
 
 // Product of a Dim
-template <int i>
-HOSTDEVICE int64_t product(const Dim<i>& a, int prod = 1) {
-  return prod * a.head * product(a.tail);
-}
-
-// Base case product of a Dim
-// Notice it is inline because it is no longer a template
-template <>
-HOSTDEVICE inline int64_t product(const Dim<0>& a, int prod) {
-  return prod;
+template <int D>
+HOSTDEVICE inline int64_t product(const Dim<D>& a) {
+  return UnrollProduct<D>::Run(a.Get());
 }
 
 // Is 0 <= idx_i < size_i for all i?
-template <int i>
-HOSTDEVICE bool contained(const Dim<i>& idx, const Dim<i>& size) {
-  return ((0 <= idx.head) && (idx.head < size.head) &&
-          contained(idx.tail, size.tail));
-}
+namespace detail {
+template <int kStart, int kEnd, bool kStop>
+struct ContainedFunctor {
+  HOSTDEVICE static inline bool Run(const int64_t* idx, const int64_t* size) {
+    return (idx[kStart] >= 0 && idx[kStart] < size[kStart]) &&
+           ContainedFunctor<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(idx,
+                                                                       size);
+  }
+};
 
-// Base case of is 0 <= idx_i < size_i ?
-// Notice it is inline because it is no longer a template
-template <>
-HOSTDEVICE inline bool contained(const Dim<0>& idx, const Dim<0>& size) {
-  return true;
+template <int kStart, int kEnd>
+struct ContainedFunctor<kStart, kEnd, true> {
+  HOSTDEVICE static constexpr inline bool Run(const int64_t* idx,
+                                              const int64_t* size) {
+    return true;
+  }
+};
+}  // namespace detail
+
+template <int D>
+HOSTDEVICE inline bool contained(const Dim<D>& idx, const Dim<D>& size) {
+  return detail::ContainedFunctor<0, D, D == 0>::Run(idx.Get(), size.Get());
 }
 
 /**
  * \brief Compute exclusive prefix-multiply of a Dim.
  */
-template <int i>
-HOSTDEVICE Dim<i> ex_prefix_mul(const Dim<i>& src, int mul = 1) {
-  return Dim<i>(mul, ex_prefix_mul(src.tail, mul * src.head));
-}
+namespace detail {
+template <int kStart, int kEnd, bool kStop>
+struct ExPrefixMulFunctor {
+  HOSTDEVICE static inline void Run(const int64_t* in, int64_t* out) {
+    kStart == 0 ? out[kStart] = 1 : out[kStart] =
+                                        out[kStart - 1] * in[kStart - 1];
+    detail::ExPrefixMulFunctor<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(in,
+                                                                          out);
+  }
+};
+
+template <int kStart, int kEnd>
+struct ExPrefixMulFunctor<kStart, kEnd, true> {
+  HOSTDEVICE static inline void Run(const int64_t* in, int64_t* out) {}
+};
+}  // namespace detail
 
-///\cond HIDDEN
-// Base case of ex_prefix_mul
-// Notice it is inline because it is no longer a template
-template <>
-HOSTDEVICE inline Dim<0> ex_prefix_mul(const Dim<0>& src, int mul) {
-  return Dim<0>();
+template <int D>
+HOSTDEVICE inline Dim<D> ex_prefix_mul(const Dim<D>& src) {
+  Dim<D> ret;
+  detail::ExPrefixMulFunctor<0, D, D == 0>::Run(src.Get(), ret.GetMutable());
+  return ret;
 }
-///\endcond
 
 /**
  * Add two dimensions together
  */
-template <int i>
-HOSTDEVICE Dim<i> dim_plus(const Dim<i>& a, const Dim<i>& b) {
-  return Dim<i>(a.head + b.head, dim_plus(a.tail, b.tail));
-}
-
-// Base case
-template <>
-HOSTDEVICE inline Dim<0> dim_plus(const Dim<0>& a, const Dim<0>& b) {
-  return Dim<0>();
+template <int D>
+HOSTDEVICE inline Dim<D> dim_plus(const Dim<D>& a, const Dim<D>& b) {
+  Dim<D> ret;
+  UnrollAdd<D>::Run(a.Get(), b.Get(), ret.GetMutable());
+  return ret;
 }
 
-template <int i>
-HOSTDEVICE Dim<i> operator+(const Dim<i>& lhs, const Dim<i>& rhs) {
+template <int D>
+HOSTDEVICE inline Dim<D> operator+(const Dim<D>& lhs, const Dim<D>& rhs) {
   return dim_plus(lhs, rhs);
 }
 
 /**
  * Multiply two dimensions together
  */
-template <int i>
-HOSTDEVICE Dim<i> dim_mult(const Dim<i>& a, const Dim<i>& b) {
-  return Dim<i>(a.head * b.head, dim_mult(a.tail, b.tail));
-}
-
-// Base case
-template <>
-HOSTDEVICE inline Dim<0> dim_mult(const Dim<0>& a, const Dim<0>& b) {
-  return Dim<0>();
+template <int D>
+HOSTDEVICE inline Dim<D> dim_mult(const Dim<D>& a, const Dim<D>& b) {
+  Dim<D> ret;
+  UnrollMul<D>::Run(a.Get(), b.Get(), ret.GetMutable());
+  return ret;
 }
 
-template <int i>
-HOSTDEVICE Dim<i> operator*(const Dim<i>& lhs, const Dim<i>& rhs) {
+template <int D>
+HOSTDEVICE Dim<D> operator*(const Dim<D>& lhs, const Dim<D>& rhs) {
   return dim_mult(lhs, rhs);
 }
 
@@ -354,23 +206,32 @@ HOSTDEVICE Dim<i> operator*(const Dim<i>& lhs, const Dim<i>& rhs) {
  * \return Dim object the same size as \p size with normalized strides
  *
  */
+namespace detail {
+template <int kStart, int kEnd, bool kStop>
+struct NormalizeStridesFunctor {
+  HOSTDEVICE static void Run(const int64_t* size, const int64_t* stride,
+                             int64_t* ret) {
+    ret[kStart] = (size[kStart] == 1 ? 0 : stride[kStart]);
+    NormalizeStridesFunctor<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(
+        size, stride, ret);
+  }
+};
 
-template <int i>
-HOSTDEVICE Dim<i> normalize_strides(const Dim<i>& size, const Dim<i>& stride) {
-  int norm_stride = size.head == 1 ? 0 : stride.head;
-  return Dim<i>(norm_stride, normalize_strides(size.tail, stride.tail));
-}
-
-///\cond HIDDEN
+template <int kStart, int kEnd>
+struct NormalizeStridesFunctor<kStart, kEnd, true> {
+  HOSTDEVICE static void Run(const int64_t* size, const int64_t* stride,
+                             int64_t* ret) {}
+};
+}  // namespace detail
 
-template <>
-HOSTDEVICE inline Dim<0> normalize_strides(const Dim<0>& size,
-                                           const Dim<0>& stride) {
-  return Dim<0>();
+template <int D>
+HOSTDEVICE Dim<D> normalize_strides(const Dim<D>& size, const Dim<D>& stride) {
+  Dim<D> ret;
+  detail::NormalizeStridesFunctor<0, D, D == 0>::Run(size.Get(), stride.Get(),
+                                                     ret.GetMutable());
+  return ret;
 }
 
-///\endcond
-
 /**
  * Helper function to create a Dim
  *
@@ -379,25 +240,17 @@ HOSTDEVICE inline Dim<0> normalize_strides(const Dim<0>& size,
  */
 
 template <typename... Args>
-HOSTDEVICE Dim<sizeof...(Args)> make_dim(Args... idxes) {
+HOSTDEVICE inline Dim<sizeof...(Args)> make_dim(Args... idxes) {
   return Dim<sizeof...(Args)>(idxes...);
 }
 
 // Allows us to output a Dim
-// XXX For some reason, overloading fails to resolve this correctly
-template <int i>
-typename std::enable_if<(i > 1), std::ostream&>::type operator<<(
-    std::ostream& os, const Dim<i>& d) {
-  os << d.head << ", " << d.tail;
-  return os;
-}
-
-// Base case that allows us to output a Dim
-// XXX I wish this could be an overload instead of a template
-template <int i>
-typename std::enable_if<(i == 1), std::ostream&>::type operator<<(
-    std::ostream& os, const Dim<i>& d) {
-  os << d.head;
+template <int D>
+inline std::ostream& operator<<(std::ostream& os, const Dim<D>& d) {
+  os << d[0];
+  for (int i = 1; i < D; ++i) {
+    os << ", " << d[i];
+  }
   return os;
 }
 
@@ -405,17 +258,15 @@ inline std::ostream& operator<<(std::ostream& os, const Dim<0>& d) {
   return os;
 }
 
-template <int i>
-HOST std::string Dim<i>::to_string() const {
+template <int D>
+HOST std::string Dim<D>::to_string() const {
   std::stringstream stream;
-
   stream << *this;
-
   return stream.str();
 }
 
 template <int D>
-HOSTDEVICE Dim<D> linear_to_dimension(int linear_index, Dim<D> extents) {
+HOSTDEVICE Dim<D> linear_to_dimension(int linear_index, const Dim<D>& extents) {
   Dim<D> result;
 
   for (int i = 0; i < D - 1; ++i) {
@@ -428,5 +279,10 @@ HOSTDEVICE Dim<D> linear_to_dimension(int linear_index, Dim<D> extents) {
   return result;
 }
 
+template <int D, typename T1, typename T2>
+inline void static_dim_assign(const T1* in, T2* out) {
+  UnrollAssign<D>::Run(in, out);
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
index eaef093ed3..39652706c4 100644
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -59,7 +59,7 @@ static DLDataType GetDLDataTypeFromTypeIndex(proto::VarType::Type type) {
 
 struct DLContextVisitor : public boost::static_visitor<::DLContext> {
   inline ::DLContext operator()(const platform::CPUPlace &place) const {
-    DLContext ctx;
+    ::DLContext ctx;
     ctx.device_type = kDLCPU;
     ctx.device_id = 0;
     return ctx;
@@ -67,7 +67,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
 
   inline ::DLContext operator()(const platform::CUDAPlace &place) const {
 #ifdef PADDLE_WITH_CUDA
-    DLContext ctx;
+    ::DLContext ctx;
     ctx.device_type = kDLGPU;
     ctx.device_id = place.device;
     return ctx;
@@ -78,7 +78,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
 
   inline ::DLContext operator()(const platform::CUDAPinnedPlace &place) const {
 #ifdef PADDLE_WITH_CUDA
-    DLContext ctx;
+    ::DLContext ctx;
     ctx.device_type = kDLCPUPinned;
     ctx.device_id = 0;
     return ctx;
diff --git a/paddle/fluid/framework/dlpack_tensor.h b/paddle/fluid/framework/dlpack_tensor.h
index 0c52bce1ef..e48b0d5c88 100644
--- a/paddle/fluid/framework/dlpack_tensor.h
+++ b/paddle/fluid/framework/dlpack_tensor.h
@@ -38,7 +38,7 @@ class DLPackTensor {
 
   // The shape in DLTensor is defined as int64_t*
   // Add this member to make TVMTensor init without heap allocation
-  ShapeType shape_[9];
+  ShapeType shape_[DDim::kMaxRank];
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 594fbb48a6..c93bbe7cee 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -22,7 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/transfer_scope_cache.h"
 #include "paddle/fluid/framework/variable_helper.h"
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 
diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc
index 2eb9e564f8..4972bc7ec3 100644
--- a/paddle/fluid/framework/executor_thread_worker.cc
+++ b/paddle/fluid/framework/executor_thread_worker.cc
@@ -29,6 +29,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/timer.h"
 #include "paddle/fluid/pybind/pybind.h"
 namespace paddle {
 namespace framework {
@@ -180,6 +181,7 @@ void ExecutorThreadWorker::SetDevice() {
   return;
 #else
   static unsigned concurrency_cap = std::thread::hardware_concurrency();
+  LOG(WARNING) << "concurrency capacity " << concurrency_cap;
   int thread_id = this->thread_id_;
 
   if (static_cast<unsigned>(thread_id) < concurrency_cap) {
@@ -238,6 +240,55 @@ static void print_fetch_var(Scope* scope, const std::string& var_name) {
   VLOG(1) << "print_fetch_var: unrecognized data type:" << tensor.type();
 }
 
+void ExecutorThreadWorker::TrainFilesWithTimer() {
+  platform::SetNumThreads(1);
+  SetDevice();
+  thread_reader_->Start();
+  std::vector<double> op_total_time;
+  std::vector<std::string> op_name;
+  for (auto& op : ops_) {
+    op_name.push_back(op->Type());
+  }
+  op_total_time.resize(ops_.size());
+  for (size_t i = 0; i < op_total_time.size(); ++i) {
+    op_total_time[i] = 0.0;
+  }
+  platform::Timer timeline;
+  double total_time = 0.0;
+  double read_time = 0.0;
+  int cur_batch;
+  int batch_cnt = 0;
+  timeline.Start();
+  while ((cur_batch = thread_reader_->Next()) > 0) {
+    timeline.Pause();
+    read_time += timeline.ElapsedSec();
+    total_time += timeline.ElapsedSec();
+    for (size_t i = 0; i < ops_.size(); ++i) {
+      timeline.Start();
+      ops_[i]->Run(*thread_scope_, place_);
+      timeline.Pause();
+      op_total_time[i] += timeline.ElapsedSec();
+      total_time += timeline.ElapsedSec();
+    }
+    ++batch_cnt;
+    thread_scope_->DropKids();
+    if (thread_id_ == 0) {
+      if (batch_cnt > 0 && batch_cnt % 1000 == 0) {
+        for (size_t i = 0; i < ops_.size(); ++i) {
+          fprintf(stderr, "op_name:[%zu][%s], op_mean_time:[%fs]\n", i,
+                  op_name[i].c_str(), op_total_time[i] / batch_cnt);
+        }
+        fprintf(stderr, "mean read time: %fs\n", read_time / batch_cnt);
+        int fetch_var_num = fetch_var_names_.size();
+        for (int i = 0; i < fetch_var_num; ++i) {
+          print_fetch_var(thread_scope_, fetch_var_names_[i]);
+        }
+      }
+    }
+    timeline.Start();
+  }
+}
+
 void ExecutorThreadWorker::TrainFiles() {
   platform::SetNumThreads(1);
 
@@ -320,10 +371,12 @@ void AsyncExecutorThreadWorker::SetPSlibPtr(
     std::shared_ptr<paddle::distributed::PSlib> pslib_ptr) {
   _pslib_ptr = pslib_ptr;
 }
+
 void AsyncExecutorThreadWorker::SetPullDenseThread(
     std::shared_ptr<DensePullThread> dpt) {
   _pull_dense_thread = dpt;
 }
+
 void AsyncExecutorThreadWorker::TrainOneNetwork() {
   PrepareParams();
 
diff --git a/paddle/fluid/framework/executor_thread_worker.h b/paddle/fluid/framework/executor_thread_worker.h
index 30b81ad880..524922b032 100644
--- a/paddle/fluid/framework/executor_thread_worker.h
+++ b/paddle/fluid/framework/executor_thread_worker.h
@@ -155,6 +155,8 @@ class ExecutorThreadWorker {
   void SetDataFeed(const std::shared_ptr<DataFeed>& datafeed);
   // A multi-thread training function
   virtual void TrainFiles();
+  // with timer log
+  virtual void TrainFilesWithTimer();
   // set fetch variable names from python interface assigned by users
   void SetFetchVarNames(const std::vector<std::string>& fetch_var_names);
 #ifdef PADDLE_WITH_PSLIB
diff --git a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
index bd5b76426e..9e77f98e9e 100644
--- a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
+++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
@@ -75,6 +75,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
   std::vector<Node*> optimize_ops;
   std::vector<Node*> lr_ops;  // ops other than forward/backward/optimize
   std::unordered_set<std::string> grad_names;
+  std::unordered_map<std::string, std::string> gradname2paramname;
 
   std::vector<ir::Node*> nodes = TopologySortOperations(*graph);
   auto origin_nodes = graph->ReleaseNodes();
@@ -99,6 +100,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
       auto op_role_vars = boost::get<std::vector<std::string>>(op_role_var);
       for (size_t i = 0; i < op_role_vars.size(); i += 2) {
         grad_names.insert(op_role_vars[i + 1]);
+        gradname2paramname[op_role_vars[i + 1]] = op_role_vars[i];
       }
     } else if (op_role & static_cast<int>(framework::OpRole::kLRSched)) {
       lr_ops.push_back(node);
@@ -109,7 +111,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
 
   // 2. copy forward backward
   ir::Node* prev_repeat_last_op_node = nullptr;
-  // record origin_grad -> repeated grad list map.
+  // record origin_grad -> repeated_grad_list map.
   std::map<ir::Node*, std::vector<ir::Node*>> grad_repeated_map;
   std::map<std::string, std::vector<ir::Node*>> created;
   std::unordered_set<std::string> bn_vars_need_rename;
@@ -124,10 +126,16 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
         if (grad_names.find(outname) != grad_names.end()) {
           std::string new_gname = string::Sprintf("%s.repeat.%d", outname, i);
           repeated_op.RenameOutput(outname, new_gname);
+          // remove op_role_var for backward ops that outputs grad for a
+          // parameter.
+          repeated_op.SetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName(),
+                              std::vector<std::string>());
         }
       }
       // 3.5 let batch_norm ops use independent vars, note batch_norm_grad do
-      // not need this update
+      // not need this update, because only moving mean and variance should be
+      // differ, trainable parameter scale and bias is the same as other
+      // parameters.
       if (node->Name() == "batch_norm") {
         // NOTE: assume bn op created by layers use save var as output mean and
         // variance
@@ -224,16 +232,25 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
         var->inputs.push_back(repeated_node);
       }
     }
-  }
+  }  // end copy forward backward
 
-  // 5. create GRAD merge op node
+  // 5. create GRAD merge op node: sum(repeat.0...repeat.n) ->
+  // scale(1/num_repeats)
   for (auto kv : grad_repeated_map) {
     OpDesc sum_op;
     sum_op.SetType("sum");
     std::vector<std::string> repeated_grad_names;
+    std::vector<std::string> param_grad_op_role_var;
     for (auto r : kv.second) {
       repeated_grad_names.push_back(r->Var()->Name());
     }
+    // NOTE: use op_role_var to control allreduce op appending in
+    //       multi_devices_graph_pass, we want to append op_role_var
+    //       only once for the merged gradient, so break after first call.
+    param_grad_op_role_var.push_back(
+        gradname2paramname.at(kv.first->Var()->Name()));        // param
+    param_grad_op_role_var.push_back(kv.first->Var()->Name());  // grad
+
     sum_op.SetInput("X", repeated_grad_names);
     sum_op.SetOutput("Out", {kv.first->Var()->Name()});
     sum_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
@@ -256,6 +273,10 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
     scale_op.SetAttr("scale", static_cast<float>(1.0f / num_repeats));
     scale_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
                      static_cast<int>(OpRole::kBackward));
+
+    scale_op.SetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName(),
+                     param_grad_op_role_var);
+
     auto scale_op_node = result.CreateOpNode(&scale_op);
     scale_op_node->inputs.push_back(sum_out_var_node);
     sum_out_var_node->outputs.push_back(scale_op_node);
diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc
index 23f681ce88..57345f12cc 100644
--- a/paddle/fluid/framework/ngraph_operator.cc
+++ b/paddle/fluid/framework/ngraph_operator.cc
@@ -399,7 +399,7 @@ void NgraphEngine::BuildNgFunction() {
   BuildNgNodes();
   ngraph_function_ = nullptr;
   ngraph::NodeVector func_outputs;
-  ngraph::op::ParameterVector func_inputs;
+  ngraph::ParameterVector func_inputs;
 
   for (auto& vo : var_out_) {
     func_outputs.push_back(var_node_map_->at(vo));
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index f659a7b93f..f10da22aec 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <glog/logging.h>
 
 #include <algorithm>
-
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -1041,12 +1040,11 @@ Scope* OperatorWithKernel::PrepareData(
 
 proto::VarType::Type OperatorWithKernel::IndicateDataType(
     const ExecutionContext& ctx) const {
-  auto& scope = ctx.scope();
   int data_type = -1;
-  std::string last_input_name;
   for (auto& input : this->inputs_) {
-    for (auto& ipt_name : input.second) {
-      auto* var = scope.FindVar(ipt_name);
+    const std::vector<const Variable*> vars = ctx.MultiInputVar(input.first);
+    for (size_t i = 0; i < vars.size(); ++i) {
+      const Variable* var = vars[i];
       if (var != nullptr) {
         const Tensor* t = nullptr;
         if (var->IsType<Tensor>()) {
@@ -1057,15 +1055,14 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
           t = &(var->Get<SelectedRows>().value());
         }
         if (t != nullptr) {
-          PADDLE_ENFORCE(t->IsInitialized(), "Input %s is not initialized",
-                         ipt_name);
+          PADDLE_ENFORCE(t->IsInitialized(), "Input %s(%lu)is not initialized",
+                         input.first, i);
           int tmp = static_cast<int>(t->type());
           PADDLE_ENFORCE(
               tmp == data_type || data_type == -1,
-              "DataType of Paddle Op %s must be the same. Get %s(%d) != %s(%d)",
-              Type(), last_input_name, data_type, ipt_name, tmp);
+              "DataType of Paddle Op %s must be the same. Get (%d) != (%d)",
+              Type(), data_type, tmp);
           data_type = tmp;
-          last_input_name = ipt_name;
         }
       }
     }
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index bad9716e8b..e2bedc60d2 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -81,6 +81,10 @@ class RuntimeContext {
   RuntimeContext(const VariableNameMap& innames,
                  const VariableNameMap& outnames, const Scope& scope);
 
+  RuntimeContext(const VariableValueMap& invars,
+                 const VariableValueMap& outvars)
+      : inputs(invars), outputs(outvars) {}
+
   VariableValueMap inputs;
   VariableValueMap outputs;
 };
@@ -447,8 +451,9 @@ class OperatorWithKernel : public OperatorBase {
   void RuntimeInferShape(const Scope& scope, const platform::Place& place,
                          const RuntimeContext& ctx) const override;
 
- protected:
   virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const;
+
+ protected:
   virtual OpKernelType GetKernelTypeForVar(
       const std::string& var_name, const Tensor& tensor,
       const OpKernelType& expected_kernel_type) const;
diff --git a/paddle/fluid/framework/rw_lock.h b/paddle/fluid/framework/rw_lock.h
index dbf00f3a79..f8aa87519a 100644
--- a/paddle/fluid/framework/rw_lock.h
+++ b/paddle/fluid/framework/rw_lock.h
@@ -16,7 +16,9 @@ limitations under the License. */
 
 #if !defined(_WIN32)
 #include <pthread.h>
-#endif  // !_WIN32
+#else
+#include <mutex>  // NOLINT
+#endif            // !_WIN32
 
 #include "paddle/fluid/platform/enforce.h"
 
@@ -29,17 +31,17 @@ struct RWLock {
 
   ~RWLock() { pthread_rwlock_destroy(&lock_); }
 
-  void RDLock() {
+  inline void RDLock() {
     PADDLE_ENFORCE_EQ(pthread_rwlock_rdlock(&lock_), 0,
                       "acquire read lock failed");
   }
 
-  void WRLock() {
+  inline void WRLock() {
     PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_), 0,
                       "acquire write lock failed");
   }
 
-  void UNLock() {
+  inline void UNLock() {
     PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed");
   }
 
@@ -51,81 +53,46 @@ struct RWLock {
 // https://stackoverflow.com/questions/7125250/making-pthread-rwlock-wrlock-recursive
 // In windows, rw_lock seems like a hack. Use empty object and do nothing.
 struct RWLock {
-  void RDLock() {}
-  void WRLock() {}
-  void UNLock() {}
+  // FIXME(minqiyang): use mutex here to do fake lock
+  inline void RDLock() { mutex_.lock(); }
+
+  inline void WRLock() { mutex_.lock(); }
+
+  inline void UNLock() { mutex_.unlock(); }
+
+ private:
+  std::mutex mutex_;
 };
 #endif
 
-class RWLockGuard {
+class AutoWRLock {
  public:
-  enum Status { kUnLock, kWRLock, kRDLock };
-
-  RWLockGuard(RWLock* rw_lock, Status init_status)
-      : lock_(rw_lock), status_(Status::kUnLock) {
-    switch (init_status) {
-      case Status::kRDLock: {
-        RDLock();
-        break;
-      }
-      case Status::kWRLock: {
-        WRLock();
-        break;
-      }
-      case Status::kUnLock: {
-        break;
-      }
-    }
-  }
+  explicit AutoWRLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); }
 
-  void WRLock() {
-    switch (status_) {
-      case Status::kUnLock: {
-        lock_->WRLock();
-        status_ = Status::kWRLock;
-        break;
-      }
-      case Status::kWRLock: {
-        break;
-      }
-      case Status::kRDLock: {
-        PADDLE_THROW(
-            "Please unlock read lock first before invoking write lock.");
-        break;
-      }
-    }
-  }
+  ~AutoWRLock() { UnLock(); }
 
-  void RDLock() {
-    switch (status_) {
-      case Status::kUnLock: {
-        lock_->RDLock();
-        status_ = Status::kRDLock;
-        break;
-      }
-      case Status::kRDLock: {
-        break;
-      }
-      case Status::kWRLock: {
-        PADDLE_THROW(
-            "Please unlock write lock first before invoking read lock.");
-        break;
-      }
-    }
-  }
+ private:
+  inline void Lock() { lock_->WRLock(); }
 
-  void UnLock() {
-    if (status_ != Status::kUnLock) {
-      lock_->UNLock();
-      status_ = Status::kUnLock;
-    }
-  }
+  inline void UnLock() { lock_->UNLock(); }
+
+ private:
+  RWLock* lock_;
+};
+
+class AutoRDLock {
+ public:
+  explicit AutoRDLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); }
+
+  ~AutoRDLock() { UnLock(); }
+
+ private:
+  inline void Lock() { lock_->RDLock(); }
 
-  ~RWLockGuard() { UnLock(); }
+  inline void UnLock() { lock_->UNLock(); }
 
  private:
   RWLock* lock_;
-  Status status_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 750b626603..a5742dbd3d 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -47,9 +47,15 @@ DEFINE_bool(fast_eager_deletion_mode, false,
 // the mutex will cause serious performance issue.
 // So the mutex is disabled when `ON_INFER`.
 #ifdef PADDLE_ON_INFERENCE
-#define SCOPE_LOCK_GUARD
+#define SCOPE_KIDS_READER_LOCK
+#define SCOPE_KIDS_WRITER_LOCK
+#define SCOPE_VARS_READER_LOCK
+#define SCOPE_VARS_WRITER_LOCK
 #else
-#define SCOPE_LOCK_GUARD std::lock_guard<std::mutex> lock(mutex_);
+#define SCOPE_KIDS_READER_LOCK AutoRDLock auto_lock(&kids_lock_);
+#define SCOPE_KIDS_WRITER_LOCK AutoWRLock auto_lock(&kids_lock_);
+#define SCOPE_VARS_READER_LOCK AutoRDLock auto_lock(&vars_lock_);
+#define SCOPE_VARS_WRITER_LOCK AutoWRLock auto_lock(&vars_lock_);
 #endif
 
 namespace paddle {
@@ -67,64 +73,69 @@ bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; }
 Scope::~Scope() { DropKids(); }
 
 Scope& Scope::NewScope() const {
-  SCOPE_LOCK_GUARD
-  kids_.push_back(new Scope(this));
-  return *kids_.back();
+  Scope* child = new Scope(this);
+  {
+    SCOPE_KIDS_WRITER_LOCK
+    kids_.push_back(child);
+  }
+  return *child;
 }
 
 Variable* Scope::Var(const std::string& name) {
-  SCOPE_LOCK_GUARD
+  SCOPE_VARS_WRITER_LOCK
   return VarInternal(name);
 }
 
 Variable* Scope::Var(std::string* name) {
-  SCOPE_LOCK_GUARD
   auto new_name = string::Sprintf("%p.%d", this, vars_.size());
   if (name != nullptr) {
     *name = new_name;
   }
+  SCOPE_VARS_WRITER_LOCK
   return VarInternal(new_name);
 }
 
 Variable* Scope::FindVar(const std::string& name) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_VARS_READER_LOCK
   return FindVarInternal(name);
 }
 
 Variable* Scope::FindLocalVar(const std::string& name) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_VARS_READER_LOCK
   return FindVarLocally(name);
 }
 
 const Scope* Scope::FindScope(const Variable* var) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_VARS_READER_LOCK
   return FindScopeInternal(var);
 }
 
 void Scope::DropKids() {
-  SCOPE_LOCK_GUARD
+  SCOPE_KIDS_WRITER_LOCK
   for (Scope* s : kids_) delete s;
   kids_.clear();
 }
 
 bool Scope::HasKid(const Scope* scope) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_KIDS_READER_LOCK
   auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
   return it != this->kids_.end();
 }
 
 std::vector<std::string> Scope::LocalVarNames() const {
-  SCOPE_LOCK_GUARD
   std::vector<std::string> known_vars;
-  known_vars.reserve(this->vars_.size());
-  for (auto& p : vars_) {
-    known_vars.emplace_back(p.first);
+  {
+    SCOPE_VARS_READER_LOCK
+    known_vars.reserve(this->vars_.size());
+    for (auto& p : vars_) {
+      known_vars.emplace_back(p.first);
+    }
   }
   return known_vars;
 }
 
 void Scope::DeleteScope(Scope* scope) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_KIDS_WRITER_LOCK
   auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
   PADDLE_ENFORCE(it != this->kids_.end(), "%p Cannot find %p as kid scope",
                  this, scope);
@@ -138,8 +149,8 @@ void Scope::DeleteScope(Scope* scope) const {
 }
 
 void Scope::EraseVars(const std::vector<std::string>& var_names) {
-  SCOPE_LOCK_GUARD
   std::set<std::string> var_set(var_names.begin(), var_names.end());
+  SCOPE_VARS_WRITER_LOCK
   for (auto it = vars_.begin(); it != vars_.end();) {
     if (var_set.find(it->first) != var_set.end()) {
       it = vars_.erase(it);
@@ -151,12 +162,12 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
 
 void Scope::Rename(const std::string& origin_name,
                    const std::string& new_name) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_VARS_WRITER_LOCK
   RenameInternal(origin_name, new_name);
 }
 
 std::string Scope::Rename(const std::string& origin_name) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_VARS_WRITER_LOCK
   auto new_name = string::Sprintf("%p.%d", this, vars_.size());
   RenameInternal(origin_name, new_name);
   return new_name;
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index aded1f771c..f0915d2eee 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -14,12 +14,18 @@ limitations under the License. */
 
 #pragma once
 
+extern "C" {
+#include <xxhash.h>
+}
+
 #include <list>
-#include <mutex>  // NOLINT
+#include <memory>
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
+#include "paddle/fluid/framework/rw_lock.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/macros.h"
 
@@ -95,7 +101,14 @@ class Scope {
   std::string Rename(const std::string& origin_name) const;
 
  protected:
-  mutable std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
+  struct KeyHasher {
+    std::size_t operator()(const std::string& key) const {
+      return XXH32(key.c_str(), key.size(), 1);
+    }
+  };
+
+  mutable std::unordered_map<std::string, std::unique_ptr<Variable>, KeyHasher>
+      vars_;
 
  private:
   // Call Scope::NewScope for a sub-scope.
@@ -124,7 +137,8 @@ class Scope {
   DISABLE_COPY_AND_ASSIGN(Scope);
 
  private:
-  mutable std::mutex mutex_;
+  mutable RWLock kids_lock_;
+  mutable RWLock vars_lock_;
 };
 
 // Generate some debug string about the inherience structure of scope, quite
diff --git a/paddle/fluid/framework/unroll_array_ops.h b/paddle/fluid/framework/unroll_array_ops.h
new file mode 100644
index 0000000000..731da74eff
--- /dev/null
+++ b/paddle/fluid/framework/unroll_array_ops.h
@@ -0,0 +1,179 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cstddef>
+#include <type_traits>
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace framework {
+
+namespace detail {
+
+template <size_t kStart, size_t kEnd, bool kStop>
+struct UnrollFillConstant {
+  template <typename T>
+  HOSTDEVICE inline static void Run(T *data, T val) {
+    data[kStart] = val;
+    UnrollFillConstant<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(data, val);
+  }
+};
+
+template <size_t kStart, size_t kEnd>
+struct UnrollFillConstant<kStart, kEnd, true> {
+  template <typename T>
+  HOSTDEVICE inline static void Run(T *data, T val) {}
+};
+
+template <size_t kStart, size_t kEnd, bool kStop>
+struct UnrollAssign {
+  template <typename Tin, typename Tout>
+  HOSTDEVICE inline static void Run(const Tin *d1, Tout *d2) {
+    d2[kStart] = static_cast<Tout>(d1[kStart]);
+    UnrollAssign<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d1, d2);
+  }
+};
+
+template <size_t kStart, size_t kEnd>
+struct UnrollAssign<kStart, kEnd, true> {
+  template <typename Tin, typename Tout>
+  HOSTDEVICE inline static void Run(const Tin *d1, Tout *d2) {}
+};
+
+template <typename T, size_t kStart, size_t kEnd, bool kStop>
+struct UnrollVarArgsAssignImpl {
+  template <typename... Args>
+  HOSTDEVICE inline static void Run(T *d, T val, Args... args) {
+    static_assert(sizeof...(args) + 1 == kEnd - kStart, "Wrong argument");
+    d[kStart] = val;
+    UnrollVarArgsAssignImpl<T, kStart + 1, kEnd, kStart + 1 == kEnd>::Run(
+        d, args...);
+  }
+};
+
+template <typename T, size_t kStart, size_t kEnd>
+struct UnrollVarArgsAssignImpl<T, kStart, kEnd, true> {
+  HOSTDEVICE inline static void Run(T *d) {}
+};
+
+template <typename T>
+struct UnrollVarArgsAssign {
+  template <typename... Args>
+  HOSTDEVICE inline static void Run(T *d, Args... args) {
+    UnrollVarArgsAssignImpl<T, 0, sizeof...(Args), sizeof...(Args) == 0>::Run(
+        d, args...);
+  }
+};
+
+template <size_t kStart, size_t kEnd, bool kStop>
+struct UnrollCompare {
+  template <typename T>
+  HOSTDEVICE inline static bool Run(const T *d1, const T *d2) {
+    return d1[kStart] == d2[kStart] &&
+           UnrollCompare<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d1, d2);
+  }
+};
+
+template <size_t kStart, size_t kEnd>
+struct UnrollCompare<kStart, kEnd, true> {
+  template <typename T>
+  HOSTDEVICE inline constexpr static bool Run(const T *d1, const T *d2) {
+    return true;
+  }
+};
+
+template <size_t kStart, size_t kEnd, bool kStop>
+struct UnrollAdd {
+  template <typename T>
+  HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) {
+    d3[kStart] = d1[kStart] + d2[kStart];
+    UnrollAdd<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d1, d2, d3);
+  }
+};
+
+template <size_t kStart, size_t kEnd>
+struct UnrollAdd<kStart, kEnd, true> {
+  template <typename T>
+  HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) {}
+};
+
+template <size_t kStart, size_t kEnd, bool kStop>
+struct UnrollMul {
+  template <typename T>
+  HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) {
+    d3[kStart] = d1[kStart] * d2[kStart];
+    UnrollMul<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d1, d2, d3);
+  }
+};
+
+template <size_t kStart, size_t kEnd>
+struct UnrollMul<kStart, kEnd, true> {
+  template <typename T>
+  HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) {}
+};
+
+template <size_t kStart, size_t kEnd, bool kStop>
+struct UnrollProduct {
+  template <typename T>
+  HOSTDEVICE inline static T Run(const T *d) {
+    return d[kStart] *
+           UnrollProduct<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d);
+  }
+
+  template <typename T>
+  HOSTDEVICE inline static T Run(const T *d1, const T *d2) {
+    return d1[kStart] * d2[kStart] +
+           UnrollProduct<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d1, d2);
+  }
+};
+
+template <size_t kStart, size_t kEnd>
+struct UnrollProduct<kStart, kEnd, true> {
+  template <typename T>
+  HOSTDEVICE inline constexpr static T Run(const T *d) {
+    return 1;
+  }
+
+  template <typename T>
+  HOSTDEVICE inline constexpr static T Run(const T *d1, const T *d2) {
+    return 0;
+  }
+};
+
+}  // namespace detail
+
+template <size_t N>
+using UnrollFillConstant = detail::UnrollFillConstant<0, N, N == 0>;
+
+template <size_t N>
+using UnrollAssign = detail::UnrollAssign<0, N, N == 0>;
+
+template <typename T>
+using UnrollVarArgsAssign = detail::UnrollVarArgsAssign<T>;
+
+template <size_t N>
+using UnrollCompare = detail::UnrollCompare<0, N, N == 0>;
+
+template <size_t N>
+using UnrollAdd = detail::UnrollAdd<0, N, N == 0>;
+
+template <size_t N>
+using UnrollMul = detail::UnrollMul<0, N, N == 0>;
+
+template <size_t N>
+using UnrollProduct = detail::UnrollProduct<0, N, N == 0>;
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/unroll_array_ops_test.cc b/paddle/fluid/framework/unroll_array_ops_test.cc
new file mode 100644
index 0000000000..51433c83c8
--- /dev/null
+++ b/paddle/fluid/framework/unroll_array_ops_test.cc
@@ -0,0 +1,108 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/unroll_array_ops.h"
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <array>
+#include <cstdint>
+
+namespace paddle {
+namespace framework {
+
+template <typename T>
+bool CheckEquality(const T* p, size_t n, T val) {
+  return std::all_of(p, p + n, [val](const T& v) { return v == val; });
+}
+
+template <int D1, int D2>
+bool FillConstantTestMain() {
+  static_assert(D1 >= D2, "");
+  std::array<int, D1> arr;
+  arr.fill(0);
+
+  UnrollFillConstant<D2>::Run(arr.data(), 1);
+  return CheckEquality(arr.data(), D2, 1) &&
+         CheckEquality(arr.data() + D2, arr.size() - D2, 0);
+}
+
+TEST(unroll_ops, fill_constant) {
+  EXPECT_TRUE((FillConstantTestMain<9, 0>()));
+  EXPECT_TRUE((FillConstantTestMain<9, 1>()));
+  EXPECT_TRUE((FillConstantTestMain<9, 4>()));
+  EXPECT_TRUE((FillConstantTestMain<9, 9>()));
+}
+
+TEST(unroll_ops, assign) {
+  const int a[] = {1, 2, 3, 4, 5};
+  int b[] = {0, 0, 0, 0, 0};
+  UnrollAssign<3>::Run(a, b);
+  EXPECT_EQ(b[0], 1);
+  EXPECT_EQ(b[1], 2);
+  EXPECT_EQ(b[2], 3);
+  EXPECT_EQ(b[3], 0);
+  EXPECT_EQ(b[4], 0);
+}
+
+TEST(unroll_ops, var_args_assign) {
+  int a[] = {0, 0, 0};
+  UnrollVarArgsAssign<int>::Run(a, 1, 2);
+  EXPECT_EQ(a[0], 1);
+  EXPECT_EQ(a[1], 2);
+  EXPECT_EQ(a[2], 0);
+}
+
+TEST(unroll_ops, compare) {
+  int a[] = {1, 2, 3};
+  int b[] = {1, 2, 4};
+  EXPECT_TRUE(UnrollCompare<2>::Run(a, b));
+  EXPECT_FALSE(UnrollCompare<3>::Run(a, b));
+
+  b[0] = -1;
+  EXPECT_TRUE(UnrollCompare<0>::Run(a, b));
+  EXPECT_FALSE(UnrollCompare<1>::Run(a, b));
+}
+
+TEST(unroll_ops, add) {
+  int a[] = {2, 3, 4};
+  int b[] = {5, 10, 102};
+  int c[] = {0, 0, 0};
+  UnrollAdd<2>::Run(a, b, c);
+  EXPECT_EQ(a[0] + b[0], c[0]);
+  EXPECT_EQ(a[1] + b[1], c[1]);
+  EXPECT_EQ(c[2], 0);
+}
+
+TEST(unroll_ops, mul) {
+  int a[] = {2, 3, 4};
+  int b[] = {5, 10, 102};
+  int c[] = {0, 0, 0};
+  UnrollMul<2>::Run(a, b, c);
+  EXPECT_EQ(a[0] * b[0], c[0]);
+  EXPECT_EQ(a[1] * b[1], c[1]);
+  EXPECT_EQ(c[2], 0);
+}
+
+TEST(unroll_ops, product) {
+  int a[] = {2, 3, 4};
+  int b[] = {5, 10, 102};
+
+  EXPECT_EQ(UnrollProduct<3>::Run(a), a[0] * a[1] * a[2]);
+
+  EXPECT_EQ(UnrollProduct<3>::Run(a, b),
+            a[0] * b[0] + a[1] * b[1] + a[2] * b[2]);
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 342cb68ab2..26e7830265 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -42,13 +42,9 @@ void AddTo(Variable* src, Variable* dst) {
 
 class Autograd {
  public:
-  explicit Autograd(framework::Scope* scope) : scope_(scope) {}
+  Autograd() {}
 
   void RunBackward(VarBase* var) {
-    PADDLE_ENFORCE(var->pre_op_->op_desc_);
-    // TODO(panyx0718): Only create for vars that "require_grad"
-    (*var->pre_op_->output_vars_)[var->pre_op_out_idx_]->grads_ = var->grads_;
-
     std::deque<OpBase*> ready;
     ready.push_back(var->pre_op_);
 
@@ -57,18 +53,22 @@ class Autograd {
     while (!ready.empty()) {
       OpBase* ready_op = ready.front();
       ready.pop_front();
-      std::vector<Variable*> input_grads = ready_op->ApplyGrad(scope_);
-
-      for (size_t i = 0; i < input_grads.size(); ++i) {
-        if (!input_grads[i]) continue;
-        OpBase* pre_op = ready_op->pre_ops_->at(i);
-        if (!pre_op) continue;
-
-        dep_counts[pre_op] -= 1;
-        PADDLE_ENFORCE(dep_counts[pre_op] >= 0);
-        bool pre_op_ready = dep_counts[pre_op] == 0;
-        if (pre_op_ready) {
-          ready.push_back(pre_op);
+      std::map<std::string, std::vector<VarBase*>> input_grads =
+          ready_op->ApplyGrad();
+
+      for (auto it : input_grads) {
+        const std::vector<VarBase*>& ingrads = it.second;
+        for (size_t i = 0; i < ingrads.size(); ++i) {
+          if (!ingrads[i]) continue;
+          OpBase* pre_op = ready_op->pre_ops_[it.first][i];
+          if (!pre_op) continue;
+
+          dep_counts[pre_op] -= 1;
+          PADDLE_ENFORCE(dep_counts[pre_op] >= 0);
+          bool pre_op_ready = dep_counts[pre_op] == 0;
+          if (pre_op_ready) {
+            ready.push_back(pre_op);
+          }
         }
       }
     }
@@ -85,138 +85,84 @@ class Autograd {
     while (!queue.empty()) {
       OpBase* candidate = queue.front();
       queue.pop_front();
-      for (OpBase* pre_op : *(candidate->pre_ops_)) {
-        if (!pre_op) continue;
-        if (visited.find(pre_op) == visited.end()) {
-          visited.insert(pre_op);
-          queue.push_back(pre_op);
+      for (auto it : candidate->pre_ops_) {
+        for (OpBase* pre_op : it.second) {
+          if (!pre_op) continue;
+          if (visited.find(pre_op) == visited.end()) {
+            visited.insert(pre_op);
+            queue.push_back(pre_op);
+          }
+          ret[pre_op] += 1;
         }
-        ret[pre_op] += 1;
       }
     }
-
     return ret;
   }
-
-  framework::Scope* scope_;
 };
 
-framework::Variable* CreateVariable(const std::string& name,
-                                    const framework::DDim& dim, float val,
-                                    framework::Scope* scope,
-                                    bool random_name = true) {
-  std::string varname = name;
-  if (random_name) {
-    std::mt19937 rng;
-    rng.seed(std::random_device()());
-    std::uniform_int_distribution<std::mt19937::result_type> dist6(
-        1, std::numeric_limits<int>::max());
-    int id = dist6(rng);
-    varname = string::Sprintf("%s@%d", varname, id);
-  }
-
-  VLOG(3) << "creating var " << varname;
-  framework::Variable* var = scope->Var(varname);
-  framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
-
-  float* data = tensor->mutable_data<float>(dim, platform::CPUPlace());
-  std::fill(data, data + tensor->numel(), val);
-  return var;
-}
-
 framework::LoDTensor& VarBase::Grad() {
   VLOG(3) << "get var grad " << var_desc_->Name();
   return *grads_->GetMutable<framework::LoDTensor>();
 }
 
-void VarBase::ApplyGrad(framework::Scope* scope, Variable* grad) {
-  VLOG(3) << "apply var grad " << var_desc_->Name() << " "
-          << grad->Get<framework::LoDTensor>().data<float>()[0];
-  if (!grads_) {
-    grads_ =
-        CreateVariable(string::Sprintf("%s@IGrad", var_desc_->Name()),
-                       var_->Get<framework::LoDTensor>().dims(), 0.0, scope);
+std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
+  if (!grad_op_desc_) {
+    VLOG(3) << "op with no grad: " << op_desc_->Type();
+    return {};
   }
-  AddTo(grad, grads_);
-  VLOG(3) << "grad_ after apply var grad " << var_desc_->Name() << " "
-          << grads_->Get<framework::LoDTensor>().data<float>()[0];
-}
-
-std::vector<Variable*> OpBase::ApplyGrad(framework::Scope* scope) {
   VLOG(3) << "op grad " << grad_op_desc_->Type();
 
-  for (const std::string& grad_invar : grad_op_desc_->InputArgumentNames()) {
-    if (grad_to_var_->find(grad_invar) == grad_to_var_->end()) {
-      // grad op inputs can be forward inputs, so not in grad_to_var.
-      continue;
-    }
-    VLOG(3) << "op grad in var " << grad_invar;
-    block_->FindRecursiveOrCreateVar(grad_invar);
-    framework::Variable* var = scope->Var(grad_invar);
-    const std::string& invar = grad_to_var_->at(grad_invar);
-    for (VarBase* varbase : *output_vars_) {
-      // Use the accumulated grads_ by sharing the input with grads_.
-      if (varbase->var_desc_->Name() == invar) {
-        var->GetMutable<framework::LoDTensor>()->ShareDataWith(
-            varbase->grads_->Get<framework::LoDTensor>());
-        break;
-      }
+  std::vector<std::unique_ptr<framework::Variable>> tmp_vars;
+  std::map<std::string, std::vector<framework::Variable*>> grad_outputs;
+  for (auto it : grad_output_vars_) {
+    auto& outputs = grad_outputs[it.first];
+    for (size_t i = 0; i < it.second.size(); ++i) {
+      tmp_vars.emplace_back(new framework::Variable());
+      outputs.push_back(tmp_vars.back().get());
+      outputs.back()->GetMutable<framework::LoDTensor>();
     }
   }
 
-  for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) {
-    VLOG(3) << "grad outvar " << outvar;
-    block_->FindRecursiveOrCreateVar(outvar);
-    framework::Variable* var = scope->Var(outvar);
-    if (!var->IsInitialized()) {
-      framework::VarDesc* var_desc = block_->FindVar(outvar);
-      if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
-        var->GetMutable<framework::LoDTensor>();
-      } else {
-        LOG(ERROR) << "tracer doesn't support yet";
-      }
-    }
-  }
-  grad_op_desc_->InferShape(*block_);
+  framework::RuntimeContext ctx(grad_input_vars_, grad_outputs);
+
+  // No need to do static infer shape here.
+  // grad_op_desc_->InferShape(*block_);
   grad_op_desc_->InferVarType(block_);
+
   std::unique_ptr<framework::OperatorBase> opbase =
       framework::OpRegistry::CreateOp(*grad_op_desc_);
-
-  opbase->Run(*scope, platform::CPUPlace());
-
-  // `ret` matches exactly with `input_vars_` of forward op.
-  std::vector<Variable*> ret;
-  for (size_t i = 0; i < input_vars_->size(); ++i) {
-    bool found = false;
-    VarBase* origin_var = (*input_vars_)[i];
-    for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) {
-      Variable* var = scope->FindVar(outvar);
-      std::string orig_var = grad_to_var_->at(outvar);
-      if (origin_var->var_desc_->Name() != orig_var) {
-        continue;
-      }
-      VLOG(3) << "apply grad " << outvar << " with origin " << orig_var;
-      origin_var->ApplyGrad(scope, var);
-      found = true;
-      ret.push_back(var);
-      // TODO(panyx0718): There might be another outvar with the same name.
-      // In that case, it doesn't matter the first one or the second one is
-      // used.
-      break;
-    }
-    if (!found) {
-      ret.push_back(nullptr);
+  framework::OperatorWithKernel* op_kernel =
+      dynamic_cast<framework::OperatorWithKernel*>(opbase.get());
+  PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
+
+  framework::Scope scope;
+  platform::CPUPlace place;
+  PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place);
+  p.op.RuntimeInferShape(scope, place, ctx);
+  p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
+
+  for (auto it : grad_output_vars_) {
+    auto& outputs = grad_outputs[it.first];
+    auto& origin_outputs = it.second;
+    for (size_t i = 0; i < outputs.size(); ++i) {
+      framework::Variable* orig_grad = origin_outputs[i];
+      AddTo(outputs[i], orig_grad);
     }
   }
-  return ret;
+  return input_vars_;
 }
 
-void VarBase::RunBackward(framework::Scope* scope) {
-  grads_ = CreateVariable(framework::GradVarName(var_desc_->Name()),
-                          var_->Get<framework::LoDTensor>().dims(), 1.0, scope,
-                          false);
+void VarBase::RunBackward() {
   if (!pre_op_) return;
-  Autograd(scope).RunBackward(this);
+
+  auto grads_t = grads_->GetMutable<framework::LoDTensor>();
+  float* data = grads_t->mutable_data<float>(platform::CPUPlace());
+  std::fill(data, data + grads_t->numel(), 1.0);
+
+  PADDLE_ENFORCE(
+      grads_ ==
+      pre_op_->output_vars_[pre_op_out_name_][pre_op_out_idx_]->grads_);
+  Autograd().RunBackward(this);
 }
 
 }  // namespace imperative
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 85a71ca83d..ae4e8e0f8a 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -14,17 +14,69 @@
 
 #pragma once
 
+#include <map>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace imperative {
 
+class PreparedOp {
+ public:
+  PreparedOp(const framework::OperatorBase& op,
+             const framework::RuntimeContext& ctx,
+             framework::OperatorWithKernel::OpKernelFunc func,
+             platform::DeviceContext* dev_ctx)
+      : op(op), ctx(ctx), func(func), dev_ctx(dev_ctx) {}
+
+  static PreparedOp Prepare(const framework::RuntimeContext& ctx,
+                            const framework::OperatorWithKernel& op,
+                            const platform::Place& place) {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto* dev_ctx = pool.Get(place);
+
+    // check if op[type] has kernel registered.
+    auto& all_op_kernels = op.AllOpKernels();
+    auto kernels_iter = all_op_kernels.find(op.Type());
+    if (kernels_iter == all_op_kernels.end()) {
+      PADDLE_THROW(
+          "There are no kernels which are registered in the %s operator.",
+          op.Type());
+    }
+
+    framework::OperatorWithKernel::OpKernelMap& kernels = kernels_iter->second;
+
+    auto expected_kernel_key = op.GetExpectedKernelType(
+        framework::ExecutionContext(op, framework::Scope(), *dev_ctx, ctx));
+    VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
+
+    auto kernel_iter = kernels.find(expected_kernel_key);
+#ifdef PADDLE_WITH_MKLDNN
+    // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
+    if (kernel_iter == kernels.end() &&
+        expected_kernel_key.library_type_ == framework::LibraryType::kMKLDNN) {
+      VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one";
+      expected_kernel_key.library_type_ = framework::LibraryType::kPlain;
+      expected_kernel_key.data_layout_ = framework::DataLayout::kAnyLayout;
+      kernel_iter = kernels.find(expected_kernel_key);
+    }
+#endif
+    if (kernel_iter == kernels.end()) {
+      PADDLE_THROW("op %s does not have kernel for %s", op.Type(),
+                   KernelTypeToString(expected_kernel_key));
+    }
+    return PreparedOp(op, ctx, kernel_iter->second, dev_ctx);
+  }
+
+  const framework::OperatorBase& op;
+  const framework::RuntimeContext& ctx;
+  framework::OperatorWithKernel::OpKernelFunc func;
+  platform::DeviceContext* dev_ctx;
+};
 class OpBase;
 
 class VarBase {
@@ -33,18 +85,26 @@ class VarBase {
       : pre_op_(nullptr),
         pre_op_out_idx_(-1),
         var_desc_(nullptr),
-        var_(nullptr),
-        grads_(nullptr) {}
-
-  virtual ~VarBase() {}
-
-  void ApplyGrad(framework::Scope* scope, framework::Variable* grad);
+        var_(new framework::Variable()),
+        grads_(new framework::Variable()) {}
+
+  virtual ~VarBase() {
+    if (var_) {
+      delete var_;
+      var_ = nullptr;
+    }
+    if (grads_) {
+      delete grads_;
+      grads_ = nullptr;
+    }
+  }
 
-  void RunBackward(framework::Scope* scope);
+  void RunBackward();
 
   framework::LoDTensor& Grad();
 
   OpBase* pre_op_;
+  std::string pre_op_out_name_;
   int pre_op_out_idx_;
 
   framework::VarDesc* var_desc_;
@@ -54,35 +114,24 @@ class VarBase {
 
 class OpBase {
  public:
-  OpBase()
-      : input_vars_(new std::vector<VarBase*>()),
-        output_vars_(new std::vector<VarBase*>()),
-        pre_ops_(new std::vector<OpBase*>()),
-        pre_ops_out_idx_(new std::vector<int>()),
-        op_desc_(nullptr),
-        grad_op_desc_(nullptr) {}
+  OpBase() : op_desc_(nullptr), grad_op_desc_(nullptr) {}
 
   virtual ~OpBase() {
-    delete input_vars_;
-    delete output_vars_;
-
-    delete pre_ops_;
-    delete pre_ops_out_idx_;
-
     if (grad_op_desc_) delete grad_op_desc_;
-    if (grad_to_var_) delete grad_to_var_;
   }
 
-  std::vector<framework::Variable*> ApplyGrad(framework::Scope* scope);
+  std::map<std::string, std::vector<VarBase*>> ApplyGrad();
 
-  std::vector<VarBase*>* input_vars_;
-  std::vector<VarBase*>* output_vars_;
-  std::vector<OpBase*>* pre_ops_;
-  std::vector<int>* pre_ops_out_idx_;
   framework::OpDesc* op_desc_;
-
   framework::OpDesc* grad_op_desc_;
-  std::unordered_map<std::string, std::string>* grad_to_var_;
+
+  std::map<std::string, std::vector<VarBase*>> input_vars_;
+  std::map<std::string, std::vector<VarBase*>> output_vars_;
+  std::map<std::string, std::vector<OpBase*>> pre_ops_;
+  std::map<std::string, std::vector<int>> pre_ops_out_idx_;
+
+  std::map<std::string, std::vector<framework::Variable*>> grad_input_vars_;
+  std::map<std::string, std::vector<framework::Variable*>> grad_output_vars_;
   framework::BlockDesc* block_;
 };
 
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index 97772dc110..c814da9853 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -20,7 +20,6 @@
 
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/imperative/engine.h"
 #include "paddle/fluid/imperative/layer.h"
 
@@ -41,22 +40,28 @@ void CreateGradOp(const framework::OpDesc& op_desc,
   *grad_op_desc = grad_op_descs[0].release();
 }
 
+void InitVar(framework::Variable* var, framework::Variable* grad_var) {
+  auto& var_t = var->Get<framework::LoDTensor>();
+  float* data =
+      grad_var->GetMutable<framework::LoDTensor>()->mutable_data<float>(
+          var_t.dims(), platform::CPUPlace());
+  std::fill(data, data + var_t.numel(), 0.0);
+}
+
 class Tracer {
  public:
   explicit Tracer(framework::BlockDesc* root_block,
                   framework::BlockDesc* startup_block)
-      : root_block_(root_block), startup_block_(startup_block) {
-    root_scope_ = new framework::Scope();
-    scopes_[root_block_] = root_scope_;
-    scopes_[startup_block_] = root_scope_;
-  }
+      : root_block_(root_block), startup_block_(startup_block) {}
 
-  virtual ~Tracer() { delete root_scope_; }
+  virtual ~Tracer() {}
 
-  void Trace(OpBase* op, const std::vector<VarBase*>& inputs,
-             const std::vector<VarBase*>& outputs,
+  void Trace(OpBase* op,
+             const std::map<std::string, std::vector<VarBase*>>& inputs,
+             const std::map<std::string, std::vector<VarBase*>>& outputs,
              framework::BlockDesc* block) {
-    framework::Scope* scope = GetScope(block);
+    std::map<std::string, VarBase*> vars;
+
     framework::OpDesc* op_desc = op->op_desc_;
     VLOG(3) << "tracer tracing " << op_desc->Type();
     op_desc->InferShape(*block);
@@ -64,77 +69,113 @@ class Tracer {
     std::unique_ptr<framework::OperatorBase> op_base =
         framework::OpRegistry::CreateOp(*op_desc);
 
-    *op->input_vars_ = inputs;
-    for (VarBase* input : inputs) {
-      const std::string vname = input->var_desc_->Name();
-      framework::Variable* var = scope->Var(vname);
-      input->var_ = var;
-      if (!var->IsInitialized()) {
-        framework::VarDesc* var_desc = block->FindVar(vname);
-        if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
-          var->GetMutable<framework::LoDTensor>();
+    framework::VariableValueMap invars_map;
+    framework::VariableValueMap outvars_map;
+
+    op->input_vars_ = inputs;
+    for (auto it : op->input_vars_) {
+      auto& invars = invars_map[it.first];
+      for (VarBase* inp : it.second) {
+        PADDLE_ENFORCE_NOT_NULL(inp->var_, "op %s input %s nullptr",
+                                op->op_desc_->Type(), inp->var_desc_->Name());
+
+        invars.push_back(inp->var_);
+        vars[inp->var_desc_->Name()] = inp;
+        if (inp->pre_op_) {
+          op->pre_ops_[it.first].push_back(inp->pre_op_);
+          op->pre_ops_out_idx_[it.first].push_back(inp->pre_op_out_idx_);
         } else {
-          LOG(ERROR) << "tracer doesn't support yet";
+          op->pre_ops_[it.first].push_back(nullptr);
         }
+        VLOG(3) << "input vname " << inp->var_desc_->Name() << " "
+                << inp->var_->IsInitialized();
       }
-      if (input->pre_op_) {
-        op->pre_ops_->push_back(input->pre_op_);
-        op->pre_ops_out_idx_->push_back(input->pre_op_out_idx_);
-      } else {
-        op->pre_ops_->push_back(nullptr);
-      }
-      VLOG(3) << "input vname " << vname << " "
-              << var->Get<framework::LoDTensor>().dims().size();
     }
 
-    *op->output_vars_ = outputs;
-    for (size_t i = 0; i < outputs.size(); ++i) {
-      const std::string vname = outputs[i]->var_desc_->Name();
-      framework::Variable* var = scope->Var(vname);
-      if (!var->IsInitialized()) {
-        framework::VarDesc* var_desc = block->FindVar(vname);
+    op->output_vars_ = outputs;
+    for (auto it : op->output_vars_) {
+      auto& outvars = outvars_map[it.first];
+      const std::vector<VarBase*>& outputs = it.second;
+      for (size_t i = 0; i < outputs.size(); ++i) {
+        VarBase* out = outputs[i];
+        outvars.push_back(out->var_);
+        vars[out->var_desc_->Name()] = out;
+
+        framework::VarDesc* var_desc = block->FindVar(out->var_desc_->Name());
         if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
-          var->GetMutable<framework::LoDTensor>();
+          out->var_->GetMutable<framework::LoDTensor>();
         } else {
           LOG(ERROR) << "tracer doesn't support yet";
         }
+        out->pre_op_ = op;
+        out->pre_op_out_name_ = it.first;
+        out->pre_op_out_idx_ = i;
+
+        VLOG(3) << "output vname " << out->var_desc_->Name() << " "
+                << out->var_->IsInitialized();
       }
-      outputs[i]->var_ = var;
-      outputs[i]->pre_op_ = op;
-      outputs[i]->pre_op_out_idx_ = i;
     }
 
     VLOG(3) << "tracer running " << op_desc->Type();
-    op_base->Run(*scope, platform::CPUPlace());
+    framework::RuntimeContext ctx(invars_map, outvars_map);
+
+    // TODO(panyx0718): Cache p.
+    framework::OperatorWithKernel* op_kernel =
+        dynamic_cast<framework::OperatorWithKernel*>(op_base.get());
+    PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
+
+    framework::Scope scope;
+    platform::CPUPlace place;
+    PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place);
+    p.op.RuntimeInferShape(scope, place, ctx);
+    p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
+
     if (block == startup_block_) {
       op->grad_op_desc_ = nullptr;
-      op->grad_to_var_ = nullptr;
     } else {
       framework::OpDesc* grad_op_desc;
       auto grad_to_var = new std::unordered_map<std::string, std::string>();
       CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var);
       op->grad_op_desc_ = grad_op_desc;
-      op->grad_to_var_ = grad_to_var;
-    }
-    op->block_ = block;
-  }
 
-  framework::Scope* GetScope(framework::BlockDesc* block) {
-    if (scopes_.find(block) != scopes_.end()) {
-      return scopes_.at(block);
+      for (auto it : grad_op_desc->Inputs()) {
+        auto& grad_in_vars = op->grad_input_vars_[it.first];
+        for (const std::string& grad_invar : it.second) {
+          block->FindRecursiveOrCreateVar(grad_invar);
+          auto var_it = grad_to_var->find(grad_invar);
+          if (var_it == grad_to_var->end()) {
+            auto fwd_var_it = vars.find(grad_invar);
+            PADDLE_ENFORCE(fwd_var_it != vars.end());
+            grad_in_vars.push_back(fwd_var_it->second->var_);
+          } else {
+            VarBase* var = vars[var_it->second];
+            if (!var->grads_->IsInitialized()) {
+              InitVar(var->var_, var->grads_);
+            }
+            grad_in_vars.push_back(var->grads_);
+          }
+        }
+      }
+      for (auto it : grad_op_desc->Outputs()) {
+        auto& grad_out_vars = op->grad_output_vars_[it.first];
+        for (const std::string& grad_outvar : it.second) {
+          block->FindRecursiveOrCreateVar(grad_outvar);
+          auto var_it = grad_to_var->find(grad_outvar);
+          PADDLE_ENFORCE(var_it != grad_to_var->end());
+          VarBase* var = vars[var_it->second];
+          if (!var->grads_->IsInitialized()) {
+            InitVar(var->var_, var->grads_);
+          }
+          grad_out_vars.push_back(var->grads_);
+        }
+      }
     }
-    framework::BlockDesc* parent_block = block->ParentBlock();
-    PADDLE_ENFORCE(scopes_.find(parent_block) != scopes_.end());
-    framework::Scope* scope = &scopes_[parent_block]->NewScope();
-    scopes_[block] = scope;
-    return scope;
+    op->block_ = block;
   }
 
  private:
-  std::map<framework::BlockDesc*, framework::Scope*> scopes_;
   framework::BlockDesc* root_block_;
   framework::BlockDesc* startup_block_;
-  framework::Scope* root_scope_;
 };
 
 }  // namespace imperative
diff --git a/paddle/fluid/operators/controlflow/logical_op.cc b/paddle/fluid/operators/controlflow/logical_op.cc
index 6446cab5ec..2e7f3edd55 100644
--- a/paddle/fluid/operators/controlflow/logical_op.cc
+++ b/paddle/fluid/operators/controlflow/logical_op.cc
@@ -86,8 +86,6 @@ class UnaryLogicalOpInferShape : public framework::InferShapeBase {
     OpComment comment;
     PADDLE_ENFORCE(context->HasInput("X"),
                    "Input(X) of %s operator must not be null", comment.type);
-    auto dim_x = context->GetInputDim("X");
-
     context->SetOutputDim("Out", context->GetInputDim("X"));
     context->ShareLoD("X", "Out");
   }
diff --git a/paddle/fluid/operators/conv_cudnn_op_cache.h b/paddle/fluid/operators/conv_cudnn_op_cache.h
index 92d394eb3c..f172431e48 100644
--- a/paddle/fluid/operators/conv_cudnn_op_cache.h
+++ b/paddle/fluid/operators/conv_cudnn_op_cache.h
@@ -19,6 +19,10 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/platform/cudnn_helper.h"
 
+DECLARE_uint64(conv_workspace_size_limit);
+DECLARE_bool(cudnn_exhaustive_search);
+DECLARE_int64(cudnn_exhaustive_search_times);
+
 namespace paddle {
 namespace operators {
 
@@ -45,6 +49,7 @@ static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5;
 template <typename TAlgorithm>
 class AlgorithmsCache {
  public:
+  AlgorithmsCache() : search_times_(0) { hash_.clear(); }
   // Caches the best algorithm for a given
   // combination of tensor dimensions & compute data type.
   TAlgorithm GetAlgorithm(
@@ -54,9 +59,14 @@ class AlgorithmsCache {
       int algorithmFlags,  // can set for different data type
       std::function<TAlgorithm()> gen_func);
 
+  TAlgorithm GetAlgorithm(int64_t area, int search_times, int algorithmFlags,
+                          std::function<TAlgorithm()> gen_func);
+
  private:
   std::unordered_map<int64_t, TAlgorithm> hash_;
   std::mutex mutex_;
+
+  int search_times_;
 };
 
 template <typename TAlgorithm>
@@ -107,5 +117,29 @@ TAlgorithm AlgorithmsCache<TAlgorithm>::GetAlgorithm(
   return hash_[seed];
 }
 
+template <typename TAlgorithm>
+TAlgorithm AlgorithmsCache<TAlgorithm>::GetAlgorithm(
+    int64_t area, int search_times, int algorithmFlags,
+    std::function<TAlgorithm()> gen_func) {
+  if (hash_.find(area) != hash_.end()) {
+    return hash_[area];
+  }
+  if (search_times_ < search_times) {
+    auto algo = gen_func();
+    hash_[area] = algo;
+    ++search_times_;
+    return algo;
+  }
+  TAlgorithm algo;
+  int64_t min = static_cast<uint64_t>(INT_MAX);
+  for (const auto& m : hash_) {
+    if (m.first < min) {
+      min = m.first;
+      algo = m.second;
+    }
+  }
+  return algo;
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/conv_fusion_op.cc b/paddle/fluid/operators/conv_fusion_op.cc
index 9bdedb10e0..23b8087e78 100644
--- a/paddle/fluid/operators/conv_fusion_op.cc
+++ b/paddle/fluid/operators/conv_fusion_op.cc
@@ -28,6 +28,8 @@ namespace operators {
 //         x is Input,
 //         z is ResidualData,
 //         bias is Bias
+// When `split_channels` is set, y will be splitted into multiple outputs,
+// each output has split_channels[i] number of channels.
 class Conv2DFusionOpMaker : public Conv2DOpMaker {
  protected:
   void Apply() override {
@@ -36,8 +38,65 @@ class Conv2DFusionOpMaker : public Conv2DOpMaker {
         "The activation type can be 'identity', 'sigmoid', 'relu', 'relu6' "
         "'relux' , 'tanh', 'band_pass'")
         .SetDefault("relu");
+    AddAttr<std::vector<int>>(
+        "split_channels",
+        "When `split_channels` are set, there will be multiple outputs, the "
+        "output size is equal to the number of `split_channels`.")
+        .SetDefault({});
+    AddOutput("Outputs",
+              "This Outputs is used when setting `split_channels`."
+              "Usually used to fuse conv with same input and same filter size, "
+              "padding, stride, dilation size.")
+        .AsDuplicable()
+        .AsDispensable();
+    AddInput("AlgoCache",
+             "The cache of convolution algorithm, a RAW type variable.")
+        .AsDispensable();
+    AddAttr<int>(
+        "search_times",
+        "The number of exhaustive search times for convolution algorithm.")
+        .SetDefault(-1);
   }
 };
+
+class Conv2DFusionOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of ConvOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Filter"),
+                   "Input(Filter) of ConvOp should not be null.");
+    auto in_dims = ctx->GetInputDim("Input");
+    auto filter_dims = ctx->GetInputDim("Filter");
+
+    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    std::vector<int> dilations =
+        ctx->Attrs().Get<std::vector<int>>("dilations");
+
+    std::vector<int64_t> oshape({in_dims[0], filter_dims[0]});
+    for (size_t i = 0; i < strides.size(); ++i) {
+      oshape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                      dilations[i], paddings[i], strides[i]));
+    }
+    PADDLE_ENFORCE(ctx->HasOutput("Output"),
+                   "Output(Output) of ConvOp should not be null.");
+    ctx->SetOutputDim("Output", framework::make_ddim(oshape));
+    std::vector<int> channels =
+        ctx->Attrs().Get<std::vector<int>>("split_channels");
+    if (channels.size()) {
+      PADDLE_ENFORCE(ctx->HasOutputs("Outputs"),
+                     "Output(Outputs) of ConvOp should not be null.");
+      std::vector<framework::DDim> oshapes;
+      oshapes.reserve(channels.size());
+      for (size_t i = 0; i < channels.size(); ++i) {
+        oshapes.push_back({oshape[0], channels[i], oshape[2], oshape[3]});
+      }
+      ctx->SetOutputsDim("Outputs", oshapes);
+    }
+  }
+};
+
 // TODO(qingqing): add gradient operator for conv2d_fusion
 
 }  // namespace operators
@@ -45,4 +104,5 @@ class Conv2DFusionOpMaker : public Conv2DOpMaker {
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(conv2d_fusion, ops::ConvOp, ops::Conv2DFusionOpMaker,
-                  ops::ConvOpInferVarType, paddle::framework::EmptyGradOpMaker);
+                  ops::Conv2DFusionOpInferShape, ops::ConvOpInferVarType,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/fluid/operators/conv_fusion_op.cu.cc b/paddle/fluid/operators/conv_fusion_op.cu.cc
index e73762f5fb..d8b997cca6 100644
--- a/paddle/fluid/operators/conv_fusion_op.cu.cc
+++ b/paddle/fluid/operators/conv_fusion_op.cu.cc
@@ -16,8 +16,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
 
-DECLARE_uint64(conv_workspace_size_limit);
-DECLARE_bool(cudnn_exhaustive_search);
+DEFINE_int64(cudnn_exhaustive_search_times, -1,
+             "Exhaustive search times for cuDNN convolution, "
+             "defalut is 1, only search once.");
 
 namespace paddle {
 namespace operators {
@@ -117,41 +118,60 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
           workspace_size_limit, &algo));
       VLOG(3) << "cuDNN forward algo " << algo;
     } else {
+      auto search_func = [&]() {
+        int returned_algo_count;
+        std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
+            fwd_perf_stat;
+        auto cudnn_find_func = [&](void* cudnn_workspace) {
+          CUDNN_ENFORCE(
+              platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
+                  handle, cudnn_input_desc, input_data, cudnn_filter_desc,
+                  filter_data, cudnn_conv_desc, cudnn_output_desc, output_data,
+                  kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
+                  fwd_perf_stat.data(), cudnn_workspace, workspace_size_limit));
+        };
+        workspace_handle.RunFunc(cudnn_find_func, workspace_size_limit);
+        VLOG(3) << "Perf result: (algo: stat, time, memory)";
+        for (int i = 0; i < returned_algo_count; ++i) {
+          const auto& stat = fwd_perf_stat[i];
+          VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time << " "
+                  << stat.memory;
+        }
+        return fwd_perf_stat[0].algo;
+      };
       AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* algo_cache = nullptr;
-      if (ctx.scope().FindVar(kCUDNNFwdAlgoCache)) {
+      int search_times = ctx.Attr<int>("search_times");
+      search_times = std::max(
+          static_cast<int>(FLAGS_cudnn_exhaustive_search_times), search_times);
+      if (search_times > 0) {
+        // The searched algo will be cached by `search_times` times for
+        // different input dimension. For other dimensions, select the algo
+        // of closest area.
+        auto var_name = ctx.Inputs("AlgoCache")[0];
         algo_cache =
             ctx.scope()
-                .FindVar(kCUDNNFwdAlgoCache)
+                .FindVar(var_name)
                 ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
+        algo = algo_cache->GetAlgorithm(x_dims[2] * x_dims[3], search_times, 0,
+                                        search_func);
       } else {
-        algo_cache =
-            const_cast<framework::Scope&>(ctx.scope())
-                .Var(kCUDNNFwdAlgoCache)
-                ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
+        // Cache searched algo in Var(kCUDNNFwdAlgoCache).
+        // all conv ops use the same kCUDNNFwdAlgoCache variable.
+        if (ctx.scope().FindVar(kCUDNNFwdAlgoCache)) {
+          algo_cache =
+              ctx.scope()
+                  .FindVar(kCUDNNFwdAlgoCache)
+                  ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
+        } else {
+          // TODO(qingqing) remove const_cast
+          algo_cache =
+              const_cast<framework::Scope*>(ctx.scope().parent())
+                  ->Var(kCUDNNFwdAlgoCache)
+                  ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
+        }
+        algo = algo_cache->GetAlgorithm(x_dims, f_dims, strides, paddings,
+                                        dilations, 0, search_func);
       }
-      algo = algo_cache->GetAlgorithm(
-          x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
-            int returned_algo_count;
-            std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
-                fwd_perf_stat;
-            auto cudnn_find_func = [&](void* cudnn_workspace) {
-              CUDNN_ENFORCE(
-                  platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
-                      handle, cudnn_input_desc, input_data, cudnn_filter_desc,
-                      filter_data, cudnn_conv_desc, cudnn_output_desc,
-                      output_data, kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
-                      fwd_perf_stat.data(), cudnn_workspace,
-                      workspace_size_limit));
-            };
-            workspace_handle.RunFunc(cudnn_find_func, workspace_size_limit);
-            VLOG(3) << "Perf result: (algo: stat, time, memory)";
-            for (int i = 0; i < returned_algo_count; ++i) {
-              const auto& stat = fwd_perf_stat[i];
-              VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time
-                      << " " << stat.memory;
-            }
-            return fwd_perf_stat[0].algo;
-          });
       VLOG(3) << "choose algo " << algo;
     }
 
@@ -195,6 +215,27 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
       };
       workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
     }
+    std::vector<int> channels = ctx.Attr<std::vector<int>>("split_channels");
+    if (channels.size()) {
+      auto outs = ctx.MultiOutput<framework::Tensor>("Outputs");
+      if (x_dims[0] == 1) {
+        // share data with Output
+        framework::Tensor t;
+        t.ShareDataWith(*output);
+        auto y_dims = output->dims();
+        t.Resize({y_dims[1], y_dims[2], y_dims[3]});
+        int s = 0;
+        for (size_t i = 0; i < channels.size(); ++i) {
+          int e = s + channels[i];
+          outs[i]->ShareDataWith(t.Slice(s, e));
+          outs[i]->Resize({x_dims[0], channels[i], y_dims[2], y_dims[3]});
+          s = e;
+        }
+      } else {
+        // TODO(qingiqng): do copy when batch size large than 1
+        PADDLE_THROW("Batch size greater than 1 is Unsupported");
+      }
+    }
   }
 };
 #endif
diff --git a/paddle/fluid/operators/crop_op.h b/paddle/fluid/operators/crop_op.h
index 2d7d33bd4f..cfc2cac7be 100644
--- a/paddle/fluid/operators/crop_op.h
+++ b/paddle/fluid/operators/crop_op.h
@@ -68,7 +68,6 @@ void CropFunction(const framework::ExecutionContext& context) {
   }
   out->mutable_data<T>(out_dims, context.GetPlace());
   auto x_stride = framework::stride(x->dims());
-  auto out_stride = framework::stride(out->dims());
   auto offsets = GetOffsets(context);
   int64_t offset = 0;
   for (size_t i = 0; i < offsets.size(); ++i) {
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
index fae0925149..1bf41ed948 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -147,7 +147,6 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
             ->GetMutable<CudnnRNNCache>();
 
     auto input_dims = input->dims();
-    auto weight_dims = weight->dims();
     auto init_h_dims = init_h->dims();
     auto init_c_dims = init_c->dims();
     in_grad->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/detail/strided_memcpy.h b/paddle/fluid/operators/detail/strided_memcpy.h
index 0b7c470fe7..94419d1f9a 100644
--- a/paddle/fluid/operators/detail/strided_memcpy.h
+++ b/paddle/fluid/operators/detail/strided_memcpy.h
@@ -27,8 +27,8 @@ struct StridedMemcpyFunctor;
 template <typename T>
 struct StridedMemcpyFunctor<T, 0> {
   void operator()(const platform::DeviceContext& dev_ctx, const T* src,
-                  framework::Dim<0> src_stride, framework::Dim<0> dst_dim,
-                  framework::Dim<0> dst_stride, T* dst) const {
+                  const int64_t* src_stride, const int64_t* dst_dim,
+                  const int64_t* dst_stride, T* dst) const {
     auto place = dev_ctx.GetPlace();
     if (platform::is_cpu_place(place)) {
       auto& cpu_place = boost::get<platform::CPUPlace>(place);
@@ -50,18 +50,18 @@ struct StridedMemcpyFunctor<T, 0> {
 template <typename T>
 struct StridedMemcpyFunctor<T, 1> {
   void operator()(const platform::DeviceContext& dev_ctx, const T* src,
-                  framework::Dim<1> src_stride, framework::Dim<1> dst_dim,
-                  framework::Dim<1> dst_stride, T* dst) const {
+                  const int64_t* src_stride, const int64_t* dst_dim,
+                  const int64_t* dst_stride, T* dst) const {
     auto place = dev_ctx.GetPlace();
     if (platform::is_cpu_place(place)) {
       auto& cpu_place = boost::get<platform::CPUPlace>(place);
-      memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim.head);
+      memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim[0]);
     } else {
 #ifdef PADDLE_WITH_CUDA
       auto& gpu_place = boost::get<platform::CUDAPlace>(place);
       auto& cuda_ctx =
           reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx);
-      memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim.head,
+      memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim[0],
                    cuda_ctx.stream());
 #else
       PADDLE_THROW("Paddle is not compiled with GPU");
@@ -73,19 +73,19 @@ struct StridedMemcpyFunctor<T, 1> {
 template <typename T, int Rank>
 struct StridedMemcpyFunctor {
   void operator()(const platform::DeviceContext& dev_ctx, const T* src,
-                  framework::Dim<Rank> src_stride, framework::Dim<Rank> dst_dim,
-                  framework::Dim<Rank> dst_stride, T* dst) const {
-    for (int64_t i = 0; i < dst_dim.head; ++i) {
+                  const int64_t* src_stride, const int64_t* dst_dim,
+                  const int64_t* dst_stride, T* dst) const {
+    for (int64_t i = 0; i < dst_dim[0]; ++i) {
       StridedMemcpyFunctor<T, Rank - 1> func;
-      func(dev_ctx, src, src_stride.tail, dst_dim.tail, dst_stride.tail, dst);
-      src += src_stride.head;
-      dst += dst_stride.head;
+      func(dev_ctx, src, src_stride + 1, dst_dim + 1, dst_stride + 1, dst);
+      src += src_stride[0];
+      dst += dst_stride[0];
     }
   }
 };
 
 template <typename T>
-struct StridedCopyDimVisitor : public boost::static_visitor<void> {
+struct StridedCopyDimVisitor {
   StridedCopyDimVisitor(const platform::DeviceContext& dev_ctx, const T* src,
                         const framework::DDim& src_stride,
                         const framework::DDim& dst_stride, T* dst)
@@ -95,13 +95,11 @@ struct StridedCopyDimVisitor : public boost::static_visitor<void> {
         dst_stride_(dst_stride),
         dst_(dst) {}
 
-  template <typename Dim>
-  void operator()(Dim dst_dim) const {
-    Dim src_stride = boost::get<Dim>(src_stride_);
-    Dim dst_stride = boost::get<Dim>(dst_stride_);
-    constexpr int dim = Dim::dimensions;
-    StridedMemcpyFunctor<T, dim> functor;
-    functor(dev_ctx_, src_, src_stride, dst_dim, dst_stride, dst_);
+  template <int D>
+  void operator()(const framework::Dim<D>& dst_dim) const {
+    StridedMemcpyFunctor<T, D> functor;
+    functor(dev_ctx_, src_, src_stride_.Get(), dst_dim.Get(), dst_stride_.Get(),
+            dst_);
   }
 
   const platform::DeviceContext& dev_ctx_;
diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
index fddd688401..a652d4d957 100644
--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@@ -64,8 +64,6 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel {
         "Output(BboxOutsideWeights) of RpnTargetAssignOp should not be null");
 
     auto rpn_rois_dims = ctx->GetInputDim("RpnRois");
-    auto gt_classes_dims = ctx->GetInputDim("GtClasses");
-    auto is_crowd_dims = ctx->GetInputDim("IsCrowd");
     auto gt_boxes_dims = ctx->GetInputDim("GtBoxes");
     auto im_info_dims = ctx->GetInputDim("ImInfo");
 
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc
index 2c46803fd0..06e48f1262 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -53,12 +53,6 @@ class GenerateProposalsOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasInput("Variances"),
                    "Input(Variances) shouldn't be null.");
 
-    auto scores_dims = ctx->GetInputDim("Scores");
-    auto bbox_deltas_dims = ctx->GetInputDim("BboxDeltas");
-    auto im_info_dims = ctx->GetInputDim("ImInfo");
-    auto anchors_dims = ctx->GetInputDim("Anchors");
-    auto variances_dims = ctx->GetInputDim("Variances");
-
     ctx->SetOutputDim("RpnRois", {-1, 4});
     ctx->SetOutputDim("RpnRoiProbs", {-1, 1});
   }
diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
index dc6c3d5a66..0b8053e8d0 100644
--- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc
+++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
@@ -58,7 +58,6 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel {
 
     auto anchor_dims = ctx->GetInputDim("Anchor");
     auto gt_boxes_dims = ctx->GetInputDim("GtBoxes");
-    auto is_crowd_dims = ctx->GetInputDim("IsCrowd");
     auto im_info_dims = ctx->GetInputDim("ImInfo");
     PADDLE_ENFORCE_EQ(anchor_dims.size(), 2,
                       "The rank of Input(Anchor) must be 2.");
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index eab4297c73..8a25d57e61 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -7,56 +7,52 @@ if(WITH_GRPC)
 else()
     set(cc_generic_services "true")
 endif()
-configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @ONLY)
+configure_file(send_recv.proto.in ${CMAKE_CURRENT_BINARY_DIR}/send_recv.proto @ONLY)
 
+# FIXME(typhoonzero): use add_subdirectory once we clean the dependency of these files
 set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-
 if(WITH_GRPC)
-  grpc_library(sendrecvop_rpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
-        request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc collective_client.cc collective_server.cc
-      PROTO send_recv.proto 
+  set(GRPC_SRCS grpc/grpc_client.cc grpc/grpc_server.cc grpc/grpc_serde.cc grpc/grpc_bytebuffer_stream.cc grpc/grpc_variable_response.cc)
+  grpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc
+        request_handler_impl.cc rpc_client.cc rpc_server.cc
+        variable_response.cc
+        collective_client.cc collective_server.cc
+        ${GRPC_SRCS}
+      PROTO ${CMAKE_CURRENT_BINARY_DIR}/send_recv.proto 
       DEPS lod_tensor selected_rows_functor memory)
 
   set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set(RPC_DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
 
-  cc_test(grpc_serde_test SRCS grpc_serde_test.cc 
-    DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_rpc scope profiler math_function SERIAL)
-
-  cc_test(rpc_server_test SRCS rpc_server_test.cc
-    DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor  proto_desc lookup_sparse_table_op SERIAL)
-
-  cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler)
-
-  if(WITH_GPU)
-  cc_test(collective_server_test SRCS collective_server_test.cc 
-      DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor
-      selected_rows_functor  scope math_function SERIAL)
-  endif()
+  cc_test(grpc_serde_test SRCS grpc/grpc_serde_test.cc 
+    DEPS ${RPC_DEPS} scope profiler math_function SERIAL)
 
-  cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory)
 else()
   set_source_files_properties(brpc_server.cc parameter_prefetch.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc
       brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc collective_server.cc collective_server_test.cc
       collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 
-  brpc_library(sendrecvop_rpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc
-      brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc collective_client.cc collective_server.cc
-    PROTO send_recv.proto
+  set(BRPC_SRCS brpc/brpc_client.cc brpc/brpc/server.cc brpc/brpc_sendrecvop_utils.cc brpc/brpc_variable_response.cc brpc/brpc_rdma_pool.cc)
+  brpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc
+      request_handler_impl.cc rpc_client.cc rpc_server.cc
+      variable_response.cc
+      collective_client.cc collective_server.cc
+      ${BRPC_SRCS}
+    PROTO ${CMAKE_CURRENT_BINARY_DIR}/send_recv.proto
     DEPS lod_tensor selected_rows memory)
 
-  cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory)
-
-  set(brpc_test_depends sendrecvop_rpc brpc ssl crypto protobuf leveldb gflags glog executor
-      proto_desc lookup_sparse_table_op snappystream snappy zlib)
-
-  cc_test(rpc_server_test SRCS rpc_server_test.cc
-      DEPS ${brpc_test_depends} SERIAL)
+  set(RPC_DEPS sendrecvop_rpc brpc ssl crypto protobuf leveldb snappystream snappy zlib)
+  cc_test(brpc_serde_test SRCS brpc/brpc_serde_test.cc
+      DEPS ${RPC_DEPS} gflags glog executor proto_desc lookup_sparse_table_op SERIAL)
+endif()
 
-  cc_test(brpc_serde_test SRCS brpc_serde_test.cc
-      DEPS ${brpc_test_depends} SERIAL)
 
-  if(WITH_GPU)
-  cc_test(collective_server_test SRCS collective_server_test.cc 
-      DEPS ${brpc_test_depends} selected_rows_functor  scope math_function SERIAL)
-  endif()
+cc_test(rpc_server_test SRCS rpc_server_test.cc
+    DEPS ${RPC_DEPS} executor proto_desc lookup_sparse_table_op SERIAL)
+cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler)
+cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory)
+if(WITH_GPU)
+    cc_test(collective_server_test SRCS collective_server_test.cc 
+        DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor
+        selected_rows_functor  scope math_function SERIAL)
 endif()
diff --git a/paddle/fluid/operators/distributed/brpc_client.cc b/paddle/fluid/operators/distributed/brpc/brpc_client.cc
similarity index 99%
rename from paddle/fluid/operators/distributed/brpc_client.cc
rename to paddle/fluid/operators/distributed/brpc/brpc_client.cc
index 62e32977b8..87bdb83503 100644
--- a/paddle/fluid/operators/distributed/brpc_client.cc
+++ b/paddle/fluid/operators/distributed/brpc/brpc_client.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/distributed/brpc_client.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_client.h"
 #include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed/brpc_client.h b/paddle/fluid/operators/distributed/brpc/brpc_client.h
similarity index 97%
rename from paddle/fluid/operators/distributed/brpc_client.h
rename to paddle/fluid/operators/distributed/brpc/brpc_client.h
index 80cc81bff3..2066ade8a5 100644
--- a/paddle/fluid/operators/distributed/brpc_client.h
+++ b/paddle/fluid/operators/distributed/brpc/brpc_client.h
@@ -31,10 +31,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/distributed_pb.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed/brpc_rdma_pool.cc b/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc
similarity index 97%
rename from paddle/fluid/operators/distributed/brpc_rdma_pool.cc
rename to paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc
index e1be5673df..d5c614001e 100644
--- a/paddle/fluid/operators/distributed/brpc_rdma_pool.cc
+++ b/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc
@@ -14,7 +14,7 @@
 
 #ifdef PADDLE_WITH_BRPC_RDMA
 
-#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h"
 #include "brpc/channel.h"
 #include "brpc/rdma/rdma_helper.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/operators/distributed/brpc_rdma_pool.h b/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h
similarity index 100%
rename from paddle/fluid/operators/distributed/brpc_rdma_pool.h
rename to paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h
diff --git a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc
similarity index 96%
rename from paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc
rename to paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc
index e4604db3a3..49e048f07a 100644
--- a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc
+++ b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc
@@ -20,10 +20,10 @@ limitations under the License. */
 #include <thread>  // NOLINT
 
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h"
-#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h"
-#include "paddle/fluid/operators/distributed/brpc_variable_response.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h"
+#include "paddle/fluid/operators/distributed/distributed_pb.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h
similarity index 96%
rename from paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h
rename to paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h
index ffaf442224..a5bdc331eb 100644
--- a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h
+++ b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h
@@ -26,7 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+#include "paddle/fluid/operators/distributed/distributed_pb.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed/brpc_serde_test.cc b/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc
similarity index 97%
rename from paddle/fluid/operators/distributed/brpc_serde_test.cc
rename to paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc
index 2a2dc72150..b902d3db48 100644
--- a/paddle/fluid/operators/distributed/brpc_serde_test.cc
+++ b/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc
@@ -22,8 +22,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h"
-#include "paddle/fluid/operators/distributed/brpc_variable_response.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/operators/distributed/variable_response.h"
 #include "paddle/fluid/operators/math/math_function.h"
diff --git a/paddle/fluid/operators/distributed/brpc_server.cc b/paddle/fluid/operators/distributed/brpc/brpc_server.cc
similarity index 98%
rename from paddle/fluid/operators/distributed/brpc_server.cc
rename to paddle/fluid/operators/distributed/brpc/brpc_server.cc
index 78d41aeac5..cbe0bd09c7 100644
--- a/paddle/fluid/operators/distributed/brpc_server.cc
+++ b/paddle/fluid/operators/distributed/brpc/brpc_server.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/distributed/brpc_server.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_server.h"
 #include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h"
-#include "paddle/fluid/operators/distributed/brpc_variable_response.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 
 namespace sendrecv {
diff --git a/paddle/fluid/operators/distributed/brpc_server.h b/paddle/fluid/operators/distributed/brpc/brpc_server.h
similarity index 95%
rename from paddle/fluid/operators/distributed/brpc_server.h
rename to paddle/fluid/operators/distributed/brpc/brpc_server.h
index 85a7ad0dfe..78bbe5adc0 100644
--- a/paddle/fluid/operators/distributed/brpc_server.h
+++ b/paddle/fluid/operators/distributed/brpc/brpc_server.h
@@ -19,8 +19,8 @@ limitations under the License. */
 #include <string>
 
 #include "brpc/server.h"
+#include "paddle/fluid/operators/distributed/distributed_pb.h"
 #include "paddle/fluid/operators/distributed/rpc_server.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/distributed/brpc_variable_response.cc b/paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc
similarity index 96%
rename from paddle/fluid/operators/distributed/brpc_variable_response.cc
rename to paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc
index 75306d7233..eb78917ad2 100644
--- a/paddle/fluid/operators/distributed/brpc_variable_response.cc
+++ b/paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 //
 
-#include "paddle/fluid/operators/distributed/brpc_variable_response.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h"
 #include "paddle/fluid/operators/distributed/send_recv.pb.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed/brpc_variable_response.h b/paddle/fluid/operators/distributed/brpc/brpc_variable_response.h
similarity index 97%
rename from paddle/fluid/operators/distributed/brpc_variable_response.h
rename to paddle/fluid/operators/distributed/brpc/brpc_variable_response.h
index b0b91a42a0..6282f08a72 100644
--- a/paddle/fluid/operators/distributed/brpc_variable_response.h
+++ b/paddle/fluid/operators/distributed/brpc/brpc_variable_response.h
@@ -23,7 +23,7 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
 
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+#include "paddle/fluid/operators/distributed/distributed_pb.h"
 
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"
diff --git a/paddle/fluid/operators/distributed/collective_client.h b/paddle/fluid/operators/distributed/collective_client.h
index 53b03c531a..6a3a450a1f 100644
--- a/paddle/fluid/operators/distributed/collective_client.h
+++ b/paddle/fluid/operators/distributed/collective_client.h
@@ -22,7 +22,7 @@
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 
 DECLARE_int32(rpc_deadline);
diff --git a/paddle/fluid/operators/distributed/collective_server.h b/paddle/fluid/operators/distributed/collective_server.h
index a23dc18b4d..03c688a78e 100644
--- a/paddle/fluid/operators/distributed/collective_server.h
+++ b/paddle/fluid/operators/distributed/collective_server.h
@@ -23,7 +23,7 @@ limitations under the License. */
 
 #include "gflags/gflags.h"
 
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/operators/distributed/rpc_server.h"
diff --git a/paddle/fluid/operators/distributed/collective_server_test.cc b/paddle/fluid/operators/distributed/collective_server_test.cc
index 0a9c69e393..46c761000c 100644
--- a/paddle/fluid/operators/distributed/collective_server_test.cc
+++ b/paddle/fluid/operators/distributed/collective_server_test.cc
@@ -21,9 +21,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 
-#include "paddle/fluid/operators/detail/macros.h"
 #include "paddle/fluid/operators/distributed/collective_client.h"
 #include "paddle/fluid/operators/distributed/collective_server.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
@@ -52,12 +52,12 @@ std::unique_ptr<framework::Scope> GenerateVars(platform::Place place) {
   framework::Scope* scope = new framework::Scope();
   framework::Variable* var = scope->Var("var1");
   auto* slr = var->GetMutable<framework::SelectedRows>();
-  slr->set_height(1000);
+  slr->set_height(20000);
 
   auto* tensor = slr->mutable_value();
   auto* rows = slr->mutable_rows();
 
-  tensor->Resize(framework::make_ddim({3, 5}));
+  tensor->Resize(framework::make_ddim({20000, 1024}));
   tensor->mutable_data<float>(place);
 
   paddle::operators::math::set_constant(ctx, tensor, 32.7);
@@ -83,6 +83,7 @@ void Gather(const std::vector<distributed::RemoteVar>& vars,
 }
 
 TEST(PREFETCH, GPU) {
+  setenv("FLAGS_max_body_size", "2147483647", 1);
   platform::CUDAPlace place;
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto& ctx = *pool.Get(place);
diff --git a/paddle/fluid/operators/detail/macros.h b/paddle/fluid/operators/distributed/distributed.h
similarity index 80%
rename from paddle/fluid/operators/detail/macros.h
rename to paddle/fluid/operators/distributed/distributed.h
index 6f4a15caa5..3a9f922598 100644
--- a/paddle/fluid/operators/detail/macros.h
+++ b/paddle/fluid/operators/distributed/distributed.h
@@ -18,15 +18,15 @@
 
 #ifdef PADDLE_WITH_GRPC
 
-#include "paddle/fluid/operators/distributed/grpc_client.h"
-#include "paddle/fluid/operators/distributed/grpc_server.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_client.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_server.h"
 #define RPCSERVER_T paddle::operators::distributed::AsyncGRPCServer
 #define RPCCLIENT_T paddle::operators::distributed::GRPCClient
 
 #else  // PADDLE_WITH_GRPC
 
-#include "paddle/fluid/operators/distributed/brpc_client.h"
-#include "paddle/fluid/operators/distributed/brpc_server.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_client.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_server.h"
 #define RPCSERVER_T paddle::operators::distributed::AsyncBRPCServer
 #define RPCCLIENT_T paddle::operators::distributed::BRPCClient
 
diff --git a/paddle/fluid/operators/distributed/distributed_pb.h b/paddle/fluid/operators/distributed/distributed_pb.h
new file mode 100644
index 0000000000..f1c662be9a
--- /dev/null
+++ b/paddle/fluid/operators/distributed/distributed_pb.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef PADDLE_WITH_DISTRIBUTE
+
+#ifdef PADDLE_WITH_GRPC
+
+#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+
+#else  // PADDLE_WITH_GRPC
+
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+
+#endif  // PADDLE_WITH_GRPC
+
+#endif  // PADDLE_WITH_DISTRIBUTE
diff --git a/paddle/fluid/operators/distributed/grpc_bytebuffer_stream.cc b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc
similarity index 96%
rename from paddle/fluid/operators/distributed/grpc_bytebuffer_stream.cc
rename to paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc
index d192f54ee0..c2cb0d7f04 100644
--- a/paddle/fluid/operators/distributed/grpc_bytebuffer_stream.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 //       file and did some modifications so that we can send gRPC
 //       requests without too much copying of the tensor data.
 
-#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h
similarity index 100%
rename from paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h
rename to paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h
diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
similarity index 99%
rename from paddle/fluid/operators/distributed/grpc_client.cc
rename to paddle/fluid/operators/distributed/grpc/grpc_client.cc
index 8c54159a41..7875c16c3c 100644
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
@@ -17,8 +17,8 @@ limitations under the License. */
 
 #include "glog/logging.h"  // For VLOG
 #include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/grpc_client.h"
-#include "paddle/fluid/operators/distributed/grpc_serde.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_client.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/profiler.h"
diff --git a/paddle/fluid/operators/distributed/grpc_client.h b/paddle/fluid/operators/distributed/grpc/grpc_client.h
similarity index 98%
rename from paddle/fluid/operators/distributed/grpc_client.h
rename to paddle/fluid/operators/distributed/grpc/grpc_client.h
index 01bf46cc31..fa77d21257 100644
--- a/paddle/fluid/operators/distributed/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_client.h
@@ -39,10 +39,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/distributed/distributed_pb.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 
diff --git a/paddle/fluid/operators/distributed/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
similarity index 96%
rename from paddle/fluid/operators/distributed/grpc_serde.cc
rename to paddle/fluid/operators/distributed/grpc/grpc_serde.cc
index a9dea9cfd2..6df4fd36f9 100644
--- a/paddle/fluid/operators/distributed/grpc_serde.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
@@ -21,9 +21,9 @@ limitations under the License. */
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h"
-#include "paddle/fluid/operators/distributed/grpc_serde.h"
-#include "paddle/fluid/operators/distributed/grpc_variable_response.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h"
 #include "paddle/fluid/operators/distributed/proto_encoder_helper.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/platform/port.h"
diff --git a/paddle/fluid/operators/distributed/grpc_serde.h b/paddle/fluid/operators/distributed/grpc/grpc_serde.h
similarity index 93%
rename from paddle/fluid/operators/distributed/grpc_serde.h
rename to paddle/fluid/operators/distributed/grpc/grpc_serde.h
index 16f5293b0e..c9a57beb3a 100644
--- a/paddle/fluid/operators/distributed/grpc_serde.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_serde.h
@@ -27,8 +27,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/platform/port.h"
 
-#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+#include "paddle/fluid/operators/distributed/distributed_pb.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/distributed/grpc_serde_test.cc b/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc
similarity index 97%
rename from paddle/fluid/operators/distributed/grpc_serde_test.cc
rename to paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc
index 1936c2c623..749c1bf39a 100644
--- a/paddle/fluid/operators/distributed/grpc_serde_test.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc
@@ -21,9 +21,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/operators/detail/macros.h"
-#include "paddle/fluid/operators/distributed/grpc_serde.h"
-#include "paddle/fluid/operators/distributed/grpc_variable_response.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/operators/distributed/grpc_server.cc b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
similarity index 99%
rename from paddle/fluid/operators/distributed/grpc_server.cc
rename to paddle/fluid/operators/distributed/grpc/grpc_server.cc
index cda102e78d..08f777e279 100644
--- a/paddle/fluid/operators/distributed/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <limits>
 #include <string>
 
-#include "paddle/fluid/operators/distributed/grpc_serde.h"
-#include "paddle/fluid/operators/distributed/grpc_server.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_server.h"
 
 using ::grpc::ServerAsyncResponseWriter;
 
diff --git a/paddle/fluid/operators/distributed/grpc_server.h b/paddle/fluid/operators/distributed/grpc/grpc_server.h
similarity index 93%
rename from paddle/fluid/operators/distributed/grpc_server.h
rename to paddle/fluid/operators/distributed/grpc/grpc_server.h
index d2524f5e65..2fd3a7a740 100644
--- a/paddle/fluid/operators/distributed/grpc_server.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_server.h
@@ -29,11 +29,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/distributed/grpc_service.h"
+#include "paddle/fluid/operators/distributed/distributed_pb.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_service.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/operators/distributed/rpc_server.h"
-#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/platform/profiler.h"
 
diff --git a/paddle/fluid/operators/distributed/grpc_service.h b/paddle/fluid/operators/distributed/grpc/grpc_service.h
similarity index 98%
rename from paddle/fluid/operators/distributed/grpc_service.h
rename to paddle/fluid/operators/distributed/grpc/grpc_service.h
index 537429b5fe..0b5c5151e6 100644
--- a/paddle/fluid/operators/distributed/grpc_service.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_service.h
@@ -23,7 +23,7 @@
 #include <grpc++/impl/codegen/stub_options.h>
 #include <grpc++/impl/codegen/sync_stream.h>
 #include <grpc++/support/byte_buffer.h>
-#include "paddle/fluid/operators/distributed/grpc_variable_response.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h"
 #include "paddle/fluid/platform/profiler.h"
 
 // NOTE: This method was originally created by tensorflow
diff --git a/paddle/fluid/operators/distributed/grpc_variable_response.cc b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc
similarity index 99%
rename from paddle/fluid/operators/distributed/grpc_variable_response.cc
rename to paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc
index 76ad02b030..87e83ca53b 100644
--- a/paddle/fluid/operators/distributed/grpc_variable_response.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc
@@ -19,7 +19,7 @@
 #include <nccl.h>
 #endif
 
-#include "paddle/fluid/operators/distributed/grpc_variable_response.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed/grpc_variable_response.h b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h
similarity index 89%
rename from paddle/fluid/operators/distributed/grpc_variable_response.h
rename to paddle/fluid/operators/distributed/grpc/grpc_variable_response.h
index 89df07c92c..3ca1d89f75 100644
--- a/paddle/fluid/operators/distributed/grpc_variable_response.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h
@@ -22,13 +22,11 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
 
-#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
-
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h"
+#include "paddle/fluid/operators/distributed/distributed_pb.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h"
 #include "paddle/fluid/operators/distributed/variable_response.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc
index cf14538b1c..a96dec1086 100644
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
@@ -23,7 +23,7 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/tensor.h"
 
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed/rpc_client.h"
 #include "paddle/fluid/operators/distributed/variable_response.h"
 #include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc
index 122619d41b..cc5b9c29a1 100644
--- a/paddle/fluid/operators/distributed/rpc_server.cc
+++ b/paddle/fluid/operators/distributed/rpc_server.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/operators/distributed/rpc_server.h"
+
 #include <fstream>
 #include <iostream>
 #include <limits>
 #include <string>
-
-#include "paddle/fluid/operators/distributed/rpc_server.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed/rpc_server_test.cc b/paddle/fluid/operators/distributed/rpc_server_test.cc
index c3dd459fc4..089ea623f1 100644
--- a/paddle/fluid/operators/distributed/rpc_server_test.cc
+++ b/paddle/fluid/operators/distributed/rpc_server_test.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/operators/distributed/rpc_client.h"
 #include "paddle/fluid/operators/distributed/rpc_server.h"
diff --git a/paddle/fluid/operators/distributed/send_recv.proto.in b/paddle/fluid/operators/distributed/send_recv.proto.in
index 2637619f30..b39eef04d8 100644
--- a/paddle/fluid/operators/distributed/send_recv.proto.in
+++ b/paddle/fluid/operators/distributed/send_recv.proto.in
@@ -1,4 +1,3 @@
-
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Licensed under
 the Apache License, Version 2.0 (the "License"); you may not use this file
 except in compliance with the License.
@@ -18,13 +17,8 @@ package sendrecv;
 option cc_generic_services = @cc_generic_services@;
 
 service SendRecvService {
-  // For parameter server round-robin like hashing, do not split tensors.
-  // Send and recv only one tensor
-  // TODO(typhoonzero): add streaming API
   rpc SendVariable(VariableMessage) returns (VoidMessage) {}
-  // Argument VariableMessage for GetVariable should only contain varname.
   rpc GetVariable(VariableMessage) returns (VariableMessage) {}
-  // pre-fetch variable by given variable name and Ids
   rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {}
 
   rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {}
@@ -33,19 +27,12 @@ service SendRecvService {
   rpc GetMonomerBarrier(VariableMessage) returns (VoidMessage) {}
 }
 
-// VariableMessage is serialized paddle variable message.
-// It can be:
-// LoDTensor
-// SelectedRows
 enum VarType {
   LOD_TENSOR = 0;
   SELECTED_ROWS = 1;
   NCCL_ID = 2;
 }
 
-// NOTICE(gongwb):don't modify this proto if you are not
-//   not familar with how we serialize in sendrecvop_utils.h
-//   and deserilize it in  variable_response.h.
 message VariableMessage {
   enum Type {
     // Pod Types
@@ -62,21 +49,14 @@ message VariableMessage {
   string varname = 1;
   // TODO(Yancey1989): reference framework::proto::VarDesc::VarType
   VarType type = 2;
-  // bool persistable is not needed for sending.
-  // tensor info:
   Type data_type = 3;
   repeated int64 dims = 4;
 
-  // lod details:
   int64 lod_level = 5;
   repeated LodData lod = 6;
-  // selected_rows height, aka. original dim0
   int64 slr_height = 7;
-  // tensor data
   bytes serialized = 8;
-  // selected_rows data
   bytes rows = 9;
-  // Look up table block execution output variable name.
   string out_varname = 10;
   // If 1, the ps server will start profiling, the ps
   // server stops profiling and generates a profile to /tmp/profile_ps_*
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
index 25e2f77fb7..e5c96507e9 100644
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 #include <thread>  // NOLINT
 
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/operators/distributed/variable_response.h"
 #include "paddle/fluid/platform/port.h"
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h
index 6a87178be5..5457101a5c 100644
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.h
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h
@@ -24,7 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+#include "paddle/fluid/operators/distributed/distributed_pb.h"
 #include "paddle/fluid/platform/port.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed/variable_response.h b/paddle/fluid/operators/distributed/variable_response.h
index a4324f67bb..294cae5f44 100644
--- a/paddle/fluid/operators/distributed/variable_response.h
+++ b/paddle/fluid/operators/distributed/variable_response.h
@@ -25,7 +25,7 @@
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+#include "paddle/fluid/operators/distributed/distributed_pb.h"
 
 DECLARE_string(rpc_server_profile_path);
 
diff --git a/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc b/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc
index a3b5ff8d17..a09bff351f 100644
--- a/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc
+++ b/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
 #include "paddle/fluid/string/printf.h"
 
diff --git a/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc b/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
index 8754856e14..7275ab201f 100644
--- a/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc b/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
index ef574ccdf4..80d712a0e0 100644
--- a/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/platform/nccl_helper.h"
 
diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
index 20870ea07e..629f364d71 100644
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 
 #include "gflags/gflags.h"
 
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
diff --git a/paddle/fluid/operators/distributed_ops/prefetch_op.cc b/paddle/fluid/operators/distributed_ops/prefetch_op.cc
index 86425aba8c..52b96d5f8e 100644
--- a/paddle/fluid/operators/distributed_ops/prefetch_op.cc
+++ b/paddle/fluid/operators/distributed_ops/prefetch_op.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc
index 0399ff4100..48065437e3 100644
--- a/paddle/fluid/operators/distributed_ops/recv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/recv_op.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed_ops/send_barrier_op.cc b/paddle/fluid/operators/distributed_ops/send_barrier_op.cc
index 8ca2877d8a..ae1b10c3b6 100644
--- a/paddle/fluid/operators/distributed_ops/send_barrier_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_barrier_op.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 
 #include "paddle/fluid/platform/profiler.h"
 
diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc
index 0bf4bebbc9..e2c2147ab5 100644
--- a/paddle/fluid/operators/distributed_ops/send_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_op.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
 #include "paddle/fluid/platform/profiler.h"
 
diff --git a/paddle/fluid/operators/distributed_ops/test_send_nccl_id.cc b/paddle/fluid/operators/distributed_ops/test_send_nccl_id.cc
index a73cb08eca..1598e1d0a4 100644
--- a/paddle/fluid/operators/distributed_ops/test_send_nccl_id.cc
+++ b/paddle/fluid/operators/distributed_ops/test_send_nccl_id.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index 41644d8cc1..fd2a98cb45 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -178,7 +178,6 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
 
     auto x_dims = ctx->GetInputDim("X");
     auto y_dims = ctx->GetInputDim("Y");
-    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
 
     PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
                       "Rank of first input must >= rank of second input.");
diff --git a/paddle/fluid/operators/expand_op.h b/paddle/fluid/operators/expand_op.h
index 75dbf1d8bf..3394082497 100644
--- a/paddle/fluid/operators/expand_op.h
+++ b/paddle/fluid/operators/expand_op.h
@@ -77,7 +77,6 @@ class ExpandKernel : public framework::OpKernel<T> {
     auto& expand_times = context.Attr<std::vector<int>>("expand_times");
     auto* out0 = context.Output<Tensor>("Out");
     Eigen::DSizes<int, Rank> bcast_dims;
-    auto x_dims = in0->dims();
     for (size_t i = 0; i < expand_times.size(); ++i) {
       bcast_dims[i] = expand_times[i];
     }
diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc
index 1ed8a2ddd1..38e57a41ed 100644
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -146,7 +146,6 @@ class FCOpKernel : public framework::OpKernel<T> {
     auto w = ctx.Input<Tensor>("W");
     auto bias = ctx.Input<Tensor>("Bias");
     auto output = ctx.Output<Tensor>("Out");
-    auto in_dims = input->dims();
     auto w_dims = w->dims();
     auto out_dims = output->dims();
     int M = framework::product(out_dims) / out_dims[out_dims.size() - 1];
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index 38cb33e790..c86430524e 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -12,68 +12,40 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/fill_constant_op.h"
 
 namespace paddle {
 namespace operators {
 
-class FillConstantInferShape : public framework::InferShapeBase {
+class FillConstantOp : public framework::OperatorWithKernel {
  public:
-  void operator()(framework::InferShapeContext *ctx) const override {
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of FillConstantOp should not be null.");
-    auto &shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
+    auto& shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
     ctx->SetOutputDim("Out", framework::make_ddim(shape));
   }
-};
-
-class FillConstantOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    auto data_type =
-        static_cast<framework::proto::VarType::Type>(Attr<int>("dtype"));
-    auto value = Attr<float>("value");
-    auto force_cpu = Attr<bool>("force_cpu");
-
-    framework::Tensor *tensor = nullptr;
 
-    auto &out_var = *scope.FindVar(Output("Out"));
-
-    if (out_var.IsType<framework::LoDTensor>()) {
-      tensor = out_var.GetMutable<framework::LoDTensor>();
-      tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
-    } else if (out_var.IsType<framework::SelectedRows>()) {
-      tensor = out_var.GetMutable<framework::SelectedRows>()->mutable_value();
-      tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
-    } else {
-      PADDLE_THROW(
-          "fill constant op's output only"
-          "supports SelectedRows and LoDTensor");
-    }
-
-    if (force_cpu) {
-      auto cpu = platform::CPUPlace();
-      tensor->mutable_data(cpu, data_type);
-    } else {
-      tensor->mutable_data(dev_place, data_type);
-    }
-
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(dev_place);
-    math::set_constant(dev_ctx, tensor, value);
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
+        ctx.GetPlace());
   }
 };
 
 class FillConstantOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {}
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    auto data_type = static_cast<framework::proto::VarType::Type>(
+        boost::get<int>(op_desc.GetAttr("dtype")));
+    auto& out_var_name = op_desc.Output("Out").front();
+    block->Var(out_var_name)->SetDataType(data_type);
+  }
 };
 
 class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -107,7 +79,13 @@ Fill up a variable with specified constant value.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(fill_constant, ops::FillConstantOp,
-                  ops::FillConstantInferShape, ops::FillConstantOpMaker,
-                  paddle::framework::EmptyGradOpMaker,
-                  ops::FillConstantOpVarTypeInference);
+
+REGISTER_OPERATOR(fill_constant, ops::FillConstantOp, ops::FillConstantOpMaker,
+                  ops::FillConstantOpVarTypeInference,
+                  paddle::framework::EmptyGradOpMaker);
+
+REGISTER_OP_CPU_KERNEL(fill_constant, ops::FillConstantKernel<float>,
+                       ops::FillConstantKernel<double>,
+                       ops::FillConstantKernel<int64_t>,
+                       ops::FillConstantKernel<int>,
+                       ops::FillConstantKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/fill_constant_op.cu.cc b/paddle/fluid/operators/fill_constant_op.cu.cc
new file mode 100644
index 0000000000..77027b5a87
--- /dev/null
+++ b/paddle/fluid/operators/fill_constant_op.cu.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fill_constant_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(fill_constant, ops::FillConstantKernel<float>,
+                        ops::FillConstantKernel<double>,
+                        ops::FillConstantKernel<int64_t>,
+                        ops::FillConstantKernel<int>,
+                        ops::FillConstantKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/fill_constant_op.h b/paddle/fluid/operators/fill_constant_op.h
new file mode 100644
index 0000000000..417c5b4da6
--- /dev/null
+++ b/paddle/fluid/operators/fill_constant_op.h
@@ -0,0 +1,64 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+template <typename T>
+class FillConstantKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext &ctx) const override {
+    auto data_type =
+        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
+    auto value = ctx.Attr<float>("value");
+    auto force_cpu = ctx.Attr<bool>("force_cpu");
+
+    framework::Tensor *tensor = nullptr;
+
+    framework::Variable *out_var = ctx.OutputVar("Out");
+
+    if (out_var->IsType<framework::LoDTensor>()) {
+      tensor = out_var->GetMutable<framework::LoDTensor>();
+      tensor->Resize(
+          framework::make_ddim(ctx.Attr<std::vector<int64_t>>("shape")));
+    } else if (out_var->IsType<framework::SelectedRows>()) {
+      tensor = out_var->GetMutable<framework::SelectedRows>()->mutable_value();
+      tensor->Resize(
+          framework::make_ddim(ctx.Attr<std::vector<int64_t>>("shape")));
+    } else {
+      PADDLE_THROW(
+          "fill constant op's output only"
+          "supports SelectedRows and LoDTensor");
+    }
+
+    if (force_cpu) {
+      tensor->mutable_data(platform::CPUPlace(), data_type);
+    } else {
+      tensor->mutable_data(ctx.GetPlace(), data_type);
+    }
+
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(ctx.GetPlace());
+    math::set_constant(dev_ctx, tensor, value);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
index a0397acab1..2bddba7db2 100644
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -1,6 +1,8 @@
 include(operators)
-register_operators(EXCLUDES fusion_transpose_flatten_concat_op)
+register_operators(EXCLUDES fusion_transpose_flatten_concat_op fusion_conv_inception_op)
 if (WITH_GPU)
   op_library(fusion_transpose_flatten_concat_op)
+  op_library(fusion_conv_inception_op)
   file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fusion_transpose_flatten_concat);\n")
+  file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_inception_fusion);\n")
 endif()
diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
index f1466f17fe..c8282aefe4 100644
--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
@@ -241,15 +241,15 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
   bool is_reverse = ctx.Attr<bool>("is_reverse");     \
   bool use_peepholes = ctx.Attr<bool>("use_peepholes");
 
-#define INIT_BASE_SIZES                       \
-  auto ids_dims = ids->dims();   /* T x M*/   \
-  auto ids_numel = ids->numel(); /* T x 1*/   \
-  auto wh_dims = wh->dims();     /* D x 4D*/  \
-  const int D = wh_dims[0];                   \
-  const int D2 = D * 2;                       \
-  const int D3 = D * 3;                       \
-  int64_t row_number = embeddings->dims()[0]; \
-  int64_t row_width = embeddings->dims()[1];  \
+#define INIT_BASE_SIZES                                      \
+  auto ids_dims = ids->dims();                   /* T x M*/  \
+  auto ids_numel = framework::product(ids_dims); /* T x 1*/  \
+  auto wh_dims = wh->dims();                     /* D x 4D*/ \
+  const int D = wh_dims[0];                                  \
+  const int D2 = D * 2;                                      \
+  const int D3 = D * 3;                                      \
+  int64_t row_number = embeddings->dims()[0];                \
+  int64_t row_width = embeddings->dims()[1];                 \
   const int D4 = wh_dims[1];
 
 #define INIT_BASE_INPUT_DATAS                                        \
diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cc b/paddle/fluid/operators/fused/fusion_conv_inception_op.cc
new file mode 100644
index 0000000000..4690bd766d
--- /dev/null
+++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cc
@@ -0,0 +1,110 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cudnn_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+class ConvInceptionFusionOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    // 1 x
+    auto in_dims = ctx->GetInputDim("Input");
+    // 4 filters
+    auto w_dims = ctx->GetInputsDim("Filter");
+
+    PADDLE_ENFORCE(in_dims.size(), 4, "Conv intput should be 4-D tensor.");
+    PADDLE_ENFORCE_EQ(w_dims.size(), 4, "There should be 4 filters");
+    PADDLE_ENFORCE_EQ(w_dims[0][1], in_dims[1]);
+    PADDLE_ENFORCE_EQ(w_dims[1][1], in_dims[1]);
+
+    int n = in_dims[0];
+    // compute output channel
+    // 1st channel
+    int c = w_dims[0][0];
+    // add 2nd channel
+    c += (w_dims[1][0] - w_dims[2][1] * 2);
+    // add 3rd channel
+    c += (w_dims[2][0] - w_dims[3][1]);
+    // add 4-th channel
+    c += w_dims[3][0];
+
+    int h = in_dims[2];
+    int w = in_dims[3];
+
+    ctx->SetOutputDim("Output", {n, c, h, w});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        ctx.Input<framework::LoDTensor>("Input")->type(), ctx.device_context());
+  }
+};
+
+class ConvInceptionFusionOpMaker : public framework::OpProtoAndCheckerMaker {
+ protected:
+  void Make() override {
+    AddInput("Input", "(Tensor) NCHW layout.");
+    AddInput("Filter", "(vector<Tensor>) 4 aggregated filters").AsDuplicable();
+    AddInput("Bias", "(vector<Tensor>) it's lenght is equal to Filter")
+        .AsDuplicable();
+    AddOutput("Output",
+              "(Tensor) The output tensor of convolution operator. "
+              "The format of output tensor is also NCHW.");
+    AddOutput("TempOutput", "").AsDuplicable();
+    AddAttr<std::string>(
+        "pooling_type",
+        "(string), pooling type, can be \"max\" for max-pooling "
+        "and \"avg\" for average-pooling.")
+        .InEnum({"max", "avg"});
+    AddAttr<bool>(
+        "exclusive",
+        "(bool, default True) When true, will exclude the zero-padding in the "
+        "averaging calculating, otherwise, include the zero-padding. Note, it "
+        "is only used when pooling_type is avg. The defalut is True.")
+        .SetDefault(true);
+    AddAttr<std::string>(
+        "activation",
+        "The activation type can be 'identity', 'sigmoid', 'relu', 'relu6' "
+        "'relux' , 'tanh', 'band_pass'")
+        .SetDefault("relu");
+    AddAttr<int>("workspace_size_MB",
+                 "Only used in cudnn kernel. Need set use_cudnn to true."
+                 "workspace size for cudnn, in MB, "
+                 "workspace is a section of GPU memory which will be "
+                 "allocated/freed each time the operator runs, larger "
+                 "workspace size can increase performance but also requires "
+                 "better hardware. This size should be chosen carefully.")
+        .SetDefault(4096);
+    AddComment(R"DOC(
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(conv2d_inception_fusion, ops::ConvInceptionFusionOp,
+                  ops::ConvInceptionFusionOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
new file mode 100644
index 0000000000..3349b0b31e
--- /dev/null
+++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
@@ -0,0 +1,272 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+
+DECLARE_uint64(conv_workspace_size_limit);
+
+namespace paddle {
+namespace operators {
+
+#if CUDNN_VERSION >= 7001
+using Tensor = framework::Tensor;
+using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
+using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
+using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
+using ScopedActivationDescriptor = platform::ScopedActivationDescriptor;
+using DataLayout = platform::DataLayout;
+
+using ScopedPoolingDescriptor = platform::ScopedPoolingDescriptor;
+using PoolingMode = platform::PoolingMode;
+template <typename T>
+using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
+
+template <typename T>
+using CudnnDataType = platform::CudnnDataType<T>;
+
+template <typename T>
+class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto* input = ctx.Input<Tensor>("Input");
+    auto filters = ctx.MultiInput<framework::Tensor>("Filter");
+    auto bias = ctx.MultiInput<framework::Tensor>("Bias");
+
+    auto* output = ctx.Output<Tensor>("Output");
+    auto temp_outs = ctx.MultiOutput<framework::Tensor>("TempOutput");
+
+    const std::string pool_type = ctx.Attr<std::string>("pooling_type");
+    const std::string activation = ctx.Attr<std::string>("activation");
+    const bool exclusive = ctx.Attr<bool>("exclusive");
+
+    int64_t user_workspace_size =
+        static_cast<size_t>(ctx.Attr<int>("workspace_size_MB"));
+
+    const T* input_data = input->data<T>();
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    T* temp_data = temp_outs[0]->mutable_data<T>(input->dims(), ctx.GetPlace());
+
+    DataLayout layout = DataLayout::kNCHW;
+    std::vector<int> in_dim = framework::vectorize2int(input->dims());
+
+    // ------------------- cudnn descriptors ---------------------
+    PoolingMode pooling_mode;
+    if (pool_type == "max") {
+      pooling_mode = PoolingMode::kMaximum;
+    } else {
+      pooling_mode = exclusive ? PoolingMode::kAverageExclusive
+                               : (PoolingMode::kAverageInclusive);
+    }
+    std::vector<int> k0x0 = {0, 0};
+    std::vector<int> k1x1 = {1, 1};
+    std::vector<int> k1x1_2 = {1, 1};
+    std::vector<int> k3x3 = {3, 3};
+    ScopedPoolingDescriptor pool_desc;
+    ScopedActivationDescriptor act_desc;
+    ScopedTensorDescriptor out_pool_desc;
+    ScopedTensorDescriptor input_desc;
+    cudnnPoolingDescriptor_t cudnn_pool_desc =
+        pool_desc.descriptor(pooling_mode, k3x3, k1x1, k1x1);
+
+    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+        layout, framework::vectorize2int(input->dims()));
+    cudnnTensorDescriptor_t pool_out_desc = out_pool_desc.descriptor<T>(
+        layout, framework::vectorize2int(input->dims()));
+
+    cudnnDataType_t cudnn_dtype = CudnnDataType<T>::type;
+    cudnnTensorDescriptor_t* out_desc = new cudnnTensorDescriptor_t[4];
+    cudnnFilterDescriptor_t* filter_desc = new cudnnFilterDescriptor_t[4];
+    cudnnTensorDescriptor_t* bias_desc = new cudnnTensorDescriptor_t[4];
+    cudnnTensorDescriptor_t* in_desc = new cudnnTensorDescriptor_t[4];
+    cudnnConvolutionDescriptor_t* conv_desc =
+        new cudnnConvolutionDescriptor_t[4];
+    for (int i = 0; i < 4; ++i) {
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateFilterDescriptor(&filter_desc[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateTensorDescriptor(&bias_desc[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateTensorDescriptor(&in_desc[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateTensorDescriptor(&out_desc[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateConvolutionDescriptor(&conv_desc[i]));
+    }
+
+    std::vector<std::vector<int>> filter_dims;
+    std::vector<std::vector<int>> bias_dims;
+    std::vector<std::vector<int>> in_dims;
+    std::vector<std::vector<int>> out_dims;
+    std::vector<std::vector<int>> in_strides;
+    std::vector<std::vector<int>> out_strides;
+    std::vector<std::vector<int>> bias_strides;
+
+    cudnnTensorFormat_t format = CUDNN_TENSOR_NCHW;
+    int n = in_dim[0];
+    int h = in_dim[2];
+    int w = in_dim[3];
+    int oc = output->dims()[1];
+
+    cudnnDataType_t compute_type = (cudnn_dtype == CUDNN_DATA_DOUBLE)
+                                       ? CUDNN_DATA_DOUBLE
+                                       : CUDNN_DATA_FLOAT;
+
+    for (int i = 0; i < 4; ++i) {
+      filter_dims.push_back(framework::vectorize2int(filters[i]->dims()));
+      CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor(
+          filter_desc[i], cudnn_dtype, format, 4, filter_dims[i].data()));
+      bias_dims.push_back({1, filter_dims[i][0], 1, 1});
+      bias_strides.push_back({filter_dims[i][0], 1, 1, 1});
+      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+          bias_desc[i], cudnn_dtype, 4, bias_dims[i].data(),
+          bias_strides[i].data()));
+      in_dims.push_back({n, filter_dims[i][1], h, w});
+      out_dims.push_back({n, filter_dims[i][0], h, w});
+      in_strides.push_back({filter_dims[i][1] * h * w, h * w, w, 1});
+      out_strides.push_back({oc * h * w, h * w, w, 1});
+
+      if (i < 2) {
+        CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionNdDescriptor(
+            conv_desc[i], 2, k0x0.data(), k1x1.data(), k1x1.data(),
+            CUDNN_CROSS_CORRELATION, compute_type));
+      } else {
+        CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionNdDescriptor(
+            conv_desc[i], 2, k1x1.data(), k1x1.data(), k1x1.data(),
+            CUDNN_CROSS_CORRELATION, compute_type));
+      }
+      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+          conv_desc[i], CUDNN_DEFAULT_MATH));
+    }
+    in_dims[2][1] *= 2;
+    in_strides[2][0] = oc * h * w;
+    out_strides[2][0] = filter_dims[2][0] * h * w;  // this out is continuous.
+    in_strides[3][0] = filter_dims[2][0] * h * w;
+    CUDNN_ENFORCE(
+        platform::dynload::cudnnSetConvolutionGroupCount(conv_desc[2], 2));
+
+    cudnnConvolutionFwdAlgo_t algo[4];
+    auto handle = dev_ctx.cudnn_handle();
+    size_t workspace_size_in_bytes = 0;  // final workspace to allocate.
+
+    size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
+    if (FLAGS_conv_workspace_size_limit > 0 || user_workspace_size > 0) {
+      int64_t max_user_size =
+          std::max(static_cast<int64_t>(FLAGS_conv_workspace_size_limit),
+                   user_workspace_size);
+      workspace_size_limit = max_user_size * 1024 * 1024;
+    }
+
+    for (int i = 0; i < 4; ++i) {
+      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+          in_desc[i], cudnn_dtype, 4, in_dims[i].data(), in_strides[i].data()));
+      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+          out_desc[i], cudnn_dtype, 4, out_dims[i].data(),
+          out_strides[i].data()));
+      CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+          handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i],
+          CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, workspace_size_limit,
+          &algo[i]));
+      size_t tmp_size = 0;
+      CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
+          handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i],
+          algo[i], &tmp_size));
+      workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
+    }
+    cudnnActivationDescriptor_t cudnn_act_desc =
+        act_desc.descriptor<T>(activation);
+
+    int oc0 = filter_dims[0][0];
+    int oc1 = filter_dims[1][0] - filter_dims[2][1] * 2;
+    int oc3 = filter_dims[3][0];
+    int oc2 = oc - oc0 - oc1 - oc3;
+
+    // branch1: pool + 1x1 conv
+    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
+    CUDNN_ENFORCE(platform::dynload::cudnnPoolingForward(
+        handle, cudnn_pool_desc, &alpha, cudnn_input_desc, input_data, &beta,
+        pool_out_desc, temp_data));
+
+    std::vector<const void*> in_datas;
+    in_datas.push_back(static_cast<const void*>(temp_data));
+    in_datas.push_back(static_cast<const void*>(input_data));
+    in_datas.push_back(
+        static_cast<const void*>(output_data + (oc0 + oc1) * h * w));
+    T* temp2_data = temp_outs[1]->mutable_data<T>(
+        framework::make_ddim(out_dims[2]), ctx.GetPlace());
+    in_datas.push_back(static_cast<const void*>(temp2_data + oc2 * h * w));
+
+    std::vector<void*> out_datas;
+    out_datas.push_back(static_cast<void*>(output_data));
+    out_datas.push_back(static_cast<void*>(output_data + oc0 * h * w));
+    out_datas.push_back(static_cast<void*>(temp2_data));
+    out_datas.push_back(
+        static_cast<void*>(output_data + (oc0 + oc1 + oc2) * h * w));
+
+    for (int i = 0; i < 4; ++i) {
+      auto func = [&](void* cudnn_workspace) {
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward(
+            handle, &alpha, in_desc[i], in_datas[i], filter_desc[i],
+            static_cast<const void*>(filters[i]->data<T>()), conv_desc[i],
+            algo[i], cudnn_workspace, workspace_size_in_bytes, &beta,
+            out_desc[i], out_datas[i], bias_desc[i],
+            static_cast<const void*>(bias[i]->data<T>()), cudnn_act_desc,
+            out_desc[i], out_datas[i]));
+      };
+      auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+      workspace_handle.RunFunc(func, workspace_size_in_bytes);
+    }
+
+    cudnnTensorDescriptor_t x_desc;
+    cudnnTensorDescriptor_t y_desc;
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&x_desc));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&y_desc));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        x_desc, cudnn_dtype, 4, out_dims[3].data(), out_strides[2].data()));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        y_desc, cudnn_dtype, 4, out_dims[3].data(), out_strides[3].data()));
+    CUDNN_ENFORCE(platform::dynload::cudnnTransformTensor(
+        handle, CudnnDataType<T>::kOne(), x_desc,
+        static_cast<const void*>(out_datas[2]), CudnnDataType<T>::kZero(),
+        y_desc, static_cast<void*>(output_data + (oc0 + oc1) * h * w)));
+
+    for (int i = 0; i < 4; ++i) {
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyTensorDescriptor(in_desc[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyTensorDescriptor(out_desc[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyFilterDescriptor(filter_desc[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyTensorDescriptor(bias_desc[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyConvolutionDescriptor(conv_desc[i]));
+    }
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(x_desc));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(y_desc));
+  }
+};
+#endif
+
+}  // namespace operators
+}  // namespace paddle
+
+#if CUDNN_VERSION >= 7001
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(conv2d_inception_fusion,
+                        ops::CUDNNConvInceptionFusionOpKernel<float>,
+                        ops::CUDNNConvInceptionFusionOpKernel<double>);
+#endif
diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc
index 69e7fa4490..f458ce6c83 100644
--- a/paddle/fluid/operators/hinge_loss_op.cc
+++ b/paddle/fluid/operators/hinge_loss_op.cc
@@ -88,7 +88,6 @@ class HingeLossGradOp : public framework::OperatorWithKernel {
                    "Input(Logits@GRAD) should not be null.");
 
     auto pred_dims = ctx->GetInputDim("Logits");
-    auto lab_dims = ctx->GetInputDim("Labels");
     auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss"));
 
     PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims);
diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc
index 9d248e0321..ef1fb83aa6 100644
--- a/paddle/fluid/operators/log_loss_op.cc
+++ b/paddle/fluid/operators/log_loss_op.cc
@@ -92,7 +92,6 @@ class LogLossGradOp : public framework::OperatorWithKernel {
                    "Output(Predicted@GRAD) should not be null.");
 
     auto pred_dims = ctx->GetInputDim("Predicted");
-    auto label_dims = ctx->GetInputDim("Labels");
     auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss"));
     PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims);
 
diff --git a/paddle/fluid/operators/math/math_function_impl.h b/paddle/fluid/operators/math/math_function_impl.h
index 895a7019aa..d1127ce4a2 100644
--- a/paddle/fluid/operators/math/math_function_impl.h
+++ b/paddle/fluid/operators/math/math_function_impl.h
@@ -37,9 +37,6 @@ void Transpose<DeviceContext, T, Rank>::operator()(
   for (int i = 0; i < Rank; i++) {
     permute[i] = axis[i];
   }
-  auto in_dim = in.dims();
-  auto out_dim = out->dims();
-
   auto eigen_in = framework::EigenTensor<T, Rank>::From(in);
   auto eigen_out = framework::EigenTensor<T, Rank>::From(*out);
   auto* dev = context.eigen_device();
diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h
index 9e99e44822..1d9d98b106 100644
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -76,7 +76,6 @@ class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> {
   void operator()(const DeviceContext& context, const framework::Tensor* X,
                   framework::Tensor* Y) {
     auto in_dims = X->dims();
-    auto out_dims = Y->dims();
     const float* in_data = X->data<float>();
     float* out_data = Y->data<float>();
     const int kBatchDim = 0;
diff --git a/paddle/fluid/operators/modified_huber_loss_op.cc b/paddle/fluid/operators/modified_huber_loss_op.cc
index 35db4c1ad1..9954e51083 100644
--- a/paddle/fluid/operators/modified_huber_loss_op.cc
+++ b/paddle/fluid/operators/modified_huber_loss_op.cc
@@ -87,7 +87,6 @@ class ModifiedHuberLossGradOp : public framework::OperatorWithKernel {
                    "Input(Out@Grad) must not be null.");
 
     auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
     auto intermediate_dims = ctx->GetInputDim("IntermediateVal");
     auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out"));
 
diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc
index 271428408c..05afdf5324 100644
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@@ -147,12 +147,6 @@ class MulGradOp : public framework::OperatorWithKernel {
                    "Input(Out@GRAD) should not be null");
     auto x_dims = ctx->GetInputDim("X");
     auto y_dims = ctx->GetInputDim("Y");
-    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-
-    auto x_mat_dims = framework::flatten_to_2d(
-        x_dims, ctx->Attrs().Get<int>("x_num_col_dims"));
-    auto y_mat_dims = framework::flatten_to_2d(
-        y_dims, ctx->Attrs().Get<int>("y_num_col_dims"));
 
     auto x_grad_name = framework::GradVarName("X");
     auto y_grad_name = framework::GradVarName("Y");
diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc
index 06c35c789f..784e07b5bd 100644
--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
@@ -36,7 +36,6 @@ class NCEOp : public framework::OperatorWithKernel {
 
     auto x_dims = ctx->GetInputDim("Input");
     auto label_dims = ctx->GetInputDim("Label");
-    auto w_dims = ctx->GetInputDim("Weight");
     PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0]);
     int num_true_classes = label_dims.size() == 2 ? label_dims[1] : 1;
     if (ctx->HasInput("Bias")) {
diff --git a/paddle/fluid/operators/norm_op.h b/paddle/fluid/operators/norm_op.h
index d0224177ec..6c95d3f3bf 100644
--- a/paddle/fluid/operators/norm_op.h
+++ b/paddle/fluid/operators/norm_op.h
@@ -43,7 +43,6 @@ class NormKernel : public framework::OpKernel<T> {
     out_norm->mutable_data<T>(ctx.GetPlace());
 
     auto xdim = in_x->dims();
-    auto ndim = out_norm->dims();
     T eps = static_cast<T>(ctx.Attr<float>("epsilon"));
     int axis = ctx.Attr<int>("axis");
     if (axis < 0) axis = xdim.size() + axis;
diff --git a/paddle/fluid/operators/psroi_pool_op.h b/paddle/fluid/operators/psroi_pool_op.h
index 1a424728f7..5666613f6e 100644
--- a/paddle/fluid/operators/psroi_pool_op.h
+++ b/paddle/fluid/operators/psroi_pool_op.h
@@ -41,7 +41,6 @@ class CPUPSROIPoolOpKernel : public framework::OpKernel<T> {
     int rois_num = rois->dims()[0];
 
     auto in_stride = framework::stride(in_dims);
-    auto roi_stride = framework::stride(rois->dims());
     auto out_stride = framework::stride(out->dims());
 
     const T* input_data = in->data<T>();
diff --git a/paddle/fluid/operators/sequence_ops/sequence_slice_op.h b/paddle/fluid/operators/sequence_ops/sequence_slice_op.h
index 03b59d71cc..4bded0efb9 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_slice_op.h
@@ -143,8 +143,6 @@ class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
       set_zero(ctx.template device_context<DeviceContext>(), x_grad,
                static_cast<T>(0));
 
-      auto out_grad_stride = framework::stride(out_grad->dims());
-
       for (size_t i = 0; i < out_lod[0].size() - 1; ++i) {
         Tensor out_grad_t =
             out_grad->Slice(static_cast<int>(out_lod[0][i]),
diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h
index c3d83a06f2..6a99ad9a90 100644
--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
@@ -40,7 +40,7 @@ inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src,
                           const framework::DDim& dst_stride, T* dst) {
   paddle::operators::detail::StridedCopyDimVisitor<T> func(
       dev_ctx, src, src_stride, dst_stride, dst);
-  boost::apply_visitor(func, dst_dim);
+  dst_dim.apply_visitor(func);
 }
 
 // Strided numel memory copy from src to dst by the specified axis
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index d1dff16ddd..05a0f14440 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -84,6 +84,9 @@ cc_test(init_test SRCS init_test.cc DEPS device_context)
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
 
+cc_library(timer SRCS timer.cc)
+cc_test(timer_test SRCS timer_test.cc DEPS timer)
+
 cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
 cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
diff --git a/paddle/fluid/platform/dynload/cudnn.cc b/paddle/fluid/platform/dynload/cudnn.cc
index f3cd3b2bbe..91d9a1ef01 100644
--- a/paddle/fluid/platform/dynload/cudnn.cc
+++ b/paddle/fluid/platform/dynload/cudnn.cc
@@ -38,6 +38,10 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
 CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
 #endif
 
+#ifdef CUDNN_DNN_ROUTINE_EACH_R6
+CUDNN_DNN_ROUTINE_EACH_R6(DEFINE_WRAP);
+#endif
+
 #ifdef CUDNN_DNN_ROUTINE_EACH_R7
 CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
 #endif
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index 990e44cd21..15d5168366 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -53,6 +53,12 @@ namespace platform {
 namespace dynload {
 static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH;
 
+#if defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+static constexpr char* win_cublas_lib = "cublas64_" PADDLE_CUDA_BINVER ".dll";
+static constexpr char* win_curand_lib = "curand64_" PADDLE_CUDA_BINVER ".dll";
+static constexpr char* win_cudnn_lib = "cudnn64_" PADDLE_CUDNN_BINVER ".dll";
+#endif
+
 static inline std::string join(const std::string& part1,
                                const std::string& part2) {
   // directory separator
@@ -165,6 +171,8 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root,
 void* GetCublasDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib");
+#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cublas_lib);
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so");
 #endif
@@ -173,6 +181,8 @@ void* GetCublasDsoHandle() {
 void* GetCUDNNDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false);
+#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, win_cudnn_lib);
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false);
 #endif
@@ -193,6 +203,8 @@ void* GetCUPTIDsoHandle() {
 void* GetCurandDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib");
+#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_curand_lib);
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so");
 #endif
diff --git a/paddle/fluid/platform/timer.cc b/paddle/fluid/platform/timer.cc
new file mode 100644
index 0000000000..75d4e5cbf9
--- /dev/null
+++ b/paddle/fluid/platform/timer.cc
@@ -0,0 +1,63 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/timer.h"
+
+namespace paddle {
+namespace platform {
+
+void Timer::Reset() {
+  _start.tv_sec = 0;
+  _start.tv_usec = 0;
+
+  _count = 0;
+  _elapsed = 0;
+  _paused = true;
+}
+
+void Timer::Start() {
+  Reset();
+  Resume();
+}
+
+void Timer::Pause() {
+  if (_paused) {
+    return;
+  }
+  _elapsed += Tickus();
+  ++_count;
+  _paused = true;
+}
+
+void Timer::Resume() {
+  gettimeofday(&_start, NULL);
+  _paused = false;
+}
+
+int Timer::Count() { return _count; }
+
+double Timer::ElapsedUS() { return static_cast<double>(_elapsed); }
+
+double Timer::ElapsedMS() { return _elapsed / 1000.0; }
+
+double Timer::ElapsedSec() { return _elapsed / 1000000.0; }
+
+int64_t Timer::Tickus() {
+  gettimeofday(&_now, NULL);
+  return (_now.tv_sec - _start.tv_sec) * 1000 * 1000L +
+         (_now.tv_usec - _start.tv_usec);
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/timer.h b/paddle/fluid/platform/timer.h
new file mode 100644
index 0000000000..9bb66eb97a
--- /dev/null
+++ b/paddle/fluid/platform/timer.h
@@ -0,0 +1,54 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <stdlib.h>
+#include "paddle/fluid/platform/port.h"
+
+namespace paddle {
+namespace platform {
+
+// A Standard Timer implementation for debugging
+class Timer {
+ public:
+  // a timer class for profiling
+  // Reset() will be called during initialization
+  // all timing variables will be set 0 in Reset()
+  Timer() { Reset(); }
+  void Reset();
+  void Start();
+  void Pause();
+  // Resume will get current system time
+  void Resume();
+  int Count();
+  // return elapsed time in us
+  double ElapsedUS();
+  // return elapsed time in ms
+  double ElapsedMS();
+  // return elapsed time in sec
+  double ElapsedSec();
+
+ private:
+  struct timeval _start;
+  struct timeval _now;
+  int _count;
+  int _elapsed;
+  bool _paused;
+
+  // get us difference between start and now
+  int64_t Tickus();
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/timer_test.cc b/paddle/fluid/platform/timer_test.cc
new file mode 100644
index 0000000000..09edf8131f
--- /dev/null
+++ b/paddle/fluid/platform/timer_test.cc
@@ -0,0 +1,45 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/platform/timer.h"
+#include "gtest/gtest.h"
+
+TEST(Timer, Reset) {
+  paddle::platform::Timer timeline;
+  timeline.Start();
+  sleep(3);
+  timeline.Pause();
+  timeline.Reset();
+}
+
+TEST(Timer, Start) {
+  paddle::platform::Timer timeline;
+  timeline.Start();
+  sleep(3);
+  timeline.Pause();
+}
+
+TEST(Timer, Pause) {
+  paddle::platform::Timer timeline;
+  timeline.Start();
+  sleep(3);
+  timeline.Pause();
+}
+
+TEST(Timer, Resume) {
+  paddle::platform::Timer timeline;
+  timeline.Start();
+  sleep(3);
+  timeline.Pause();
+  timeline.Resume();
+}
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index be63fb8778..819943508b 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/pybind/imperative.h"
 #include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/imperative/tracer.h"
 
 namespace paddle {
@@ -28,9 +27,7 @@ void BindTracer(pybind11::module *m) {
               framework::BlockDesc *startup_block) {
              new (&self) imperative::Tracer(root_block, startup_block);
            })
-      .def("trace", &imperative::Tracer::Trace)
-      .def("get_scope", &imperative::Tracer::GetScope,
-           pybind11::return_value_policy::reference);
+      .def("trace", &imperative::Tracer::Trace);
 }
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 81d63aace0..d664107d57 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -84,11 +84,15 @@ bool IsCompiledWithCUDA() {
 }
 
 bool IsCompiledWithBrpc() {
-#if defined(PADDLE_WITH_BRPC) || defined(PADDLE_WITH_BRPC_RDMA)
-  return true;
-#else
+#ifndef PADDLE_WITH_DISTRIBUTE
+  return false;
+#endif
+
+#ifdef PADDLE_WITH_GRPC
   return false;
 #endif
+
+  return true;
 }
 
 bool IsCompiledWithDIST() {
@@ -124,9 +128,7 @@ PYBIND11_MODULE(core, m) {
   py::class_<imperative::VarBase, PyVarBase>(m, "VarBase", R"DOC()DOC")
       .def(py::init<>())
       .def("_run_backward",
-           [](imperative::VarBase &self, framework::Scope *scope) {
-             self.RunBackward(scope);
-           })
+           [](imperative::VarBase &self) { self.RunBackward(); })
       .def("_grad", &imperative::VarBase::Grad)
       .def_property(
           "desc",
@@ -134,7 +136,13 @@ PYBIND11_MODULE(core, m) {
           [](imperative::VarBase &self, framework::VarDesc *var_desc) {
             self.var_desc_ = var_desc;
           },
-          py::return_value_policy::reference);
+          py::return_value_policy::reference)
+      .def_property("var",
+                    [](const imperative::VarBase &self) { return self.var_; },
+                    [](imperative::VarBase &self, framework::Variable *var) {
+                      self.var_ = var;
+                    },
+                    py::return_value_policy::reference);
 
   py::class_<imperative::OpBase, PyOpBase>(m, "OpBase", R"DOC()DOC")
       .def(py::init<>())
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index ef43d13e18..47c5248b57 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -28,20 +28,53 @@ int main(int argc, char** argv) {
   for (int i = 0; i < argc; ++i) {
     new_argv.push_back(argv[i]);
   }
+
+  std::vector<std::string> envs;
+  std::vector<std::string> undefok;
+#if defined(PADDLE_WITH_DISTRIBUTE) && !defined(PADDLE_WITH_GRPC)
+  envs.push_back("max_body_size");
+#endif
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  new_argv.push_back(
-      strdup("--tryfromenv=fraction_of_gpu_memory_to_use,allocator_strategy"));
+  envs.push_back("fraction_of_gpu_memory_to_use");
+  envs.push_back("allocator_strategy");
 #elif __clang__
-  new_argv.push_back(
-      strdup("--tryfromenv=use_mkldnn,initial_cpu_memory_in_"
-             "mb,allocator_strategy"));
-  new_argv.push_back(strdup("--undefok=use_mkldnn,initial_cpu_memory_in_mb"));
+  envs.push_back("use_mkldnn");
+  envs.push_back("initial_cpu_memory_in_mb");
+  envs.push_back("allocator_strategy");
+
+  undefok.push_back("use_mkldnn");
+  undefok.push_back("initial_cpu_memory_in_mb");
 #else
-  new_argv.push_back(
-      strdup("--tryfromenv=use_pinned_memory,use_mkldnn,initial_cpu_memory_in_"
-             "mb,allocator_strategy"));
-  new_argv.push_back(strdup("--undefok=use_mkldnn,initial_cpu_memory_in_mb"));
+  envs.push_back("use_pinned_memory");
+  envs.push_back("use_mkldnn");
+  envs.push_back("initial_cpu_memory_in_mb");
+  envs.push_back("allocator_strategy");
+
+  undefok.push_back("use_mkldnn");
+  undefok.push_back("initial_cpu_memory_in_mb");
 #endif
+
+  if (envs.size() > 0) {
+    std::string env_string = "--tryfromenv=";
+    for (auto t : envs) {
+      env_string += t + ",";
+    }
+    env_string = env_string.substr(0, env_string.length() - 1);
+    new_argv.push_back(strdup(env_string.c_str()));
+    VLOG(1) << "gtest env_string:" << env_string;
+  }
+
+  if (undefok.size() > 0) {
+    std::string undefok_string = "--undefok=";
+    for (auto t : undefok) {
+      undefok_string += t + ",";
+    }
+    undefok_string = undefok_string.substr(0, undefok_string.length() - 1);
+    new_argv.push_back(strdup(undefok_string.c_str()));
+    VLOG(1) << "gtest undefok_string:" << undefok_string;
+  }
+
   int new_argc = static_cast<int>(new_argv.size());
   char** new_argv_address = new_argv.data();
   google::ParseCommandLineFlags(&new_argc, &new_argv_address, false);
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index cdc631860f..999065f8aa 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -151,13 +151,17 @@ def __bootstrap__():
         read_env_flags.append('rpc_get_thread_num')
         read_env_flags.append('rpc_prefetch_thread_num')
         read_env_flags.append('rpc_disable_reuse_port')
+        if core.is_compiled_with_brpc():
+            read_env_flags.append('max_body_size')
+            #set brpc max body size
+            os.environ['FLAGS_max_body_size'] = "2147483647"
 
     if core.is_compiled_with_cuda():
         read_env_flags += [
             'fraction_of_gpu_memory_to_use', 'cudnn_deterministic',
             'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
             'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus',
-            'sync_nccl_allreduce'
+            'cudnn_exhaustive_search_times', 'sync_nccl_allreduce'
         ]
 
     core.init_gflags([sys.argv[0]] +
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index af02721eb7..c280ff21ee 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -272,8 +272,7 @@ class DataFeeder(object):
             dict: the result of conversion.
 
         Raises:
-            ValueError: If drop_last is False and the data batch which cannot
-            fit for devices.
+            ValueError: If drop_last is False and the data batch which cannot fit for devices.
         """
 
         def __reader_creator__():
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index de30ed2fc5..921d59158f 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import collections
+from collections import defaultdict
 import contextlib
 import os
 import re
@@ -369,13 +370,11 @@ class Variable(object):
             self._ivar.desc = self.desc
 
     def _numpy(self):
-        scope = _imperative_tracer().get_scope(self.block.desc)
-        tensor = core.get_variable_tensor(scope, self.desc.name())
+        tensor = self._ivar.var.get_tensor()
         return np.array(tensor)
 
     def _backward(self):
-        scope = _imperative_tracer().get_scope(self.block.desc)
-        self._ivar._run_backward(scope)
+        self._ivar._run_backward()
 
     def _gradient(self):
         return np.array(self._ivar._grad())
@@ -648,20 +647,16 @@ class Operator(object):
                     self.desc.set_input(in_proto.name, [])
 
         if outputs is not None:
-            given = set()
-            need = set()
-            for n in outputs:
-                given.add(n)
             for m in proto.outputs:
-                need.add(m.name)
-            if not given == need:
-                raise ValueError(("Incorrect setting for output(s) of "
-                                  "operator \"%s\". Need: [%s] Given: [%s]") %
-                                 (type,
-                                  ", ".join(six.binary_type(e) for e in need),
-                                  ", ".join(six.binary_type(e) for e in given)))
-
+                if (m.name not in outputs) and m.dispensable:
+                    continue
+                if not ((m.name in outputs) or m.dispensable):
+                    raise ValueError(
+                        ("Incorrect setting for output(s) of "
+                         "operator \"%s\", should set: [%s].") % (type, m.name))
             for out_proto in proto.outputs:
+                if out_proto.name not in outputs:
+                    continue
                 out_args = outputs[out_proto.name]
                 if not isinstance(out_args, list):
                     out_args = [out_args]
@@ -692,20 +687,20 @@ class Operator(object):
         if _in_imperative_mode():
             self.iop = core.OpBase()
             self.iop.desc = self.desc
-            self.inputs = []
+            self.inputs = defaultdict(list)
             if inputs is not None:
-                for inp in inputs.values():
-                    if isinstance(inp, Variable):
-                        self.inputs.append(inp)
-                    elif isinstance(inp, list) or isinstance(inp, tuple):
-                        self.inputs.extend(inp[:])
-            self.outputs = []
+                for k, v in six.iteritems(inputs):
+                    if isinstance(v, Variable):
+                        self.inputs[k].append(v._ivar)
+                    elif isinstance(v, list) or isinstance(v, tuple):
+                        self.inputs[k].extend([var._ivar for var in v])
+            self.outputs = defaultdict(list)
             if outputs is not None:
-                for out in outputs.values():
-                    if isinstance(out, Variable):
-                        self.outputs.append(out)
-                    elif isinstance(out, list) or isinstance(out, tuple):
-                        self.outputs.extend(out[:])
+                for k, v in six.iteritems(outputs):
+                    if isinstance(v, Variable):
+                        self.outputs[k].append(v._ivar)
+                    elif isinstance(v, list) or isinstance(v, tuple):
+                        self.outputs[k].extend([var._ivar for var in v])
 
     def _has_kernel(self, op_type):
         return op_type not in self.OP_WITHOUT_KERNEL_SET
@@ -1273,8 +1268,7 @@ class Block(object):
         op_desc = self.desc.append_op()
         op = Operator(block=self, desc=op_desc, *args, **kwargs)
         if _in_imperative_mode():
-            _imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs],
-                                       [v._ivar for v in op.outputs], self.desc)
+            _imperative_tracer().trace(op.iop, op.inputs, op.outputs, self.desc)
         self.ops.append(op)
         return op
 
@@ -1325,8 +1319,7 @@ class Block(object):
         op_desc = self.desc._prepend_op()
         op = Operator(self, op_desc, *args, **kwargs)
         if _in_imperative_mode():
-            _imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs],
-                                       [v._ivar for v in op.outputs], self.desc)
+            _imperative_tracer().trace(op.iop, op.inputs, op.outputs, self.desc)
         self.ops.insert(0, op)
         return op
 
@@ -1641,8 +1634,8 @@ class Program(object):
                 parameters, e.g., :code:`trainable`, :code:`optimize_attr`, need
                 to print.
 
-        Returns
-            (str): The debug string.
+        Returns:
+            str : The debug string.
 
         Raises:
             ValueError: If any of required fields is not set and throw_on_error is
diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/imperative/base.py
index aa48ef71aa..61e243e288 100644
--- a/python/paddle/fluid/imperative/base.py
+++ b/python/paddle/fluid/imperative/base.py
@@ -46,8 +46,7 @@ def to_variable(value, block=None):
             name=None,
             shape=value.shape,
             dtype=value.dtype)
-        scope = framework._imperative_tracer().get_scope(block.desc)
-        var = scope.var(py_var.name)
+        var = py_var._ivar.var
         tensor = var.get_tensor()
         tensor.set(value, core.CPUPlace())
         return py_var
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index 74b4a977db..8543cb847d 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -20,7 +20,7 @@ import six
 import sys
 import numpy as np
 
-from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating
+from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating, _in_imperative_mode
 from . import unique_name
 from paddle.fluid.initializer import Constant, Xavier
 from paddle.fluid.imperative import base
@@ -313,11 +313,22 @@ class LayerHelper(object):
             param = self._create_weight_normalize(attr, shape, dtype)
             WeightNormParamAttr.params_with_weight_norm.append(param)
             return param
-
-        self.startup_program.global_block().create_parameter(
-            dtype=dtype, shape=shape, **attr._to_kwargs(with_initializer=True))
-        return self.main_program.global_block().create_parameter(
-            dtype=dtype, shape=shape, **attr._to_kwargs())
+        if _in_imperative_mode():
+            self.main_program.global_block().create_parameter(
+                dtype=dtype, shape=shape, **attr._to_kwargs())
+            # In imperative mode, we want the returned parameter to be
+            # initialized so that it can be used imperatively.
+            return self.startup_program.global_block().create_parameter(
+                dtype=dtype,
+                shape=shape,
+                **attr._to_kwargs(with_initializer=True))
+        else:
+            self.startup_program.global_block().create_parameter(
+                dtype=dtype,
+                shape=shape,
+                **attr._to_kwargs(with_initializer=True))
+            return self.main_program.global_block().create_parameter(
+                dtype=dtype, shape=shape, **attr._to_kwargs())
 
     def get_parameter(self, name):
         param = self.main_program.global_block().var(name)
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 9d98e8333b..a7494aacea 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -1452,6 +1452,7 @@ class DynamicRNN(object):
     def step_input(self, x):
         """
         Mark a sequence as a dynamic RNN input.
+
         Args:
             x(Variable): The input sequence.
 
@@ -1505,6 +1506,7 @@ class DynamicRNN(object):
         """
         Mark a variable as a RNN input. The input will not be scattered into
         time steps.
+
         Args:
             x(Variable): The input variable.
 
@@ -1629,13 +1631,11 @@ class DynamicRNN(object):
         Args:
             init(Variable|None): The initialized variable.
 
-            shape(list|tuple): The memory shape. NOTE the shape does not contain
-            batch_size.
+            shape(list|tuple): The memory shape. NOTE the shape does not contain batch_size.
 
             value(float): the initalized value.
 
-            need_reorder(bool): True if the initialized memory depends on the
-            input sample.
+            need_reorder(bool): True if the initialized memory depends on the input sample.
 
             dtype(str|numpy.dtype): The data type of the initialized memory.
 
@@ -1714,6 +1714,7 @@ class DynamicRNN(object):
         """
         Update the memory from ex_mem to new_mem. NOTE that the shape and data
         type of :code:`ex_mem` and :code:`new_mem` must be same.
+        
         Args:
             ex_mem(Variable): the memory variable.
             new_mem(Variable): the plain variable generated in RNN block.
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index ce731f39ea..8aed97dc59 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -65,7 +65,7 @@ def rpn_target_assign(bbox_pred,
                       rpn_negative_overlap=0.3,
                       use_random=True):
     """
-    ** Target Assign Layer for region proposal network (RPN) in Faster-RCNN detection. **
+    **Target Assign Layer for region proposal network (RPN) in Faster-RCNN detection.**
 
     This layer can be, for given the  Intersection-over-Union (IoU) overlap
     between anchors and ground truth boxes, to assign classification and
@@ -135,19 +135,20 @@ def rpn_target_assign(bbox_pred,
     Examples:
         .. code-block:: python
 
-        bbox_pred = layers.data(name='bbox_pred', shape=[100, 4],
-                          append_batch_size=False, dtype='float32')
-        cls_logits = layers.data(name='cls_logits', shape=[100, 1],
-                          append_batch_size=False, dtype='float32')
-        anchor_box = layers.data(name='anchor_box', shape=[20, 4],
-                          append_batch_size=False, dtype='float32')
-        gt_boxes = layers.data(name='gt_boxes', shape=[10, 4],
-                         append_batch_size=False, dtype='float32')
-        loc_pred, score_pred, loc_target, score_target, bbox_inside_weight =
-            fluid.layers.rpn_target_assign(bbox_pred=bbox_pred,
-                                          cls_logits=cls_logits,
-                                          anchor_box=anchor_box,
-                                          gt_boxes=gt_boxes)
+            bbox_pred = layers.data(name='bbox_pred', shape=[100, 4],
+                              append_batch_size=False, dtype='float32')
+            cls_logits = layers.data(name='cls_logits', shape=[100, 1],
+                              append_batch_size=False, dtype='float32')
+            anchor_box = layers.data(name='anchor_box', shape=[20, 4],
+                              append_batch_size=False, dtype='float32')
+            gt_boxes = layers.data(name='gt_boxes', shape=[10, 4],
+                             append_batch_size=False, dtype='float32')
+            loc_pred, score_pred, loc_target, score_target, bbox_inside_weight =
+                fluid.layers.rpn_target_assign(bbox_pred=bbox_pred,
+                                              cls_logits=cls_logits,
+                                              anchor_box=anchor_box,
+                                              gt_boxes=gt_boxes)
+
     """
 
     helper = LayerHelper('rpn_target_assign', **locals())
@@ -1519,27 +1520,30 @@ def anchor_generator(input,
     Args:
        input(Variable): The input feature map, the format is NCHW.
        anchor_sizes(list|tuple|float): The anchor sizes of generated anchors,
-       given in absolute pixels e.g. [64., 128., 256., 512.].
-       For instance, the anchor size of 64 means the area of this anchor equals to 64**2.
+                                       given in absolute pixels e.g. [64., 128., 256., 512.].
+                                       For instance, the anchor size of 64 means the area of this anchor equals to 64**2.
        aspect_ratios(list|tuple|float): The height / width ratios of generated
-            anchors, e.g. [0.5, 1.0, 2.0].
+                                        anchors, e.g. [0.5, 1.0, 2.0].
        variance(list|tuple): The variances to be used in box regression deltas.
-            Default:[0.1, 0.1, 0.2, 0.2].
-       stride(list|turple): The anchors stride across width and height,
-            e.g. [16.0, 16.0]
+                             Default:[0.1, 0.1, 0.2, 0.2].
+       stride(list|turple): The anchors stride across width and height,e.g. [16.0, 16.0]
        offset(float): Prior boxes center offset. Default: 0.5
        name(str): Name of the prior box op. Default: None.
 
     Returns:
-        Anchors(Variable):  The output anchors with a layout of [H, W, num_anchors, 4].
-              H is the height of input, W is the width of input,
-              num_anchors is the box count of each position.
-              Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized.
-        Variances(Variable): The expanded variances of anchors
-              with a layout of [H, W, num_priors, 4].
-              H is the height of input, W is the width of input
-              num_anchors is the box count of each position.
-              Each variance is in (xcenter, ycenter, w, h) format.
+        Anchors(Variable),Variances(Variable):  
+        
+              two variables:
+        
+              - Anchors(Variable): The output anchors with a layout of [H, W, num_anchors, 4]. \
+                H is the height of input, W is the width of input, \
+                num_anchors is the box count of each position.  \
+                Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized. 
+              - Variances(Variable): The expanded variances of anchors \
+                with a layout of [H, W, num_priors, 4]. \
+                H is the height of input, W is the width of input \
+                num_anchors is the box count of each position. \
+                Each variance is in (xcenter, ycenter, w, h) format.
 
 
     Examples:
@@ -1748,35 +1752,35 @@ def generate_proposals(scores,
                        eta=1.0,
                        name=None):
     """
-    ** Generate proposal Faster-RCNN **
-	
-	This operation proposes RoIs according to each box with their probability to be a foreground object and 
-	the box can be calculated by anchors. Bbox_deltais and scores to be an object are the output of RPN. Final proposals
-	could be used to train detection net.
-
-	For generating proposals, this operation performs following steps:
-
-	1. Transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4)
- 	2. Calculate box locations as proposals candidates. 
-	3. Clip boxes to image
-	4. Remove predicted boxes with small area. 
-	5. Apply NMS to get final proposals as output.
-	
-      
-	Args:
-		scores(Variable): A 4-D Tensor with shape [N, A, H, W] represents the probability for each box to be an object.
-			N is batch size, A is number of anchors, H and W are height and width of the feature map.
-		bbox_deltas(Variable): A 4-D Tensor with shape [N, 4*A, H, W] represents the differece between predicted box locatoin and anchor location. 
-		im_info(Variable): A 2-D Tensor with shape [N, 3] represents origin image information for N batch. Info contains height, width and scale
-			between origin image size and the size of feature map.
-		anchors(Variable):   A 4-D Tensor represents the anchors with a layout of [H, W, A, 4]. H and W are height and width of the feature map,
-              		num_anchors is the box count of each position. Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized.
-		variances(Variable): The expanded variances of anchors with a layout of [H, W, num_priors, 4]. Each variance is in (xcenter, ycenter, w, h) format.
-		pre_nms_top_n(float): Number of total bboxes to be kept per image before NMS. 6000 by default.
-		post_nms_top_n(float): Number of total bboxes to be kept per image after NMS. 1000 by default.
-		nms_thresh(float): Threshold in NMS, 0.5 by default.
-		min_size(float): Remove predicted boxes with either height or width < min_size. 0.1 by default.
-		eta(float): Apply in adaptive NMS, if adaptive threshold > 0.5, adaptive_threshold = adaptive_threshold * eta in each iteration.
+    **Generate proposal Faster-RCNN**
+
+    This operation proposes RoIs according to each box with their probability to be a foreground object and 
+    the box can be calculated by anchors. Bbox_deltais and scores to be an object are the output of RPN. Final proposals
+    could be used to train detection net.
+
+    For generating proposals, this operation performs following steps:
+
+    1. Transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4)
+    2. Calculate box locations as proposals candidates. 
+    3. Clip boxes to image
+    4. Remove predicted boxes with small area. 
+    5. Apply NMS to get final proposals as output.
+
+    Args:
+        scores(Variable): A 4-D Tensor with shape [N, A, H, W] represents the probability for each box to be an object.
+            N is batch size, A is number of anchors, H and W are height and width of the feature map.
+        bbox_deltas(Variable): A 4-D Tensor with shape [N, 4*A, H, W] represents the differece between predicted box locatoin and anchor location. 
+        im_info(Variable): A 2-D Tensor with shape [N, 3] represents origin image information for N batch. Info contains height, width and scale
+            between origin image size and the size of feature map.
+        anchors(Variable):   A 4-D Tensor represents the anchors with a layout of [H, W, A, 4]. H and W are height and width of the feature map,
+                    num_anchors is the box count of each position. Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized.
+        variances(Variable): The expanded variances of anchors with a layout of [H, W, num_priors, 4]. Each variance is in (xcenter, ycenter, w, h) format.
+        pre_nms_top_n(float): Number of total bboxes to be kept per image before NMS. 6000 by default.
+        post_nms_top_n(float): Number of total bboxes to be kept per image after NMS. 1000 by default.
+        nms_thresh(float): Threshold in NMS, 0.5 by default.
+        min_size(float): Remove predicted boxes with either height or width < min_size. 0.1 by default.
+        eta(float): Apply in adaptive NMS, if adaptive threshold > 0.5, adaptive_threshold = adaptive_threshold * eta in each iteration.
+
     """
     helper = LayerHelper('generate_proposals', **locals())
 
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 42f4959a83..9a29b25093 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -949,12 +949,11 @@ def shuffle(reader, buffer_size):
     is determined by argument buf_size.
 
     Args:
-        param reader: the original reader whose output will be shuffled.
-        type reader: callable
-        param buf_size: shuffle buffer size.
-        type buf_size: int
-        return: the new reader whose output is shuffled.
-        rtype: callable
+        reader(callable): the original reader whose output will be shuffled.
+        buf_size(int): shuffle buffer size.
+
+    Returns:
+        callable: the new reader whose output is shuffled.
     """
     return __create_unshared_decorated_reader__(
         'create_shuffle_reader', reader, {'buffer_size': int(buffer_size)})
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index cc1fdbd285..1c97fe6c19 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -233,7 +233,7 @@ def fc(input,
             dimensions will be flatten to form the first dimension of the final matrix (height of
             the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
             form the second dimension of the final matrix (width of the matrix). For example, suppose
-            `X` is a 6-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
+            `X` is a 5-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
             Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30].
         param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
             parameters/weights of this layer.
@@ -502,46 +502,48 @@ def lstm(input,
     If Device is GPU, This op will use cudnn LSTM implementation
 
     A four-gate Long Short-Term Memory network with no peephole connections.
-    In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1,
+    In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, 
     the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations:
 
-    $$ i_t = \\sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$
-
-    $$ f_t = \\sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) $$
-
-    $$ o_t = \\sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) $$
-
-    $$ \\tilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) $$
-
-    $$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$
-
-    $$ h_t = o_t \\odot tanh(c_t) $$
-
-    - W terms denote weight matrices (e.g. $W_{ix}$ is the matrix
+    .. math::
+    
+       i_t &= \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) 
+       
+       f_t &= \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) 
+       
+       o_t &= \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) 
+       
+       \\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c)
+       
+       c_t &= f_t \odot c_{t-1} + i_t \odot \\tilde{c_t} 
+       
+       h_t &= o_t \odot tanh(c_t) 
+
+    - $W$ terms denote weight matrices (e.g. $W_{ix}$ is the matrix
       of weights from the input gate to the input)
     - The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector).
     - sigmoid is the logistic sigmoid function.
     - $i, f, o$ and $c$ are the input gate, forget gate, output gate,
       and cell activation vectors, respectively, all of which have the same size as
       the cell output activation vector $h$.
-    - The $\odot$ is the element-wise product of the vectors.
-    - `tanh` is the activation functions.
-    - $\tilde{c_t}$ is also called candidate hidden state,
+    - The :math:`\odot` is the element-wise product of the vectors.
+    - :math:`tanh` is the activation functions.
+    - :math:`\\tilde{c_t}` is also called candidate hidden state,
       which is computed based on the current input and the previous hidden state.
 
-    Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication,
+    Where sigmoid is the sigmoid operator: :math:`sigmoid(x) = 1 / (1 + e^{-x})` , * represents a point-wise multiplication, 
     X represensts a matrix multiplication
 
 
     Args:
         input (Variable): LSTM input tensor, shape MUST be ( seq_len x batch_size x input_size )
-        init_h(Variable): The initial hidden state of the LSTM
+        init_h(Variable): The initial hidden state of the LSTM                       
                        This is a tensor with shape ( num_layers x batch_size x hidden_size)
                        if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
         init_c(Variable): The initial cell state of the LSTM.
                        This is a tensor with shape ( num_layers x batch_size x hidden_size )
                        if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
-        max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len
+        max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len 
         hidden_size (int): hidden size of the LSTM
         num_layers (int): total layers number of the LSTM
         dropout_prob(float|0.0): dropout prob, dropout ONLY work between rnn layers, NOT between time steps
@@ -556,14 +558,18 @@ def lstm(input,
 
 
     Returns:
-        rnn_out(Tensor): result of LSTM hidden, shape is (seq_len x batch_size x hidden_size)
-                         if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2)
-        last_h(Tensor): the hidden state of the last step of LSTM
-                        shape is ( num_layers x batch_size x hidden_size )
-                        if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
-        last_c(Tensor): the cell state of the last step of LSTM
-                        shape is ( num_layers x batch_size x hidden_size )
-                        if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
+        rnn_out(Tensor),last_h(Tensor),last_c(Tensor):  
+                        
+                        Three tensors, rnn_out, last_h, last_c:
+                        
+                        - rnn_out is result of LSTM hidden, shape is (seq_len x batch_size x hidden_size) \
+                          if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2)
+                        - last_h is the hidden state of the last step of LSTM \
+                          shape is ( num_layers x batch_size x hidden_size ) \
+                          if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)                     
+                        - last_c(Tensor): the cell state of the last step of LSTM \
+                          shape is ( num_layers x batch_size x hidden_size ) \
+                          if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)                     
 
 
     Examples:
@@ -1220,6 +1226,8 @@ def dropout(x,
     probability) the outputs of some units to zero, while others are remain
     unchanged.
 
+    dropout op can be removed from the program to make the program more efficient.
+
     Args:
         x (Variable): The input tensor variable.
         dropout_prob (float): Probability of setting units to zero.
@@ -1230,22 +1238,24 @@ def dropout(x,
                     units will be dropped. DO NOT use a fixed seed in training.
         name (str|None): A name for this layer(optional). If set None, the layer
                          will be named automatically.
-        dropout_implementation(string): ['downgrade_in_infer'(defauld)|'upscale_in_train']
+        dropout_implementation(string): ['downgrade_in_infer'(default)|'upscale_in_train']
+
                                         1. downgrade_in_infer(default), downgrade the outcome at inference
-                                           train: out = input * mask
-                                           inference: out = input * dropout_prob
-                                           (make is a tensor same shape with input, value is 0 or 1
-                                            ratio of 0 is dropout_prob)
+
+                                           - train: out = input * mask
+                                           - inference: out = input * dropout_prob
+
+                                           (mask is a tensor same shape with input, value is 0 or 1
+                                           ratio of 0 is dropout_prob)
                                         2. upscale_in_train, upscale the outcome at training time
-                                           train: out = input * mask / ( 1.0 - dropout_prob )
-                                           inference: out = input
-                                           (make is a tensor same shape with input, value is 0 or 1
-                                            ratio of 0 is dropout_prob)
-                                           dropout op can be removed from the program.
-                                           the program will be efficient
 
+                                           - train: out = input * mask / ( 1.0 - dropout_prob )
+                                           - inference: out = input
 
+                                           (mask is a tensor same shape with input, value is 0 or 1
+                                           ratio of 0 is dropout_prob)
 
+                                        
     Returns:
         Variable: A tensor variable is the shape with `x`.
 
@@ -1333,11 +1343,15 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex):
          A 2-D tensor with shape [N x 1], the cross entropy loss.
 
     Raises:
-        `ValueError`: 1) the 1st dimension of `input` and `label` are not equal.
-                      2) when `soft_label == True`, and the 2nd dimension of
-                         `input` and `label` are not equal.
-                      3) when `soft_label == False`, and the 2nd dimension of
-                         `label` is not 1.
+         ValueError:
+
+                      1. the 1st dimension of ``input`` and ``label`` are not equal.
+                      
+                      2. when ``soft_label == True``, and the 2nd dimension of
+                         ``input`` and ``label`` are not equal.
+                         
+                      3. when ``soft_label == False``, and the 2nd dimension of
+                         ``label`` is not 1.
 
     Examples:
         .. code-block:: python
@@ -1457,8 +1471,8 @@ def chunk_eval(input,
     This function computes and outputs the precision, recall and
     F1-score of chunk detection.
 
-    For some basics of chunking, please refer to
-    'Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>'.
+    For some basics of chunking, please refer to 
+    `Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>`_ .
 
     ChunkEvalOp computes the precision, recall, and F1-score of chunk detection,
     and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
@@ -1823,7 +1837,7 @@ def conv2d(input,
             of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
             will create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
-             and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
+            and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
         bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d.
             If it is set to False, no bias will be added to the output units.
             If it is set to None or one attribute of ParamAttr, conv2d
@@ -2276,7 +2290,7 @@ def sequence_slice(input, offset, length, name=None):
 
     .. code-block:: text
 
-	- Case:
+              - Case:
 
             Given the input Variable **input**:
 
@@ -2292,7 +2306,8 @@ def sequence_slice(input, offset, length, name=None):
                 out.lod = [[2, 1]],
                 out.dims = (3, 2).
 
-    NOTE: The first dimension size of **input**, **offset** and **length**
+    Note: 
+          The first dimension size of **input**, **offset** and **length**
           should be equal. The **offset** should start from 0.
 
     Args:
@@ -2570,12 +2585,7 @@ def adaptive_pool2d(input,
         raise ValueError(
             "invalid setting 'require_index' true when 'pool_type' is 'avg'.")
 
-    def _is_list_or_tuple_(data):
-        return (isinstance(data, list) or isinstance(data, tuple))
-
-    if not _is_list_or_tuple_(pool_size) or len(pool_size) != 2:
-        raise ValueError(
-            "'pool_size' should be a list or tuple with length as 2.")
+    pool_size = utils.convert_to_list(pool_size, 2, 'pool_size')
 
     if pool_type == "max":
         l_type = 'max_pool2d_with_index'
@@ -2671,12 +2681,7 @@ def adaptive_pool3d(input,
         raise ValueError(
             "invalid setting 'require_index' true when 'pool_type' is 'avg'.")
 
-    def _is_list_or_tuple_(data):
-        return (isinstance(data, list) or isinstance(data, tuple))
-
-    if not _is_list_or_tuple_(pool_size) or len(pool_size) != 3:
-        raise ValueError(
-            "'pool_size' should be a list or tuple with length as 3.")
+    pool_size = utils.convert_to_list(pool_size, 3, 'pool_size')
 
     if pool_type == "max":
         l_type = 'max_pool3d_with_index'
@@ -3013,7 +3018,7 @@ def group_norm(input,
     """
     **Group Normalization Layer**
 
-    Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`
+    Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`_ .
 
     Args:
         input(Variable): The input tensor variable.
@@ -3140,8 +3145,8 @@ def conv2d_transpose(input,
 
            H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\
            W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 \\\\
-           H_{out} \in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\
-           W_{out} \in [ W^\prime_{out}, W^\prime_{out} + strides[1] )
+           H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\
+           W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] )
 
     Args:
         input(Variable): The input image with [N, C, H, W] format.
@@ -4673,7 +4678,7 @@ def ctc_greedy_decoder(input, blank, name=None):
                       [0.5, 0.1, 0.3, 0.1]]
 
         input.lod = [[4, 4]]
-
+      
         Computation:
 
         step1: Apply argmax to first input sequence which is input.data[0:4]. Then we get:
@@ -4704,10 +4709,10 @@ def ctc_greedy_decoder(input, blank, name=None):
         name (str): The name of this layer. It is optional.
 
     Returns:
-        Variable: CTC greedy decode result which is a 2-D tensor with shape [Lp, 1].
-                  'Lp' is the sum if all output sequences' length. If all the sequences
-                  in result were empty, the result LoDTensor will be [-1] with
-                  LoD [[]] and dims [1, 1].
+        Variable: CTC greedy decode result which is a 2-D tensor with shape [Lp, 1]. \
+                  'Lp' is the sum if all output sequences' length. If all the sequences \
+                  in result were empty, the result LoDTensor will be [-1] with  \
+                  LoD [[]] and dims [1, 1]. 
 
     Examples:
         .. code-block:: python
@@ -5060,7 +5065,7 @@ def hsigmoid(input,
     """
     The hierarchical sigmoid operator is used to accelerate the training
     process of language model. This operator organizes the classes into a
-    complete binary tree, or you can use is_custom to pass your own tree to
+    complete binary tree, or you can use is_custom to pass your own tree to 
     implement hierarchical. Each leaf node represents a class(a word) and each
     internal node acts as a binary classifier. For each word there's a unique
     path from root to it's leaf node, hsigmoid calculate the cost for each
@@ -5072,13 +5077,13 @@ def hsigmoid(input,
     <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>`_
 
     And if you want to use the costumed tree by set 'is_custom' as true you may need to do following things first:
-        1. using your word dict to build a binary tree, each leaf node should be an word of your word dict
-        2. build a dict to store word_id -> word's leaf to root path, we call it path_table.
-        3. build a dict to store word_id -> code of word's leaf to root path, we call it path_code. Code
-         means label of each binary classification, using 1 indicate true, 0 indicate false.
-        4. now, each word should has its path and code along the path, you can pass a batch of path and code
-        related to the same batch of inputs.
 
+    1. using your word dict to build a binary tree, each leaf node should be an word of your word dict
+    2. build a dict to store word_id -> word's leaf to root path, we call it path_table.
+    3. build a dict to store word_id -> code of word's leaf to root path, we call it path_code. Code
+       means label of each binary classification, using 1 indicate true, 0 indicate false.
+    4. now, each word should has its path and code along the path, you can pass a batch of path and code 
+       related to the same batch of inputs.
 
     Args:
         input (Variable): The input tensor variable with shape
@@ -5086,8 +5091,8 @@ def hsigmoid(input,
             and :math:`D` is the feature size.
         label (Variable): The tensor variable contains labels of training data.
             It's a tensor with shape is :math:`[N \\times 1]`.
-        num_classes: (int), The number of classes, must not be less than 2. with default tree this has to be set,
-            it should never be None under is_custom=False, but while is_custom is true, it should be non leaf num
+        num_classes: (int), The number of classes, must not be less than 2. with default tree this has to be set, 
+            it should never be None under is_custom=False, but while is_custom is true, it should be non leaf num 
             which indicates the num of classes using by binary classify.
         param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
              of hsigmoid. If it is set to None or one attribute of ParamAttr, hsigmoid
@@ -5100,15 +5105,15 @@ def hsigmoid(input,
              is not set, the bias is initialized zero. Default: None.
         name (str|None): A name for this layer(optional). If set None, the layer
              will be named automatically. Default: None.
-        path_table: (Variable|None) this variable can store each batch of samples' path to root,
+        path_table: (Variable|None) this variable can store each batch of samples' path to root, 
             it should be in leaf -> root order
-            path_table should have the same shape with path_code, and for each sample i path_table[i] indicates a np.array like
-            structure and each element in this array is indexes in parent nodes' Weight Matrix.
-        path_code:  (Variable|None) this variable can store each batch of samples' code,
+            path_table should have the same shape with path_code, and for each sample i path_table[i] indicates a np.array like 
+            structure and each element in this array is indexes in parent nodes' Weight Matrix. 
+        path_code:  (Variable|None) this variable can store each batch of samples' code, 
             each code consist with every code of parent nodes. it should be in leaf -> root order
-        is_custom: (bool|False)using user defined binary tree instead of default complete binary tree, if costum is
+        is_custom: (bool|False)using user defined binary tree instead of default complete binary tree, if costum is 
              set you need to set path_table/path_code/num_classes, otherwise num_classes should be set
-        is_sparse: (bool|False)using sparse update instead of dense update, if set, the gradient
+        is_sparse: (bool|False)using sparse update instead of dense update, if set, the gradient 
              of W and input will be sparse.
 
     Returns:
@@ -5485,11 +5490,11 @@ def softmax_with_cross_entropy(logits,
 
     .. math::
 
-        max_j = \\max_{i=0}^{K}{\\text{logit}_i}
+        max_j &= \\max_{i=0}^{K}{\\text{logit}_i}
 
-        log\\_max\\_sum_j = \\log\\sum_{i=0}^{K}\\exp(logit_i - max_j)
+        log\\_max\\_sum_j &= \\log\\sum_{i=0}^{K}\\exp(logit_i - max_j)
 
-        softmax_j = \\exp(logit_j - max_j - {log\\_max\\_sum}_j)
+        softmax_j &= \\exp(logit_j - max_j - {log\\_max\\_sum}_j)
 
     and then cross entropy loss is calculated by softmax and label.
 
@@ -5515,11 +5520,11 @@ def softmax_with_cross_entropy(logits,
                                along with the cross entropy loss. Default: False
 
     Returns:
-        Variable or Tuple of two Variables: Return the cross entropy loss if
-                              `return_softmax` is False, otherwise the tuple
-                              (loss, softmax), where the cross entropy loss is
-                              a 2-D tensor with shape [N x 1], and softmax is a
-                              2-D tensor with shape [N x K].
+        Variable or Tuple of two Variables: Return the cross entropy loss if \
+                                            `return_softmax` is False, otherwise the tuple \
+                                            (loss, softmax), where the cross entropy loss is \
+                                            a 2-D tensor with shape [N x 1], and softmax is a \
+                                            2-D tensor with shape [N x K].
 
     Examples:
         .. code-block:: python
@@ -5792,21 +5797,27 @@ def squeeze(input, axes, name=None):
     the single dimensions will be removed from the shape. If an axis is
     selected with shape entry not equal to one, an error is raised.
 
-    Examples:
-    Case 1:
-      Given
-        X.shape = (1, 3, 1, 5)
-      and
-        axes = [0]
-      we get:
-        Out.shape = (3, 1, 5)
-      Case 2:
-        Given
-          X.shape = (1, 3, 1, 5)
-        and
-          axes = []
-        we get:
-          Out.shape = (3, 5)
+    For example:
+
+    .. code-block:: text
+
+        Case 1:
+
+          Given
+            X.shape = (1, 3, 1, 5)
+          and
+            axes = [0]
+          we get:
+            Out.shape = (3, 1, 5)
+
+        Case 2:
+
+          Given
+            X.shape = (1, 3, 1, 5)
+          and
+            axes = []
+          we get:
+            Out.shape = (3, 5)
 
     Args:
         input (Variable): The input variable to be squeezed.
@@ -5842,6 +5853,9 @@ def unsqueeze(input, axes, name=None):
     Dimension indices in axes are as seen in the output tensor.
 
     For example:
+
+    .. code-block:: text
+
       Given a tensor such that tensor with shape [3, 4, 5],
       then Unsqueezed tensor with axes=[0, 4] has shape [1, 3, 4, 5, 1].
 
@@ -6729,8 +6743,11 @@ def sequence_scatter(input, index, updates, name=None):
     the columns to update in each row of X.
 
     Here is an example:
+
     Given the following input:
+
     .. code-block:: text
+
         input.data = [[1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
                       [1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
                       [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]]
@@ -6743,7 +6760,9 @@ def sequence_scatter(input, index, updates, name=None):
         updates.lod =  [[  0,            3,                                 8,                         12]]
 
     Then we have the output:
+
     .. code-block:: text
+
         out.data = [[1.3, 1.3, 1.4, 1.0, 1.0, 1.0],
                     [1.0, 1.0, 1.4, 1.3, 1.2, 1.1],
                     [1.0, 1.0, 1.3, 1.2, 1.4, 1.1]]
@@ -6759,7 +6778,7 @@ def sequence_scatter(input, index, updates, name=None):
         name (str|None): The output variable name. Default None.
 
     Returns:
-        output (Variable): The output is a tensor with the same shape as input.
+        Variable: The output is a tensor with the same shape as input.
 
     Examples:
 
@@ -6933,7 +6952,7 @@ def mean_iou(input, label, num_classes):
 
     .. math::
 
-        IOU = \\frac{true\_positiv}{(true\_positive + false\_positive + false\_negative)}.
+        IOU = \\frac{true\_positive}{(true\_positive + false\_positive + false\_negative)}.
 
     The predictions are accumulated in a confusion matrix and mean-IOU
     is then calculated from it.
@@ -6946,9 +6965,13 @@ def mean_iou(input, label, num_classes):
         num_classes (int): The possible number of labels.
 
     Returns:
-        mean_iou (Variable): A Tensor representing the mean intersection-over-union with shape [1].
-        out_wrong(Variable): A Tensor with shape [num_classes]. The wrong numbers of each class.
-        out_correct(Variable): A Tensor with shape [num_classes]. The correct numbers of each class.
+        mean_iou (Variable),out_wrong(Variable),out_correct(Variable): 
+        
+                     Three variables:
+                      
+                     - mean_iou : A Tensor representing the mean intersection-over-union with shape [1].
+                     - out_wrong: A Tensor with shape [num_classes]. The wrong numbers of each class.
+                     - out_correct: A Tensor with shape [num_classes]. The correct numbers of each class.
 
     Examples:
 
@@ -7143,8 +7166,8 @@ def affine_grid(theta, out_shape, name=None):
 
     Args:
         theta (Variable): A batch of affine transform parameters with shape [N, 2, 3].
-        out_shape (Variable | list | tuple): The shape of target output with format [N, C, H, W].
-        out_shape can be a Variable or a list or tuple.
+        out_shape (Variable | list | tuple): The shape of target output with format [N, C, H, W]. 
+                                             ``out_shape`` can be a Variable or a list or tuple.
         name(str|None): A name for this layer(optional). If set None, the layer
                         will be named automatically.
 
@@ -7157,6 +7180,7 @@ def affine_grid(theta, out_shape, name=None):
     Examples:
 
         .. code-block:: python
+
             theta = fluid.layers.data(name="x", shape=[2, 3], dtype="float32")
             out_shape = fluid.layers.data(name="y", shape=[-1], dtype="float32")
             data = fluid.layers.affine_grid(theta, out_shape)
@@ -7192,9 +7216,10 @@ def affine_grid(theta, out_shape, name=None):
 
 def rank_loss(label, left, right, name=None):
     """
+
     **Rank loss layer for RankNet**
 
-    RankNet(http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf)
+    `RankNet <http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf>`_
     is a pairwise ranking model with a training sample consisting of a pair
     of documents, A and B. Label P indicates whether A is ranked higher than B
     or not:
@@ -7202,16 +7227,19 @@ def rank_loss(label, left, right, name=None):
     P = {0, 1} or {0, 0.5, 1}, where 0.5 means that there is no information
     about the rank of the input pair.
 
-    Rank loss layer takes three inputs: left (o_i), right (o_j) and
-    label (P_{i,j}). The inputs respectively represent RankNet's output scores
+    Rank loss layer takes three inputs: left ( :math:`o_i` ), right ( :math:`o_j` ) and
+    label ( :math:`P_{i,j}` ). The inputs respectively represent RankNet's output scores
     for documents A and B and the value of label P. The following equation
     computes rank loss C_{i,j} from the inputs:
 
-    $$
-      C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\
-      o_{i,j} =  o_i - o_j  \\
-      \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \}
-    $$
+    .. math::
+
+      C_{i,j} &= -\\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\\\
+
+      o_{i,j} &=  o_i - o_j  \\\\
+
+      \\tilde{P_{i,j}} &= \\left \{0, 0.5, 1 \\right \} \ or \ \\left \{0, 1 \\right \}
+
 
     Rank loss layer takes batch inputs with size batch_size (batch_size >= 1).
 
@@ -7237,7 +7265,6 @@ def rank_loss(label, left, right, name=None):
             right = fluid.layers.data(name="right", shape=[4, 1], dtype="float32")
             out = fluid.layers.rank_loss(label, left, right)
 
-
     """
     helper = LayerHelper('rank_loss', **locals())
 
@@ -7269,7 +7296,7 @@ def margin_rank_loss(label, left, right, margin=0.1, name=None):
 
     .. math::
 
-        rank\_loss &= max(0, -label * (left - right) + margin)
+        rank\_loss = max(0, -label * (left - right) + margin)
 
     Args:
        label (Variable): Indicates whether the left is ranked higher than the right or not.
@@ -7278,12 +7305,17 @@ def margin_rank_loss(label, left, right, margin=0.1, name=None):
        margin (float): Indicates the given margin.
        name (str|None): A name for this layer (optional). If set None, the layer
                        will be named automatically.
+
     Returns:
        Variable: The ranking loss.
+
     Raises:
        ValueError: Any of label, left, and right is not a Variable.
+
     Examples:
+
         .. code-block:: python
+
            label = fluid.layers.data(name="label", shape=[4, 1], dtype="float32")
            left = fluid.layers.data(name="left", shape=[4, 1], dtype="float32")
            right = fluid.layers.data(name="right", shape=[4, 1], dtype="float32")
@@ -7587,7 +7619,8 @@ def prelu(x, mode, param_attr=None, name=None):
     """
     Equation:
 
-        y = \max(0, x) + alpha * \min(0, x)
+    .. math::
+        y = \max(0, x) + \\alpha * \min(0, x)
 
     Args:
         x (Variable): The input tensor.
@@ -7653,8 +7686,8 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None):
 
         .. code-block:: python
 
-        x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32")
-        y = fluid.layers.brelu(x, t_min=1.0, t_max=20.0)
+            x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32")
+            y = fluid.layers.brelu(x, t_min=1.0, t_max=20.0)
     """
     helper = LayerHelper('brelu', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -7683,8 +7716,8 @@ def leaky_relu(x, alpha=0.02, name=None):
 
         .. code-block:: python
 
-        x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32")
-        y = fluid.layers.leaky_relu(x, alpha=0.01)
+            x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32")
+            y = fluid.layers.leaky_relu(x, alpha=0.01)
     """
     helper = LayerHelper('leaky_relu', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -7712,8 +7745,8 @@ def soft_relu(x, threshold=40.0, name=None):
 
         .. code-block:: python
 
-        x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32")
-        y = fluid.layers.soft_relu(x, threshold=20.0)
+            x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32")
+            y = fluid.layers.soft_relu(x, threshold=20.0)
     """
     helper = LayerHelper('soft_relu', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -7729,23 +7762,32 @@ def flatten(x, axis=1, name=None):
     """
     **Flatten layer**
     Flattens the input tensor into a 2D matrix.
+    
+    For Example:
+    
+    .. code-block:: text
 
-    Examples:
-    Case 1:
-      Given
-        X.shape = (3, 100, 100, 4)
-      and
-        axis = 2
-      We get:
-        Out.shape = (3 * 100, 4 * 100)
-
-    Case 2:
-      Given
-        X.shape = (3, 100, 100, 4)
-      and
-        axis = 0
-      We get:
-        Out.shape = (1, 3 * 100 * 100 * 4)
+        Case 1:
+
+          Given
+            X.shape = (3, 100, 100, 4)
+
+          and
+            axis = 2
+
+          We get:
+            Out.shape = (3 * 100, 4 * 100)
+
+        Case 2:
+
+          Given
+            X.shape = (3, 100, 100, 4)
+
+          and
+            axis = 0
+
+          We get:
+            Out.shape = (1, 3 * 100 * 100 * 4)
 
     Args:
         x (Variable): A tensor of rank >= axis.
@@ -7759,9 +7801,9 @@ def flatten(x, axis=1, name=None):
                         will be named automatically.
 
     Returns:
-        Variable: A 2D tensor with the contents of the input tensor, with input
-                  dimensions up to axis flattened to the outer dimension of
-                  the output and remaining input dimensions flattened into the
+        Variable: A 2D tensor with the contents of the input tensor, with input \
+                  dimensions up to axis flattened to the outer dimension of \
+                  the output and remaining input dimensions flattened into the \
                   inner dimension of the output.
 
     Raises:
@@ -7801,19 +7843,23 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None):
     The enumerated sequence has the same 1st dimension with variable `input`, and
     the 2nd dimension is `win_size`, padded by `pad_value` if necessary in generation.
 
-    Examples:
-    Case 1:
-      Input:
-        X.lod = [[0, 3, 5]]
-        X.data = [[1], [2], [3], [4], [5]]
-        X.dims = [5, 1]
-      Attrs:
-        win_size = 2
-        pad_value = 0
-      Output:
-        Out.lod = [[0, 3, 5]]
-        Out.data = [[1, 2], [2, 3], [3, 0], [4, 5], [5, 0]]
-        Out.dims = [5, 2]
+    .. code-block:: text
+
+        Case 1:
+
+          Input:
+            X.lod = [[0, 3, 5]]
+            X.data = [[1], [2], [3], [4], [5]]
+            X.dims = [5, 1]
+
+          Attrs:
+            win_size = 2
+            pad_value = 0
+
+          Output:
+            Out.lod = [[0, 3, 5]]
+            Out.data = [[1, 2], [2, 3], [3, 0], [4, 5], [5, 0]]
+            Out.dims = [5, 2]
 
     Args:
         input (Variable): The input variable which is a index sequence.
@@ -8896,6 +8942,7 @@ def similarity_focus(input, axis, indexes, name=None):
     SimilarityFocus Operator
 
     Generate a similarity focus mask with the same shape of input using the following method:
+    
     1. Extract the 3-D tensor(here the first dimension is BatchSize) corresponding
        to the axis according to the indexes. For example, if axis=1 and indexes=[a],
        it will get the matrix T=X[:, a, :, :]. In this case, if the shape of input X
@@ -8969,14 +9016,16 @@ def similarity_focus(input, axis, indexes, name=None):
         indexes(list): Indicating the indexes of the selected dimension.
 
     Returns:
-        Variable: A tensor variable with the same shape and same type
-            as the input.
+        Variable: A tensor variable with the same shape and same type \
+                  as the input.
 
     Examples:
         .. code-block:: python
+
             data = fluid.layers.data(
               name='data', shape=[2, 3, 2, 2], dtype='float32')
             x = fluid.layers.layer_norm(input=data, axis=1, indexes=[0])
+
     """
     helper = LayerHelper('similarity_focus', **locals())
     # check attrs
@@ -9055,6 +9104,7 @@ def hash(input, hash_size, num_hash=1, name=None):
 
     Examples:
        .. code-block:: python
+
            word_dict = paddle.dataset.imdb.word_dict()
            x = fluid.layers.data(shape[1], dtype='int32', lod_level=1)
            out = fluid.layers.hash(input=x, num_hash=4, hash_size=1000)
@@ -9075,50 +9125,52 @@ def hash(input, hash_size, num_hash=1, name=None):
 def grid_sampler(x, grid, name=None):
     """
     This operation samples input X by using bilinear interpolation based on
-    flow field grid, which is usually gennerated by affine_grid. The grid of
+    flow field grid, which is usually gennerated by :code:`affine_grid` . The grid of
     shape [N, H, W, 2] is the concatenation of (grid_x, grid_y) coordinates
     with shape [N, H, W] each, where grid_x is indexing the 4th dimension
     (in width dimension) of input data x and grid_y is indexng the 3rd
     dimention (in height dimension), finally results is the bilinear
     interpolation value of 4 nearest corner points.
 
-    Step 1:
-    Get (x, y) grid coordinates and scale to [0, H-1/W-1].
+    .. code-block:: text
 
-    grid_x = 0.5 * (grid[:, :, :, 0] + 1) * (W - 1)
-    grid_y = 0.5 * (grid[:, :, :, 1] + 1) * (H - 1)
+        Step 1:
+        Get (x, y) grid coordinates and scale to [0, H-1/W-1].
 
-    Step 2:
-    Indices input data X with grid (x, y) in each [H, W] area, and bilinear
-    interpolate point value by 4 nearest points.
+        grid_x = 0.5 * (grid[:, :, :, 0] + 1) * (W - 1)
+        grid_y = 0.5 * (grid[:, :, :, 1] + 1) * (H - 1)
 
-      wn ------- y_n ------- en
-      |           |           |
-      |          d_n          |
-      |           |           |
-     x_w --d_w-- grid--d_e-- x_e
-      |           |           |
-      |          d_s          |
-      |           |           |
-      ws ------- y_s ------- wn
+        Step 2:
+        Indices input data X with grid (x, y) in each [H, W] area, and bilinear
+        interpolate point value by 4 nearest points.
 
-    x_w = floor(x)              // west side x coord
-    x_e = x_w + 1               // east side x coord
-    y_n = floor(y)              // north side y coord
-    y_s = y_s + 1               // south side y coord
+          wn ------- y_n ------- en
+          |           |           |
+          |          d_n          |
+          |           |           |
+         x_w --d_w-- grid--d_e-- x_e
+          |           |           |
+          |          d_s          |
+          |           |           |
+          ws ------- y_s ------- wn
 
-    d_w = grid_x - x_w          // distance to west side
-    d_e = x_e - grid_x          // distance to east side
-    d_n = grid_y - y_n          // distance to north side
-    d_s = y_s - grid_y          // distance to south side
+        x_w = floor(x)              // west side x coord
+        x_e = x_w + 1               // east side x coord
+        y_n = floor(y)              // north side y coord
+        y_s = y_s + 1               // south side y coord
 
-    wn = X[:, :, y_n, x_w]      // north-west point value
-    en = X[:, :, y_n, x_e]      // north-east point value
-    ws = X[:, :, y_s, x_w]      // south-east point value
-    es = X[:, :, y_s, x_w]      // north-east point value
+        d_w = grid_x - x_w          // distance to west side
+        d_e = x_e - grid_x          // distance to east side
+        d_n = grid_y - y_n          // distance to north side
+        d_s = y_s - grid_y          // distance to south side
 
-    output = wn * d_e * d_s + en * d_w * d_s
-           + ws * d_e * d_n + es * d_w * d_n
+        wn = X[:, :, y_n, x_w]      // north-west point value
+        en = X[:, :, y_n, x_e]      // north-east point value
+        ws = X[:, :, y_s, x_w]      // south-east point value
+        es = X[:, :, y_s, x_w]      // north-east point value
+
+        output = wn * d_e * d_s + en * d_w * d_s
+               + ws * d_e * d_n + es * d_w * d_n
 
     Args:
         x(Variable): Input data of shape [N, C, H, W].
@@ -9126,16 +9178,18 @@ def grid_sampler(x, grid, name=None):
         name (str, default None): The name of this layer.
 
     Returns:
-        out(Variable): Output of shape [N, C, H, W] data samples input X
+        Variable: Output of shape [N, C, H, W] data samples input X
         using bilnear interpolation based on input grid.
 
-    Exmples:
-    .. code-block:: python
+    Examples:
+
+        .. code-block:: python
+
+            x = fluid.layers.data(name='x', shape=[3, 10, 32, 32], dtype='float32')
+            theta = fluid.layers.data(name='theta', shape=[3, 2, 3], dtype='float32')
+            grid = fluid.layers.affine_grid(input=theta, size=[3, 10, 32, 32]})
+            out = fluid.layers.grid_sampler(x=x, grid=grid)
 
-        x = fluid.layers.data(name='x', shape=[3, 10, 32, 32], dtype='float32')
-        theta = fluid.layers.data(name='theta', shape=[3, 2, 3], dtype='float32')
-        grid = fluid.layers.affine_grid(input=theta, size=[3, 10, 32, 32]})
-        out = fluid.layers.grid_sampler(x=x, grid=grid)
     """
     helper = LayerHelper("grid_sampler", **locals())
 
@@ -9203,19 +9257,19 @@ def add_position_encoding(input, alpha, beta, name=None):
     """
     **Add Position Encoding Layer**
 
-    This layer accepts an input 3D-Tensor of shape [N x M x P], and return an
+    This layer accepts an input 3D-Tensor of shape [N x M x P], and returns an
     output Tensor of shape [N x M x P] with positional encoding value.
 
-    Refer to `Attention Is All You Need<http://arxiv.org/pdf/1706.03762.pdf>`_ .
+    Refer to `Attention Is All You Need <http://arxiv.org/pdf/1706.03762.pdf>`_ .
 
     .. math::
-        PE(pos, 2i) = \\sin{(pos / 10000^{2i / P})}   \\\\
-        PE(pos, 2i + 1) = \\cos{(pos / 10000^{2i / P})}  \\\\
-        Out(:, pos, i) = \\alpha * input(:, pos, i) + \\beta * PE(pos, i)
+        PE(pos, 2i) &= \\sin{(pos / 10000^{2i / P})}   \\\\
+        PE(pos, 2i + 1) &= \\cos{(pos / 10000^{2i / P})}  \\\\
+        Out(:, pos, i) &= \\alpha * input(:, pos, i) + \\beta * PE(pos, i)
 
     Where:
-    * PE(pos, 2i): the increment for the number at even position
-    * PE(pos, 2i + 1): the increment for the number at odd position
+      - :math:`PE(pos, 2i)` : the increment for the number at even position
+      - :math:`PE(pos, 2i + 1)` : the increment for the number at odd position
 
     Args:
         input (Variable): 3-D input tensor with shape [N x M x P]
@@ -9230,6 +9284,7 @@ def add_position_encoding(input, alpha, beta, name=None):
         .. code-block:: python
 
           position_tensor = fluid.layers.add_position_encoding(input=tensor)
+
     """
     helper = LayerHelper('add_position_encoding', **locals())
     dtype = helper.input_dtype()
@@ -9262,13 +9317,13 @@ def bilinear_tensor_product(x,
     For example:
 
     .. math::
-       out{i} = x * W_{i} * {y^\mathrm{T}}, i=0,1,...,size-1
+       out_{i} = x * W_{i} * {y^\mathrm{T}}, i=0,1,...,size-1
 
     In this formula:
       - :math:`x`: the first input contains M elements, shape is [batch_size, M].
       - :math:`y`: the second input contains N elements, shape is [batch_size, N].
       - :math:`W_{i}`: the i-th learned weight, shape is [M, N]
-      - :math:`out{i}`: the i-th element of out, shape is [batch_size, size].
+      - :math:`out_{i}`: the i-th element of out, shape is [batch_size, size].
       - :math:`y^\mathrm{T}`: the transpose of :math:`y_{2}`.
 
     Args:
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 49a486cf0c..4399d96626 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -393,9 +393,6 @@ def fill_constant_batch_size_like(input,
 
     It also sets *stop_gradient* to True.
 
-    >>> data = fluid.layers.fill_constant_batch_size_like(
-    >>>             input=like, shape=[1], value=0, dtype='int64')
-
     Args:
         input(${input_type}): ${input_comment}.
 
@@ -411,6 +408,14 @@ def fill_constant_batch_size_like(input,
 
     Returns:
         ${out_comment}.
+
+    Examples:
+
+        .. code-block:: python
+
+             data = fluid.layers.fill_constant_batch_size_like(
+                         input=like, shape=[1], value=0, dtype='int64')
+
     """
     helper = LayerHelper("fill_constant_batch_size_like", **locals())
     out = helper.create_variable_for_type_inference(dtype=dtype)
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
index 85af8fea13..fd07ff0ba3 100644
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -361,8 +361,8 @@ class ChunkEvaluator(MetricBase):
     Accumulate counter numbers output by chunk_eval from mini-batches and
     compute the precision recall and F1-score using the accumulated counter
     numbers.
-    For some basics of chunking, please refer to
-    'Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>'.
+    For some basics of chunking, please refer to 
+    `Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>`_ .
     ChunkEvalEvaluator computes the precision, recall, and F1-score of chunk detection,
     and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
 
@@ -391,6 +391,7 @@ class ChunkEvaluator(MetricBase):
     def update(self, num_infer_chunks, num_label_chunks, num_correct_chunks):
         """
         Update the states based on the layers.chunk_eval() ouputs.
+
         Args:
             num_infer_chunks(int|numpy.array): The number of chunks in Inference on the given minibatch.
             num_label_chunks(int|numpy.array): The number of chunks in Label on the given mini-batch.
@@ -450,9 +451,9 @@ class EditDistance(MetricBase):
                 distance, instance_error = distance_evaluator.eval()
 
     In the above example:
-        'distance' is the average of the edit distance in a pass.
 
-        'instance_error' is the instance error rate in a pass.
+        - 'distance' is the average of the edit distance in a pass.
+        - 'instance_error' is the instance error rate in a pass.
 
     """
 
@@ -567,12 +568,15 @@ class DetectionMAP(object):
     Calculate the detection mean average precision (mAP).
 
     The general steps are as follows:
+
     1. calculate the true positive and false positive according to the input
-        of detection and labels.
+       of detection and labels.
     2. calculate mAP value, support two versions: '11 point' and 'integral'.
 
     Please get more information from the following articles:
+
       https://sanchom.wordpress.com/tag/average-precision/
+
       https://arxiv.org/abs/1512.02325
 
     Args:
@@ -613,10 +617,12 @@ class DetectionMAP(object):
                 for data in batches:
                     loss, cur_map_v, accum_map_v = exe.run(fetch_list=fetch)
 
-        In the above example:
+    In the above example:
+
+            - 'cur_map_v' is the mAP of current mini-batch.
+            - 'accum_map_v' is the accumulative mAP of one pass.
 
-            'cur_map_v' is the mAP of current mini-batch.
-            'accum_map_v' is the accumulative mAP of one pass.
+ 
     """
 
     def __init__(self,
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
index 6cd71e39e4..a27212f38f 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
@@ -32,6 +32,8 @@ class TestConv2dFusionOp(OpTest):
         self.activation = 'relu'
         self.add_bias = True
         self.add_residual_data = True
+        self.channels = None
+        self.outputs = None
 
         self.init_group()
         self.init_dilation()
@@ -49,8 +51,8 @@ class TestConv2dFusionOp(OpTest):
         input = np.random.random(self.input_size).astype(self.dtype)
         filter = np.random.random(self.filter_size).astype(self.dtype)
 
-        output = conv2d_forward_naive(input, filter, self.groups,
-                                      conv2d_param).astype(self.dtype)
+        self.output = conv2d_forward_naive(input, filter, self.groups,
+                                           conv2d_param).astype(self.dtype)
 
         self.inputs = {
             'Input': OpTest.np_dtype_to_fluid_dtype(input),
@@ -58,19 +60,20 @@ class TestConv2dFusionOp(OpTest):
         }
 
         if self.add_residual_data:
-            residual_data = np.random.random(output.shape).astype(self.dtype)
+            residual_data = np.random.random(self.output.shape).astype(
+                self.dtype)
             self.inputs['ResidualData'] = OpTest.np_dtype_to_fluid_dtype(
                 residual_data)
-            output += residual_data
+            self.output += residual_data
 
         if self.add_bias:
             bias = np.random.random(self.filter_size[0]).astype(self.dtype)
             self.inputs['Bias'] = OpTest.np_dtype_to_fluid_dtype(bias)
-            output = output + bias.reshape((1, bias.size, 1, 1))
+            self.output = self.output + bias.reshape((1, bias.size, 1, 1))
 
         assert self.activation in ['relu', 'identity']
         if self.activation == 'relu':
-            output = np.maximum(output, 0)
+            self.output = np.maximum(self.output, 0)
 
         self.attrs = {
             'strides': self.stride,
@@ -79,9 +82,12 @@ class TestConv2dFusionOp(OpTest):
             'dilations': self.dilations,
             'data_format': self.data_format,
             'exhaustive_search': self.exhaustive_search,
-            'activation': self.activation
+            'activation': self.activation,
+            'split_channels': self.channels
         }
-        self.outputs = {'Output': output}
+        self.outputs = {'Output': self.output}
+
+        self.set_outputs()
 
     def testcuda(self):
         return core.is_compiled_with_cuda()
@@ -117,6 +123,9 @@ class TestConv2dFusionOp(OpTest):
     def set_search_method(self):
         self.exhaustive_search = False
 
+    def set_outputs(self):
+        pass
+
 
 class TestWithoutResidual(TestConv2dFusionOp):
     def init_bias_residual(self):
@@ -160,5 +169,21 @@ class TestCUDNNExhaustiveSearch(TestConv2dFusionOp):
         self.exhaustive_search = True
 
 
+class TestMultipleOutputs(TestConv2dFusionOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [1, 32, 17, 17]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [126, f_c, 3, 3]
+        self.channels = [84, 42]
+
+    def set_outputs(self):
+        out1 = self.output[:, 0:84, :, :]
+        out2 = self.output[:, 84:126, :, :]
+        self.outputs['Outputs'] = [('out1', out1), ('out2', out2)]
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_data_balance.py b/python/paddle/fluid/tests/unittests/test_data_balance.py
index aa19a5edc7..9a6b7cf476 100644
--- a/python/paddle/fluid/tests/unittests/test_data_balance.py
+++ b/python/paddle/fluid/tests/unittests/test_data_balance.py
@@ -194,4 +194,6 @@ class TestDataBalance(unittest.TestCase):
 
 
 if __name__ == '__main__':
-    unittest.main()
+    # Disable data balance unittest, because data balance would be removed
+    # unittest.main()
+    pass
diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py
index 0fe69d1bd4..6b6ab227de 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative.py
@@ -38,7 +38,9 @@ class MyLayer(fluid.imperative.PyLayer):
     def forward(self, inputs):
         x = fluid.layers.relu(inputs[0])
         self._x_for_debug = x
-        return [fluid.layers.elementwise_mul(x, x)]
+        x = fluid.layers.elementwise_mul(x, x)
+        x = fluid.layers.reduce_sum(x)
+        return [x]
 
 
 class MLP(fluid.imperative.PyLayer):
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index e180822c2b..90f5d797a6 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -243,6 +243,10 @@ class TestBook(unittest.TestCase):
             pool, mask = layers.adaptive_pool2d(x, [3, 3], require_index=True)
             self.assertIsNotNone(pool)
             self.assertIsNotNone(mask)
+            self.assertIsNotNone(layers.adaptive_pool2d(x, 3, pool_type='avg'))
+            pool, mask = layers.adaptive_pool2d(x, 3, require_index=True)
+            self.assertIsNotNone(pool)
+            self.assertIsNotNone(mask)
 
     def test_adaptive_pool3d(self):
         program = Program()
@@ -255,6 +259,10 @@ class TestBook(unittest.TestCase):
                 x, [3, 3, 3], require_index=True)
             self.assertIsNotNone(pool)
             self.assertIsNotNone(mask)
+            self.assertIsNotNone(layers.adaptive_pool3d(x, 3, pool_type='avg'))
+            pool, mask = layers.adaptive_pool3d(x, 3, require_index=True)
+            self.assertIsNotNone(pool)
+            self.assertIsNotNone(mask)
 
     def test_lstm_unit(self):
         program = Program()
diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
index d94494e219..559386545e 100644
--- a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
@@ -209,6 +209,7 @@ class TestPyReaderUsingExecutor(unittest.TestCase):
             else:
                 thread = threading.Thread(
                     target=feed_data, args=(feed_queue, reader))
+                thread.daemon = True
                 thread.start()
 
             self.outputs = []
@@ -219,6 +220,8 @@ class TestPyReaderUsingExecutor(unittest.TestCase):
 
             feed_queue.close()
             self.validate()
+            if not use_decorate_paddle_reader:
+                thread.join()
 
     def validate(self):
         self.assertEqual(len(self.inputs), len(self.outputs))
diff --git a/python/paddle/fluid/tests/unittests/testsuite.py b/python/paddle/fluid/tests/unittests/testsuite.py
index dc3b2cb8bc..c4eb26893c 100644
--- a/python/paddle/fluid/tests/unittests/testsuite.py
+++ b/python/paddle/fluid/tests/unittests/testsuite.py
@@ -137,9 +137,9 @@ def append_input_output(block, op_proto, np_list, is_input, dtype):
     var_dict = {}
     for var_proto in proto_list:
         var_name = str(var_proto.name)
+        if (var_name not in np_list) and var_proto.dispensable:
+            continue
         if is_input:
-            if (var_name not in np_list) and var_proto.dispensable:
-                continue
             assert (var_name in np_list) or (var_proto.dispensable), \
                 "Missing {} as input".format(var_name)
         if var_proto.duplicable:
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index d21ec42dcc..c128843885 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -125,14 +125,23 @@ def slice_variable(var_list, slice_count, min_block_size):
 
 class DistributeTranspilerConfig(object):
     """
-    Args:
-        slice_var_up (bool): Do Tensor slice for pservers, default is True.
-        split_method (PSDispatcher): RoundRobin or HashName can be used
-          try to choose the best method to balance loads for pservers.
-        min_block_size (int): Minimum splitted element number in block.
-          According:https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156
+    .. py:attribute:: slice_var_up (bool)
+
+          Do Tensor slice for pservers, default is True.
+
+    .. py:attribute:: split_method (PSDispatcher)
+
+          RoundRobin or HashName can be used.
+          Try to choose the best method to balance loads for pservers.
+
+    .. py:attribute:: min_block_size (int)
+
+          Minimum number of splitted elements in block.
+
+          According to : https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156
           We can use bandwidth effiently when data size is larger than 2MB.If you
-          want to change it, please be sure you see the slice_variable function.
+          want to change it, please be sure you have read the slice_variable function.
+
     """
 
     slice_var_up = True