merge develop

test=develop
7 years ago · 48324c32f2
parent 8a83d6994e 10bedbdeaa
commit 48324c32f2
44 changed files with 1416 additions and 485 deletions
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@ -139,10 +139,12 @@ endfunction()
 message(STATUS "CUDA detected: " ${CUDA_VERSION})
 if (${CUDA_VERSION} LESS 7.0)
  set(paddle_known_gpu_archs ${paddle_known_gpu_archs})
+  add_definitions("-DPADDLE_CUDA_BINVER=\"60\"")
 elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x
  set(paddle_known_gpu_archs ${paddle_known_gpu_archs7})
  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
+  add_definitions("-DPADDLE_CUDA_BINVER=\"70\"")
 elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
  set(paddle_known_gpu_archs ${paddle_known_gpu_archs8})
  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
@ -150,6 +152,7 @@ elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
  # CUDA 8 may complain that sm_20 is no longer supported. Suppress the
  # warning for now.
  list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
+  add_definitions("-DPADDLE_CUDA_BINVER=\"80\"")
 endif()

 include_directories(${CUDA_INCLUDE_DIRS})
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@ -89,6 +89,7 @@ if(CUDNN_FOUND)
        if(NOT CUDNN_MAJOR_VERSION)
            set(CUDNN_VERSION "???")
        else()
+            add_definitions("-DPADDLE_CUDNN_BINVER=\"${CUDNN_MAJOR_VERSION}\"")
            math(EXPR CUDNN_VERSION
                "${CUDNN_MAJOR_VERSION} * 1000 +
                 ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}")
--- a/cmake/external/cub.cmake
+++ b/cmake/external/cub.cmake
@ -32,4 +32,4 @@ endif()

 add_dependencies(cub extern_cub)

-LIST(APPEND externl_project_dependencies cub)
+LIST(APPEND external_project_dependencies cub)
--- a/cmake/external/dlpack.cmake
+++ b/cmake/external/dlpack.cmake
@ -28,4 +28,4 @@ endif()

 add_dependencies(dlpack extern_dlpack)

-LIST(APPEND externl_project_dependencies dlpack)
+LIST(APPEND external_project_dependencies dlpack)
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@ -110,7 +110,7 @@ function(op_library TARGET)
    # Define operators that don't need pybind here.
    foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
 "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
-"fusion_transpose_flatten_concat_op")
+"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op")
        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
            set(pybind_flag 1)
        endif()
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -72,13 +72,13 @@ cc_test(reader_test SRCS reader_test.cc DEPS reader)
 cc_library(threadpool SRCS threadpool.cc DEPS enforce)
 cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)

-cc_library(var_type_traits SRCS var_type_traits DEPS lod_tensor selected_rows framework_proto) 
+cc_library(var_type_traits SRCS var_type_traits DEPS lod_tensor selected_rows framework_proto)
 if (WITH_GPU)
  target_link_libraries(var_type_traits dynload_cuda)
 endif()
 cc_test(var_type_traits_test SRCS var_type_traits_test.cc DEPS var_type_traits)

-cc_library(scope SRCS scope.cc DEPS glog threadpool var_type_traits)
+cc_library(scope SRCS scope.cc DEPS glog threadpool xxhash var_type_traits)
 cc_library(scope_pool SRCS scope_pool.cc DEPS scope)
 cc_test(scope_test SRCS scope_test.cc DEPS scope)
 cc_test(variable_test SRCS variable_test.cc DEPS tensor var_type_traits)
@ -189,9 +189,9 @@ cc_library(parallel_executor SRCS parallel_executor.cc DEPS
        fast_threaded_ssa_graph_executor variable_helper)

 if(WITH_PSLIB)
-    cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib_brpc pslib)
+    cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib_brpc pslib timer)
 else()
-    cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper)
+    cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper timer)
 endif(WITH_PSLIB)


--- a/paddle/fluid/framework/async_executor.cc
+++ b/paddle/fluid/framework/async_executor.cc
@ -304,8 +304,13 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,

  // start executing ops in multiple threads
  for (int thidx = 0; thidx < actual_thread_num; ++thidx) {
-    threads.push_back(
-        std::thread(&ExecutorThreadWorker::TrainFiles, workers[thidx].get()));
+    if (debug) {
+      threads.push_back(std::thread(&ExecutorThreadWorker::TrainFilesWithTimer,
+                                    workers[thidx].get()));
+    } else {
+      threads.push_back(
+          std::thread(&ExecutorThreadWorker::TrainFiles, workers[thidx].get()));
+    }
  }

  for (auto& th : threads) {
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@ -50,7 +50,7 @@ void AllReduceOpHandle::RunImpl() {

 // FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR,
 // this is a distributed or inter-process call, find a better way.
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
  if (NoDummyInputSize() == 1 &&
      local_scopes_[0]->FindLocalVar(NCCL_ID_VARNAME) == nullptr) {
 #else
--- a/paddle/fluid/framework/details/execution_strategy.h
+++ b/paddle/fluid/framework/details/execution_strategy.h
@ -25,7 +25,7 @@ struct ExecutionStrategy {
  size_t num_threads_{0};
  bool use_cuda_{true};
  bool allow_op_delay_{false};
-  size_t num_iteration_per_drop_scope_{100};
+  size_t num_iteration_per_drop_scope_{1};
  ExecutorType type_{kDefault};
  bool dry_run_{false};
 };
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@ -64,20 +64,26 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
  }

  platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
-  drop_scope_counter_ += 1;
+  ++drop_scope_counter_;

-  if (!fetch_tensors.empty() ||
-      drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
-    drop_scope_counter_ = 0;
-    // Wait All computational streams
-    for (auto p : places_) {
-      platform::DeviceContextPool::Instance().Get(p)->Wait();
+  bool stream_end = false;
+  if (!fetch_tensors.empty()) {
+    WaitComputationalStreams();
+    stream_end = true;
+  }
+
+  if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
+    if (!stream_end) {
+      WaitComputationalStreams();
    }
+
    for (auto &scope : local_scopes_) {
      auto &local_scope =
          *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
      scope->DeleteScope(local_scope);
    }
+
+    drop_scope_counter_ = 0;
  }
  if (eptr) {
    std::rethrow_exception(eptr);
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
@ -47,6 +47,14 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {

  FeedFetchList Run(const std::vector<std::string>& fetch_tensors) override;

+ private:
+  inline void WaitComputationalStreams() {
+    // Wait All computational streams
+    for (auto p : places_) {
+      platform::DeviceContextPool::Instance().Get(p)->Wait();
+    }
+  }
+
 private:
  size_t drop_scope_counter_{0};

--- a/paddle/fluid/framework/executor_thread_worker.cc
+++ b/paddle/fluid/framework/executor_thread_worker.cc
@ -29,6 +29,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/timer.h"
 #include "paddle/fluid/pybind/pybind.h"
 namespace paddle {
 namespace framework {
@ -180,6 +181,7 @@ void ExecutorThreadWorker::SetDevice() {
  return;
 #else
  static unsigned concurrency_cap = std::thread::hardware_concurrency();
+  LOG(WARNING) << "concurrency capacity " << concurrency_cap;
  int thread_id = this->thread_id_;

  if (static_cast<unsigned>(thread_id) < concurrency_cap) {
@ -238,6 +240,55 @@ static void print_fetch_var(Scope* scope, const std::string& var_name) {
  VLOG(1) << "print_fetch_var: unrecognized data type:" << tensor.type();
 }

+void ExecutorThreadWorker::TrainFilesWithTimer() {
+  platform::SetNumThreads(1);
+  SetDevice();
+  thread_reader_->Start();
+  std::vector<double> op_total_time;
+  std::vector<std::string> op_name;
+  for (auto& op : ops_) {
+    op_name.push_back(op->Type());
+  }
+  op_total_time.resize(ops_.size());
+  for (size_t i = 0; i < op_total_time.size(); ++i) {
+    op_total_time[i] = 0.0;
+  }
+  platform::Timer timeline;
+  double total_time = 0.0;
+  double read_time = 0.0;
+  int cur_batch;
+  int batch_cnt = 0;
+  timeline.Start();
+  while ((cur_batch = thread_reader_->Next()) > 0) {
+    timeline.Pause();
+    read_time += timeline.ElapsedSec();
+    total_time += timeline.ElapsedSec();
+    for (size_t i = 0; i < ops_.size(); ++i) {
+      timeline.Start();
+      ops_[i]->Run(*thread_scope_, place_);
+      timeline.Pause();
+      op_total_time[i] += timeline.ElapsedSec();
+      total_time += timeline.ElapsedSec();
+    }
+    ++batch_cnt;
+    thread_scope_->DropKids();
+    if (thread_id_ == 0) {
+      if (batch_cnt > 0 && batch_cnt % 1000 == 0) {
+        for (size_t i = 0; i < ops_.size(); ++i) {
+          fprintf(stderr, "op_name:[%zu][%s], op_mean_time:[%fs]\n", i,
+                  op_name[i].c_str(), op_total_time[i] / batch_cnt);
+        }
+        fprintf(stderr, "mean read time: %fs\n", read_time / batch_cnt);
+        int fetch_var_num = fetch_var_names_.size();
+        for (int i = 0; i < fetch_var_num; ++i) {
+          print_fetch_var(thread_scope_, fetch_var_names_[i]);
+        }
+      }
+    }
+    timeline.Start();
+  }
+}
+
 void ExecutorThreadWorker::TrainFiles() {
  platform::SetNumThreads(1);

@ -320,10 +371,12 @@ void AsyncExecutorThreadWorker::SetPSlibPtr(
    std::shared_ptr<paddle::distributed::PSlib> pslib_ptr) {
  _pslib_ptr = pslib_ptr;
 }
+
 void AsyncExecutorThreadWorker::SetPullDenseThread(
    std::shared_ptr<DensePullThread> dpt) {
  _pull_dense_thread = dpt;
 }
+
 void AsyncExecutorThreadWorker::TrainOneNetwork() {
  PrepareParams();

--- a/paddle/fluid/framework/executor_thread_worker.h
+++ b/paddle/fluid/framework/executor_thread_worker.h
@ -155,6 +155,8 @@ class ExecutorThreadWorker {
  void SetDataFeed(const std::shared_ptr<DataFeed>& datafeed);
  // A multi-thread training function
  virtual void TrainFiles();
+  // with timer log
+  virtual void TrainFilesWithTimer();
  // set fetch variable names from python interface assigned by users
  void SetFetchVarNames(const std::vector<std::string>& fetch_var_names);
 #ifdef PADDLE_WITH_PSLIB
--- a/paddle/fluid/framework/rw_lock.h
+++ b/paddle/fluid/framework/rw_lock.h
@ -16,7 +16,9 @@ limitations under the License. */

 #if !defined(_WIN32)
 #include <pthread.h>
-#endif  // !_WIN32
+#else
+#include <mutex>  // NOLINT
+#endif            // !_WIN32

 #include "paddle/fluid/platform/enforce.h"

@ -29,17 +31,17 @@ struct RWLock {

  ~RWLock() { pthread_rwlock_destroy(&lock_); }

-  void RDLock() {
+  inline void RDLock() {
    PADDLE_ENFORCE_EQ(pthread_rwlock_rdlock(&lock_), 0,
                      "acquire read lock failed");
  }

-  void WRLock() {
+  inline void WRLock() {
    PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_), 0,
                      "acquire write lock failed");
  }

-  void UNLock() {
+  inline void UNLock() {
    PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed");
  }

@ -51,81 +53,46 @@ struct RWLock {
 // https://stackoverflow.com/questions/7125250/making-pthread-rwlock-wrlock-recursive
 // In windows, rw_lock seems like a hack. Use empty object and do nothing.
 struct RWLock {
-  void RDLock() {}
-  void WRLock() {}
-  void UNLock() {}
+  // FIXME(minqiyang): use mutex here to do fake lock
+  inline void RDLock() { mutex_.lock(); }
+
+  inline void WRLock() { mutex_.lock(); }
+
+  inline void UNLock() { mutex_.unlock(); }
+
+ private:
+  std::mutex mutex_;
 };
 #endif

-class RWLockGuard {
+class AutoWRLock {
 public:
-  enum Status { kUnLock, kWRLock, kRDLock };
-
-  RWLockGuard(RWLock* rw_lock, Status init_status)
-      : lock_(rw_lock), status_(Status::kUnLock) {
-    switch (init_status) {
-      case Status::kRDLock: {
-        RDLock();
-        break;
-      }
-      case Status::kWRLock: {
-        WRLock();
-        break;
-      }
-      case Status::kUnLock: {
-        break;
-      }
-    }
-  }
+  explicit AutoWRLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); }

-  void WRLock() {
-    switch (status_) {
-      case Status::kUnLock: {
-        lock_->WRLock();
-        status_ = Status::kWRLock;
-        break;
-      }
-      case Status::kWRLock: {
-        break;
-      }
-      case Status::kRDLock: {
-        PADDLE_THROW(
-            "Please unlock read lock first before invoking write lock.");
-        break;
-      }
-    }
-  }
+  ~AutoWRLock() { UnLock(); }

-  void RDLock() {
-    switch (status_) {
-      case Status::kUnLock: {
-        lock_->RDLock();
-        status_ = Status::kRDLock;
-        break;
-      }
-      case Status::kRDLock: {
-        break;
-      }
-      case Status::kWRLock: {
-        PADDLE_THROW(
-            "Please unlock write lock first before invoking read lock.");
-        break;
-      }
-    }
-  }
+ private:
+  inline void Lock() { lock_->WRLock(); }

-  void UnLock() {
-    if (status_ != Status::kUnLock) {
-      lock_->UNLock();
-      status_ = Status::kUnLock;
-    }
-  }
+  inline void UnLock() { lock_->UNLock(); }
+
+ private:
+  RWLock* lock_;
+};
+
+class AutoRDLock {
+ public:
+  explicit AutoRDLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); }
+
+  ~AutoRDLock() { UnLock(); }
+
+ private:
+  inline void Lock() { lock_->RDLock(); }

-  ~RWLockGuard() { UnLock(); }
+  inline void UnLock() { lock_->UNLock(); }

 private:
  RWLock* lock_;
-  Status status_;
 };

 }  // namespace framework
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@ -47,9 +47,15 @@ DEFINE_bool(fast_eager_deletion_mode, false,
 // the mutex will cause serious performance issue.
 // So the mutex is disabled when `ON_INFER`.
 #ifdef PADDLE_ON_INFERENCE
-#define SCOPE_LOCK_GUARD
+#define SCOPE_KIDS_READER_LOCK
+#define SCOPE_KIDS_WRITER_LOCK
+#define SCOPE_VARS_READER_LOCK
+#define SCOPE_VARS_WRITER_LOCK
 #else
-#define SCOPE_LOCK_GUARD std::lock_guard<std::mutex> lock(mutex_);
+#define SCOPE_KIDS_READER_LOCK AutoRDLock auto_lock(&kids_lock_);
+#define SCOPE_KIDS_WRITER_LOCK AutoWRLock auto_lock(&kids_lock_);
+#define SCOPE_VARS_READER_LOCK AutoRDLock auto_lock(&vars_lock_);
+#define SCOPE_VARS_WRITER_LOCK AutoWRLock auto_lock(&vars_lock_);
 #endif

 namespace paddle {
@ -67,64 +73,69 @@ bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; }
 Scope::~Scope() { DropKids(); }

 Scope& Scope::NewScope() const {
-  SCOPE_LOCK_GUARD
-  kids_.push_back(new Scope(this));
-  return *kids_.back();
+  Scope* child = new Scope(this);
+  {
+    SCOPE_KIDS_WRITER_LOCK
+    kids_.push_back(child);
+  }
+  return *child;
 }

 Variable* Scope::Var(const std::string& name) {
-  SCOPE_LOCK_GUARD
+  SCOPE_VARS_WRITER_LOCK
  return VarInternal(name);
 }

 Variable* Scope::Var(std::string* name) {
-  SCOPE_LOCK_GUARD
  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
  if (name != nullptr) {
    *name = new_name;
  }
+  SCOPE_VARS_WRITER_LOCK
  return VarInternal(new_name);
 }

 Variable* Scope::FindVar(const std::string& name) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_VARS_READER_LOCK
  return FindVarInternal(name);
 }

 Variable* Scope::FindLocalVar(const std::string& name) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_VARS_READER_LOCK
  return FindVarLocally(name);
 }

 const Scope* Scope::FindScope(const Variable* var) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_VARS_READER_LOCK
  return FindScopeInternal(var);
 }

 void Scope::DropKids() {
-  SCOPE_LOCK_GUARD
+  SCOPE_KIDS_WRITER_LOCK
  for (Scope* s : kids_) delete s;
  kids_.clear();
 }

 bool Scope::HasKid(const Scope* scope) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_KIDS_READER_LOCK
  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
  return it != this->kids_.end();
 }

 std::vector<std::string> Scope::LocalVarNames() const {
-  SCOPE_LOCK_GUARD
  std::vector<std::string> known_vars;
-  known_vars.reserve(this->vars_.size());
-  for (auto& p : vars_) {
-    known_vars.emplace_back(p.first);
+  {
+    SCOPE_VARS_READER_LOCK
+    known_vars.reserve(this->vars_.size());
+    for (auto& p : vars_) {
+      known_vars.emplace_back(p.first);
+    }
  }
  return known_vars;
 }

 void Scope::DeleteScope(Scope* scope) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_KIDS_WRITER_LOCK
  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
  PADDLE_ENFORCE(it != this->kids_.end(), "%p Cannot find %p as kid scope",
                 this, scope);
@ -138,8 +149,8 @@ void Scope::DeleteScope(Scope* scope) const {
 }

 void Scope::EraseVars(const std::vector<std::string>& var_names) {
-  SCOPE_LOCK_GUARD
  std::set<std::string> var_set(var_names.begin(), var_names.end());
+  SCOPE_VARS_WRITER_LOCK
  for (auto it = vars_.begin(); it != vars_.end();) {
    if (var_set.find(it->first) != var_set.end()) {
      it = vars_.erase(it);
@ -151,12 +162,12 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {

 void Scope::Rename(const std::string& origin_name,
                   const std::string& new_name) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_VARS_WRITER_LOCK
  RenameInternal(origin_name, new_name);
 }

 std::string Scope::Rename(const std::string& origin_name) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_VARS_WRITER_LOCK
  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
  RenameInternal(origin_name, new_name);
  return new_name;
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@ -14,12 +14,18 @@ limitations under the License. */

 #pragma once

+extern "C" {
+#include <xxhash.h>
+}
+
 #include <list>
-#include <mutex>  // NOLINT
+#include <memory>
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>

+#include "paddle/fluid/framework/rw_lock.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/macros.h"

@ -95,7 +101,14 @@ class Scope {
  std::string Rename(const std::string& origin_name) const;

 protected:
-  mutable std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
+  struct KeyHasher {
+    std::size_t operator()(const std::string& key) const {
+      return XXH32(key.c_str(), key.size(), 1);
+    }
+  };
+
+  mutable std::unordered_map<std::string, std::unique_ptr<Variable>, KeyHasher>
+      vars_;

 private:
  // Call Scope::NewScope for a sub-scope.
@ -124,7 +137,8 @@ class Scope {
  DISABLE_COPY_AND_ASSIGN(Scope);

 private:
-  mutable std::mutex mutex_;
+  mutable RWLock kids_lock_;
+  mutable RWLock vars_lock_;
 };

 // Generate some debug string about the inherience structure of scope, quite
--- a/paddle/fluid/operators/conv_cudnn_op_cache.h
+++ b/paddle/fluid/operators/conv_cudnn_op_cache.h
@ -19,6 +19,10 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/platform/cudnn_helper.h"

+DECLARE_uint64(conv_workspace_size_limit);
+DECLARE_bool(cudnn_exhaustive_search);
+DECLARE_int64(cudnn_exhaustive_search_times);
+
 namespace paddle {
 namespace operators {

@ -45,6 +49,7 @@ static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5;
 template <typename TAlgorithm>
 class AlgorithmsCache {
 public:
+  AlgorithmsCache() : search_times_(0) { hash_.clear(); }
  // Caches the best algorithm for a given
  // combination of tensor dimensions & compute data type.
  TAlgorithm GetAlgorithm(
@ -54,9 +59,14 @@ class AlgorithmsCache {
      int algorithmFlags,  // can set for different data type
      std::function<TAlgorithm()> gen_func);

+  TAlgorithm GetAlgorithm(int64_t area, int search_times, int algorithmFlags,
+                          std::function<TAlgorithm()> gen_func);
+
 private:
  std::unordered_map<int64_t, TAlgorithm> hash_;
  std::mutex mutex_;
+
+  int search_times_;
 };

 template <typename TAlgorithm>
@ -107,5 +117,29 @@ TAlgorithm AlgorithmsCache<TAlgorithm>::GetAlgorithm(
  return hash_[seed];
 }

+template <typename TAlgorithm>
+TAlgorithm AlgorithmsCache<TAlgorithm>::GetAlgorithm(
+    int64_t area, int search_times, int algorithmFlags,
+    std::function<TAlgorithm()> gen_func) {
+  if (hash_.find(area) != hash_.end()) {
+    return hash_[area];
+  }
+  if (search_times_ < search_times) {
+    auto algo = gen_func();
+    hash_[area] = algo;
+    ++search_times_;
+    return algo;
+  }
+  TAlgorithm algo;
+  int64_t min = static_cast<uint64_t>(INT_MAX);
+  for (const auto& m : hash_) {
+    if (m.first < min) {
+      min = m.first;
+      algo = m.second;
+    }
+  }
+  return algo;
+}
+
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/conv_fusion_op.cc
+++ b/paddle/fluid/operators/conv_fusion_op.cc
@ -28,6 +28,8 @@ namespace operators {
 //         x is Input,
 //         z is ResidualData,
 //         bias is Bias
+// When `split_channels` is set, y will be splitted into multiple outputs,
+// each output has split_channels[i] number of channels.
 class Conv2DFusionOpMaker : public Conv2DOpMaker {
 protected:
  void Apply() override {
@ -36,8 +38,65 @@ class Conv2DFusionOpMaker : public Conv2DOpMaker {
        "The activation type can be 'identity', 'sigmoid', 'relu', 'relu6' "
        "'relux' , 'tanh', 'band_pass'")
        .SetDefault("relu");
+    AddAttr<std::vector<int>>(
+        "split_channels",
+        "When `split_channels` are set, there will be multiple outputs, the "
+        "output size is equal to the number of `split_channels`.")
+        .SetDefault({});
+    AddOutput("Outputs",
+              "This Outputs is used when setting `split_channels`."
+              "Usually used to fuse conv with same input and same filter size, "
+              "padding, stride, dilation size.")
+        .AsDuplicable()
+        .AsDispensable();
+    AddInput("AlgoCache",
+             "The cache of convolution algorithm, a RAW type variable.")
+        .AsDispensable();
+    AddAttr<int>(
+        "search_times",
+        "The number of exhaustive search times for convolution algorithm.")
+        .SetDefault(-1);
  }
 };
+
+class Conv2DFusionOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of ConvOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Filter"),
+                   "Input(Filter) of ConvOp should not be null.");
+    auto in_dims = ctx->GetInputDim("Input");
+    auto filter_dims = ctx->GetInputDim("Filter");
+
+    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    std::vector<int> dilations =
+        ctx->Attrs().Get<std::vector<int>>("dilations");
+
+    std::vector<int64_t> oshape({in_dims[0], filter_dims[0]});
+    for (size_t i = 0; i < strides.size(); ++i) {
+      oshape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                      dilations[i], paddings[i], strides[i]));
+    }
+    PADDLE_ENFORCE(ctx->HasOutput("Output"),
+                   "Output(Output) of ConvOp should not be null.");
+    ctx->SetOutputDim("Output", framework::make_ddim(oshape));
+    std::vector<int> channels =
+        ctx->Attrs().Get<std::vector<int>>("split_channels");
+    if (channels.size()) {
+      PADDLE_ENFORCE(ctx->HasOutputs("Outputs"),
+                     "Output(Outputs) of ConvOp should not be null.");
+      std::vector<framework::DDim> oshapes;
+      oshapes.reserve(channels.size());
+      for (size_t i = 0; i < channels.size(); ++i) {
+        oshapes.push_back({oshape[0], channels[i], oshape[2], oshape[3]});
+      }
+      ctx->SetOutputsDim("Outputs", oshapes);
+    }
+  }
+};
+
 // TODO(qingqing): add gradient operator for conv2d_fusion

 }  // namespace operators
@ -45,4 +104,5 @@ class Conv2DFusionOpMaker : public Conv2DOpMaker {

 namespace ops = paddle::operators;
 REGISTER_OPERATOR(conv2d_fusion, ops::ConvOp, ops::Conv2DFusionOpMaker,
-                  ops::ConvOpInferVarType, paddle::framework::EmptyGradOpMaker);
+                  ops::Conv2DFusionOpInferShape, ops::ConvOpInferVarType,
+                  paddle::framework::EmptyGradOpMaker);
--- a/paddle/fluid/operators/conv_fusion_op.cu.cc
+++ b/paddle/fluid/operators/conv_fusion_op.cu.cc
@ -16,8 +16,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/platform/cudnn_helper.h"

-DECLARE_uint64(conv_workspace_size_limit);
-DECLARE_bool(cudnn_exhaustive_search);
+DEFINE_int64(cudnn_exhaustive_search_times, -1,
+             "Exhaustive search times for cuDNN convolution, "
+             "defalut is 1, only search once.");

 namespace paddle {
 namespace operators {
@ -117,41 +118,60 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
          workspace_size_limit, &algo));
      VLOG(3) << "cuDNN forward algo " << algo;
    } else {
+      auto search_func = [&]() {
+        int returned_algo_count;
+        std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
+            fwd_perf_stat;
+        auto cudnn_find_func = [&](void* cudnn_workspace) {
+          CUDNN_ENFORCE(
+              platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
+                  handle, cudnn_input_desc, input_data, cudnn_filter_desc,
+                  filter_data, cudnn_conv_desc, cudnn_output_desc, output_data,
+                  kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
+                  fwd_perf_stat.data(), cudnn_workspace, workspace_size_limit));
+        };
+        workspace_handle.RunFunc(cudnn_find_func, workspace_size_limit);
+        VLOG(3) << "Perf result: (algo: stat, time, memory)";
+        for (int i = 0; i < returned_algo_count; ++i) {
+          const auto& stat = fwd_perf_stat[i];
+          VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time << " "
+                  << stat.memory;
+        }
+        return fwd_perf_stat[0].algo;
+      };
      AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* algo_cache = nullptr;
-      if (ctx.scope().FindVar(kCUDNNFwdAlgoCache)) {
+      int search_times = ctx.Attr<int>("search_times");
+      search_times = std::max(
+          static_cast<int>(FLAGS_cudnn_exhaustive_search_times), search_times);
+      if (search_times > 0) {
+        // The searched algo will be cached by `search_times` times for
+        // different input dimension. For other dimensions, select the algo
+        // of closest area.
+        auto var_name = ctx.Inputs("AlgoCache")[0];
        algo_cache =
            ctx.scope()
-                .FindVar(kCUDNNFwdAlgoCache)
+                .FindVar(var_name)
                ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
+        algo = algo_cache->GetAlgorithm(x_dims[2] * x_dims[3], search_times, 0,
+                                        search_func);
      } else {
-        algo_cache =
-            const_cast<framework::Scope&>(ctx.scope())
-                .Var(kCUDNNFwdAlgoCache)
-                ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
+        // Cache searched algo in Var(kCUDNNFwdAlgoCache).
+        // all conv ops use the same kCUDNNFwdAlgoCache variable.
+        if (ctx.scope().FindVar(kCUDNNFwdAlgoCache)) {
+          algo_cache =
+              ctx.scope()
+                  .FindVar(kCUDNNFwdAlgoCache)
+                  ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
+        } else {
+          // TODO(qingqing) remove const_cast
+          algo_cache =
+              const_cast<framework::Scope*>(ctx.scope().parent())
+                  ->Var(kCUDNNFwdAlgoCache)
+                  ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
+        }
+        algo = algo_cache->GetAlgorithm(x_dims, f_dims, strides, paddings,
+                                        dilations, 0, search_func);
      }
-      algo = algo_cache->GetAlgorithm(
-          x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
-            int returned_algo_count;
-            std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
-                fwd_perf_stat;
-            auto cudnn_find_func = [&](void* cudnn_workspace) {
-              CUDNN_ENFORCE(
-                  platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
-                      handle, cudnn_input_desc, input_data, cudnn_filter_desc,
-                      filter_data, cudnn_conv_desc, cudnn_output_desc,
-                      output_data, kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
-                      fwd_perf_stat.data(), cudnn_workspace,
-                      workspace_size_limit));
-            };
-            workspace_handle.RunFunc(cudnn_find_func, workspace_size_limit);
-            VLOG(3) << "Perf result: (algo: stat, time, memory)";
-            for (int i = 0; i < returned_algo_count; ++i) {
-              const auto& stat = fwd_perf_stat[i];
-              VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time
-                      << " " << stat.memory;
-            }
-            return fwd_perf_stat[0].algo;
-          });
      VLOG(3) << "choose algo " << algo;
    }

@ -195,6 +215,27 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
      };
      workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
    }
+    std::vector<int> channels = ctx.Attr<std::vector<int>>("split_channels");
+    if (channels.size()) {
+      auto outs = ctx.MultiOutput<framework::Tensor>("Outputs");
+      if (x_dims[0] == 1) {
+        // share data with Output
+        framework::Tensor t;
+        t.ShareDataWith(*output);
+        auto y_dims = output->dims();
+        t.Resize({y_dims[1], y_dims[2], y_dims[3]});
+        int s = 0;
+        for (size_t i = 0; i < channels.size(); ++i) {
+          int e = s + channels[i];
+          outs[i]->ShareDataWith(t.Slice(s, e));
+          outs[i]->Resize({x_dims[0], channels[i], y_dims[2], y_dims[3]});
+          s = e;
+        }
+      } else {
+        // TODO(qingiqng): do copy when batch size large than 1
+        PADDLE_THROW("Batch size greater than 1 is Unsupported");
+      }
+    }
  }
 };
 #endif
--- a/paddle/fluid/operators/distributed/collective_server_test.cc
+++ b/paddle/fluid/operators/distributed/collective_server_test.cc
@ -52,12 +52,12 @@ std::unique_ptr<framework::Scope> GenerateVars(platform::Place place) {
  framework::Scope* scope = new framework::Scope();
  framework::Variable* var = scope->Var("var1");
  auto* slr = var->GetMutable<framework::SelectedRows>();
-  slr->set_height(1000);
+  slr->set_height(20000);

  auto* tensor = slr->mutable_value();
  auto* rows = slr->mutable_rows();

-  tensor->Resize(framework::make_ddim({3, 5}));
+  tensor->Resize(framework::make_ddim({20000, 1024}));
  tensor->mutable_data<float>(place);

  paddle::operators::math::set_constant(ctx, tensor, 32.7);
@ -83,6 +83,7 @@ void Gather(const std::vector<distributed::RemoteVar>& vars,
 }

 TEST(PREFETCH, GPU) {
+  setenv("FLAGS_max_body_size", "2147483647", 1);
  platform::CUDAPlace place;
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
  auto& ctx = *pool.Get(place);
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@ -1,6 +1,8 @@
 include(operators)
-register_operators(EXCLUDES fusion_transpose_flatten_concat_op)
+register_operators(EXCLUDES fusion_transpose_flatten_concat_op fusion_conv_inception_op)
 if (WITH_GPU)
  op_library(fusion_transpose_flatten_concat_op)
+  op_library(fusion_conv_inception_op)
  file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fusion_transpose_flatten_concat);\n")
+  file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_inception_fusion);\n")
 endif()
--- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cc
+++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cc
@ -0,0 +1,110 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cudnn_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+class ConvInceptionFusionOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    // 1 x
+    auto in_dims = ctx->GetInputDim("Input");
+    // 4 filters
+    auto w_dims = ctx->GetInputsDim("Filter");
+
+    PADDLE_ENFORCE(in_dims.size(), 4, "Conv intput should be 4-D tensor.");
+    PADDLE_ENFORCE_EQ(w_dims.size(), 4, "There should be 4 filters");
+    PADDLE_ENFORCE_EQ(w_dims[0][1], in_dims[1]);
+    PADDLE_ENFORCE_EQ(w_dims[1][1], in_dims[1]);
+
+    int n = in_dims[0];
+    // compute output channel
+    // 1st channel
+    int c = w_dims[0][0];
+    // add 2nd channel
+    c += (w_dims[1][0] - w_dims[2][1] * 2);
+    // add 3rd channel
+    c += (w_dims[2][0] - w_dims[3][1]);
+    // add 4-th channel
+    c += w_dims[3][0];
+
+    int h = in_dims[2];
+    int w = in_dims[3];
+
+    ctx->SetOutputDim("Output", {n, c, h, w});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        ctx.Input<framework::LoDTensor>("Input")->type(), ctx.device_context());
+  }
+};
+
+class ConvInceptionFusionOpMaker : public framework::OpProtoAndCheckerMaker {
+ protected:
+  void Make() override {
+    AddInput("Input", "(Tensor) NCHW layout.");
+    AddInput("Filter", "(vector<Tensor>) 4 aggregated filters").AsDuplicable();
+    AddInput("Bias", "(vector<Tensor>) it's lenght is equal to Filter")
+        .AsDuplicable();
+    AddOutput("Output",
+              "(Tensor) The output tensor of convolution operator. "
+              "The format of output tensor is also NCHW.");
+    AddOutput("TempOutput", "").AsDuplicable();
+    AddAttr<std::string>(
+        "pooling_type",
+        "(string), pooling type, can be \"max\" for max-pooling "
+        "and \"avg\" for average-pooling.")
+        .InEnum({"max", "avg"});
+    AddAttr<bool>(
+        "exclusive",
+        "(bool, default True) When true, will exclude the zero-padding in the "
+        "averaging calculating, otherwise, include the zero-padding. Note, it "
+        "is only used when pooling_type is avg. The defalut is True.")
+        .SetDefault(true);
+    AddAttr<std::string>(
+        "activation",
+        "The activation type can be 'identity', 'sigmoid', 'relu', 'relu6' "
+        "'relux' , 'tanh', 'band_pass'")
+        .SetDefault("relu");
+    AddAttr<int>("workspace_size_MB",
+                 "Only used in cudnn kernel. Need set use_cudnn to true."
+                 "workspace size for cudnn, in MB, "
+                 "workspace is a section of GPU memory which will be "
+                 "allocated/freed each time the operator runs, larger "
+                 "workspace size can increase performance but also requires "
+                 "better hardware. This size should be chosen carefully.")
+        .SetDefault(4096);
+    AddComment(R"DOC(
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(conv2d_inception_fusion, ops::ConvInceptionFusionOp,
+                  ops::ConvInceptionFusionOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
--- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@ -84,6 +84,9 @@ cc_test(init_test SRCS init_test.cc DEPS device_context)
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)

+cc_library(timer SRCS timer.cc)
+cc_test(timer_test SRCS timer_test.cc DEPS timer)
+
 cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
 cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
--- a/paddle/fluid/platform/dynload/cudnn.cc
+++ b/paddle/fluid/platform/dynload/cudnn.cc
@ -38,6 +38,10 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
 CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
 #endif

+#ifdef CUDNN_DNN_ROUTINE_EACH_R6
+CUDNN_DNN_ROUTINE_EACH_R6(DEFINE_WRAP);
+#endif
+
 #ifdef CUDNN_DNN_ROUTINE_EACH_R7
 CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
 #endif
--- a/Show More
+++ b/Show More