Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into hard_example

7 years ago · 62dc593e51
parent ff5570c126 7831e0bdd3
commit 62dc593e51
39 changed files with 1438 additions and 523 deletions
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@ -186,6 +186,11 @@ function(cc_library TARGET_NAME)
      add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
    endif()
    if (cc_library_DEPS)
+      # Don't need link libwarpctc.so
+      if ("${cc_library_DEPS};" MATCHES "warpctc;")
+        list(REMOVE_ITEM cc_library_DEPS warpctc)
+        add_dependencies(${TARGET_NAME} warpctc)
+      endif()
      add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
      target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
    endif()
@ -465,10 +470,10 @@ function(py_test TARGET_NAME)
  if(WITH_TESTING)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS ARGS)
+    set(multiValueArgs SRCS DEPS ARGS ENVS)
    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_test(NAME ${TARGET_NAME}
-             COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python
+             COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python ${py_test_ENVS}
             ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
  endif()
--- a/doc/design/switch.md
+++ b/doc/design/switch.md
@ -0,0 +1,32 @@
+### Design Doc: Switch
+
+### Background
+
+Many programming languages provide `switch` as a generalization of `if-elif-else`.  We want to add it to Fluid.
+
+The following example shows the usage of `fluid.switch`.
+
+```python
+a = fluid.Var(10)
+b = fluid.Var(0)
+
+switch = fluid.switch()
+with switch.block():
+    with switch.case(fluid.less_equal(a, 10)):
+        fluid.print("Case 1")
+    with switch.case(fluid.larger(a, 0)):
+        fluid.print("Case 2")
+    with switch.default():
+        fluid.print("Case 3")
+```
+
+### The Semantics
+
+1. A `switch` control-flow checks cases one-by-one.
+1. The condition of each case is a boolean value, which is a scalar, and differs from the `fluid.if_else` control-flow, which condition could be a vector of boolean values.
+1. It runs the first matched case, or the default case if there is one.
+1. Once it matches a case, it runs the corresponding branch and only that branch.  It's like there is a C's `break` keyword at the end of each case.
+
+The above program should print and print only "Case 1".
+
+The implementation of the backward pass of the `switch` control-flow is easier than the backward of the `if_else`, because `switch` runs at most one branch, whereas `if-else` could run more than one branches.
--- a/doc/howto/usage/cluster/cluster_train_cn.md
+++ b/doc/howto/usage/cluster/cluster_train_cn.md
@ -92,11 +92,11 @@ paddle.init(
 参数说明

 - use_gpu： **可选，默认False**，是否启用GPU训练
- trainer_count：**必选，默认1**，当前训练任务trainer总个数
+- trainer_count：**必选，默认1**，当前trainer的线程数目
 - port：**必选，默认7164**，连接到pserver的端口
 - ports_num：**必选，默认1**，连接到pserver的端口个数
 - ports_num_for_sparse：**必选，默认0**，和pserver之间用于稀疏类型参数通信的端口个数
- num_gradient_servers：**必选，默认1**，当前训练任务pserver总数
+- num_gradient_servers：**必选，默认1**，当前训练任务trainer总数
 - trainer_id：**必选，默认0**，每个trainer的唯一ID，从0开始的整数
 - pservers：**必选，默认127.0.0.1**，当前训练任务启动的pserver的IP列表，多个IP使用“,”隔开

--- a/doc/howto/usage/cluster/cluster_train_en.md
+++ b/doc/howto/usage/cluster/cluster_train_en.md
@ -95,11 +95,11 @@ paddle.init(
 Parameter Description

 - use_gpu: **optional, default False**, set to "True" to enable GPU training.
- trainer_count: **required, default 1**, total count of trainers in the training job.
+- trainer_count: **required, default 1**, number of threads in current trainer.
 - port: **required, default 7164**, port to connect to parameter server.
 - ports_num: **required, default 1**, number of ports for communication.
 - ports_num_for_sparse: **required, default 0**, number of ports for sparse type caculation.
- num_gradient_servers: **required, default 1**, total number of gradient server.
+- num_gradient_servers: **required, default 1**, number of trainers in current job.
 - trainer_id: **required, default 0**, ID for every trainer, start from 0.
 - pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",".

--- a/paddle/framework/channel_test.cc
+++ b/paddle/framework/channel_test.cc
@ -29,16 +29,16 @@ TEST(Channel, MakeAndClose) {
  {
    // MakeChannel should return a buffered channel is buffer_size > 0.
    auto ch = MakeChannel<int>(10);
-    EXPECT_NE(dynamic_cast<Buffered<int>*>(ch), nullptr);
-    EXPECT_EQ(dynamic_cast<UnBuffered<int>*>(ch), nullptr);
+    EXPECT_NE(dynamic_cast<Buffered<int> *>(ch), nullptr);
+    EXPECT_EQ(dynamic_cast<UnBuffered<int> *>(ch), nullptr);
    CloseChannel(ch);
    delete ch;
  }
  {
    // MakeChannel should return an un-buffered channel is buffer_size = 0.
    auto ch = MakeChannel<int>(0);
-    EXPECT_EQ(dynamic_cast<Buffered<int>*>(ch), nullptr);
-    EXPECT_NE(dynamic_cast<UnBuffered<int>*>(ch), nullptr);
+    EXPECT_EQ(dynamic_cast<Buffered<int> *>(ch), nullptr);
+    EXPECT_NE(dynamic_cast<UnBuffered<int> *>(ch), nullptr);
    CloseChannel(ch);
    delete ch;
  }
@ -78,3 +78,132 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
  t.join();
  delete ch;
 }
+
+TEST(Channel, SimpleUnbufferedChannelTest) {
+  auto ch = MakeChannel<int>(0);
+  unsigned sum_send = 0;
+  std::thread t([&]() {
+    for (int i = 0; i < 5; i++) {
+      ch->Send(&i);
+      sum_send += i;
+    }
+  });
+  for (int i = 0; i < 5; i++) {
+    int recv;
+    ch->Receive(&recv);
+    EXPECT_EQ(recv, i);
+  }
+
+  CloseChannel(ch);
+  t.join();
+  EXPECT_EQ(sum_send, 10U);
+  delete ch;
+}
+
+// This tests that closing an unbuffered channel also unblocks
+//  unblocks any receivers waiting for senders
+TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
+  auto ch = MakeChannel<int>(0);
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+
+  // Launches threads that try to read and are blocked becausew of no writers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    t[i] = std::thread(
+        [&](bool *p) {
+          int data;
+          ch->Receive(&data);
+          *p = true;
+        },
+        &thread_ended[i]);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+
+  // Verify that all the threads are blocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], false);
+  }
+
+  // Explicitly close the thread
+  // This should unblock all receivers
+  CloseChannel(ch);
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  delete ch;
+}
+
+// This tests that closing an unbuffered channel also unblocks
+//  unblocks any senders waiting for senders
+TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) {
+  auto ch = MakeChannel<int>(0);
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+
+  // Launches threads that try to read and are blocked becausew of no writers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    t[i] = std::thread(
+        [&](bool *p) {
+          int data = 10;
+          ch->Send(&data);
+          *p = true;
+        },
+        &thread_ended[i]);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+
+  // Verify that all the threads are blocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], false);
+  }
+
+  // Explicitly close the thread
+  // This should unblock all receivers
+  CloseChannel(ch);
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  delete ch;
+}
+
+TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
+  auto ch = MakeChannel<int>(0);
+  unsigned sum_send = 0;
+  // Send should block after three iterations
+  // since we only have three receivers.
+  std::thread t([&]() {
+    // Try to send more number of times
+    // than receivers
+    for (int i = 0; i < 4; i++) {
+      ch->Send(&i);
+      sum_send += i;
+    }
+  });
+  for (int i = 0; i < 3; i++) {
+    int recv;
+    ch->Receive(&recv);
+    EXPECT_EQ(recv, i);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.5 sec
+  EXPECT_EQ(sum_send, 3U);
+
+  CloseChannel(ch);
+  t.join();
+  delete ch;
+}
--- a/paddle/framework/details/unbuffered_channel.h
+++ b/paddle/framework/details/unbuffered_channel.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+#include <atomic>
 #include <condition_variable>
-#include <deque>
 #include <mutex>

 #include "paddle/framework/channel.h"
@ -36,20 +36,104 @@ class UnBuffered : public paddle::framework::Channel<T> {
  virtual ~UnBuffered();

 private:
-  UnBuffered() {}
+  std::mutex mu_ch_;
+  // Mutex for readers and writers who are waiting for other reader
+  // and writer to complete execution
+  std::recursive_mutex mu_read_, mu_write_;
+  // reader_found_ is set true when a reader is ready to accept data
+  // writer_found_ is set true when a writer is ready to send data
+  // A transaction occurs only when both are true
+  std::atomic<bool> reader_found_{false}, writer_found_{false};
+  std::condition_variable cv_channel_;
+  std::condition_variable_any cv_reader_, cv_writer_;
+  T* item{nullptr};
+  std::atomic<bool> closed_{false};
+
+  UnBuffered() : closed_(false) {}
+
+  void NotifyAllParticipants(std::unique_lock<std::mutex>*);
 };

+// This function implements the concept of how data should
+// be sent from a writer to a reader.
+template <typename T>
+void UnBuffered<T>::Send(T* data) {
+  // Prevent other writers from entering
+  std::unique_lock<std::recursive_mutex> writer_lock(mu_write_);
+  writer_found_ = true;
+  std::unique_lock<std::recursive_mutex> cv_lock(mu_write_);
+  // If writer comes first, it should wait till a reader arrives
+  cv_writer_.wait(cv_lock,
+                  [this]() { return reader_found_ == true || closed_; });
+  cv_reader_.notify_one();
+  if (!closed_) {
+    std::unique_lock<std::mutex> channel_lock(mu_ch_);
+    item = data;
+    channel_lock.unlock();
+    cv_channel_.notify_one();
+    channel_lock.lock();
+    cv_channel_.wait(channel_lock,
+                     [this]() { return item == nullptr || closed_; });
+  }
+  writer_found_ = false;
+}
+
+// This function implements the concept of how
+// data that was sent by a writer is read from a reader.
 template <typename T>
-void UnBuffered<T>::Send(T* channel_element) {}
+void UnBuffered<T>::Receive(T* data) {
+  // Prevent other readers from entering
+  std::unique_lock<std::recursive_mutex> read_lock{mu_read_};
+  reader_found_ = true;
+  std::unique_lock<std::recursive_mutex> cv_lock{mu_read_};
+  // If reader comes first, it should wait till a writer arrives
+  cv_reader_.wait(cv_lock,
+                  [this]() { return writer_found_ == true || closed_; });
+  cv_writer_.notify_one();
+  if (!closed_) {
+    std::unique_lock<std::mutex> lock_ch{mu_ch_};
+    // Reader should wait for the writer to first write its data
+    cv_channel_.wait(lock_ch, [this]() { return item != nullptr || closed_; });
+    if (!closed_) {
+      *data = std::move(*item);
+      item = nullptr;
+      lock_ch.unlock();
+    }
+    cv_channel_.notify_one();
+  }
+  reader_found_ = false;
+}

+// This function implements the sequence of events
+// that take place once the channel is closed.
 template <typename T>
-void UnBuffered<T>::Receive(T*) {}
+void UnBuffered<T>::Close() {
+  std::unique_lock<std::mutex> lock(mu_ch_);
+  item = nullptr;
+  closed_ = true;
+  NotifyAllParticipants(&lock);
+}

+// This function implements the sequence of events
+// that are executed once the object of an UnBuffered
+// channel is destroyed.
 template <typename T>
-void UnBuffered<T>::Close() {}
+UnBuffered<T>::~UnBuffered() {
+  std::unique_lock<std::mutex> lock(mu_ch_);
+  item = nullptr;
+  closed_ = true;
+  NotifyAllParticipants(&lock);
+}

+// This function notifies all the readers, writers and
+// the channel condition variables.
 template <typename T>
-UnBuffered<T>::~UnBuffered() {}
+void UnBuffered<T>::NotifyAllParticipants(std::unique_lock<std::mutex>* lock) {
+  lock->unlock();
+  cv_writer_.notify_all();
+  cv_channel_.notify_all();
+  cv_reader_.notify_all();
+}

 }  // namespace details
 }  // namespace framework
--- a/paddle/framework/mixed_vector.h
+++ b/paddle/framework/mixed_vector.h
@ -34,18 +34,6 @@ namespace framework {

 template <typename T>
 class Vector : public std::vector<T> {
- public:
-  /* NOTE(dzhwinter):
-   * Data always store and modified on Host.
-   * If the data is modified when use cuda_data interface,
-   * You need to call the CopyFromCUDA explicitly to synchronize data.
-   *
-   */
-  enum class kDataPosition {
-    kDataOnHost = 0,
-    kDataOnDevice = 1,
-  };
-
 public:
  using std::vector<T>::vector;

@ -55,11 +43,12 @@ class Vector : public std::vector<T> {
  virtual ~Vector() {
 #ifdef PADDLE_WITH_CUDA
    if (cuda_ptr_ != nullptr) {
-      memory::Free<platform::CUDAPlace>(place_, static_cast<void *>(cuda_ptr_));
+      memory::Free<platform::CUDAPlace>(place_, cuda_ptr_);
    }
 #endif
  }

+  /* Get device vector */
  T *cuda_data() {
    CopyToCUDA();
    PADDLE_ENFORCE_NOT_NULL(
@ -67,81 +56,73 @@ class Vector : public std::vector<T> {
    return static_cast<T *>(cuda_ptr_);
  }

+  /* Get host vector */
  T *data() { return std::vector<T>::data(); }
-
  const T *data() const { return std::vector<T>::data(); }

+  /* Synchronize host vector to device vector */
  void CopyToCUDA();
-
+  /* Synchronize device vector to host vector */
  void CopyFromCUDA();
-
+  /* Switch device vector location */
  void CopyToPeer(platform::Place);

 private:
  void *cuda_ptr_ = nullptr;
-  size_t cuda_size_ = 0;
-  /*The DataPosition is unused now,
-    if we want support random access from cpu and cuda,
-    we need to overload all the vector method */
-
-  kDataPosition position_ = kDataPosition::kDataOnHost;
+  size_t cuda_size_ = 0;  // device vector numel
  platform::CUDAPlace place_;
 };

 template <typename T>
 void Vector<T>::CopyToCUDA() {
 #ifdef PADDLE_WITH_CUDA
-  if (cuda_ptr_ == nullptr) {
+  if (cuda_size_ < this->size()) {
+    if (cuda_ptr_ != nullptr) {
+      memory::Free<platform::CUDAPlace>(place_, cuda_ptr_);
+    }
    cuda_ptr_ =
        memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T));
  }
+  cuda_size_ = this->size();
  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto *cuda_ctx = pool.GetByPlace(place_);
-
-  memory::Copy(place_, static_cast<void *>(cuda_ptr_), platform::CPUPlace(),
+  auto *ctx = pool.GetByPlace(place_);
+  memory::Copy(place_, cuda_ptr_, platform::CPUPlace(),
               static_cast<const void *>(this->data()),
-               this->size() * sizeof(T), cuda_ctx->stream());
-  cuda_ctx->Wait();
-
-  cuda_size_ = this->size();
+               this->size() * sizeof(T), ctx->stream());
+  ctx->Wait();
 #endif
 }

 template <typename T>
 void Vector<T>::CopyFromCUDA() {
 #ifdef PADDLE_WITH_CUDA
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto *cuda_ctx = pool.GetByPlace(place_);
  if (cuda_ptr_ == nullptr) {
-    LOG(WARNING) << "No uncommited cuda data.";
+    LOG(WARNING) << "No uncommitted cuda data.";
    return;
  }
  this->resize(cuda_size_);
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto *ctx = pool.GetByPlace(place_);
  memory::Copy(platform::CPUPlace(), static_cast<void *>(this->data()), place_,
               static_cast<const void *>(cuda_ptr_), this->size() * sizeof(T),
-               cuda_ctx->stream());
-  cuda_ctx->Wait();
-
+               ctx->stream());
+  ctx->Wait();
 #endif
 }

 template <typename T>
 void Vector<T>::CopyToPeer(platform::Place peer_place) {
-  if (platform::is_cpu_place(peer_place)) {
-    return;
-  }
 #ifdef PADDLE_WITH_CUDA
-  auto *cuda_ctx = platform::DeviceContextPool::Instance().GetByPlace(place_);
-  void *peer_cuda_ptr_ = memory::Alloc<platform::CUDAPlace>(
+  auto *ctx = platform::DeviceContextPool::Instance().GetByPlace(place_);
+  void *peer_cuda_ptr = memory::Alloc<platform::CUDAPlace>(
      boost::get<platform::CUDAPlace>(peer_place), this->size() * sizeof(T));
-  memory::Copy(boost::get<platform::CUDAPlace>(peer_place),
-               static_cast<void *>(peer_cuda_ptr_), place_,
-               static_cast<const void *>(cuda_ptr_), this->size() * sizeof(T),
-               cuda_ctx->stream());
-  cuda_ctx->Wait();
-  memory::Free<platform::CUDAPlace>(place_, static_cast<void *>(cuda_ptr_));
+  memory::Copy(boost::get<platform::CUDAPlace>(peer_place), peer_cuda_ptr,
+               place_, cuda_ptr_, this->size() * sizeof(T), ctx->stream());
+  ctx->Wait();
+
+  memory::Free<platform::CUDAPlace>(place_, cuda_ptr_);
  place_ = boost::get<platform::CUDAPlace>(peer_place);
-  cuda_ptr_ = peer_cuda_ptr_;
+  cuda_ptr_ = peer_cuda_ptr;
 #endif
 }

--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@ -178,19 +178,22 @@ public:
    real* inputData = inputs[0].data<real>();
    real* filterData = inputs[1].data<real>();
    real* outputData = outputs[0].data<real>();
+    real* colData = NULL;
    bool needIm2col = isNeedIm2col(filter);

    TensorShape imShape =
        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-
    TensorShape colShape;
-    real* colData = NULL;

-    size_t colHeight = inputChannels / groups_ * filterHeight * filterWidth;
-    size_t colWidth = outputHeight * outputWidth;
-    // Max col matrix height 256, Max col matrix width 1024
-    size_t stepColHeight = std::min(colHeight, static_cast<size_t>(256));
-    size_t stepColWidth = std::min(colWidth, static_cast<size_t>(2048));
+    // Max col matrix width 4096, Max col matrix size 4M.
+    size_t outputHeightSteps =
+        std::min(std::max(4096 / outputWidth, (size_t)1), outputHeight);
+    size_t maxColWidth = outputHeightSteps * outputWidth;
+    size_t channelSteps =
+        std::min(std::max((1048576 / maxColWidth) / filterHeight * filterWidth,
+                          (size_t)1),
+                 inputChannels / groups_);
+    size_t maxColHeight = channelSteps * filterHeight * filterWidth;

    if (needIm2col) {
      colShape = TensorShape({inputChannels / groups_,
@ -199,7 +202,7 @@ public:
                              outputHeight,
                              outputWidth});

-      resizeBuffer<Device>(stepColHeight * stepColWidth * sizeof(real));
+      resizeBuffer<Device>(maxColHeight * maxColWidth * sizeof(real));
      colData = reinterpret_cast<real*>(memory_->getBuf());
    }

@ -209,20 +212,24 @@ public:
        (outputChannels / groups_) * outputHeight * outputWidth;
    size_t filterOffset = filter.getElements() / groups_;

-    int nStride = colWidth;
-    int kStride = colHeight;
+    int nStride = outputHeight * outputWidth;
+    int kStride = inputChannels / groups_ * filterHeight * filterWidth;
    for (size_t i = 0; i < batchSize; i++) {
+      filterData = inputs[1].data<real>();
      for (size_t g = 0; g < groups_; g++) {
        if (needIm2col) {
          real beta_ = beta;
-          for (size_t colHeightStart = 0; colHeightStart < colHeight;
-               colHeightStart += stepColHeight) {
-            for (size_t colWidthStart = 0; colWidthStart < colWidth;
-                 colWidthStart += stepColWidth) {
-              int N = std::min(colWidth - colWidthStart, stepColWidth);
-              int K = std::min(colHeight - colHeightStart, stepColHeight);
+          for (size_t ic = 0; ic < inputChannels / groups_;
+               ic += channelSteps) {
+            int channels = std::min(inputChannels / groups_ - ic, channelSteps);
+            for (size_t oh = 0; oh < outputHeight; oh += outputHeightSteps) {
+              int height = std::min(outputHeight - oh, outputHeightSteps);
+
+              int M = outputChannels / groups_;
+              int N = height * outputWidth;
+              int K = channels * filterHeight * filterWidth;
              // im2col
-              im2col(inputData + g * inputOffset,
+              im2col(inputData,
                     imShape,
                     colData,
                     colShape,
@ -232,13 +239,12 @@ public:
                     paddingW(),
                     dilationH(),
                     dilationW(),
-                     colHeightStart,
-                     K,
-                     colWidthStart,
+                     channels,
+                     oh,
+                     height,
                     N);

              // gemm
-              int M = outputChannels / groups_;
              BlasGemm<Device, real>::compute(
                  false,
                  false,
@ -246,12 +252,12 @@ public:
                  N,
                  K,
                  1.0f,
-                  filterData + g * filterOffset + colHeightStart,
+                  filterData + ic * filterHeight * filterWidth,
                  kStride,
                  colData,
                  N,
                  beta_,
-                  outputData + g * outputOffset + colWidthStart,
+                  outputData + oh * outputWidth,
                  nStride);
            }
            beta_ = 1.0;
@ -266,17 +272,18 @@ public:
                                          N,
                                          K,
                                          1.0f,
-                                          filterData + g * filterOffset,
+                                          filterData,
                                          K,
-                                          inputData + g * inputOffset,
+                                          inputData,
                                          N,
                                          beta,
-                                          outputData + g * outputOffset,
+                                          outputData,
                                          N);
        }
+        inputData += inputOffset;
+        outputData += outputOffset;
+        filterData += filterOffset;
      }
-      inputData += inputChannels * inputHeight * inputWidth;
-      outputData += outputChannels * outputHeight * outputWidth;
    }

    memory_.reset();
--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
@ -111,39 +111,42 @@ public:
                  int paddingWidth,
                  int dilationHeight,
                  int dilationWidth,
-                  int colHeightStart,
-                  int colHeightSize,
-                  int colWidthStart,
-                  int colWidthSize) {
+                  int inputChannels,
+                  int colOffset,
+                  int colOutputHeight,
+                  int colWidth) {
    int inputHeight = imShape[1];
    int inputWidth = imShape[2];
    int filterHeight = colShape[1];
    int filterWidth = colShape[2];
    int outputWidth = colShape[4];

-    for (int colh = 0; colh < colHeightSize; colh++) {
-      int wOffset = (colHeightStart + colh) % filterWidth;
-      int hOffset = ((colHeightStart + colh) / filterWidth) % filterHeight;
-      int c_im = (colHeightStart + colh) / filterWidth / filterHeight;
-
-      for (int colw = 0; colw < colWidthSize; colw++) {
-        int h = (colWidthStart + colw) / outputWidth;
-        int w = (colWidthStart + colw) % outputWidth;
-
-        int imRowIdx = h * strideHeight + hOffset * dilationHeight;
-        int imColIdx = w * strideWidth + wOffset * dilationWidth;
-        if ((imRowIdx - paddingHeight) < 0 ||
-            (imRowIdx - paddingHeight) >= inputHeight ||
-            (imColIdx - paddingWidth) < 0 ||
-            (imColIdx - paddingWidth) >= inputWidth) {
-          colData[colh * colWidthSize + colw] = static_cast<T>(0);
-        } else {
-          imRowIdx += c_im * inputHeight - paddingHeight;
-          imColIdx -= paddingWidth;
-          colData[colh * colWidthSize + colw] =
-              imData[imRowIdx * inputWidth + imColIdx];
+    for (int ic = 0; ic < inputChannels; ic++) {
+      for (int oh = 0; oh < colOutputHeight; oh++) {
+        T* dstData = colData + oh * outputWidth;
+        for (int fh = 0; fh < filterHeight; fh++) {
+          for (int fw = 0; fw < filterWidth; fw++) {
+            int imRowIdx = (oh + colOffset) * strideHeight +
+                           fh * dilationHeight - paddingHeight;
+            if (imRowIdx < 0 || imRowIdx >= inputHeight) {
+              memset(dstData, 0, outputWidth * sizeof(T));
+            } else {
+              for (int ow = 0; ow < outputWidth; ow++) {
+                int imColIdx =
+                    ow * strideWidth + fw * dilationWidth - paddingWidth;
+                if (imColIdx < 0 || imColIdx >= inputWidth) {
+                  dstData[ow] = T(0);
+                } else {
+                  dstData[ow] = imData[imRowIdx * inputWidth + imColIdx];
+                }
+              }
+            }
+            dstData += colWidth;
+          }
        }
      }
+      colData += filterHeight * filterWidth * colWidth;
+      imData += inputHeight * inputWidth;
    }
  }
 };
--- a/paddle/function/Im2ColTest.cpp
+++ b/paddle/function/Im2ColTest.cpp
@ -202,10 +202,10 @@ void TestIm2ColMobileFunctor() {
                          padding,
                          dilation,
                          dilation,
+                          channels,
                          0,
-                          height,
-                          0,
-                          width);
+                          outputHeight,
+                          outputHeight * outputWidth);

                  autotest::TensorCheckEqual(*output1, *output2);
                }
--- a/paddle/inference/tests/book/CMakeLists.txt
+++ b/paddle/inference/tests/book/CMakeLists.txt
@ -4,4 +4,4 @@ cc_test(test_inference_recognize_digits_mlp
    DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
    ARGS --dirname=${PYTHON_TESTS_DIR}/book/recognize_digits_mlp.inference.model)
 set_tests_properties(test_inference_recognize_digits_mlp
-    PROPERTIES DEPENDS test_recognize_digits_mlp_cpu)
+    PROPERTIES DEPENDS test_recognize_digits)
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@ -2015,13 +2015,6 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
    CHECK_EQ(channels * outLength, maskMatP->getWidth());
  }

-  /* initialize the data_ */
-  for (size_t i = 0; i < height_; i++) {
-    for (size_t j = 0; j < width_; j++) {
-      outData[i * outStride + j] = -(real)FLT_MAX;
-    }
-  }
-
  /* pool max one by one */
  for (size_t n = 0; n < num; ++n) {  // frame by frame
    if (!isContiguous()) {
@ -2030,19 +2023,24 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
    for (size_t c = 0; c < channels; ++c) {  // channel by channel
      for (size_t ph = 0; ph < outputH; ++ph) {
        int hstart = ph * strideH - paddingH;
-        int hend = std::min(hstart + sizeY, imgSizeH);
-        hstart = std::max(hstart, 0);
+        int hend = hstart + sizeY;
+        hstart = hstart < 0 ? 0 : hstart;
+        hend = hend < (int)imgSizeH ? hend : (int)imgSizeH;
        for (size_t pw = 0; pw < outputW; ++pw) {
          int wstart = pw * strideW - paddingW;
-          int wend = std::min(wstart + sizeX, imgSizeW);
-          wstart = std::max(wstart, 0);
+          int wend = wstart + sizeX;
+          wstart = wstart < 0 ? 0 : wstart;
+          wend = wend < (int)imgSizeW ? wend : (int)imgSizeW;
          if (maskData == NULL) {
+            real tmp = -(real)FLT_MAX;
            for (int h = hstart; h < hend; ++h) {
              for (int w = wstart; w < wend; ++w) {
-                outData[ph * outputW + pw] = std::max(
-                    outData[ph * outputW + pw], inputData[h * imgSizeW + w]);
+                tmp = tmp < inputData[h * imgSizeW + w]
+                          ? inputData[h * imgSizeW + w]
+                          : tmp;
              }
            }
+            outData[ph * outputW + pw] = tmp;
          } else {
            for (int h = hstart; h < hend; ++h) {
              for (int w = wstart; w < wend; ++w) {
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@ -122,9 +122,11 @@ if(WITH_DISTRIBUTE)
    set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
    op_library(recv_op DEPS ${DISTRIBUTE_DEPS})
    set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor)
+    op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS})
+    set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op listen_and_serv_op sum_op executor)
 else()
-    set(DEPS_OPS ${DEPS_OPS} send_op recv_op)
+    set(DEPS_OPS ${DEPS_OPS} send_op recv_op listen_and_serv_op)
 endif()

 op_library(cond_op DEPS framework_proto tensor net_op)
--- a/paddle/operators/listen_and_serv_op.cc
+++ b/paddle/operators/listen_and_serv_op.cc
@ -0,0 +1,207 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stdint.h>
+#include <sys/stat.h>
+#include <ostream>
+#include <thread>
+
+#include <unistd.h>
+
+#include "paddle/framework/executor.h"
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/proto_desc.h"
+#include "paddle/operators/detail/grpc_server.h"
+#include "paddle/operators/detail/sendrecvop_utils.h"
+#include "paddle/operators/detail/simple_block_queue.h"
+#include "paddle/string/printf.h"
+
+namespace paddle {
+namespace operators {
+
+constexpr char kOptimizeBlock[] = "OptimizeBlock";
+
+void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service) {
+  service->RunSyncUpdate();
+  VLOG(4) << "RunServer thread end";
+}
+
+static void CreateTensorFromMessageType(framework::Variable *var,
+                                        sendrecv::VarType var_type) {
+  if (var_type == sendrecv::VarType::LOD_TENSOR) {
+    var->GetMutable<framework::LoDTensor>();
+  } else if (var_type == sendrecv::VarType::SELECTED_ROWS) {
+    var->GetMutable<framework::SelectedRows>();
+  } else {
+    PADDLE_THROW(
+        "VariableMessage type %d is not in "
+        "[LoDTensor, SelectedRows]",
+        var_type);
+  }
+}
+
+class ListenAndServOp : public framework::OperatorBase {
+ public:
+  ListenAndServOp(const std::string &type,
+                  const framework::VariableNameMap &inputs,
+                  const framework::VariableNameMap &outputs,
+                  const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {
+    if (!rpc_service_) {
+      std::string endpoint = Attr<std::string>("endpoint");
+      rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
+      server_thread_.reset(new std::thread(RunServer, rpc_service_));
+    }
+  }
+
+  void Stop() override {
+    detail::MessageWithName term_msg;
+    term_msg.first = LISTEN_TERMINATE_MESSAGE;
+    rpc_service_->Push(term_msg);
+    rpc_service_->ShutDown();
+    server_thread_->join();
+  }
+
+  std::string GetGradVarNameForTrainer(const std::string &varname) const {
+    if (grads_counter_.find(varname) == grads_counter_.end()) {
+      grads_counter_[varname] = 0;
+    }
+    return string::Sprintf("%s.trainer_%d", varname, grads_counter_[varname]++);
+  }
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &dev_place) const override {
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
+    framework::Scope &recv_scope = scope.NewScope();
+
+    // FIXME(Yancey1989): initialize rpc server with lazy mode.
+    rpc_service_->SetScope(&recv_scope);
+    rpc_service_->SetDevCtx(&dev_ctx);
+    auto param_list = Attr<std::vector<std::string>>("ParamList");
+    auto grad_list = Attr<std::vector<std::string>>("GradList");
+    auto fan_in = Attr<int>("Fanin");
+
+    auto *block = Attr<framework::BlockDesc *>(kOptimizeBlock);
+    auto *program = block->Program();
+    framework::Executor executor(dev_place);
+
+    // TODO(typhoonzero): change this to a while_op for every cluster-batch.
+    bool exit_flag = false;
+    while (!exit_flag) {
+      // Get from multiple trainers, we don't care about the order in which
+      // the gradients arrives, just add suffix 0~n and merge the gradient.
+      rpc_service_->SetCond(0);
+      size_t recv_var_cnt = 0;
+      int batch_barrier = 0;
+      while (batch_barrier != fan_in) {
+        const detail::MessageWithName &v = rpc_service_->Get();
+        auto grad_var_name = v.first;
+        if (grad_var_name == LISTEN_TERMINATE_MESSAGE) {
+          LOG(INFO) << "received terminate message and exit";
+          exit_flag = true;
+          break;
+        } else if (grad_var_name == BATCH_BARRIER_MESSAGE) {
+          VLOG(3) << "recv batch barrier message";
+          batch_barrier++;
+          continue;
+        } else {
+          // receive a variable
+          recv_var_cnt++;
+          auto it =
+              std::find(grad_list.begin(), grad_list.end(), grad_var_name);
+          std::string param_var_name;
+          if (it != grad_list.end()) {
+            param_var_name = param_list[it - grad_list.begin()];
+          } else {
+            LOG(ERROR) << "grad has no paired param:" << grad_var_name;
+          }
+          VLOG(3) << "received grad: " << grad_var_name
+                  << " updating param: " << param_var_name;
+
+          if (fan_in > 1) {
+            grad_var_name = this->GetGradVarNameForTrainer(grad_var_name);
+          }
+          auto *var = recv_scope.FindVar(grad_var_name);
+          if (var == nullptr) {
+            LOG(ERROR) << "Can not find server side var: " << grad_var_name;
+            PADDLE_THROW("Can not find server side var");
+          }
+          detail::DeserializeFromMessage(v.second, dev_ctx, var);
+        }
+      }
+      VLOG(3) << "recv " << recv_var_cnt << " parmeters for one barrier.";
+      // TODO(Yancey1989): merge SelectedRows variables here
+      if (exit_flag) {
+        rpc_service_->ShutDown();
+      }
+
+      try {
+        executor.Run(*program, &recv_scope, block->ID(), /*global_block*/
+                     false /*create_local_scope*/, false /*create_vars*/);
+      } catch (std::exception &e) {
+        LOG(ERROR) << "run sub program error " << e.what();
+      }
+      rpc_service_->SetCond(1);
+      rpc_service_->WaitClientGet(recv_var_cnt);
+      grads_counter_.clear();
+    }  // while(true)
+  }
+
+ protected:
+  std::shared_ptr<detail::AsyncGRPCServer> rpc_service_;
+  std::shared_ptr<std::thread> server_thread_;
+  mutable std::unordered_map<std::string, int> grads_counter_;
+};
+
+class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ListenAndServOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddComment(R"DOC(
+ListenAndServ operator
+
+This operator will start a RPC server which can receive variables
+from send_op and send back variables to recv_op.
+)DOC");
+    AddAttr<std::string>("endpoint",
+                         "(string, default 127.0.0.1:6164)"
+                         "IP address to listen on.")
+        .SetDefault("127.0.0.1:6164")
+        .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
+    AddAttr<framework::BlockDesc *>(kOptimizeBlock,
+                                    "BlockID to run on server side.");
+    AddAttr<std::vector<std::string>>(
+        "ParamList", "type list of string",
+        "grad->param name mapping to find which parameters to optimize.")
+        .SetDefault({});
+    AddAttr<std::vector<std::string>>(
+        "GradList", "type list of string",
+        "grad->param name mapping to find which parameters to optimize.")
+        .SetDefault({});
+    AddAttr<int>("Fanin", "type int",
+                 "Number of trainers in the current cluster job")
+        .SetDefault(1);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(listen_and_serv, ops::ListenAndServOp,
+                  ops::ListenAndServOpMaker);
--- a/paddle/operators/recv_op.cc
+++ b/paddle/operators/recv_op.cc
@ -12,187 +12,60 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include <stdint.h>
-#include <sys/stat.h>
 #include <ostream>
-#include <thread>

-#include <unistd.h>
-
-#include "paddle/framework/executor.h"
+#include "paddle/framework/data_type.h"
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/op_registry.h"
-#include "paddle/framework/proto_desc.h"
-#include "paddle/operators/detail/grpc_server.h"
-#include "paddle/operators/detail/sendrecvop_utils.h"
-#include "paddle/operators/detail/simple_block_queue.h"
-#include "paddle/string/printf.h"
+
+#include <future>
+#include "paddle/operators/detail/grpc_client.h"

 namespace paddle {
 namespace operators {

-constexpr char kOptimizeBlock[] = "OptimizeBlock";
-
-void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service) {
-  service->RunSyncUpdate();
-  VLOG(4) << "RunServer thread end";
-}
-
-static void CreateTensorFromMessageType(framework::Variable *var,
-                                        sendrecv::VarType var_type) {
-  if (var_type == sendrecv::VarType::LOD_TENSOR) {
-    var->GetMutable<framework::LoDTensor>();
-  } else if (var_type == sendrecv::VarType::SELECTED_ROWS) {
-    var->GetMutable<framework::SelectedRows>();
-  } else {
-    PADDLE_THROW(
-        "VariableMessage type %d is not in "
-        "[LoDTensor, SelectedRows]",
-        var_type);
-  }
-}
-
 class RecvOp : public framework::OperatorBase {
 public:
-  RecvOp(const std::string &type, const framework::VariableNameMap &inputs,
-         const framework::VariableNameMap &outputs,
-         const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {
-    if (!rpc_service_) {
-      std::string endpoint = Attr<std::string>("endpoint");
-      rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
-      server_thread_.reset(new std::thread(RunServer, rpc_service_));
-    }
-  }
-
-  void Stop() override {
-    detail::MessageWithName term_msg;
-    term_msg.first = LISTEN_TERMINATE_MESSAGE;
-    rpc_service_->Push(term_msg);
-    rpc_service_->ShutDown();
-    server_thread_->join();
-  }
-
-  std::string GetGradVarNameForTrainer(const std::string &varname) const {
-    if (grads_counter_.find(varname) == grads_counter_.end()) {
-      grads_counter_[varname] = 0;
+  RecvOp(const std::string& type, const framework::VariableNameMap& inputs,
+         const framework::VariableNameMap& outputs,
+         const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope& scope,
+           const platform::Place& place) const override {
+    auto outs = Outputs("Out");
+    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
+
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& ctx = *pool.Get(place);
+
+    for (size_t i = 0; i < outs.size(); i++) {
+      VLOG(3) << "getting " << outs[i];
+      client_.AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
    }
-    return string::Sprintf("%s.trainer_%d", varname, grads_counter_[varname]++);
+    PADDLE_ENFORCE(client_.Wait());
  }

-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(dev_place);
-    framework::Scope &recv_scope = scope.NewScope();
-
-    // FIXME(Yancey1989): initialize rpc server with laze mode.
-    rpc_service_->SetScope(&recv_scope);
-    rpc_service_->SetDevCtx(&dev_ctx);
-    auto param_list = Attr<std::vector<std::string>>("ParamList");
-    auto grad_list = Attr<std::vector<std::string>>("GradList");
-    auto fan_in = Attr<int>("Fanin");
-
-    auto *block = Attr<framework::BlockDesc *>(kOptimizeBlock);
-    auto *program = block->Program();
-    framework::Executor executor(dev_place);
-
-    // TODO(typhoonzero): change this to a while_op for every cluster-batch.
-    bool exit_flag = false;
-    while (!exit_flag) {
-      // Get from multiple trainers, we don't care about the order in which
-      // the gradients arrives, just add suffix 0~n and merge the gradient.
-      rpc_service_->SetCond(0);
-      size_t recv_var_cnt = 0;
-      int batch_barrier = 0;
-      while (batch_barrier != fan_in) {
-        const detail::MessageWithName &v = rpc_service_->Get();
-        auto grad_var_name = v.first;
-        if (grad_var_name == LISTEN_TERMINATE_MESSAGE) {
-          LOG(INFO) << "received terminate message and exit";
-          exit_flag = true;
-          break;
-        } else if (grad_var_name == BATCH_BARRIER_MESSAGE) {
-          VLOG(3) << "recv batch barrier message";
-          batch_barrier++;
-          continue;
-        } else {
-          // receive a variable
-          recv_var_cnt++;
-          auto it =
-              std::find(grad_list.begin(), grad_list.end(), grad_var_name);
-          std::string param_var_name;
-          if (it != grad_list.end()) {
-            param_var_name = param_list[it - grad_list.begin()];
-          } else {
-            LOG(ERROR) << "grad has no paired param:" << grad_var_name;
-          }
-          VLOG(3) << "received grad: " << grad_var_name
-                  << " updating param: " << param_var_name;
-
-          if (fan_in > 1) {
-            grad_var_name = this->GetGradVarNameForTrainer(grad_var_name);
-          }
-          auto *var = recv_scope.FindVar(grad_var_name);
-          if (var == nullptr) {
-            LOG(ERROR) << "Can not find server side var: " << grad_var_name;
-            PADDLE_THROW("Can not find server side var");
-          }
-          detail::DeserializeFromMessage(v.second, dev_ctx, var);
-        }
-      }
-      VLOG(3) << "recv " << recv_var_cnt << " parmeters for one barrier.";
-      // TODO(Yancey1989): merge SelectedRows variables here
-      if (exit_flag) {
-        break;
-      }
-
-      try {
-        executor.Run(*program, &recv_scope, block->ID(), /*global_block*/
-                     false /*create_local_scope*/, false /*create_vars*/);
-      } catch (std::exception &e) {
-        LOG(ERROR) << "run sub program error " << e.what();
-      }
-      rpc_service_->SetCond(1);
-      rpc_service_->WaitClientGet(recv_var_cnt);
-      grads_counter_.clear();
-    }  // while(true)
-  }
-
- protected:
-  std::shared_ptr<detail::AsyncGRPCServer> rpc_service_;
-  std::shared_ptr<std::thread> server_thread_;
-  mutable std::unordered_map<std::string, int> grads_counter_;
+ private:
+  mutable detail::RPCClient client_;
 };

 class RecvOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  RecvOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  RecvOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput("Out", "(Tensor) Variables to get from server.").AsDuplicable();
    AddComment(R"DOC(
 Recv operator

-This operator will recieve tensor from send_op
+This operator can get variables from server side.
 )DOC");
-    AddAttr<std::string>("endpoint",
-                         "(string, default 127.0.0.1:6164)"
-                         "IP address to listen on.")
-        .SetDefault("127.0.0.1:6164")
-        .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
-    AddAttr<framework::BlockDesc *>(
-        kOptimizeBlock, "Serialized ProgramDesc string for recv to run.");
-    AddAttr<std::vector<std::string>>(
-        "ParamList", "type list of string",
-        "grad->param name mapping to find which parameters to optimize.")
-        .SetDefault({});
-    AddAttr<std::vector<std::string>>(
-        "GradList", "type list of string",
-        "grad->param name mapping to find which parameters to optimize.")
+    AddAttr<std::vector<std::string>>("epmap",
+                                      "(string vector, default 127.0.0.1:6164)"
+                                      "Server endpoints in the order of input "
+                                      "variables for mapping")
        .SetDefault({});
-    AddAttr<int>("Fanin", "type int",
-                 "Number of trainers in the current cluster job")
-        .SetDefault(1);
  }
 };

--- a/paddle/operators/send_op.cc
+++ b/paddle/operators/send_op.cc
@ -62,11 +62,13 @@ class SendOp : public framework::OperatorBase {
    }
    PADDLE_ENFORCE(rpc_client->Wait());

-    for (size_t i = 0; i < outs.size(); i++) {
-      VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
-      rpc_client->AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
+    if (outs.size() > 0) {
+      for (size_t i = 0; i < outs.size(); i++) {
+        VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
+        rpc_client->AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
+      }
+      PADDLE_ENFORCE(rpc_client->Wait());
    }
-    PADDLE_ENFORCE(rpc_client->Wait());
  }
 };

@ -85,6 +87,8 @@ Send operator

 This operator will send tensor to recv_op at the parameter server.
 )DOC");
+    // TODO(typhoonzero): remove this attr generate de-duplicated vector from
+    // epmap when initializing.
    AddAttr<std::vector<std::string>>("endpoints",
                                      "(string vector, default 127.0.0.1:6164)"
                                      "Server endpoints to send variables to.")
--- a/paddle/operators/send_recv_op_test.cc
+++ b/paddle/operators/send_recv_op_test.cc
@ -25,7 +25,7 @@ limitations under the License. */
 #include "paddle/string/printf.h"

 USE_NO_KERNEL_OP(send);
-USE_NO_KERNEL_OP(recv);
+USE_NO_KERNEL_OP(listen_and_serv);
 USE_OP(sum);

 namespace f = paddle::framework;
@ -33,7 +33,7 @@ namespace p = paddle::platform;
 namespace m = paddle::operators::math;

 // global for simplicity.
-std::unique_ptr<f::OperatorBase> recv_op;
+std::unique_ptr<f::OperatorBase> listen_and_serv_op;

 void InitTensorsInScope(f::Scope &scope, p::CPUPlace &place) {
  p::CPUDeviceContext ctx(place);
@ -120,7 +120,7 @@ void StartServerNet(bool is_sparse) {
    InitTensorsInScope(scope, place);
  }

-  // sub program run in recv_op, for simple test we use sum
+  // sub program run in listen_and_serv_op, for simple test we use sum
  f::ProgramDesc program;
  f::BlockDesc *block = program.MutableBlock(0);
  // X for server side tensors, RX for received tensers, must be of same shape.
@ -131,8 +131,9 @@ void StartServerNet(bool is_sparse) {
  attrs.insert({"ParamList", std::vector<std::string>({"Out"})});
  attrs.insert({"GradList", std::vector<std::string>({"x1"})});
  attrs.insert({"OptimizeBlock", block});
-  recv_op = f::OpRegistry::CreateOp("recv", {{"RX", {"x1"}}}, {}, attrs);
-  recv_op->Run(scope, place);
+  listen_and_serv_op =
+      f::OpRegistry::CreateOp("listen_and_serv", {}, {}, attrs);
+  listen_and_serv_op->Run(scope, place);
 }

 TEST(SendRecvOp, CPUDense) {
@ -161,9 +162,9 @@ TEST(SendRecvOp, CPUDense) {
  for (int64_t i = 0; i < target->numel(); ++i) {
    EXPECT_EQ(expected[i] * 2, actual[i]);
  }
-  recv_op->Stop();
+  listen_and_serv_op->Stop();
  server_thread.join();
-  recv_op.reset(nullptr);
+  listen_and_serv_op.reset(nullptr);
 }

 TEST(SendRecvOp, CPUSparse) {
@ -200,7 +201,7 @@ TEST(SendRecvOp, CPUSparse) {
    EXPECT_EQ(expect_value->mutable_data<float>(place)[i],
              actual->mutable_data<float>(place)[i]);
  }
-  recv_op->Stop();
+  listen_and_serv_op->Stop();
  server_thread.join();
-  recv_op.reset();
+  listen_and_serv_op.reset();
 }
--- a/paddle/operators/while_op.cc
+++ b/paddle/operators/while_op.cc
@ -53,6 +53,8 @@ class WhileOp : public framework::OperatorBase {
    auto step_scopes =
        scope.FindVar(Output(kStepScopes))->GetMutable<StepScopeVar>();

+    PADDLE_ENFORCE(platform::is_cpu_place(cond.place()),
+                   "Condition of while op must in CPU memory.");
    while (cond.data<bool>()[0]) {
      auto &current_scope = scope.NewScope();
      step_scopes->push_back(&current_scope);
@ -99,6 +101,9 @@ class WhileGradOp : public framework::OperatorBase {

  void Run(const framework::Scope &scope,
           const platform::Place &dev_place) const override {
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
    framework::Executor executor(dev_place);
    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
    auto *program = block->Program();
@ -205,6 +210,8 @@ class WhileGradOp : public framework::OperatorBase {
        sum_op->Run(cur_scope, dev_place);
        cur_scope.Rename(new_inside_name, inside_grad_name);
      }
+      dev_ctx.Wait();
+      const_cast<framework::Scope &>(scope).DeleteScope(&cur_scope);
    }
  }
 };
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@ -32,7 +32,7 @@ function cmake_gen() {
    cat <<EOF
    ========================================
    Configuring cmake in /paddle/build ...
-        -DCMAKE_BUILD_TYPE=${BUILD_TYPE:Release}
+        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}
        ${PYTHON_FLAGS}
        -DWITH_DOC=OFF
        -DWITH_GPU=${WITH_GPU:-OFF}
@ -55,7 +55,7 @@ EOF
    # docker environment is fully controlled by this script.
    # See /Paddle/CMakeLists.txt, UNITTEST_USE_VIRTUALENV option.
    cmake .. \
-        -DCMAKE_BUILD_TYPE=${BUILD_TYPE:Release} \
+        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} \
        ${PYTHON_FLAGS} \
        -DWITH_DOC=OFF \
        -DWITH_GPU=${WITH_GPU:-OFF} \
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@ -27,9 +27,10 @@ int main(int argc, char** argv) {
  }
 #ifdef PADDLE_WITH_CUDA
  new_argv.push_back(
-      strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory"));
+      strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory,"
+             "warpctc_dir"));
 #else
-  new_argv.push_back(strdup("--tryfromenv=use_pinned_memory"));
+  new_argv.push_back(strdup("--tryfromenv=use_pinned_memory,warpctc_dir"));
 #endif
  int new_argc = static_cast<int>(new_argv.size());
  char** new_argv_address = new_argv.data();
--- a/python/paddle/v2/fluid/init.py
+++ b/python/paddle/v2/fluid/init.py
@ -76,7 +76,9 @@ def __bootstrap__():

    os.environ['OMP_NUM_THREADS'] = str(num_threads)

-    read_env_flags = ['use_pinned_memory', 'check_nan_inf', 'benchmark']
+    read_env_flags = [
+        'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir'
+    ]
    if core.is_compiled_with_cuda():
        read_env_flags += ['fraction_of_gpu_memory_to_use']
    core.init_gflags([sys.argv[0]] +
--- a/python/paddle/v2/fluid/debuger.py
+++ b/python/paddle/v2/fluid/debuger.py
@ -0,0 +1,73 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from graphviz import GraphPreviewGenerator
+import proto.framework_pb2 as framework_pb2
+
+
+def draw_block_graphviz(block, highlights=None, path="./temp.dot"):
+    '''
+    Generate a debug graph for block.
+    Args:
+        block(Block): a block.
+    '''
+    graph = GraphPreviewGenerator("some graph")
+    # collect parameters and args
+    protostr = block.desc.serialize_to_string()
+    desc = framework_pb2.BlockDesc.FromString(str(protostr))
+
+    def need_highlight(name):
+        if highlights is None: return False
+        for pattern in highlights:
+            assert type(pattern) is str
+            if re.match(pattern, name):
+                return True
+        return False
+
+    # draw parameters and args
+    vars = {}
+    for var in desc.vars:
+        shape = [str(i) for i in var.lod_tensor.tensor.dims]
+        if not shape:
+            shape = ['null']
+        # create var
+        if var.persistable:
+            varn = graph.add_param(
+                var.name, var.type, shape, highlight=need_highlight(var.name))
+        else:
+            varn = graph.add_arg(var.name, highlight=need_highlight(var.name))
+        vars[var.name] = varn
+
+    def add_op_link_var(op, var, op2var=False):
+        for arg in var.arguments:
+            if arg not in vars:
+                # add missing variables as argument
+                vars[arg] = graph.add_arg(arg, highlight=need_highlight(arg))
+            varn = vars[arg]
+            highlight = need_highlight(op.description) or need_highlight(
+                varn.description)
+            if op2var:
+                graph.add_edge(op, varn, highlight=highlight)
+            else:
+                graph.add_edge(varn, op, highlight=highlight)
+
+    for op in desc.ops:
+        opn = graph.add_op(op.type, highlight=need_highlight(op.type))
+        for var in op.inputs:
+            add_op_link_var(opn, var, False)
+        for var in op.outputs:
+            add_op_link_var(opn, var, True)
+
+    graph(path, show=True)
--- a/python/paddle/v2/fluid/distribute_transpiler.py
+++ b/python/paddle/v2/fluid/distribute_transpiler.py
@ -478,9 +478,9 @@ class DistributeTranspiler:
            else:
                self._append_pserver_non_opt_ops(optimize_sub_program,
                                                 pserver_program, opt_op)
-        # Append the recv op
+        # Append the listen_and_serv op
        pserver_program.global_block().append_op(
-            type="recv",
+            type="listen_and_serv",
            inputs={},
            outputs={},
            attrs={
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@ -451,9 +451,8 @@ class Operator(object):
            if not given == need:
                raise ValueError(("Incorrect setting for output(s) of "
                                  "operator \"%s\". Need: [%s] Given: [%s]") %
-                                 (type, ", ".join(str(e)
-                                                  for e in need), ", ".join(
-                                                      str(e) for e in given)))
+                                 (type, ", ".join(str(e) for e in need),
+                                  ", ".join(str(e) for e in given)))

            for out_proto in proto.outputs:
                out_args = outputs[out_proto.name]
@ -489,7 +488,8 @@ class Operator(object):
        no_kernel_op_set = {
            'feed', 'fetch', 'save', 'load', 'recurrent',
            'rnn_memory_helper_grad', 'conditional_block', 'while', 'send',
-            'recv', 'parallel_do'
+            'recv', 'listen_and_serv', 'parallel_do', 'save_combine',
+            'load_combine'
        }
        if type not in no_kernel_op_set:
            self.desc.infer_var_type(self.block.desc)
--- a/python/paddle/v2/fluid/graphviz.py
+++ b/python/paddle/v2/fluid/graphviz.py
--- a/Show More
+++ b/Show More