Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into dev_reader

8 years ago · 6e6f5c7e79
parent da8a56ea61 c8ba6d51c8
commit 6e6f5c7e79
45 changed files with 1499 additions and 165 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -137,7 +137,7 @@ include(external/openblas)  # download, build, install openblas
 include(external/mkldnn)    # download, build, install mkldnn
 include(external/swig)      # download, build, install swig
 include(external/warpctc)   # download, build, install warpctc
-include(external/boost)     # download, build, install boost
+include(external/boost)     # download boost
 include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@ -21,6 +21,7 @@ set(BOOST_URL           "http://sourceforge.net/projects/boost/files/boost/${BOO
 set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
 set(BOOST_DOWNLOAD_DIR  "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
 set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
+set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1)

 include_directories(${BOOST_INCLUDE_DIR})

--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@ -47,3 +47,5 @@ sphinx_add_target(paddle_docs_cn
                  ${SPHINX_CACHE_DIR_CN}
                  ${CMAKE_CURRENT_SOURCE_DIR}
                  ${SPHINX_HTML_DIR_CN})
+
+add_subdirectory(api)
--- a/doc/api/CMakeLists.txt
+++ b/doc/api/CMakeLists.txt
@ -0,0 +1,20 @@
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
+
+# HTML output director
+set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
+
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
+    "${BINARY_BUILD_DIR_EN}/conf.py"
+    @ONLY)
+
+sphinx_add_target(paddle_api_docs
+                  html
+                  ${BINARY_BUILD_DIR_EN}
+                  ${SPHINX_CACHE_DIR_EN}
+                  ${CMAKE_CURRENT_SOURCE_DIR}
+                  ${SPHINX_HTML_DIR_EN})
--- a/doc/design/csp.md
+++ b/doc/design/csp.md
@ -42,7 +42,7 @@ The type *channel* is conceptually the blocking queue.  In Go, its implemented i

 The `select` operation has been in OS kernels long before Go language.  All Unix kernels implement system calls *poll* and *select*.  They monitor multiple file descriptors to see if I/O is possible on any of them.  This takes O(N) time.  Since Linux 2.6, a new system call, *epoll*, can do the same in O(1) time.  In BSD systems, there is a similar system call *kqueue*.  Go's Linux implementation uses epoll.

-It might be a good idea to implement Fluid's select using epoll too.  In this design doc, we start from the O(N) way, so we could focus on Python binding and the syntax.
+It might be a good idea to implement Fluid's select using epoll too.  In this design doc, we start from the O(N) way so that we could focus on Python binding and the syntax.

 ### Type Channel

@ -71,14 +71,14 @@ ch1 := make(chan int, 100)  // a channel that can buffer 100 ints.
 In Fluid, we should be able to do the same:

 ```python
-ch  = fluid.make_chan(dtype=INT)
-ch1 = fluid.make_chan(dtype=INT, 100)
+ch  = fluid.make_channel(dtype=INT)
+ch1 = fluid.make_channel(dtype=INT, 100)
 ```

 In addition to that, we want channels that can hold more complex element types, e.g., Tensors of float16:

 ```python
-ch = fluid.make_chan(dtype=Tensor, etype=float16)
+ch = fluid.make_channel(dtype=Tensor, etype=float16)
 ```

 or Tensors of Tensors of float16 etc.
@ -87,8 +87,135 @@ The point here is that we need a consistent way to compose types, like in C++ we

 ### Send and Recv

+Go's CSP implementation depends on data type *channel*. There are two types of channels:
+
+1. The unblocked channel, or buffered channel, is a blocking queue with a non-zero sized buffer. The sending to buffered channel blocks if the buffer is full, and the receive operation blocks if the buffer is empty.
+1. blocked channel, or unbuffered channel, is a blocking queue with no buffer.  Both sending and receiving block with unbuffered channels.
+
+There are four types of actions with a channel:
+
+1. Create a channel
+
+   ```go
+   ch := make(chan int) // this is an unbuffered channel
+   ch := make(chan int, 100) // this is a buffered channel of 100 ints.
+   ```
+
+1. Send
+
+   ```go
+   ch <- 111
+   ```
+
+1. Recv
+
+   ```go
+   y, ok <- ch
+   ```
+
+1. Close
+
+   ```go
+   close(ch)
+   ```
+   
+   Please be aware that a closed channel is not a nil channel, which is `var ch chan int`.
+   
+There are some [axioms with channels](https://dave.cheney.net/2014/03/19/channel-axioms):
+
+1. A send to a nil channel blocks forever
+
+1. A receive from a nil channel blocks forever
+
+1. A send to a closed channel panics
+
+1. A receive from a closed channel returns the residual values and then zeros.
+
+In Fluid, we have [buffered channels](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/buffered_channel.h) and [unbuffered channels](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/unbuffered_channel.h)
+
+The following program illustrates the Python syntax for accessing Fluid buffers.
+
+```python
+import fluid
+
+buffer_size = 10
+ch = fluid.make_channel(dtype=INT, buffer_size)
+
+# Now write three elements to the channel
+with fluid.while(steps=buffer_size):
+  fluid.send(ch, step)
+  fluid.close_channel(ch)
+  
+with fluid.while(steps=buffer_size):
+  fluid.print(fluid.recv(ch))
+```
+
+The following example shows that to avoid the always-blocking behavior of unbuffered channels, we need to use Fluid's goroutines.
+
+```python
+import fluid
+
+ch = fluid.make_channel(dtype=INT)
+
+with fluid.go():
+  fluid.send(ch)
+
+y = fluid.recv(ch)
+
+fluid.close_channel(ch)
+```
+
 ### Select

+In Go, the `select` statement lets a goroutine wait on multiple communication operations. A `select` blocks until one of its cases can run, then it executes that case. It chooses one at random if multiple are ready.
+
+```go
+
+ch1  := make(chan int)       
+ch2  := make(chan int, 100)
+
+x := 0
+
+for {
+    select {
+    case ch1 <- x:
+      x := x + 1
+    case y <- ch2:
+      fmt.Println("Received on channel")
+    default:
+      fmt.Println("Default")
+    }
+  }
+
+```
+
+In Fluid, we should be able to do the same:
+
+```python
+ch1  = fluid.make_chan(dtype=INT)
+ch2 = fluid.make_chan(dtype=INT, 100)
+
+sel = fluid.select()
+
+with sel.case(ch1, 'w', X):
+    fluid.layers.increment(X)
+
+with sel.case(ch2, 'r', Y):
+    fluid.print("Received on Channel")
+
+with sel.default():
+    fluid.print("Default")
+
+```
+
+In the above code snippet, `X` and `Y` are variables. Now let us look at each of these statements one by one.
+
+- `sel.case(ch1, 'w', X)` : This specifies that we are writing to `ch1` and we want to write the integer in variable `X` to the channel. The character `w` is used here to make the syntax familiar to write syntax in Python I/O.
+
+- `sel.case(ch2, 'r', Y)` : This specifies that we would like to read the result from `ch2` into variable `Y`. The character `r` is used here to make the syntax familiar to read syntax in Python I/O.
+
+- `sel.default()` : This is equivalent to the default in Go `select`. If none of the channels are ready for read or write, then the fluid code in the default block will be executed.
+
 ## Example Programs

 ### 1. RPC between Trainers and Parameter Servers
--- a/doc/index_cn.rst
+++ b/doc/index_cn.rst
@ -8,4 +8,3 @@ PaddlePaddle 文档
  howto/index_cn.rst
  api/index_cn.rst
  faq/index_cn.rst
-  mobile/index_cn.rst
--- a/doc/index_en.rst
+++ b/doc/index_en.rst
@ -7,4 +7,3 @@ PaddlePaddle Documentation
  getstarted/index_en.rst
  howto/index_en.rst
  api/index_en.rst
-  mobile/index_en.rst
--- a/doc/mobile/index_cn.rst
+++ b/doc/mobile/index_cn.rst
@ -1,9 +0,0 @@
-MOBILE
-======
-
-..  toctree::
-  :maxdepth: 1
-
-  cross_compiling_for_android_cn.md
-  cross_compiling_for_ios_cn.md
-  cross_compiling_for_raspberry_cn.md
--- a/doc/mobile/index_en.rst
+++ b/doc/mobile/index_en.rst
@ -1,9 +0,0 @@
-MOBILE
-======
-
-..  toctree::
-  :maxdepth: 1
-
-  cross_compiling_for_android_en.md
-  cross_compiling_for_ios_en.md
-  cross_compiling_for_raspberry_en.md
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@ -534,7 +534,7 @@ ParamGradInfoMap AppendBackward(
  auto root_block = program_desc.MutableBlock(root_block_idx);

  std::string fill_one_op_out = GradVarName(target.Name());
-  bool is_scalar = target.Shape() == std::vector<int64_t>{1};
+  bool is_scalar = target.GetShape() == std::vector<int64_t>{1};
  PADDLE_ENFORCE(is_scalar, "target should be scalar");
  VLOG(3) << "backward from loss=" << target.Name()
          << " data_type=" << target.GetDataType();
@ -565,7 +565,7 @@ ParamGradInfoMap AppendBackward(

  auto var = root_block->Var(fill_one_op_out);
  var->SetDataType(target.GetDataType());
-  var->SetShape(target.Shape());
+  var->SetShape(target.GetShape());
  auto& target_grad = retv[target.Name()];
  target_grad.name_ = fill_one_op_out;
  target_grad.block_idx_ = root_block_idx;
--- a/paddle/framework/channel.h
+++ b/paddle/framework/channel.h
@ -23,8 +23,8 @@ namespace framework {
 template <typename T>
 class Channel {
 public:
-  virtual void Send(T*) = 0;
-  virtual void Receive(T*) = 0;
+  virtual bool Send(T*) = 0;
+  virtual bool Receive(T*) = 0;
  virtual size_t Cap() = 0;
  virtual void Close() = 0;
  virtual ~Channel() {}
--- a/paddle/framework/channel_test.cc
+++ b/paddle/framework/channel_test.cc
@ -48,12 +48,12 @@ TEST(Channel, SufficientBufferSizeDoesntBlock) {
  const size_t buffer_size = 10;
  auto ch = MakeChannel<size_t>(buffer_size);
  for (size_t i = 0; i < buffer_size; ++i) {
-    ch->Send(&i);  // should not block
+    EXPECT_EQ(ch->Send(&i), true);  // should not block
  }

  size_t out;
  for (size_t i = 0; i < buffer_size; ++i) {
-    ch->Receive(&out);  // should not block
+    EXPECT_EQ(ch->Receive(&out), true);  // should not block
    EXPECT_EQ(out, i);
  }
  CloseChannel(ch);
@ -67,7 +67,10 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
  std::thread t([&]() {
    // Try to write more than buffer size.
    for (size_t i = 0; i < 2 * buffer_size; ++i) {
-      ch->Send(&i);  // should not block
+      if (i < buffer_size)
+        EXPECT_EQ(ch->Send(&i), true);  // should block after 10 iterations
+      else
+        EXPECT_EQ(ch->Send(&i), false);
      sum += i;
    }
  });
@ -84,13 +87,13 @@ TEST(Channel, SimpleUnbufferedChannelTest) {
  unsigned sum_send = 0;
  std::thread t([&]() {
    for (int i = 0; i < 5; i++) {
-      ch->Send(&i);
+      EXPECT_EQ(ch->Send(&i), true);
      sum_send += i;
    }
  });
  for (int i = 0; i < 5; i++) {
    int recv;
-    ch->Receive(&recv);
+    EXPECT_EQ(ch->Receive(&recv), true);
    EXPECT_EQ(recv, i);
  }

@ -100,6 +103,102 @@ TEST(Channel, SimpleUnbufferedChannelTest) {
  delete ch;
 }

+// This tests that closing a buffered channel also unblocks
+//  any receivers waiting on the channel
+TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
+  auto ch = MakeChannel<int>(1);
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+
+  // Launches threads that try to read and are blocked because of no writers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    t[i] = std::thread(
+        [&](bool *p) {
+          int data;
+          // All reads should return false
+          EXPECT_EQ(ch->Receive(&data), false);
+          *p = true;
+        },
+        &thread_ended[i]);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+
+  // Verify that all threads are blocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], false);
+  }
+
+  // Explicitly close the channel
+  // This should unblock all receivers
+  CloseChannel(ch);
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
+
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  delete ch;
+}
+
+// This tests that closing a buffered channel also unblocks
+//  any senders waiting for channel to have write space
+TEST(Channel, BufferedChannelCloseUnblocksSendersTest) {
+  auto ch = MakeChannel<int>(1);
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+  bool send_success[num_threads];
+
+  // Launches threads that try to write and are blocked because of no readers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    send_success[i] = false;
+    t[i] = std::thread(
+        [&](bool *ended, bool *success) {
+          int data = 10;
+          *success = ch->Send(&data);
+          *ended = true;
+        },
+        &thread_ended[i], &send_success[i]);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+
+  // Verify that atleast 4 threads are blocked
+  int ct = 0;
+  for (size_t i = 0; i < num_threads; i++) {
+    if (thread_ended[i] == false) ct++;
+  }
+  // Atleast 4 threads must be blocked
+  EXPECT_GE(ct, 4);
+
+  // Explicitly close the thread
+  // This should unblock all senders
+  CloseChannel(ch);
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
+
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+
+  // Verify that only 1 send was successful
+  ct = 0;
+  for (size_t i = 0; i < num_threads; i++) {
+    if (send_success[i]) ct++;
+  }
+  // Only 1 send must be successful
+  EXPECT_EQ(ct, 1);
+
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  delete ch;
+}
+
 // This tests that closing an unbuffered channel also unblocks
 //  unblocks any receivers waiting for senders
 TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
@ -114,7 +213,7 @@ TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
    t[i] = std::thread(
        [&](bool *p) {
          int data;
-          ch->Receive(&data);
+          EXPECT_EQ(ch->Receive(&data), false);
          *p = true;
        },
        &thread_ended[i]);
@ -155,7 +254,7 @@ TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) {
    t[i] = std::thread(
        [&](bool *p) {
          int data = 10;
-          ch->Send(&data);
+          EXPECT_EQ(ch->Send(&data), false);
          *p = true;
        },
        &thread_ended[i]);
@ -207,3 +306,37 @@ TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
  t.join();
  delete ch;
 }
+
+TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
+  auto ch = MakeChannel<int>(0);
+  unsigned sum_send = 0;
+  unsigned sum_receive = 0;
+  // The receiver should block after 5
+  // iterations, since there are only 5 senders.
+  std::thread t([&]() {
+    for (int i = 0; i < 8; i++) {
+      int recv;
+      ch->Receive(&recv);  // should block after the fifth iteration.
+      EXPECT_EQ(recv, i);
+      sum_receive += i;
+    }
+  });
+  for (int i = 0; i < 5; i++) {
+    ch->Send(&i);
+    sum_send += i;
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+  EXPECT_EQ(sum_send, 10U);
+  EXPECT_EQ(sum_receive, 10U);
+  // send three more elements
+  for (int i = 5; i < 8; i++) {
+    ch->Send(&i);
+    sum_send += i;
+  }
+
+  CloseChannel(ch);
+  t.join();
+  EXPECT_EQ(sum_send, 28U);
+  EXPECT_EQ(sum_receive, 28U);
+  delete ch;
+}
--- a/paddle/framework/details/buffered_channel.h
+++ b/paddle/framework/details/buffered_channel.h
@ -30,8 +30,8 @@ class Buffered : public paddle::framework::Channel<T> {
  friend void paddle::framework::CloseChannel<T>(Channel<T>*);

 public:
-  virtual void Send(T*);
-  virtual void Receive(T*);
+  virtual bool Send(T*);
+  virtual bool Receive(T*);
  virtual size_t Cap() { return cap_; }
  virtual void Close();
  virtual ~Buffered();
@ -48,39 +48,43 @@ class Buffered : public paddle::framework::Channel<T> {
    PADDLE_ENFORCE_GT(cap, 0);
  }

-  void NotifyAllSenders(std::unique_lock<std::mutex>*);
+  void NotifyAllParticipants(std::unique_lock<std::mutex>*);
 };

 template <typename T>
-void Buffered<T>::Send(T* item) {
+bool Buffered<T>::Send(T* item) {
  std::unique_lock<std::mutex> lock(mu_);
  full_cond_var_.wait(lock,
                      [this]() { return channel_.size() < cap_ || closed_; });
+  bool ret = false;
  if (!closed_) {
    channel_.push_back(std::move(*item));
    lock.unlock();
    empty_cond_var_.notify_one();
+    ret = true;
  }
+  return ret;
 }

 template <typename T>
-void Buffered<T>::Receive(T* item) {
+bool Buffered<T>::Receive(T* item) {
  std::unique_lock<std::mutex> lock(mu_);
  empty_cond_var_.wait(lock, [this]() { return !channel_.empty() || closed_; });
+  bool ret = false;
  if (!closed_) {
    *item = std::move(channel_.front());
    channel_.pop_front();
-    NotifyAllSenders(&lock);
-  } else {
-    item = nullptr;
+    full_cond_var_.notify_one();
+    ret = true;
  }
+  return ret;
 }

 template <typename T>
 void Buffered<T>::Close() {
  std::unique_lock<std::mutex> lock(mu_);
  closed_ = true;
-  NotifyAllSenders(&lock);
+  NotifyAllParticipants(&lock);
 }

 template <typename T>
@ -88,13 +92,14 @@ Buffered<T>::~Buffered() {
  std::unique_lock<std::mutex> lock(mu_);
  closed_ = true;
  channel_.clear();
-  NotifyAllSenders(&lock);
+  NotifyAllParticipants(&lock);
 }

 template <typename T>
-void Buffered<T>::NotifyAllSenders(std::unique_lock<std::mutex>* lock) {
+void Buffered<T>::NotifyAllParticipants(std::unique_lock<std::mutex>* lock) {
  lock->unlock();
  full_cond_var_.notify_all();
+  empty_cond_var_.notify_all();
 }

 }  // namespace details
--- a/paddle/framework/details/unbuffered_channel.h
+++ b/paddle/framework/details/unbuffered_channel.h
@ -29,8 +29,8 @@ class UnBuffered : public paddle::framework::Channel<T> {
  friend void paddle::framework::CloseChannel<T>(Channel<T>*);

 public:
-  virtual void Send(T*);
-  virtual void Receive(T*);
+  virtual bool Send(T*);
+  virtual bool Receive(T*);
  virtual size_t Cap() { return 0; }
  virtual void Close();
  virtual ~UnBuffered();
@ -57,7 +57,7 @@ class UnBuffered : public paddle::framework::Channel<T> {
 // This function implements the concept of how data should
 // be sent from a writer to a reader.
 template <typename T>
-void UnBuffered<T>::Send(T* data) {
+bool UnBuffered<T>::Send(T* data) {
  // Prevent other writers from entering
  std::unique_lock<std::recursive_mutex> writer_lock(mu_write_);
  writer_found_ = true;
@ -66,6 +66,7 @@ void UnBuffered<T>::Send(T* data) {
  cv_writer_.wait(cv_lock,
                  [this]() { return reader_found_ == true || closed_; });
  cv_reader_.notify_one();
+  bool ret = false;
  if (!closed_) {
    std::unique_lock<std::mutex> channel_lock(mu_ch_);
    item = data;
@ -74,14 +75,16 @@ void UnBuffered<T>::Send(T* data) {
    channel_lock.lock();
    cv_channel_.wait(channel_lock,
                     [this]() { return item == nullptr || closed_; });
+    ret = true;
  }
  writer_found_ = false;
+  return ret;
 }

 // This function implements the concept of how
 // data that was sent by a writer is read from a reader.
 template <typename T>
-void UnBuffered<T>::Receive(T* data) {
+bool UnBuffered<T>::Receive(T* data) {
  // Prevent other readers from entering
  std::unique_lock<std::recursive_mutex> read_lock{mu_read_};
  reader_found_ = true;
@ -90,6 +93,7 @@ void UnBuffered<T>::Receive(T* data) {
  cv_reader_.wait(cv_lock,
                  [this]() { return writer_found_ == true || closed_; });
  cv_writer_.notify_one();
+  bool ret = false;
  if (!closed_) {
    std::unique_lock<std::mutex> lock_ch{mu_ch_};
    // Reader should wait for the writer to first write its data
@ -98,10 +102,12 @@ void UnBuffered<T>::Receive(T* data) {
      *data = std::move(*item);
      item = nullptr;
      lock_ch.unlock();
+      ret = true;
    }
    cv_channel_.notify_one();
  }
  reader_found_ = false;
+  return ret;
 }

 // This function implements the sequence of events
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@ -116,6 +116,8 @@ message LoDTensorArrayDesc {
  optional int32 lod_level = 2 [ default = 0 ];
 }

+message Reader { repeated LoDTensorDesc lod_tensor = 1; }
+
 message VarDesc {
  enum VarType {
    LOD_TENSOR = 1;
@ -126,13 +128,15 @@ message VarDesc {
    LOD_RANK_TABLE = 6;
    LOD_TENSOR_ARRAY = 7;
    PLACE_LIST = 8;
+    READER = 9;
  }
  required string name = 1;
  required VarType type = 2;
-  optional LoDTensorDesc lod_tensor = 3;
-  optional TensorDesc selected_rows = 4;
+  optional bool persistable = 3 [ default = false ];
+  optional LoDTensorDesc lod_tensor = 4;
+  optional TensorDesc selected_rows = 5;
  optional LoDTensorArrayDesc tensor_array = 6;
-  optional bool persistable = 5 [ default = false ];
+  optional Reader reader = 7;
 }

 message BlockDesc {
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@ -458,11 +458,11 @@ DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const {
  auto var = block_.FindVarRecursive(name);
  PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
  try {
-    auto shape = var->Shape();
+    auto shape = var->GetShape();
    if (shape.empty()) {
      return framework::make_ddim({0UL});
    } else {
-      return framework::make_ddim(var->Shape());
+      return framework::make_ddim(var->GetShape());
    }
  } catch (...) {
    VLOG(5) << "GetDim of variable " << name << " error";
--- a/paddle/framework/program_desc_test.cc
+++ b/paddle/framework/program_desc_test.cc
@ -53,7 +53,7 @@ TEST(ProgramDesc, copy_ctor) {
    ASSERT_NE(copy, var_before);
    ASSERT_EQ(copy->Name(), var_before->Name());
    ASSERT_EQ(copy->GetType(), var_before->GetType());
-    ASSERT_EQ(copy->Shape(), var_before->Shape());
+    ASSERT_EQ(copy->GetShape(), var_before->GetShape());
    ASSERT_EQ(copy->Proto()->SerializeAsString(),
              var_before->Proto()->SerializeAsString());
  };
@ -117,7 +117,7 @@ TEST(ProgramDescBind, serialize_and_deserialize) {
    ASSERT_NE(restored, var_before);
    ASSERT_EQ(restored->Name(), var_before->Name());
    ASSERT_EQ(restored->GetType(), var_before->GetType());
-    ASSERT_EQ(restored->Shape(), var_before->Shape());
+    ASSERT_EQ(restored->GetShape(), var_before->GetShape());
    ASSERT_EQ(restored->Proto()->SerializeAsString(),
              var_before->Proto()->SerializeAsString());
  };
--- a/paddle/framework/var_desc.cc
+++ b/paddle/framework/var_desc.cc
@ -26,18 +26,91 @@ void VarDesc::SetShape(const std::vector<int64_t> &dims) {
  VectorToRepeated(dims, mutable_tensor_desc()->mutable_dims());
 }

+void VarDesc::SetTensorDescNum(size_t num) {
+  switch (desc_.type()) {
+    case proto::VarDesc::READER: {
+      auto *lod_tensors_ptr = desc_.mutable_reader()->mutable_lod_tensor();
+      lod_tensors_ptr->Clear();
+      for (size_t i = 0; i < num; ++i) {
+        lod_tensors_ptr->Add();
+      }
+      return;
+    } break;
+    default:
+      PADDLE_THROW(
+          "Setting 'sub_tensor_number' is not supported by the type of var %s.",
+          this->Name());
+  }
+}
+
+size_t VarDesc::GetTensorDescNum() const {
+  switch (desc_.type()) {
+    case proto::VarDesc::READER:
+      return desc_.reader().lod_tensor_size();
+      break;
+    default:
+      PADDLE_THROW(
+          "Getting 'sub_tensor_number' is not supported by the type of var %s.",
+          this->Name());
+  }
+}
+
+void VarDesc::SetShapes(
+    const std::vector<std::vector<int64_t>> &multiple_dims) {
+  PADDLE_ENFORCE_EQ(multiple_dims.size(), GetTensorDescNum(),
+                    "The number of given shapes(%d) doesn't equal to the "
+                    "number of sub tensor.",
+                    multiple_dims.size(), GetTensorDescNum());
+  std::vector<proto::TensorDesc *> tensors = mutable_tensor_descs();
+  for (size_t i = 0; i < multiple_dims.size(); ++i) {
+    VectorToRepeated(multiple_dims[i], tensors[i]->mutable_dims());
+  }
+}
+
+std::vector<int64_t> VarDesc::GetShape() const {
+  return RepeatedToVector(tensor_desc().dims());
+}
+
+std::vector<std::vector<int64_t>> VarDesc::GetShapes() const {
+  std::vector<proto::TensorDesc> descs = tensor_descs();
+  std::vector<std::vector<int64_t>> res;
+  res.reserve(descs.size());
+  for (const auto &tensor_desc : descs) {
+    res.push_back(RepeatedToVector(tensor_desc.dims()));
+  }
+  return res;
+}
+
 void VarDesc::SetDataType(proto::DataType data_type) {
  mutable_tensor_desc()->set_data_type(data_type);
 }

-std::vector<int64_t> VarDesc::Shape() const {
-  return RepeatedToVector(tensor_desc().dims());
+void VarDesc::SetDataTypes(
+    const std::vector<proto::DataType> &multiple_data_type) {
+  PADDLE_ENFORCE_EQ(multiple_data_type.size(), GetTensorDescNum(),
+                    "The number of given data types(%d) doesn't equal to the "
+                    "number of sub tensor.",
+                    multiple_data_type.size(), GetTensorDescNum());
+  std::vector<proto::TensorDesc *> tensor_descs = mutable_tensor_descs();
+  for (size_t i = 0; i < multiple_data_type.size(); ++i) {
+    tensor_descs[i]->set_data_type(multiple_data_type[i]);
+  }
 }

 proto::DataType VarDesc::GetDataType() const {
  return tensor_desc().data_type();
 }

+std::vector<proto::DataType> VarDesc::GetDataTypes() const {
+  std::vector<proto::TensorDesc> descs = tensor_descs();
+  std::vector<proto::DataType> res;
+  res.reserve(descs.size());
+  for (const auto &tensor_desc : descs) {
+    res.push_back(tensor_desc.data_type());
+  }
+  return res;
+}
+
 void VarDesc::SetLoDLevel(int32_t lod_level) {
  switch (desc_.type()) {
    case proto::VarDesc::LOD_TENSOR:
@ -47,8 +120,28 @@ void VarDesc::SetLoDLevel(int32_t lod_level) {
      desc_.mutable_tensor_array()->set_lod_level(lod_level);
      break;
    default:
-      PADDLE_THROW("Tensor type=%d does not support LoDLevel",
-                   desc_.tensor_array().lod_level());
+      PADDLE_THROW(
+          "Setting 'lod_level' is not supported by the type of var %s.",
+          this->Name());
+  }
+}
+
+void VarDesc::SetLoDLevels(const std::vector<int32_t> &multiple_lod_level) {
+  PADDLE_ENFORCE_EQ(multiple_lod_level.size(), GetTensorDescNum(),
+                    "The number of given data types(%d) doesn't equal to the "
+                    "number of sub tensor.",
+                    multiple_lod_level.size(), GetTensorDescNum());
+  switch (desc_.type()) {
+    case proto::VarDesc::READER: {
+      size_t i = 0;
+      for (auto &lod_tensor : *desc_.mutable_reader()->mutable_lod_tensor()) {
+        lod_tensor.set_lod_level(multiple_lod_level[i++]);
+      }
+    } break;
+    default:
+      PADDLE_THROW(
+          "Setting 'lod_levels' is not supported by the type of var %s.",
+          this->Name());
  }
 }

@ -59,13 +152,31 @@ int32_t VarDesc::GetLoDLevel() const {
    case proto::VarDesc::LOD_TENSOR_ARRAY:
      return desc_.tensor_array().lod_level();
    default:
-      PADDLE_THROW("Tensor type=%d does not support LoDLevel",
-                   desc_.tensor_array().lod_level());
+      PADDLE_THROW(
+          "Getting 'lod_level' is not supported by the type of var %s.",
+          this->Name());
+  }
+}
+
+std::vector<int32_t> VarDesc::GetLoDLevels() const {
+  std::vector<int32_t> res;
+  switch (desc_.type()) {
+    case proto::VarDesc::READER:
+      res.reserve(desc_.reader().lod_tensor_size());
+      for (auto &lod_tensor : desc_.reader().lod_tensor()) {
+        res.push_back(lod_tensor.lod_level());
+      }
+      return res;
+      break;
+    default:
+      PADDLE_THROW(
+          "Getting 'lod_levels' is not supported by the type of var %s.",
+          this->Name());
  }
 }

 const proto::TensorDesc &VarDesc::tensor_desc() const {
-  PADDLE_ENFORCE(desc_.has_type(), "invoke TensorDesc must after set type");
+  PADDLE_ENFORCE(desc_.has_type(), "The var's type hasn't been set.");
  switch (desc_.type()) {
    case proto::VarDesc::SELECTED_ROWS:
      return desc_.selected_rows();
@ -74,13 +185,32 @@ const proto::TensorDesc &VarDesc::tensor_desc() const {
    case proto::VarDesc::LOD_TENSOR_ARRAY:
      return desc_.tensor_array().tensor();
    default:
-      PADDLE_THROW("The type of var %s is unsupported.", this->Name());
+      PADDLE_THROW(
+          "Getting 'tensor_desc' is not supported by the type of var %s.",
+          this->Name());
+  }
+}
+
+std::vector<proto::TensorDesc> VarDesc::tensor_descs() const {
+  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
+  std::vector<proto::TensorDesc> res;
+  res.reserve(GetTensorDescNum());
+  switch (desc_.type()) {
+    case proto::VarDesc::READER:
+      for (const auto &lod_tensor : desc_.reader().lod_tensor()) {
+        res.push_back(lod_tensor.tensor());
+      }
+      return res;
+    default:
+      PADDLE_THROW(
+          "Getting 'tensor_descs' is not supported by the type of var "
+          "%s.",
+          this->Name());
  }
 }

 proto::TensorDesc *VarDesc::mutable_tensor_desc() {
-  PADDLE_ENFORCE(desc_.has_type(),
-                 "invoke MutableTensorDesc must after set type");
+  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
  switch (desc_.type()) {
    case proto::VarDesc::SELECTED_ROWS:
      return desc_.mutable_selected_rows();
@ -89,8 +219,30 @@ proto::TensorDesc *VarDesc::mutable_tensor_desc() {
    case proto::VarDesc::LOD_TENSOR_ARRAY:
      return desc_.mutable_tensor_array()->mutable_tensor();
    default:
-      PADDLE_THROW("Unexpected branch.");
+      PADDLE_THROW(
+          "Getting 'mutable_tensor_desc' is not supported by the type of var "
+          "%s.",
+          this->Name());
  }
 }
+
+std::vector<proto::TensorDesc *> VarDesc::mutable_tensor_descs() {
+  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
+  std::vector<proto::TensorDesc *> res;
+  res.reserve(GetTensorDescNum());
+  switch (desc_.type()) {
+    case proto::VarDesc::READER:
+      for (auto &lod_tensor : *desc_.mutable_reader()->mutable_lod_tensor()) {
+        res.push_back(lod_tensor.mutable_tensor());
+      }
+      return res;
+    default:
+      PADDLE_THROW(
+          "Getting 'tensor_descs' is not supported by the type of var "
+          "%s.",
+          this->Name());
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/var_desc.h
+++ b/paddle/framework/var_desc.h
@ -68,18 +68,34 @@ class VarDesc {

  void SetName(std::string name) { desc_.set_name(name); }

+  void SetTensorDescNum(size_t num);
+
+  size_t GetTensorDescNum() const;
+
  void SetShape(const std::vector<int64_t> &dims);

+  void SetShapes(const std::vector<std::vector<int64_t>> &multiple_dims);
+
+  std::vector<int64_t> GetShape() const;
+
+  std::vector<std::vector<int64_t>> GetShapes() const;
+
  void SetDataType(proto::DataType data_type);

-  std::vector<int64_t> Shape() const;
+  void SetDataTypes(const std::vector<proto::DataType> &multiple_data_type);

  proto::DataType GetDataType() const;

+  std::vector<proto::DataType> GetDataTypes() const;
+
  void SetLoDLevel(int32_t lod_level);

+  void SetLoDLevels(const std::vector<int32_t> &multiple_lod_level);
+
  int32_t GetLoDLevel() const;

+  std::vector<int32_t> GetLoDLevels() const;
+
  proto::VarDesc::VarType GetType() const;

  void SetType(proto::VarDesc::VarType type);
@ -90,7 +106,9 @@ class VarDesc {

 private:
  const proto::TensorDesc &tensor_desc() const;
+  std::vector<proto::TensorDesc> tensor_descs() const;
  proto::TensorDesc *mutable_tensor_desc();
+  std::vector<proto::TensorDesc *> mutable_tensor_descs();

  proto::VarDesc desc_;
 };
--- a/paddle/inference/io.cc
+++ b/paddle/inference/io.cc
@ -55,7 +55,7 @@ void LoadPersistables(framework::Executor& executor,
      VLOG(3) << "parameter's name: " << var->Name();

      framework::VarDesc* new_var = load_block->Var(var->Name());
-      new_var->SetShape(var->Shape());
+      new_var->SetShape(var->GetShape());
      new_var->SetDataType(var->GetDataType());
      new_var->SetType(var->GetType());
      new_var->SetLoDLevel(var->GetLoDLevel());
--- a/paddle/inference/tests/book/test_inference_recognize_digits.cc
+++ b/paddle/inference/tests/book/test_inference_recognize_digits.cc
@ -58,6 +58,47 @@ void TestInference(const std::string& dirname,
  delete scope;
 }

+template <typename T>
+void SetupTensor(paddle::framework::LoDTensor& input,
+                 paddle::framework::DDim dims,
+                 T lower,
+                 T upper) {
+  srand(time(0));
+  float* input_ptr = input.mutable_data<T>(dims, paddle::platform::CPUPlace());
+  for (int i = 0; i < input.numel(); ++i) {
+    input_ptr[i] =
+        (static_cast<T>(rand()) / static_cast<T>(RAND_MAX)) * (upper - lower) +
+        lower;
+  }
+}
+
+template <typename T>
+void CheckError(paddle::framework::LoDTensor& output1,
+                paddle::framework::LoDTensor& output2) {
+  // Check lod information
+  EXPECT_EQ(output1.lod(), output2.lod());
+
+  EXPECT_EQ(output1.dims(), output2.dims());
+  EXPECT_EQ(output1.numel(), output2.numel());
+
+  T err = static_cast<T>(0);
+  if (typeid(T) == typeid(float)) {
+    err = 1E-3;
+  } else if (typeid(T) == typeid(double)) {
+    err = 1E-6;
+  } else {
+    err = 0;
+  }
+
+  size_t count = 0;
+  for (int64_t i = 0; i < output1.numel(); ++i) {
+    if (fabs(output1.data<T>()[i] - output2.data<T>()[i]) > err) {
+      count++;
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
+}
+
 TEST(inference, recognize_digits) {
  if (FLAGS_dirname.empty()) {
    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
@ -70,12 +111,10 @@ TEST(inference, recognize_digits) {
  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc

  paddle::framework::LoDTensor input;
-  srand(time(0));
-  float* input_ptr =
-      input.mutable_data<float>({1, 28, 28}, paddle::platform::CPUPlace());
-  for (int i = 0; i < 784; ++i) {
-    input_ptr[i] = rand() / (static_cast<float>(RAND_MAX));
-  }
+  // Use normilized image pixels as input data,
+  // which should be in the range [-1.0, 1.0].
+  SetupTensor<float>(
+      input, {1, 28, 28}, static_cast<float>(-1), static_cast<float>(1));
  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
  cpu_feeds.push_back(&input);

@ -98,16 +137,6 @@ TEST(inference, recognize_digits) {
      dirname, cpu_feeds, cpu_fetchs2);
  LOG(INFO) << output2.dims();

-  EXPECT_EQ(output1.dims(), output2.dims());
-  EXPECT_EQ(output1.numel(), output2.numel());
-
-  float err = 1E-3;
-  int count = 0;
-  for (int64_t i = 0; i < output1.numel(); ++i) {
-    if (fabs(output1.data<float>()[i] - output2.data<float>()[i]) > err) {
-      count++;
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
+  CheckError<float>(output1, output2);
 #endif
 }
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@ -159,7 +159,10 @@ op_library(create_reader_op DEPS reader)

 # Regist multiple Kernel to pybind
 if (WITH_GPU)
-op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS vol2col)
+
+op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS
+    vol2col depthwise_conv)
+
 op_library(edit_distance_op SRCS edit_distance_op.cc edit_distance_op.cu DEPS math_function)
 op_library(pool_op SRCS pool_op.cc pool_op.cu.cc pool_cudnn_op.cu.cc DEPS pooling)
 op_library(conv_transpose_op SRCS conv_transpose_op.cc conv_transpose_op.cu.cc
--- a/paddle/operators/conv_op.cc
+++ b/paddle/operators/conv_op.cc
@ -318,9 +318,25 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
 namespace ops = paddle::operators;
 REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad,
            ops::ConvOpGrad);
+
+// depthwise convolution op
+REGISTER_OP(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
+            depthwise_conv2d_grad, ops::ConvOpGrad);
 REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
            ops::ConvOpGrad);

+// depthwise conv kernel
+// TODO(xingzhaolong): neon kernel for mobile
+REGISTER_OP_CPU_KERNEL(
+    depthwise_conv2d,
+    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OP_CPU_KERNEL(
+    depthwise_conv2d_grad,
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
+
 REGISTER_OP_CPU_KERNEL(
    conv2d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/operators/conv_op.cu.cc
+++ b/paddle/operators/conv_op.cu.cc
@ -16,6 +16,16 @@ limitations under the License. */

 namespace ops = paddle::operators;

+REGISTER_OP_CUDA_KERNEL(
+    depthwise_conv2d,
+    ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    depthwise_conv2d_grad,
+    ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, double>);
+
 REGISTER_OP_CUDA_KERNEL(
    conv2d, ops::GemmConvKernel<paddle::platform::CUDADeviceContext, float>,
    ops::GemmConvKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/operators/conv_op.h
+++ b/paddle/operators/conv_op.h
@ -16,6 +16,7 @@ limitations under the License. */

 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/depthwise_conv.h"
 #include "paddle/operators/math/im2col.h"
 #include "paddle/operators/math/math_function.h"
 #include "paddle/operators/math/vol2col.h"
@ -350,5 +351,72 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
    }
  }
 };
+
+template <typename DeviceContext, typename T>
+class DepthwiseConvKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("Input");
+    Tensor filter = *context.Input<Tensor>("Filter");
+    Tensor* output = context.Output<Tensor>("Output");
+    output->mutable_data<T>(context.GetPlace());
+
+    PADDLE_ENFORCE_EQ(
+        output->dims()[1] % input->dims()[1], 0,
+        "The output channels must be a multiple of the input channels");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+
+    math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    depthwiseConv(dev_ctx, *input, filter, strides, paddings, output);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class DepthwiseConvGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("Input");
+    const Tensor* output_grad =
+        context.Input<Tensor>(framework::GradVarName("Output"));
+    Tensor* input_grad =
+        context.Output<Tensor>(framework::GradVarName("Input"));
+    Tensor* filter_grad =
+        context.Output<Tensor>(framework::GradVarName("Filter"));
+    Tensor filter = *context.Input<Tensor>("Filter");
+
+    if (!input_grad && !filter_grad) return;
+
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+
+    math::SetConstant<DeviceContext, T> set_zero;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+
+    math::DepthwiseConvInputGradFunctor<DeviceContext, T>
+        depthwiseConvInputGrad;
+    math::DepthwiseConvFilterGradFunctor<DeviceContext, T>
+        depthwiseConvFilterGrad;
+
+    if (input_grad) {
+      input_grad->mutable_data<T>(context.GetPlace());
+      set_zero(dev_ctx, input_grad, static_cast<T>(0));
+      depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
+                             paddings, input_grad);
+    }
+
+    if (filter_grad) {
+      filter_grad->mutable_data<T>(context.GetPlace());
+      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
+      depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, paddings,
+                              filter_grad);
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
--- a/Show More
+++ b/Show More