diff --git a/README.md b/README.md
index db0fbd88b2..ceeb6d9e51 100644
--- a/README.md
+++ b/README.md
@@ -2,8 +2,8 @@
 
 
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://doc.paddlepaddle.org/develop/doc/)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://doc.paddlepaddle.org/develop/doc_cn/)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html)
 [![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
@@ -36,7 +36,7 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl
     examples:
 
       - Optimized math operations through SSE/AVX intrinsics, BLAS libraries
-      (e.g. MKL, ATLAS, cuBLAS) or customized CPU/GPU kernels.
+      (e.g. MKL, OpenBLAS, cuBLAS) or customized CPU/GPU kernels.
       - Highly optimized recurrent networks which can handle **variable-length**
       sequence without padding.
       - Optimized local and distributed training for models with high dimensional
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 13294c0548..6320b17520 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -3,7 +3,7 @@
 # It will search MKLML, atlas, OpenBlas, reference-cblas in order.
 #
 # If any cblas implementation found, the following variable will be set.
-#    CBLAS_PROVIDER  # one of MKLML, ATLAS, OPENBLAS, REFERENCE
+#    CBLAS_PROVIDER  # one of MKLML, OPENBLAS, REFERENCE
 #    CBLAS_INC_DIR   # the include directory for cblas.
 #    CBLAS_LIBS      # a list of libraries should be linked by paddle.
 #                    # Each library should be full path to object file.
@@ -25,42 +25,6 @@ if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB)
   return()
 endif()
 
-## Then find atlas.
-set(ATLAS_ROOT $ENV{ATLAS_ROOT} CACHE PATH "Folder contains Atlas")
-set(ATLAS_INCLUDE_SEARCH_PATHS
-        ${ATLAS_ROOT}/include
-        /usr/include
-        /usr/include/atlas)
-set(ATLAS_LIB_SEARCH_PATHS
-        ${ATLAS_ROOT}/lib
-        /usr/lib
-        /usr/lib/blas/atlas
-        /usr/lib/atlas
-        /usr/lib/atlas-base   # special for ubuntu 14.04.
-    )
-find_path(ATLAS_INC_DIR NAMES cblas.h
-  PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
-find_path(ATLAS_CLAPACK_INC_DIR NAMES clapack.h
-  PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
-find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3
-  PATHS ${ATLAS_LIB_SEARCH_PATHS})
-find_library(ATLAS_CLAPACK_LIB NAMES lapack_atlas liblapack_atlas.so.3
-  PATHS ${ATLAS_LIB_SEARCH_PATHS})
-
-if(ATLAS_CLAPACK_INC_DIR AND ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_CLAPACK_LIB)
-  set(CBLAS_FOUND ON)
-  set(CBLAS_PROVIDER ATLAS)
-  set(CBLAS_INC_DIR ${ATLAS_INC_DIR} ${ATLAS_CLAPACK_INC_DIR})
-  set(CBLAS_LIBRARIES ${ATLAS_CLAPACK_LIB} ${ATLAS_CBLAS_LIB})
-
-  add_definitions(-DPADDLE_USE_ATLAS)
-  add_definitions(-DLAPACK_FOUND)
-
-  message(STATUS "Found ATLAS (include: ${ATLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
-  message(STATUS "Found lapack in ATLAS (include: ${ATLAS_CLAPACK_INC_DIR})")
-  return()
-endif()
-
 ## Then find openblas.
 set(OPENBLAS_ROOT $ENV{OPENBLAS_ROOT} CACHE PATH "Folder contains Openblas")
 set(OPENBLAS_INCLUDE_SEARCH_PATHS
diff --git a/doc/design/fluid-compiler.graffle b/doc/design/fluid-compiler.graffle
new file mode 100644
index 0000000000..c933df2cb8
Binary files /dev/null and b/doc/design/fluid-compiler.graffle differ
diff --git a/doc/design/fluid-compiler.png b/doc/design/fluid-compiler.png
new file mode 100644
index 0000000000..1b0ffed203
Binary files /dev/null and b/doc/design/fluid-compiler.png differ
diff --git a/doc/design/fluid.md b/doc/design/fluid.md
new file mode 100644
index 0000000000..585dc8ef39
--- /dev/null
+++ b/doc/design/fluid.md
@@ -0,0 +1,122 @@
+# Design Doc: PaddlePaddle Fluid
+
+## Why Fluid
+
+When Baidu developed PaddlePaddle in 2013, the only well-known open source deep learning system at the time was Caffe.  However, when PaddlePaddle was open-sourced in 2016, many other choices were available. There was a challenge -- what is the need for open sourcing yet another deep learning framework?
+
+Fluid is the answer.  Fluid is similar to PyTorch and TensorFlow Eager Execution, which describes the "process" of training or inference using the concept of a model.  In fact in PyTorch, TensorFlow Eager Execution and Fluid, there is no  concept of a model at all. The details are covered in the sections below. Fluid is currently more extreme in the above mentioned idea than PyTorch and Eager Execution, and we are trying to push Fluid towards the directions of a compiler and a new programming language for deep learning.
+
+## The Evolution of Deep Learning Systems
+
+Deep learning infrastructure is one of the fastest evolving technologies. Within four years, there have already been three generations of technologies invented.
+
+| Existed since | model as sequence of layers | model as graph of operators | No model |
+|--|--|--|--|
+| 2013 | Caffe, Theano, Torch, PaddlePaddle | | |
+| 2015 | | TensorFlow, MxNet, Caffe2, ONNX, n-graph | |
+| 2016 | | | PyTorch, TensorFlow Eager Execution, PaddlePaddle Fluid |
+
+From the above table, we see that the deep learning technology is evolving towards getting rid of the concept of a model.  To understand the reasons behind this direction, a comparison of the *programming paradigms* or the ways to program deep learning applications using these systems, would be helpful. The following section goes over these.
+
+## Deep Learning Programming Paradigms
+
+With the systems listed as the first or second generation, e.g., Caffe or TensorFlow, an AI application training program looks like the following:
+
+```python
+x = layer.data("image")
+l = layer.data("label")
+f = layer.fc(x, W)
+s = layer.softmax(f)
+c = layer.mse(l, s)
+
+for i in xrange(1000): # train for 1000 iterations
+    m = read_minibatch()
+    forward({input=x, data=m}, minimize=c)
+    backward(...)
+
+print W # print the trained model parameters.
+```
+
+The above program includes two parts:
+
+1. The first part describes the model, and
+2. The second part describes the training process (or inference process) for the model.
+
+This paradigm has a well-known problem that limits the productivity of programmers. If the programmer made a mistake in configuring the model, the error messages wouldn't show up until the second part is executed and `forward` and `backward` propagations are performed. This makes it difficult for the programmer to debug and locate a mistake that is located blocks away from the actual error prompt.
+
+This problem of being hard to debug and re-iterate fast on a program is the primary reason that programmers, in general,  prefer PyTorch over the older systems.  Using PyTorch, we would write the above program as following:
+
+```python
+W = tensor(...)
+
+for i in xrange(1000): # train for 1000 iterations
+    m = read_minibatch()
+    x = m["image"]
+    l = m["label"]
+    f = layer.fc(x, W)
+    s = layer.softmax(f)
+    c = layer.mse(l, s)
+    backward()
+
+print W # print the trained model parameters.
+```
+
+We can see that the main difference is the moving the model configuration part (the first step) into the training loop.  This change would allow the mistakes in model configuration to be reported where they actually appear in the programming block.  This change also represents the model better, or its forward pass, by keeping the configuration process in the training loop.
+
+## Describe Arbitrary Models for the Future
+
+Describing the process instead of the model also brings Fluid, the flexibility to define different non-standard models that haven't been invented yet.
+
+As we write out the program for the process, we can write an RNN as a loop, instead of an RNN as a layer or as an operator.  A PyTorch example would look like the following:
+
+```python
+for i in xrange(1000):
+    m = read_minibatch()
+    x = m["sentence"]
+    for t in xrange x.len():
+        h[t] = the_step(x[t])
+```        
+
+With Fluid, the training loop and the RNN in the above program are not really Python loops, but just a "loop structure" provided by Fluid and implemented in C++ as the following:
+
+```python
+train_loop = layers.While(cond)
+with train_loop.block():
+  m = read_minibatch()
+  x = m["sentence"]
+  rnn = layers.While(...)
+  with rnn.block():
+    h[t] = the_step(input[t])
+```    
+
+An actual Fluid example is described  [here](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/python/paddle/v2/fluid/tests/test_while_op.py#L36-L44).
+
+From the example, the Fluid programs look very similar to their PyTorch equivalent programs, except that Fluid's loop structure, wrapped with Python's `with` statement, could run much faster than just a Python loop.
+
+We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/if_else_op.md) structure of Fluid.
+
+## Turing Completeness
+
+In computability theory, a system of data-manipulation rules, such as a programming language, is said to be Turing complete if it can be used to simulate any Turing machine.  For a programming language, if it provides if-then-else and loop, it is Turing complete.  From the above examples, Fluid seems to be Turing complete; however, it is noteworthy to notice that there  is a slight difference between the `if-then-else` of Fluid and that of a programming language. The difference being that the former runs both of its branches and splits the input mini-batch into two -- one for the True condition and another for the False condition. This hasn't been researched in depth if this is equivalent to the `if-then-else` in programming languages that makes them Turing-complete.  Based on a conversation with [Yuang Yu](https://research.google.com/pubs/104812.html), it seems to be the case but this needs to be looked into in-depth.
+
+## The Execution of a Fluid Program
+
+There are two ways to execute a Fluid program.  When a program is executed, it creates a protobuf message [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
+
+There is a C++ class [`Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h), which runs a `ProgramDesc`, similar to how an interpreter runs a Python program.
+
+Fluid is moving towards the direction of a compiler, which is explain in more detail later in this article.
+
+## Backward Compatibility of Fluid
+
+Given all the advantages from the removal of the concept of a *model*, hardware manufacturers might still prefer the existence of the concept of a model, so it would be easier for them to support multiple frameworks all at once and could run a trained model during inference.  For example, Nervana, a startup company acquired by Intel, has been working on an XPU that reads the models in the format known as [n-graph](https://github.com/NervanaSystems/ngraph).  Similarly, [Movidius](https://www.movidius.com/) is producing a mobile deep learning chip that reads and runs graphs of operators.  The well-known [ONNX](https://github.com/onnx/onnx) is also a file format of graphs of operators.
+
+For Fluid, we can write a converter that extracts the parts in the `ProgramDesc` protobuf message, converts them into a graph of operators, and exports the graph into the ONNX or n-graph format.
+
+## Towards a Deep Learning Language and the Compiler
+
+We can change the `if-then-else` and loop structure a little bit in the above Fluid example programs, to make it into a new programming language, different than Python.
+
+Even if we do not invent a new language, as long as we get the `ProgramDesc` message filled in, we can write a transpiler, which translates each invocation to an operator, into a C++ call to a kernel function of that operator. For example, a transpiler that weaves the CUDA kernels outputs an NVIDIA-friendly C++ program, which can be built using `nvcc`.  Another transpiler could generate MKL-friendly code that should be built using `icc` from Intel.  More interestingly, we can translate a Fluid program into its distributed version of two `ProgramDesc` messages, one for running on the trainer process, and the other one for the parameter server.  For more details of the last example, the [concurrent programming design](concurrent_programming.md) document would be a good pointer.  The following figure explains the proposed two-stage process:
+
+![](fluid-compiler.png)
diff --git a/doc/design/support_new_device.md b/doc/design/support_new_device.md
new file mode 100644
index 0000000000..fd23dc211a
--- /dev/null
+++ b/doc/design/support_new_device.md
@@ -0,0 +1,248 @@
+# Design Doc: Supporting new Device/Library
+
+## Background
+
+Deep learning has a high demand for computing resources. New high-performance devices and computing libraries are appearing very frequently. Deep learning frameworks have to integrate these high-performance devices and computing libraries flexibly and efficiently.
+
+On one hand, hardware and computing libraries usually do not have a one-to-one correspondence. For example,Intel CPUs support Eigen and MKL computing libraries while Nvidia GPUs support Eigen and cuDNN computing libraries. We have to implement operator specific kernels for each computing library.
+
+On the other hand, users usually do not want to care about the low-level hardware and computing libraries when writing a neural network configuration. In Fluid, `Layer` is exposed in `Python`, and `Operator` is exposed in `C++`. Both `Layer` and `Operator` are hardware independent.
+
+So, how to support a new Device/Library in Fluid becomes a challenge.
+
+
+## Basic: Integrate A New Device/Library
+
+For a general overview of fluid, please refer to the [overview doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/read_source.md).
+
+There are mainly three parts that we have to consider while integrating a new device/library:
+
+- Place and DeviceContext: indicates the device id and manages hardware resources
+
+- Memory and Tensor: malloc/free data on certain device
+
+- Math Functor and OpKernel: implement computing unit on certain devices/libraries
+
+### Place and DeviceContext
+
+
+#### Place
+Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent different devices and computing libraries. There are inheritance relationships between different kinds of `Place`.
+
+```
+        |   CPUPlace   --> MKLDNNPlace
+Place --|   CUDAPlace  --> CUDNNPlace
+        |   FPGAPlace
+```
+
+And `Place` is defined as follows:
+
+```
+typedef boost::variant<CUDAPlace, CPUPlace, FPGAPlace> Place;
+```
+
+#### DeviceContext
+
+Fluid uses class [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L30) to manage the resources in different hardwares, such as CUDA stream in `CDUADeviceContext`. There are also inheritance relationships between different kinds of `DeviceContext`.
+
+
+```
+                /->  CPUDeviceContext   --> MKLDeviceContext
+DeviceContext ---->  CUDADeviceContext  --> CUDNNDeviceContext
+                \->  FPGADeviceContext
+```
+
+An example of Nvidia GPU is as follows:
+
+- DeviceContext
+
+
+```
+class DeviceContext {
+  virtual Place GetPlace() const = 0;
+};  
+```
+
+
+- CUDADeviceContext
+
+
+```
+class CUDADeviceContext : public DeviceContext {
+  Place GetPlace() const override { return place_; }
+private:
+  CUDAPlace place_;
+  cudaStream_t stream_; 
+  cublasHandle_t cublas_handle_;
+  std::unique_ptr<Eigen::GpuDevice> eigen_device_;  // binds with stream_
+};
+```
+
+- CUDNNDeviceContext
+
+```
+class CUDNNDeviceContext : public CUDADeviceContext {
+  private:
+    cudnnHandle_t cudnn_handle_;
+};
+```
+
+
+### Memory and Tensor
+
+
+#### memory module
+
+Fluid provides the following [memory interfaces](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/memory/memory.h#L36):
+
+```
+template <typename Place>
+void* Alloc(Place place, size_t size);
+
+template <typename Place>
+void Free(Place place, void* ptr);
+
+template <typename Place>
+size_t Used(Place place);
+```
+
+To implementing these interfaces, we have to implement MemoryAllocator for different Devices
+
+
+#### Tensor
+
+[Tensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.h#L36) holds data with some shape in a specific Place.
+
+```cpp
+class Tensor {
+ public:
+  /*! Return a pointer to mutable memory block. */
+  template <typename T>
+  inline T* data();
+
+  /**
+   * @brief   Return a pointer to mutable memory block.
+   * @note    If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(platform::Place place);
+
+  /**
+   * @brief     Return a pointer to mutable memory block.
+   *
+   * @param[in] dims    The dimensions of the memory block.
+   * @param[in] place   The place of the memory block.
+   *
+   * @note      If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(DDim dims, platform::Place place);
+
+  /*! Resize the dimensions of the memory block. */
+  inline Tensor& Resize(const DDim& dims);
+
+  /*! Return the dimensions of the memory block. */
+  inline const DDim& dims() const;
+
+ private:
+  /*! holds the memory block if allocated. */
+  std::shared_ptr<Placeholder> holder_;
+
+  /*! points to dimensions of memory block. */
+  DDim dim_;
+};
+```
+
+`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configure its shape, and then call `mutuable_data` to allocate the actual memory.
+
+```cpp
+paddle::framework::Tensor t;
+paddle::platform::CPUPlace place;
+// set size first
+t.Resize({2, 3});
+// allocate memory on CPU later
+t.mutable_data(place);
+```
+
+
+
+### Math Functor and OpKernel
+
+Fluid implements computing units based on different DeviceContexts. Some computing units are shared between operators. This common part will be put in operators/math directory as basic Functors.
+
+Let's take [MaxOutFunctor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/math/maxouting.h#L27) as an example:
+
+The interface is defined in header file.
+
+```
+template <typename DeviceContext, typename T>
+class MaxOutFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  framework::Tensor* output, int groups);
+};
+```
+
+CPU implemention is in .cc file
+
+```
+template <typename T>
+class MaxOutFunctor<platform::CPUDeviceContext, T> {
+  public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* output,
+                  int groups) {
+                  ...
+                  }
+};
+```
+
+CUDA implemention is in .cu file
+
+```
+template <typename T>
+class MaxOutFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* output,
+                  int groups) {
+                  ...
+                  }
+};                  
+```
+
+
+We get computing handle from a concrete DeviceContext, and make compution on tensors.
+
+The implemention of `OpKernel` is similar to math functors, the extra thing we need to do is to register the OpKernel in a global map.
+
+Fluid provides different register interfaces in op_registry.h
+
+
+Let's take [Crop](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/crop_op.cc#L134) operator as an example:
+
+In .cc file:
+
+```
+REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel<float>);
+REGISTER_OP_CPU_KERNEL(
+    crop_grad, ops::CropGradKernel<paddle::platform::CPUDeviceContext, float>);
+```
+
+In .cu file:
+
+```
+REGISTER_OP_CUDA_KERNEL(crop, ops::CropKernel<float>);
+REGISTER_OP_CUDA_KERNEL(
+    crop_grad, ops::CropGradKernel<paddle::platform::CUDADeviceContext, float>);
+```
+
+
+## Advanced topics: How to switch between different Device/Library
+
+Generally, we will impelement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not sutibale on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run at GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library.
+
+
+We will discuss how to implement an efficient OpKernel switch policy. 
+
+- TBD
diff --git a/doc/faq/build_and_install/index_cn.rst b/doc/faq/build_and_install/index_cn.rst
index f1677e216f..a2bdeead78 100644
--- a/doc/faq/build_and_install/index_cn.rst
+++ b/doc/faq/build_and_install/index_cn.rst
@@ -14,7 +14,7 @@
 
     $ export CUDA_SO="$(\ls usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
     $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-    $ docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddlepaddle:latest-gpu
+    $ docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
 
 更多关于Docker的安装与使用, 请参考 `PaddlePaddle Docker 文档 <http://www.paddlepaddle.org/doc_cn/build_and_install/install/docker_install.html>`_ 。
 
diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst
index f78b1fb0e1..1eb06e4182 100644
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -114,7 +114,7 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note
 
   .. code-block:: bash
 
-     nvidia-docker run -it -v $PWD:/work paddledev/paddle:latest-gpu /bin/bash
+     nvidia-docker run -it -v $PWD:/work paddlepaddle/paddle:latest-gpu /bin/bash
 
 **注: 如果没有安装nvidia-docker，可以尝试以下的方法，将CUDA库和Linux设备挂载到Docker容器内：**
 
@@ -122,7 +122,7 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note
 
      export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
      export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-     docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:latest-gpu
+     docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
 
 **关于AVX：**
 
diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst
index d7acc7aeb7..5a46c598f2 100644
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -122,7 +122,7 @@ GPU driver installed before move on.
 
   .. code-block:: bash
 
-     nvidia-docker run -it -v $PWD:/work paddledev/paddle:latest-gpu /bin/bash
+     nvidia-docker run -it -v $PWD:/work paddlepaddle/paddle:latest-gpu /bin/bash
 
 **NOTE: If you don't have nvidia-docker installed, try the following method to mount CUDA libs and devices into the container.**
 
@@ -130,7 +130,7 @@ GPU driver installed before move on.
 
      export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
      export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-     docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:latest-gpu
+     docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
 
 **About AVX:**
 
diff --git a/doc/howto/dev/contribute_to_paddle_cn.md b/doc/howto/dev/contribute_to_paddle_cn.md
index 6993901452..3e0bf7b397 100644
--- a/doc/howto/dev/contribute_to_paddle_cn.md
+++ b/doc/howto/dev/contribute_to_paddle_cn.md
@@ -76,18 +76,18 @@ no changes added to commit (use "git add" and/or "git commit -a")
 
 ## 构建和测试
 
-编译 PaddlePaddle 的源码以及生成文档需要多种开发工具。为了方便大家，我们的标准开发流程是把这些工具都装进一个Docker image，称为*开发镜像*，通常名字是 `paddle:dev`。然后所有用 `cmake && make` 的地方（比如IDE配置里）都用 `docker run paddle:dev`来代替。
+编译 PaddlePaddle 的源码以及生成文档需要多种开发工具。为了方便大家，我们的标准开发流程是把这些工具都装进一个Docker image，称为*开发镜像*，通常名字是 `paddle:latest-dev` 或者 `paddle:[version tag]-dev` 如 `paddle:0.11.0-dev`。然后所有用 `cmake && make` 的地方（比如IDE配置里）都用 `docker run paddle:latest-dev`来代替。
 
 如要build这个开发镜像，在源码目录树的根目录中运行：
 
 ```bash
-➜  docker build -t paddle:dev .
+➜  docker build -t paddle:latest-dev .
 ```
 
 随后可以用这个开发镜像开始build PaddlePaddle的源码。比如如果要build一个不依赖GPU，但是支持AVX指令集，并且包括unit tests的PaddlePaddle，可以：
 
 ```bash
-➜  docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=ON" paddle:dev
+➜  docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TESTING=ON" paddle:latest-dev
 ```
 
 这个过程除了编译PaddlePaddle为 `./build/libpaddle.so`，并且输出一个 `./build/paddle.deb`文件之外，还会输出一个 `build/Dockerfile`。我们只需要运行下面命令把编译好的PaddlePaddle打包成一个*生产镜像*（`paddle:prod`）：
@@ -99,7 +99,7 @@ no changes added to commit (use "git add" and/or "git commit -a")
 如果要运行所有的单元测试，可以用如下命令：
 
 ```bash
-➜  docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest"
+➜  docker run -it -v $(pwd):/paddle paddle:latest-dev bash -c "cd /paddle/build && ctest"
 ```
 
 关于构建和测试的更多信息，请参见[这篇文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_cn.rst)。
diff --git a/doc/howto/dev/new_op_cn.md b/doc/howto/dev/new_op_cn.md
index 44dbeecbbd..757a5840bc 100644
--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/howto/dev/new_op_cn.md
@@ -1,17 +1,18 @@
 # 如何写新的Operator
 
  - [概念简介](#概念简介)
- - [实现C++类](#实现C++类)
-   - [定义ProtoMaker类](#定义ProtoMaker类)
-   - [定义Operator类](#定义Operator类)
-   - [定义OpKernel类](#定义OpKernel类)
-   - [注册Operator](#注册Operator)
+ - [实现C++类](#实现c类)
+   - [定义ProtoMaker类](#定义protomaker类)
+   - [定义Operator类](#定义operator类)
+   - [定义OpKernel类](#定义opkernel类)
+   - [注册Operator](#注册operator)
    - [编译](#编译)
- - [绑定Python](#绑定Python)
+ - [绑定Python](#绑定python)
  - [实现单元测试](#实现单元测试)
-   - [前向Operator单测](#前向Operator单测)
-   - [反向Operator单测](#反向Operator单测)
+   - [前向Operator单测](#前向operator单测)
+   - [反向Operator单测](#反向operator单测)
    - [编译和执行](#编译和执行)
+ - [注意事项](#注意事项)
 
 
 ## 概念简介
@@ -43,7 +44,7 @@ Kernel实现       | CPU、CUDA共享Kernel实现在`.h`文件中，否则，CPU
 ## 实现C++类
 
 
-### 1. 定义ProtoMaker类
+### 定义ProtoMaker类
 
 矩阵乘法的公式：$Out = X * Y$, 可见该计算由两个输入，一个输出组成。
 
@@ -100,7 +101,7 @@ The equation is: Out = scale*X
 - `AddAttr<AttrType>("scale", "...").SetDefault(1.0);` : 增加`scale`系数，作为参数属性，并且设置默认值为1.0。
 
 
-### 2. 定义Operator类
+### 定义Operator类
 
 下面的点实现了MulOp的定义：
 
@@ -149,7 +150,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 
 通常`OpProtoMaker`和`Op`类的定义写在`.cc`文件中，和下面将要介绍的注册函数一起放在`.cc`中
 
-### 3. 定义OpKernel类
+### 定义OpKernel类
 
 `MulKernel`继承自`framework::OpKernel`，带有下面两个模板参数:
 
@@ -177,6 +178,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
     math::matmul<DeviceContext, T>(*X, false, *Y, false, 1, Z, 0, device_context);
   }
   };
+  ```
 
 需要注意：**不同设备(CPU、CUDA)共享一个Op定义，是否则共享同一个`OpKernel`，取决于`Compute`调用的函数是否支持不同设备。**
 
@@ -188,7 +190,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 到此，前向Op实现完成。接下来，需要在`.cc`文件中注册该op和kernel。
 反向Op类的定义，反向OpKernel的定义与前向Op类似，这里不再赘述。**但需注意反向Op没有`ProtoMaker`**。
 
-### 4. 注册Operator
+### 注册Operator
 
 - 在`.cc`文件中注册前向、反向Op类，注册CPU Kernel。
 
@@ -220,7 +222,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
                            ops::MulGradKernel<paddle::platform::CUDADeviceContext, float>);
     ```
 
-### 5. 编译
+### 编译
 
 运行下面命令可以进行编译：
 
@@ -236,6 +238,7 @@ make mul_op
 
 单测包括对比前向Op不同设备(CPU、CUDA)的实现、对比反向OP不同设备(CPU、CUDA)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。
 
+### 前向Operator单测
 
 Op单元测试继承自`OpTest`。各项更加具体的单元测试在`TestMulOp`里完成。测试Operator，需要：
 
@@ -273,8 +276,7 @@ Op单元测试继承自`OpTest`。各项更加具体的单元测试在`TestMulOp
       def test_check_grad_ingore_y(self):
           self.check_grad(
               ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
-
-    ```
+  ```
 
 上面的代码首先导入依赖的包，下面是对`setUp`函数中操作的重要变量的详细解释：
 
@@ -282,6 +284,8 @@ Op单元测试继承自`OpTest`。各项更加具体的单元测试在`TestMulOp
 - `self.inputs` : 定义输入，类型为`numpy.array`，并初始化。
 - `self.outputs` : 定义输出，并在Python脚本中完成与operator同样的计算逻辑，返回Python端的计算结果。
 
+### 反向operator单测
+
 而反向测试中：
 - `test_check_grad_normal`中调用`check_grad`使用数值法检测梯度正确性和稳定性。
   - 第一个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。
@@ -290,7 +294,7 @@ Op单元测试继承自`OpTest`。各项更加具体的单元测试在`TestMulOp
 - `test_check_grad_ingore_x`和`test_check_grad_ingore_y`分支用来测试只需要计算一个输入梯度的情况。
 
 
-### 编译和执行单元测试
+### 编译和执行
 
 `python/paddle/v2/framework/tests` 目录下新增的 `test_*.py` 单元测试会被自动加入工程进行编译。
 
diff --git a/doc/howto/dev/new_op_en.md b/doc/howto/dev/new_op_en.md
index 510233306c..fe86936bc1 100644
--- a/doc/howto/dev/new_op_en.md
+++ b/doc/howto/dev/new_op_en.md
@@ -1,8 +1,8 @@
 # How to write a new operator
 
  - [Background](#background)
- - [Implementing C++ Types](#implementing-c++-types)
-   - [Defining ProtoMaker](#defining-protoMaker)
+ - [Implementing C++ Types](#implementing-c-types)
+   - [Defining ProtoMaker](#defining-protomaker)
    - [Defining Operator](#defining-operator)
    - [Registering Operator](#registering-operator)
    - [Compilation](#compilation)
@@ -41,7 +41,7 @@ Let's take matrix multiplication operator, [MulOp](https://github.com/PaddlePadd
 ## Implementing C++ Types
 
 
-### 1. Defining Class ProtoMaker
+### Defining ProtoMaker
 
 Matrix Multiplication can be written as $Out = X * Y$, meaning that the operation consists of two inputs and pne output.
 
@@ -98,7 +98,7 @@ There are two changes in this example:
 - `AddAttr<AttrType>("scale", "...").SetDefault(1.0);`  adds `scale`constant as an attribute, and sets the default value to 1.0.
 
 
-### 2. Defining Operator
+### Defining Operator
 
 The following code defines the interface for MulOp:
 
@@ -147,7 +147,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 
 Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, which also include the registration methods introduced later.
 
-### 3. Defining OpKernel
+### Defining OpKernel
 
 `MulKernel` inherits `framework::OpKernel`, which includes the following templates:
 
@@ -188,7 +188,7 @@ This concludes the forward implementation of an operator. Next its operation and
 
 The definition of its corresponding backward operator, if applicable, is similar to that of an forward operator. **Note that a backward operator does not include a `ProtoMaker`**.
 
-### 4. Registering Operator
+### Registering Operator
 
 - In `.cc` files, register forward and backward operator classes and the CPU kernel.
 
@@ -220,7 +220,7 @@ The definition of its corresponding backward operator, if applicable, is similar
                            ops::MulGradKernel<paddle::platform::CUDADeviceContext, float>);
     ```
 
-### 5. Compilation
+### Compilation
 
 Run the following commands to compile.
 
@@ -284,8 +284,7 @@ A forward operator unit test inherits `unittest.TestCase` and defines metaclass
       def test_check_grad_ingore_y(self):
           self.check_grad(
               ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
-
-    ```
+  ```
 Get its output, and compare it with the forward operator's own output.
 
 The code above first loads required packages. In addition, we have
@@ -294,6 +293,8 @@ The code above first loads required packages. In addition, we have
 - `self.inputs` defines input, with type `numpy.array` and initializes it.
 - `self.outputs` defines output and completes the same operator computation in the Python script, and returns its result from the Python script.
 
+### Testing Backward Operators
+
 Some key points in checking gradient above include:
 
 - `test_normal` calls `check_grad` to validate scaling tests' correctness and stability through numeric methods.
diff --git a/doc/howto/read_source.md b/doc/howto/read_source.md
new file mode 100644
index 0000000000..383acb0c82
--- /dev/null
+++ b/doc/howto/read_source.md
@@ -0,0 +1,67 @@
+# PaddlePaddle Fluid Source Code Overview
+
+Examples: https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/v2/fluid/tests/book
+
+Core: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework
+
+Operator: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators
+
+Optimizer: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/optimizer
+
+Memory: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory
+
+# Compile Time
+
+The following **defines** the NN. The definition goes into this [protocol buffer](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto).
+
+```python
+x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+y_predict = fluid.layers.fc(input=x, size=1, act=None)
+cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+avg_cost = fluid.layers.mean(x=cost)
+
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+sgd_optimizer.minimize(avg_cost)
+```
+
+- Variables: `x`,  `y`, `y_predict`, `cost` and `avg_cost`. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/framework.py#L93)
+- Layers: `fluid.layers.data`, `fluid.layers.fc` and `fluid.layers.mean` are layers. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/layers.py)
+  - Every Layer has one or more operators and variables/parameters
+    - All the operators are defined at [`paddle/operators/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators). Other worth-looking files:
+      - Base class: [`paddle/framework/operator.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h)
+      - Operator Registration: [`paddle/framework/op_registry.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h) 
+      - Operator Lookup: [`paddle/framework/op_info.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_info.h)
+- Optimizer: `fluid.optimizer.SGD`. It does the following
+  - Add backward operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/backward.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/backward.cc)]
+  - Add optimizer operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/optimizer.py), [C++](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/optimizer)]
+
+# Run Time
+
+The following **evaluates** the NN. Instantiates all the variables, operators.
+
+```python
+place = fluid.CPUPlace()
+feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+exe = fluid.Executor(place)
+
+# Allocate memory. Initialize Parameter.
+exe.run(fluid.default_startup_program())
+
+# Allocate memory. Do computation.
+exe.run(fluid.default_main_program(),
+        feed=feeder.feed(data),
+        fetch_list=[avg_cost])
+```
+
+- Place: `place`. one of CPU, GPU or FPGA. [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h)
+  - The device handle are at [paddle/platform/device_context.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h)
+- Executor: `fluid.Executor(place)`. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/executor.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)]
+  - Feeds the data: `feed=feeder.feed(data)`
+  - Evaluates all the operators
+  - Fetches the result: `fetch_list=[avg_cost]`
+- Other worth looking files:
+  - Scope: [paddle/framework/scope.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/scope.h). Where all the variables live
+    - Variable: [paddle/framework/variable.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h). Where all the data (most likely tensors) live
+      - Tensor: [paddle/framework/tensor.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.h). Where we allocate memory through [`paddle/memory/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory)
diff --git a/paddle/capi/error.cpp b/paddle/capi/error.cpp
index 169b65f921..96ce31b45f 100644
--- a/paddle/capi/error.cpp
+++ b/paddle/capi/error.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "error.h"
 
-const char* paddle_error_string(paddle_error err) {
+extern "C" const char* paddle_error_string(paddle_error err) {
   switch (err) {
     case kPD_NULLPTR:
       return "nullptr error";
diff --git a/paddle/capi/error.h b/paddle/capi/error.h
index 9d9d0ed63a..2da9e0a3ef 100644
--- a/paddle/capi/error.h
+++ b/paddle/capi/error.h
@@ -29,9 +29,17 @@ typedef enum {
   kPD_UNDEFINED_ERROR = -1,
 } paddle_error;
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /**
  * Error string for Paddle API.
  */
 PD_API const char* paddle_error_string(paddle_error err);
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index a17036c652..faf6e60cbd 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -430,14 +430,14 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
     std::vector<std::unique_ptr<OpDescBind>> op_grads;
 
     if ((*it)->Type() == "recurrent" || (*it)->Type() == "while") {
-      int step_block_idx = (*it)->GetBlockAttr("step_block");
+      int step_block_idx = (*it)->GetBlockAttr("sub_block");
       BlockDescBind* backward_block = CreateStepBlock(
           program_desc, no_grad_vars, grad_to_var, step_block_idx);
       op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
     } else if ((*it)->Type() == "conditional_block") {
       BlockDescBind* backward_block =
           CreateStepBlock(program_desc, no_grad_vars, grad_to_var,
-                          (*it)->GetBlockAttr("block"));
+                          (*it)->GetBlockAttr("sub_block"));
       op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
     } else {
       op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var);
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index 7e5a1db44a..afb8d9d599 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -244,7 +244,7 @@ TEST(Matrix, unary) {
     LOG(WARNING) << "This version of PaddlePaddle was not built with LAPACK"
                  << "support so we cannot test matrix inverse. To test "
                  << "matrix inverse, please install LAPACKE "
-                 << "and MKL/Openblas/ATLAS, and re-build PaddlePaddle.";
+                 << "and MKL/Openblas, and re-build PaddlePaddle.";
 #endif
   }
 }
diff --git a/paddle/operators/cast_op.cc b/paddle/operators/cast_op.cc
index 42bff69a1e..d641b8fc9f 100644
--- a/paddle/operators/cast_op.cc
+++ b/paddle/operators/cast_op.cc
@@ -74,4 +74,5 @@ REGISTER_OP_WITH_KERNEL(cast, ops::CastOpGradMaker, ops::CastOpInferShape,
 REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel<CPU, float>,
                        ops::CastOpKernel<CPU, double>,
                        ops::CastOpKernel<CPU, int>,
-                       ops::CastOpKernel<CPU, int64_t>);
+                       ops::CastOpKernel<CPU, int64_t>,
+                       ops::CastOpKernel<CPU, bool>);
diff --git a/paddle/operators/cast_op.cu b/paddle/operators/cast_op.cu
index 4681deaa62..91e6fb391c 100644
--- a/paddle/operators/cast_op.cu
+++ b/paddle/operators/cast_op.cu
@@ -19,4 +19,5 @@ using CastOpKernel =
     paddle::operators::CastOpKernel<paddle::platform::CUDADeviceContext, T>;
 
 REGISTER_OP_CUDA_KERNEL(cast, CastOpKernel<float>, CastOpKernel<double>,
-                        CastOpKernel<int>, CastOpKernel<int64_t>);
+                        CastOpKernel<int>, CastOpKernel<int64_t>,
+                        CastOpKernel<bool>);
diff --git a/paddle/operators/conditional_block_op.cc b/paddle/operators/conditional_block_op.cc
index 03c58a7eab..6f2ef9174e 100644
--- a/paddle/operators/conditional_block_op.cc
+++ b/paddle/operators/conditional_block_op.cc
@@ -65,7 +65,7 @@ class ConditionalBlockOp : public ConditionalOp {
       scopes->front() = &scope.NewScope();
       auto &cur_scope = *scopes->front();
 
-      auto *block = Attr<framework::BlockDescBind *>("block");
+      auto *block = Attr<framework::BlockDescBind *>("sub_block");
       framework::Executor exec(dev_ctx);
       exec.Run(*block->Program(), &cur_scope, block->ID(), false);
     }
@@ -88,7 +88,7 @@ class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
               "unify the conditional block, rnn and while op, the type of "
               "scope is std::vector<Scope*>");
     AddAttr<framework::BlockDescBind *>(
-        "block", "The step block of conditional block operator");
+        "sub_block", "The step block of conditional block operator");
     AddComment(R"DOC(Conditional block operator
 
 Run the sub-block if X is not empty. Params is the other inputs and Out is the
@@ -117,7 +117,7 @@ class ConditionalBlockGradOp : public ConditionalOp {
       auto &scopes = scope_var->Get<std::vector<framework::Scope *>>();
       framework::Scope &cur_scope = *scopes[0];
 
-      auto *block = Attr<framework::BlockDescBind *>("block");
+      auto *block = Attr<framework::BlockDescBind *>("sub_block");
       framework::Executor exec(dev_ctx);
       exec.Run(*block->Program(), &cur_scope, block->ID(), false);
 
@@ -181,7 +181,7 @@ class ConditionalBlockGradMaker : public framework::SingleGradOpDescMaker {
     grad_op->SetInput("Scope", Output("Scope"));
     grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
     grad_op->SetOutput(framework::GradVarName("Params"), InputGrad("Params"));
-    grad_op->SetBlockAttr("block", *this->grad_block_[0]);
+    grad_op->SetBlockAttr("sub_block", *this->grad_block_[0]);
     return std::unique_ptr<framework::OpDescBind>(grad_op);
   }
 };
diff --git a/paddle/operators/conv_op.h b/paddle/operators/conv_op.h
index 749258183b..d2de4e80f7 100644
--- a/paddle/operators/conv_op.h
+++ b/paddle/operators/conv_op.h
@@ -261,8 +261,12 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
 
     if (input_grad) {
       input_grad->mutable_data<T>(context.GetPlace());
-      set_zero(dev_ctx, input_grad, static_cast<T>(0));
 
+      // if is_expand is false, the operation of set_zero is unnecessary,
+      // because math::matmul will reset input_grad.
+      if (is_expand) {
+        set_zero(dev_ctx, input_grad, static_cast<T>(0));
+      }
       math::Col2VolFunctor<DeviceContext, T> col2vol;
       math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
 
diff --git a/paddle/operators/conv_transpose_op.h b/paddle/operators/conv_transpose_op.h
index 42dfbb23bc..0a77eb9623 100644
--- a/paddle/operators/conv_transpose_op.h
+++ b/paddle/operators/conv_transpose_op.h
@@ -225,7 +225,6 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
 
       if (input_grad) {
         input_grad->mutable_data<T>(context.GetPlace());
-        set_zero(dev_ctx, input_grad, static_cast<T>(0));
       }
       if (filter_grad) {  // filter size (m, c, k_h, k_w)
         filter_grad->mutable_data<T>(context.GetPlace());
diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu
index 1b560a7e2d..e33070c40f 100644
--- a/paddle/operators/math/math_function.cu
+++ b/paddle/operators/math/math_function.cu
@@ -273,6 +273,13 @@ void set_constant_with_place<platform::GPUPlace>(
                            TensorSetConstantGPU(context, tensor, value));
 }
 
+template <>
+void set_constant_with_place<platform::CudnnPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  set_constant_with_place<platform::GPUPlace>(context, tensor, value);
+}
+
 template struct RowwiseAdd<platform::CUDADeviceContext, float>;
 template struct RowwiseAdd<platform::CUDADeviceContext, double>;
 template struct ColwiseSum<platform::CUDADeviceContext, float>;
diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc
index 29f9163643..232d926f7b 100644
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -25,7 +25,7 @@ constexpr char kOutputs[] = "outputs";
 constexpr char kStepScopes[] = "step_scopes";
 constexpr char kExStates[] = "ex_states";
 constexpr char kStates[] = "states";
-constexpr char kStepBlock[] = "step_block";
+constexpr char kStepBlock[] = "sub_block";
 constexpr char kReverse[] = "reverse";
 constexpr char kIsTrain[] = "is_train";
 #define GRAD_SUFFIX "@GRAD"
diff --git a/paddle/operators/reduce_op.cc b/paddle/operators/reduce_op.cc
index b754637bf2..fedc2a5c37 100644
--- a/paddle/operators/reduce_op.cc
+++ b/paddle/operators/reduce_op.cc
@@ -37,18 +37,23 @@ class ReduceOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_LT(
         dim, x_rank,
         "The dim should be in the range [-rank(input), rank(input)).");
-    bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
-    auto dims_vector = vectorize(x_dims);
-    if (keep_dim || x_rank == 1) {
-      dims_vector[dim] = 1;
+    bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
+    if (reduce_all) {
+      ctx->SetOutputDim("Out", {1});
     } else {
-      dims_vector.erase(dims_vector.begin() + dim);
-    }
-    auto out_dims = framework::make_ddim(dims_vector);
-    ctx->SetOutputDim("Out", out_dims);
-    if (dim != 0) {
-      // Only pass LoD when not reducing on the first dim.
-      ctx->ShareLoD("X", /*->*/ "Out");
+      bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
+      auto dims_vector = vectorize(x_dims);
+      if (keep_dim || x_rank == 1) {
+        dims_vector[dim] = 1;
+      } else {
+        dims_vector.erase(dims_vector.begin() + dim);
+      }
+      auto out_dims = framework::make_ddim(dims_vector);
+      ctx->SetOutputDim("Out", out_dims);
+      if (dim != 0) {
+        // Only pass LoD when not reducing on the first dim.
+        ctx->ShareLoD("X", /*->*/ "Out");
+      }
     }
   }
 };
@@ -95,11 +100,16 @@ class ReduceOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(bool, default false) "
                   "If true, retain the reduced dimension with length 1.")
         .SetDefault(false);
+    AddAttr<bool>("reduce_all",
+                  "(bool, default false) "
+                  "If true, output a scalar reduced along all dimensions.")
+        .SetDefault(false);
     comment_ = R"DOC(
 {ReduceOp} Operator.
 
 This operator computes the {reduce} of input tensor along the given dimension. 
 The result tensor has 1 fewer dimension than the input unless keep_dim is true.
+If reduce_all is true, just reduce along all dimensions and output a scalar.
 
 )DOC";
     AddComment(comment_);
diff --git a/paddle/operators/reduce_op.h b/paddle/operators/reduce_op.h
index 47ce910f28..7bd99cb1e6 100644
--- a/paddle/operators/reduce_op.h
+++ b/paddle/operators/reduce_op.h
@@ -26,10 +26,12 @@ using DDim = framework::DDim;
 template <typename T, size_t D, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-
 template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
 struct SumFunctor {
   template <typename DeviceContext, typename X, typename Y, typename Dim>
@@ -95,26 +97,41 @@ template <typename DeviceContext, typename T, typename Functor>
 class ReduceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    int rank = context.Input<Tensor>("X")->dims().size();
-    switch (rank) {
-      case 1:
-        ReduceCompute<1>(context);
-        break;
-      case 2:
-        ReduceCompute<2>(context);
-        break;
-      case 3:
-        ReduceCompute<3>(context);
-        break;
-      case 4:
-        ReduceCompute<4>(context);
-        break;
-      case 5:
-        ReduceCompute<5>(context);
-        break;
-      case 6:
-        ReduceCompute<6>(context);
-        break;
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    if (reduce_all) {
+      // Flatten and reduce 1-D tensor
+      auto* input = context.Input<Tensor>("X");
+      auto* output = context.Output<Tensor>("Out");
+      output->mutable_data<T>(context.GetPlace());
+      auto x = EigenVector<T>::Flatten(*input);
+      auto out = EigenScalar<T>::From(*output);
+      auto& place =
+          *context.template device_context<DeviceContext>().eigen_device();
+      auto reduce_dim = Eigen::array<int, 1>({{0}});
+      Functor functor;
+      functor(place, x, out, reduce_dim);
+    } else {
+      int rank = context.Input<Tensor>("X")->dims().size();
+      switch (rank) {
+        case 1:
+          ReduceCompute<1>(context);
+          break;
+        case 2:
+          ReduceCompute<2>(context);
+          break;
+        case 3:
+          ReduceCompute<3>(context);
+          break;
+        case 4:
+          ReduceCompute<4>(context);
+          break;
+        case 5:
+          ReduceCompute<5>(context);
+          break;
+        case 6:
+          ReduceCompute<6>(context);
+          break;
+      }
     }
   }
 
@@ -157,26 +174,46 @@ template <typename DeviceContext, typename T, typename Functor>
 class ReduceGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    int rank = context.Input<Tensor>("X")->dims().size();
-    switch (rank) {
-      case 1:
-        ReduceGradCompute<1>(context);
-        break;
-      case 2:
-        ReduceGradCompute<2>(context);
-        break;
-      case 3:
-        ReduceGradCompute<3>(context);
-        break;
-      case 4:
-        ReduceGradCompute<4>(context);
-        break;
-      case 5:
-        ReduceGradCompute<5>(context);
-        break;
-      case 6:
-        ReduceGradCompute<6>(context);
-        break;
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    if (reduce_all) {
+      auto* input0 = context.Input<Tensor>("X");
+      auto* input1 = context.Input<Tensor>("Out");
+      auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
+      auto* output = context.Output<Tensor>(framework::GradVarName("X"));
+      output->mutable_data<T>(context.GetPlace());
+      auto x = EigenVector<T>::Flatten(*input0);
+      auto x_reduce = EigenVector<T>::From(*input1);
+      auto x_reduce_grad = EigenVector<T>::From(*input2);
+      auto x_grad = EigenVector<T>::Flatten(*output);
+      auto& place =
+          *context.template device_context<DeviceContext>().eigen_device();
+      auto broadcast_dim =
+          Eigen::array<int, 1>({{static_cast<int>(input0->numel())}});
+      Functor functor;
+      functor(place, x, x_reduce, x_grad, x_reduce_grad, broadcast_dim,
+              broadcast_dim[0]);
+    } else {
+      int rank = context.Input<Tensor>("X")->dims().size();
+      switch (rank) {
+        case 1:
+          ReduceGradCompute<1>(context);
+          break;
+        case 2:
+          ReduceGradCompute<2>(context);
+          break;
+        case 3:
+          ReduceGradCompute<3>(context);
+          break;
+        case 4:
+          ReduceGradCompute<4>(context);
+          break;
+        case 5:
+          ReduceGradCompute<5>(context);
+          break;
+        case 6:
+          ReduceGradCompute<6>(context);
+          break;
+      }
     }
   }
 
diff --git a/paddle/operators/reshape_op.cc b/paddle/operators/reshape_op.cc
index 39bf2118d6..7fd33bf662 100644
--- a/paddle/operators/reshape_op.cc
+++ b/paddle/operators/reshape_op.cc
@@ -84,9 +84,9 @@ Given a 2-D tensor X with 2 rows and 2 columns
     [[1, 2], [3, 4]]
 
 and target shape = [1, 4], the reshape operator will transform
-the tensor X into a 1-D tensor:
+the tensor X into a 2-D tensor:
 
-    [1, 2, 3, 4]
+    [[1, 2, 3, 4]]
 
 )DOC");
   }
diff --git a/paddle/operators/while_op.cc b/paddle/operators/while_op.cc
index b8e44bcc5a..9a092a570f 100644
--- a/paddle/operators/while_op.cc
+++ b/paddle/operators/while_op.cc
@@ -25,7 +25,7 @@ namespace operators {
 using StepScopeVar = std::vector<framework::Scope *>;
 using LoDTensor = framework::LoDTensor;
 
-constexpr char kStepBlock[] = "step_block";
+constexpr char kStepBlock[] = "sub_block";
 constexpr char kCondition[] = "Condition";
 constexpr char kStepScopes[] = "StepScopes";
 constexpr char kParameters[] = "X";
diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
index 2c7f964216..1c72b50559 100644
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -125,6 +125,22 @@ cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; }
 
 cudaStream_t CUDADeviceContext::stream() const { return stream_; }
 
+CudnnDeviceContext::CudnnDeviceContext(CudnnPlace place)
+    : CUDADeviceContext(place), place_(place) {
+  PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
+  PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream()));
+}
+
+CudnnDeviceContext::~CudnnDeviceContext() {
+  SetDeviceId(place_.device);
+  Wait();
+  PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_));
+}
+
+Place CudnnDeviceContext::GetPlace() const { return CudnnPlace(); }
+
+cudnnHandle_t CudnnDeviceContext::cudnn_handle() const { return cudnn_handle_; }
+
 #endif
 
 }  // namespace platform
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index 596d9d0bba..f67194993d 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -86,6 +86,22 @@ class CUDADeviceContext : public DeviceContext {
   cublasHandle_t cublas_handle_;
 };
 
+class CudnnDeviceContext : public CUDADeviceContext {
+ public:
+  explicit CudnnDeviceContext(CudnnPlace place);
+  virtual ~CudnnDeviceContext();
+
+  /*! \brief  Return place in the device context. */
+  Place GetPlace() const final;
+
+  /*! \brief  Return cudnn  handle in the device context. */
+  cudnnHandle_t cudnn_handle() const;
+
+ private:
+  cudnnHandle_t cudnn_handle_;
+  CudnnPlace place_;
+};
+
 #endif
 
 }  // namespace platform
diff --git a/paddle/platform/device_context_test.cc b/paddle/platform/device_context_test.cc
index 4893cd92f6..be3b2af5af 100644
--- a/paddle/platform/device_context_test.cc
+++ b/paddle/platform/device_context_test.cc
@@ -46,3 +46,19 @@ TEST(Device, CUDADeviceContext) {
     delete device_context;
   }
 }
+
+TEST(Device, CudnnDeviceContext) {
+  using paddle::platform::CudnnDeviceContext;
+  using paddle::platform::CudnnPlace;
+  if (paddle::platform::dynload::HasCUDNN()) {
+    int count = paddle::platform::GetCUDADeviceCount();
+    for (int i = 0; i < count; ++i) {
+      CudnnDeviceContext* device_context =
+          new CudnnDeviceContext(CudnnPlace(i));
+      cudnnHandle_t cudnn_handle = device_context->cudnn_handle();
+      ASSERT_NE(nullptr, cudnn_handle);
+      ASSERT_NE(nullptr, device_context->stream());
+      delete device_context;
+    }
+  }
+}
diff --git a/paddle/platform/place.h b/paddle/platform/place.h
index 5370360a7d..f0dcec8f52 100644
--- a/paddle/platform/place.h
+++ b/paddle/platform/place.h
@@ -43,6 +43,11 @@ struct GPUPlace {
   int device;
 };
 
+struct CudnnPlace : public GPUPlace {
+  CudnnPlace() : GPUPlace() {}
+  explicit CudnnPlace(int d) : GPUPlace(d) {}
+};
+
 struct IsGPUPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &) const { return false; }
   bool operator()(const GPUPlace &gpu) const { return true; }
@@ -52,7 +57,7 @@ struct IsGPUPlace : public boost::static_visitor<bool> {
 // should be less equal than 2^(NUM_PLACE_TYPE_LIMIT_IN_BIT)
 #define NUM_PLACE_TYPE_LIMIT_IN_BIT 4
 
-typedef boost::variant<GPUPlace, CPUPlace> Place;
+typedef boost::variant<CudnnPlace, GPUPlace, CPUPlace> Place;
 
 // static check number of place types is less equal than
 // 2^(NUM_PLACE_TYPE_LIMIT_IN_BIT)
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index c16d3e0cbe..1faf24bcb8 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -282,6 +282,23 @@ All parameter, weight, gradient are variables in Paddle.
     }
     return ret_values;
   });
+  m.def("get_grad_op_descs",
+        [](const OpDescBind &op_desc,
+           const std::unordered_set<std::string> &no_grad_set,
+           std::unordered_map<std::string, std::string> &grad_to_var,
+           const std::vector<BlockDescBind *> &grad_sub_block) {
+          std::vector<std::unique_ptr<OpDescBind>> grad_op_descs =
+              framework::OpInfoMap::Instance()
+                  .Get(op_desc.Type())
+                  .GradOpMaker()(op_desc, no_grad_set, &grad_to_var,
+                                 grad_sub_block);
+          std::vector<OpDescBind *> grad_op_desc_ptrs(grad_op_descs.size());
+          std::transform(
+              grad_op_descs.begin(), grad_op_descs.end(),
+              grad_op_desc_ptrs.begin(),
+              [](std::unique_ptr<OpDescBind> &p) { return p.release(); });
+          return grad_op_desc_ptrs;
+        });
   m.def("prune", [](const ProgramDescBind &origin,
                     const std::vector<std::array<size_t, 2>> &targets) {
     ProgramDescBind prog_with_targets(origin);
diff --git a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
index 1a2d19e823..c2f631bdf4 100644
--- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
@@ -1,7 +1,7 @@
 # Build this image:  docker build -t mpi .
 #
 
-FROM paddledev/paddle:0.10.0rc3
+FROM paddlepaddle/paddle:0.10.0rc3
 
 ENV DEBIAN_FRONTEND noninteractive
 
diff --git a/paddle/scripts/docker/README.md b/paddle/scripts/docker/README.md
index f3a6f1dba7..f0620498cf 100644
--- a/paddle/scripts/docker/README.md
+++ b/paddle/scripts/docker/README.md
@@ -20,7 +20,7 @@ binaries.
 
 ## Run The Build
 
-### Build Evironments
+### Build Environments
 
 The pre-built build environment images are:
 
@@ -192,7 +192,7 @@ For developers who are interested in the C++ source code, please use -e "WOBOQ=O
 - The following command builds PaddlePaddle, generates HTML pages from C++ source code, and writes HTML pages into `$HOME/woboq_out` on the host:
 
 ```bash
-docker run -v $PWD:/paddle -v $HOME/woboq_out:/woboq_out -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=ON" -e "WOBOQ=ON" paddlepaddle/paddle:latest-dev
+docker run -v $PWD:/paddle -v $HOME/woboq_out:/woboq_out -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TESTING=ON" -e "WOBOQ=ON" paddlepaddle/paddle:latest-dev
 ```
 
 - You can open the generated HTML files in your Web browser. Or, if you want to run a Nginx container to serve them for a wider audience, you can run:
diff --git a/paddle/scripts/tools/build_docs/build_docs.sh b/paddle/scripts/tools/build_docs/build_docs.sh
index c6cbbc4eef..f9bc8bf63a 100755
--- a/paddle/scripts/tools/build_docs/build_docs.sh
+++ b/paddle/scripts/tools/build_docs/build_docs.sh
@@ -5,4 +5,4 @@ docker run --rm \
        -e "WITH_AVX=ON" \
        -e "WITH_DOC=ON" \
        -e "WOBOQ=ON" \
-       ${1:-"paddledev/paddle:dev"}
+       ${1:-"paddlepaddle/paddle:latest-dev"}
diff --git a/python/paddle/v2/fluid/layers.py b/python/paddle/v2/fluid/layers.py
deleted file mode 100644
index 9d6ff7c47a..0000000000
--- a/python/paddle/v2/fluid/layers.py
+++ /dev/null
@@ -1,2195 +0,0 @@
-import core
-import proto.framework_pb2 as framework_pb2
-from framework import OpProtoHolder, Variable, Program, Operator
-from initializer import Constant, Normal, Xavier, Initializer
-from paddle.v2.fluid.layer_helper import LayerHelper, unique_name
-import re
-import cStringIO
-from param_attr import ParamAttr
-import contextlib
-
-__all__ = [
-    'fc', 'data', 'cross_entropy', 'conv2d', 'pool2d', 'embedding', 'concat',
-    'StaticRNN', 'cast', 'sequence_conv', 'sequence_pool', 'sums', 'cos_sim',
-    'batch_norm', 'accuracy', 'split_lod_tensor', 'While'
-]
-
-
-def fc(input,
-       size,
-       num_flatten_dims=1,
-       param_attr=None,
-       bias_attr=None,
-       act=None,
-       name=None,
-       main_program=None,
-       startup_program=None):
-    """
-    Fully Connected Layer.
-
-    Args:
-       input: The input tensor to the function
-       size: The size of the layer
-       num_flatten_dims: Number of columns in input
-       param_attr: The parameters/weights to the FC Layer
-       param_initializer: Initializer used for the weight/parameter. If None, XavierInitializer() is used
-       bias_attr: The bias parameter for the FC layer
-       bias_initializer: Initializer used for the bias. If None, then ConstantInitializer() is used
-       act: Activation to be applied to the output of FC layer
-       name: Name/alias of the function
-       main_program: Name of the main program that calls this
-       startup_program: Name of the startup program
-
-    This function can take in multiple inputs and performs the Fully Connected
-    function (linear transformation) on top of each of them.
-    So for input x, the output will be : Wx + b. Where W is the parameter,
-    b the bias and x is the input.
-
-    The function also applies an activation (non-linearity) on top of the
-    output, if activation is passed in the input.
-
-    All the input variables of this function are passed in as local variables
-    to the LayerHelper constructor.
-
-    """
-    helper = LayerHelper('fc', **locals())
-
-    dtype = helper.input_dtype()
-
-    mul_results = []
-    for input_var, param_attr in helper.iter_inputs_and_params():
-        input_shape = input_var.shape
-        param_shape = [
-            reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1)
-        ] + [size]
-        w = helper.create_parameter(
-            attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)
-        tmp = helper.create_tmp_variable(dtype)
-        helper.append_op(
-            type="mul",
-            inputs={
-                "X": input_var,
-                "Y": w,
-            },
-            outputs={"Out": tmp},
-            attrs={'x_num_col_dims': num_flatten_dims,
-                   'y_num_col_dims': 1})
-        mul_results.append(tmp)
-
-    # sum
-    if len(mul_results) == 1:
-        pre_bias = mul_results[0]
-    else:
-        pre_bias = helper.create_tmp_variable(dtype)
-        helper.append_op(
-            type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias})
-    # add bias
-    pre_activation = helper.append_bias_op(pre_bias)
-    # add activation
-    return helper.append_activation(pre_activation)
-
-
-def embedding(input,
-              size,
-              is_sparse=False,
-              param_attr=None,
-              dtype='float32',
-              main_program=None,
-              startup_program=None):
-    """
-    Embedding Layer.
-
-    Args:
-       param_initializer:
-       input: The input to the function
-       size: The size of the layer
-       is_sparse: A flag that decleares whether the input is sparse
-       param_attr: Parameters for this layer
-       dtype: The type of data : float32, float_16, int etc
-       main_program: Name of the main program that calls this
-       startup_program: Name of the startup program
-
-    This function can take in the input (which is a vector of IDs) and
-    performs a lookup in the lookup_table using these IDs, to result into
-    the embedding of each ID in the input.
-
-    All the input variables of this function are passed in as local variables
-    to the LayerHelper constructor.
-
-    """
-
-    helper = LayerHelper('embedding', **locals())
-    w = helper.create_parameter(
-        attr=helper.param_attr, shape=size, dtype=dtype, is_bias=False)
-    tmp = helper.create_tmp_variable(dtype)
-    helper.append_op(
-        type='lookup_table',
-        inputs={'Ids': input,
-                'W': w},
-        outputs={'Out': tmp},
-        attrs={'is_sparse': is_sparse})
-    return tmp
-
-
-# TODO(qijun): expose H0 and C0
-def dynamic_lstm(input,
-                 size,
-                 param_attr=None,
-                 bias_attr=None,
-                 use_peepholes=True,
-                 is_reverse=False,
-                 gate_activation='sigmoid',
-                 cell_activation='tanh',
-                 candidate_activation='tanh',
-                 dtype='float32',
-                 main_program=None,
-                 startup_program=None):
-    helper = LayerHelper('lstm', **locals())
-    size = size / 4
-    weight = helper.create_parameter(
-        attr=helper.param_attr, shape=[size, 4 * size], dtype=dtype)
-    bias_size = [1, 7 * size]
-    if not use_peepholes:
-        bias_size[1] = 4 * size
-    bias = helper.create_parameter(
-        attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
-
-    hidden = helper.create_tmp_variable(dtype)
-    cell = helper.create_tmp_variable(dtype)
-    batch_gate = helper.create_tmp_variable(dtype)
-    batch_cell_pre_act = helper.create_tmp_variable(dtype)
-
-    helper.append_op(
-        type='lstm',
-        inputs={'Input': input,
-                'Weight': weight,
-                'Bias': bias},
-        outputs={
-            'Hidden': hidden,
-            'Cell': cell,
-            'BatchGate': batch_gate,
-            'BatchCellPreAct': batch_cell_pre_act
-        },
-        attrs={
-            'use_peepholes': use_peepholes,
-            'is_reverse': is_reverse,
-            'gate_activation': gate_activation,
-            'cell_activation': cell_activation,
-            'candidate_activation': candidate_activation
-        })
-    return hidden, cell
-
-
-def gru_unit(input,
-             hidden,
-             size,
-             weight=None,
-             bias=None,
-             activation='tanh',
-             gate_activation='sigmoid',
-             main_program=None,
-             startup_program=None):
-    """
-    GRUUnit Operator implements partial calculations of the GRU unit as following:
-
-    $$
-    update \ gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\
-    reset \ gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r)  \\
-    output \ candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\
-    output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t)
-    $$
-
-    which is same as one time step of GRU Operator.
-
-    @note To implement the complete GRU unit, fully-connected operator must be
-    used before to feed xu, xr and xc as the Input of GRUUnit operator.
-
-    TODO(ChunweiYan) add more document here
-    """
-    activation_dict = dict(
-        identity=0,
-        sigmoid=1,
-        tanh=2,
-        relu=3, )
-    activation = activation_dict[activation]
-    gate_activation = activation_dict[gate_activation]
-
-    helper = LayerHelper('gru_unit', **locals())
-    dtype = helper.input_dtype()
-    size = size / 3
-
-    # create weight
-    if weight is None:
-        weight = helper.create_parameter(
-            attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
-
-    # create bias
-    if bias is None:
-        bias_size = [1, 3 * size]
-        bias = helper.create_parameter(
-            attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
-
-    gate = helper.create_tmp_variable(dtype)
-    reset_hidden_pre = helper.create_tmp_variable(dtype)
-    updated_hidden = helper.create_tmp_variable(dtype)
-
-    helper.append_op(
-        type='gru_unit',
-        inputs={'Input': input,
-                'HiddenPrev': hidden,
-                'Weight': weight},
-        outputs={
-            'Gate': gate,
-            'ResetHiddenPrev': reset_hidden_pre,
-            'Hidden': updated_hidden,
-        },
-        attrs={
-            'activation': 0,
-            'gate_activation': 1,
-        })
-
-    return updated_hidden, reset_hidden_pre, gate
-
-
-def data(name,
-         shape,
-         append_batch_size=True,
-         dtype='float32',
-         lod_level=0,
-         type=core.VarDesc.VarType.LOD_TENSOR,
-         main_program=None,
-         startup_program=None,
-         stop_gradient=True):
-    """
-    Data Layer.
-
-    Args:
-       name: The name/alias of the function
-       shape: Tuple declaring the shape.
-       append_batch_size: Whether or not to append the data as a batch.
-       dtype: The type of data : float32, float_16, int etc
-       type: The output type. By default it is LOD_TENSOR.
-       lod_level(int): The LoD Level. 0 means the input data is not a sequence.
-       main_program: Name of the main program that calls this
-       startup_program: Name of the startup program
-       stop_gradient: A boolean that mentions whether gradient should flow.
-
-    This function takes in input and based on whether data has
-    to be returned back as a minibatch, it creates the global variable using
-    the helper functions. The global variables can be accessed by all the
-    following operations and layers in the graph.
-
-    All the input variables of this function are passed in as local variables
-    to the LayerHelper constructor.
-
-    """
-    helper = LayerHelper('data', **locals())
-    shape = list(shape)
-    for i in xrange(len(shape)):
-        if shape[i] is None:
-            shape[i] = -1
-            append_batch_size = False
-        elif shape[i] < 0:
-            append_batch_size = False
-
-    if append_batch_size:
-        shape = [-1] + shape  # append batch size as -1
-
-    return helper.create_global_variable(
-        name=name,
-        shape=shape,
-        dtype=dtype,
-        type=type,
-        stop_gradient=stop_gradient,
-        lod_level=lod_level)
-
-
-def create_tensor(dtype, name=None, main_program=None, startup_program=None):
-    helper = LayerHelper("create_tensor", **locals())
-    return helper.create_variable(name=helper.name, dtype=dtype)
-
-
-def _convert_(name):
-    """
-    Formatting.
-
-    Args:
-       name: The name/alias
-
-    This function takes in a name and converts it to a standard format of
-    group1_group2. Where as per the regular expression, group1 can have
-    alphabets and numbers and group2 has capital alphabets.
-
-    """
-    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
-    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
-
-
-def _generate_doc_string_(op_proto):
-    """
-    Generate docstring by OpProto
-
-    Args:
-        op_proto (framework_pb2.OpProto): a protobuf message typed OpProto
-
-    Returns:
-        str: the document string
-    """
-
-    def _type_to_str_(tp):
-        return framework_pb2.AttrType.Name(tp)
-
-    if not isinstance(op_proto, framework_pb2.OpProto):
-        raise TypeError("OpProto should be `framework_pb2.OpProto`")
-
-    buf = cStringIO.StringIO()
-    buf.write(op_proto.comment)
-    buf.write('\nArgs:\n')
-    for each_input in op_proto.inputs:
-        line_begin = '    {0}: '.format(_convert_(each_input.name))
-        buf.write(line_begin)
-        buf.write(each_input.comment)
-        buf.write('\n')
-        buf.write(' ' * len(line_begin))
-        buf.write('Duplicable: ')
-        buf.write(str(each_input.duplicable))
-        buf.write('  Optional: ')
-        buf.write(str(each_input.dispensable))
-        buf.write('\n')
-
-    for each_attr in op_proto.attrs:
-        buf.write('    ')
-        buf.write(each_attr.name)
-        buf.write(' (')
-        buf.write(_type_to_str_(each_attr.type))
-        buf.write('): ')
-        buf.write(each_attr.comment)
-        buf.write('\n')
-
-    if len(op_proto.outputs) != 0:
-        buf.write('\nReturns:\n')
-        buf.write('    ')
-        for each_opt in op_proto.outputs:
-            if not each_opt.intermediate:
-                break
-        buf.write(each_opt.comment)
-
-    return buf.getvalue()
-
-
-def _create_op_func_(op_type):
-    """
-    Create an Operator for a Function.
-
-    Args:
-       op_type: The name of the operator to be created
-
-    This function takes in the operator type (sigmoid, mean , average etc) and
-    creates the operator functionality.
-
-    """
-    op_proto = OpProtoHolder.instance().get_op_proto(op_type)
-    not_intermediate_outputs = \
-        filter(lambda output: not output.intermediate, op_proto.outputs)
-    intermediate_outputs = \
-        filter(lambda output: output.intermediate, op_proto.outputs)
-
-    if len(not_intermediate_outputs) != 1:
-        raise ValueError("Only one non intermediate output operator can be",
-                         "automatically generated")
-
-    if not_intermediate_outputs[0].duplicable:
-        raise ValueError(
-            "Only non duplicable op can be automatically generated")
-
-    for output in intermediate_outputs:
-        if output.duplicable:
-            raise ValueError("The op can be automatically generated only when ",
-                             "all intermediate ops are not duplicable")
-
-    o_name = not_intermediate_outputs[0].name
-    intermediate_output_names = [output.name for output in intermediate_outputs]
-
-    def infer_and_check_dtype(op_proto, **kwargs):
-        """
-        This function performs the sanity check for dtype and
-        instance type.
-        """
-        dtype = None
-        for ipt in op_proto.inputs:
-            name = _convert_(ipt.name)
-            val = kwargs.pop(name, [])
-            if not isinstance(val, list) and not isinstance(val, tuple):
-                val = [val]
-            for each in val:
-                if not isinstance(each, Variable):
-                    raise ValueError("input of {0} must be variable".format(
-                        op_type))
-
-                if dtype is None:
-                    dtype = each.dtype
-                elif dtype != each.dtype:
-                    raise ValueError(
-                        "operator {0} must input same dtype. {1} vs {2}".format(
-                            op_type, dtype, each.dtype))
-
-        return dtype
-
-    def func(**kwargs):
-        helper = LayerHelper(op_type, **kwargs)
-
-        dtype = infer_and_check_dtype(op_proto, **kwargs)
-
-        inputs = dict()
-        for ipt in op_proto.inputs:
-            name = _convert_(ipt.name)
-            val = kwargs.pop(name, [])
-            if not isinstance(val, list) and not isinstance(val, tuple):
-                val = [val]
-            inputs[ipt.name] = val
-
-        outputs = dict()
-        out = helper.create_tmp_variable(dtype=dtype)
-        outputs[o_name] = [out]
-        for name in intermediate_output_names:
-            outputs[name] = [helper.create_tmp_variable(dtype=dtype)]
-        helper.append_op(
-            type=op_type, inputs=inputs, outputs=outputs, attrs=kwargs)
-        return helper.append_activation(out)
-
-    func.__name__ = op_type
-    globals()[op_type] = func
-    func.__doc__ = _generate_doc_string_(op_proto)
-    global __all__
-    __all__.append(op_type)
-
-
-_create_op_func_('mean')
-_create_op_func_('mul')
-_create_op_func_('elementwise_add')
-_create_op_func_('elementwise_div')
-_create_op_func_('dropout')
-_create_op_func_('reshape')
-_create_op_func_('sigmoid')
-_create_op_func_('scale')
-_create_op_func_('reshape')
-_create_op_func_('transpose')
-_create_op_func_('sigmoid_cross_entropy_with_logits')
-
-
-def cast(x, dtype, main_program=None):
-    """
-    This function takes in the input with input_dtype
-    and casts it to the output_dtype as the output.
-    """
-    helper = LayerHelper('cast', **locals())
-    out = helper.create_tmp_variable(dtype=dtype)
-    helper.append_op(
-        type='cast',
-        inputs={'X': [x]},
-        outputs={'Out': [out]},
-        attrs={'in_dtype': x.dtype,
-               'out_dtype': out.dtype})
-    return out
-
-
-def concat(input, axis, main_program=None, startup_program=None):
-    """
-    This function concats the input along the axis mentioned
-    and returns that as the output.
-    """
-    helper = LayerHelper('concat', **locals())
-    out = helper.create_tmp_variable(dtype=helper.input_dtype())
-    helper.append_op(
-        type='concat',
-        inputs={'X': input},
-        outputs={'Out': [out]},
-        attrs={'axis': axis})
-    return out
-
-
-def sums(input, out=None, main_program=None, startup_program=None):
-    """
-    This function takes in the input and performs the sum operation on it
-    and returns that as the output.
-    """
-    helper = LayerHelper('sum', **locals())
-    if out is None:
-        out = helper.create_tmp_variable(dtype=helper.input_dtype())
-    helper.append_op(type='sum', inputs={'X': input}, outputs={'Out': out})
-    return out
-
-
-def linear_chain_crf(input,
-                     label,
-                     param_attr=None,
-                     main_program=None,
-                     startup_program=None):
-    helper = LayerHelper('linear_chain_crf', **locals())
-    size = input.shape[1]
-    transition = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=[size + 2, size],
-        dtype=helper.input_dtype())
-    alpha = helper.create_tmp_variable(dtype=helper.input_dtype())
-    emission_exps = helper.create_tmp_variable(dtype=helper.input_dtype())
-    transition_exps = helper.create_tmp_variable(dtype=helper.input_dtype())
-    log_likelihood = helper.create_tmp_variable(dtype=helper.input_dtype())
-    helper.append_op(
-        type='linear_chain_crf',
-        inputs={"Emission": [input],
-                "Transition": transition,
-                "Label": label},
-        outputs={
-            "Alpha": [alpha],
-            "EmissionExps": [emission_exps],
-            "TransitionExps": transition_exps,
-            "LogLikelihood": log_likelihood
-        })
-
-    return log_likelihood
-
-
-def crf_decoding(input,
-                 param_attr,
-                 label=None,
-                 main_program=None,
-                 startup_program=None):
-    helper = LayerHelper('crf_decoding', **locals())
-    transition = helper.get_parameter(param_attr.name)
-    viterbi_path = helper.create_tmp_variable(dtype=helper.input_dtype())
-    helper.append_op(
-        type='crf_decoding',
-        inputs={"Emission": [input],
-                "Transition": transition,
-                "Label": label},
-        outputs={"ViterbiPath": [viterbi_path]})
-
-    return viterbi_path
-
-
-def assign(input, output, main_program=None, startup_program=None):
-    helper = LayerHelper('assign', **locals())
-    helper.append_op(
-        type='scale',
-        inputs={'X': [input]},
-        outputs={'Out': [output]},
-        attrs={'scale': 1.0})
-    return output
-
-
-def split_lod_tensor(input,
-                     mask,
-                     level=0,
-                     main_program=None,
-                     startup_program=None):
-    helper = LayerHelper('split_lod_tensor', **locals())
-    out_true = helper.create_tmp_variable(dtype=input.dtype)
-    out_false = helper.create_tmp_variable(dtype=input.dtype)
-    helper.append_op(
-        type='split_lod_tensor',
-        inputs={
-            'X': input,
-            'Mask': mask,
-        },
-        outputs={'OutTrue': out_true,
-                 'OutFalse': out_false},
-        attrs={'level': level})
-    return out_true, out_false
-
-
-def merge_lod_tensor(in_true,
-                     in_false,
-                     x,
-                     mask,
-                     level=0,
-                     main_program=None,
-                     startup_program=None):
-    helper = LayerHelper('merge_lod_tensor', **locals())
-    out = helper.create_tmp_variable(dtype=in_true.dtype)
-    helper.append_op(
-        type='merge_lod_tensor',
-        inputs={'X': x,
-                'Mask': mask,
-                'InTrue': in_true,
-                'InFalse': in_false},
-        outputs={'Out': out},
-        attrs={'level': level})
-    return out
-
-
-def cos_sim(X, Y, **kwargs):
-    """
-    This function performs the cosine similarity between two tensors
-    X and Y and returns that as the output.
-    """
-    helper = LayerHelper('cos_sim', **kwargs)
-    out = helper.create_tmp_variable(dtype=X.dtype)
-    xnorm = helper.create_tmp_variable(dtype=X.dtype)
-    ynorm = helper.create_tmp_variable(dtype=X.dtype)
-    helper.append_op(
-        type='cos_sim',
-        inputs={'X': [X],
-                'Y': [Y]},
-        outputs={'Out': [out],
-                 'XNorm': [xnorm],
-                 'YNorm': [ynorm]})
-    return out
-
-
-def cross_entropy(input, label, **kwargs):
-    """
-    This function computes cross_entropy using the input and label.
-    """
-    helper = LayerHelper('cross_entropy', **kwargs)
-    out = helper.create_tmp_variable(dtype=input.dtype)
-    helper.append_op(
-        type='cross_entropy',
-        inputs={'X': [input],
-                'Label': [label]},
-        outputs={'Y': [out]},
-        attrs=kwargs)
-    return out
-
-
-def square_error_cost(input, label, **kwargs):
-    """
-    This functions returns the squared error cost using the input and label.
-    The output is appending the op to do the above.
-    """
-    helper = LayerHelper('square_error_cost', **kwargs)
-    minus_out = helper.create_tmp_variable(dtype=input.dtype)
-    helper.append_op(
-        type='elementwise_sub',
-        inputs={'X': [input],
-                'Y': [label]},
-        outputs={'Out': [minus_out]})
-
-    square_out = helper.create_tmp_variable(dtype=input.dtype)
-    helper.append_op(
-        type='square', inputs={'X': [minus_out]}, outputs={'Y': [square_out]})
-    return square_out
-
-
-def accuracy(input, label, k=1, correct=None, total=None, **kwargs):
-    """
-    This function computes the accuracy using the input and label.
-    The output is the top_k inputs and their indices.
-    """
-    helper = LayerHelper("accuracy", **kwargs)
-    topk_out = helper.create_tmp_variable(dtype=input.dtype)
-    topk_indices = helper.create_tmp_variable(dtype="int64")
-    helper.append_op(
-        type="top_k",
-        inputs={"X": [input]},
-        outputs={"Out": [topk_out],
-                 "Indices": [topk_indices]},
-        attrs={"k": k})
-    acc_out = helper.create_tmp_variable(dtype="float32")
-    if correct is None:
-        correct = helper.create_tmp_variable(dtype="int64")
-    if total is None:
-        total = helper.create_tmp_variable(dtype="int64")
-    helper.append_op(
-        type="accuracy",
-        inputs={
-            "Out": [topk_out],
-            "Indices": [topk_indices],
-            "Label": [label]
-        },
-        outputs={
-            "Accuracy": [acc_out],
-            "Correct": [correct],
-            "Total": [total],
-        })
-    return acc_out
-
-
-def chunk_eval(input,
-               label,
-               chunk_scheme,
-               num_chunk_types,
-               excluded_chunk_types=None,
-               **kwargs):
-    """
-    This function computes the accuracy using the input and label.
-    The output is the top_k inputs and their indices.
-    """
-    helper = LayerHelper("chunk_eval", **kwargs)
-
-    # prepare output
-    precision = helper.create_tmp_variable(dtype="float32")
-    recall = helper.create_tmp_variable(dtype="float32")
-    f1_score = helper.create_tmp_variable(dtype="float32")
-
-    helper.append_op(
-        type="chunk_eval",
-        inputs={"Inference": [input],
-                "Label": [label]},
-        outputs={
-            "Precision": [precision],
-            "Recall": [recall],
-            "F1-Score": [f1_score]
-        },
-        attrs={
-            "num_chunk_types": num_chunk_types,
-            'chunk_scheme': chunk_scheme,
-            'excluded_chunk_types': excluded_chunk_types or []
-        })
-    return precision, recall, f1_score
-
-
-def sequence_conv(input,
-                  num_filters,
-                  filter_size=3,
-                  filter_stride=1,
-                  padding=None,
-                  bias_attr=None,
-                  param_attr=None,
-                  act=None,
-                  main_program=None,
-                  startup_program=None):
-    """
-    This function creates the op for sequence_conv, using the inputs and
-    other convolutional configurations for the filters and stride as given
-    in the input parameters to the function.
-    """
-
-    # FIXME(dzh) : want to unify the argument of python layer
-    # function. So we ignore some unecessary attributes.
-    # such as, padding_trainable, context_start.
-
-    helper = LayerHelper('sequence_conv', **locals())
-    dtype = helper.input_dtype()
-    filter_shape = [filter_size * input.shape[1], num_filters]
-    filter_param = helper.create_parameter(
-        attr=helper.param_attr, shape=filter_shape, dtype=dtype)
-    pre_bias = helper.create_tmp_variable(dtype)
-
-    helper.append_op(
-        type='sequence_conv',
-        inputs={
-            'X': [input],
-            'Filter': [filter_param],
-        },
-        outputs={"Out": pre_bias},
-        attrs={
-            'contextStride': filter_stride,
-            'contextStart': -int(filter_size / 2),
-            'contextLength': filter_size
-        })
-    pre_act = helper.append_bias_op(pre_bias)
-    return helper.append_activation(pre_act)
-
-
-def conv2d(input,
-           num_filters,
-           filter_size,
-           stride=None,
-           padding=None,
-           groups=None,
-           param_attr=None,
-           bias_attr=None,
-           act=None,
-           name=None,
-           main_program=None,
-           startup_program=None):
-    """
-    This function creates the op for a 2-dimensional Convolution.
-    This is performed using the parameters of filters(size, dimensionality etc)
-    , stride and other configurations for a Convolution operation.
-    This funciton can also append an activation on top of the
-    conv-2d output, if mentioned in the input parameters.
-    """
-
-    if stride is None:
-        stride = [1, 1]
-    helper = LayerHelper('conv2d', **locals())
-    dtype = helper.input_dtype()
-
-    num_channels = input.shape[1]
-    if groups is None:
-        num_filter_channels = num_channels
-    else:
-        if num_channels % groups != 0:
-            raise ValueError("num_channels must be divisible by groups.")
-        num_filter_channels = num_channels / groups
-
-    if isinstance(filter_size, int):
-        filter_size = [filter_size, filter_size]
-    if isinstance(stride, int):
-        stride = [stride, stride]
-    if isinstance(padding, int):
-        padding = [padding, padding]
-
-    input_shape = input.shape
-    filter_shape = [num_filters, num_filter_channels] + filter_size
-
-    def _get_default_param_initializer():
-        std = (2.0 / (filter_size[0]**2 * num_channels))**0.5
-        return Normal(0.0, std, 0)
-
-    filter_param = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=filter_shape,
-        dtype=dtype,
-        default_initializer=_get_default_param_initializer())
-
-    pre_bias = helper.create_tmp_variable(dtype)
-
-    helper.append_op(
-        type='conv2d_cudnn',
-        inputs={
-            'Input': input,
-            'Filter': filter_param,
-        },
-        outputs={"Output": pre_bias},
-        attrs={'strides': stride,
-               'paddings': padding,
-               'groups': groups})
-
-    pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
-
-    return helper.append_activation(pre_act)
-
-
-def sequence_pool(input, pool_type, **kwargs):
-    """
-    This function add the operator for sequence pooling.
-    This is applied on top of the input using pool_type mentioned
-    in the parameters.
-    """
-    helper = LayerHelper('sequence_pool', input=input, **kwargs)
-    dtype = helper.input_dtype()
-    pool_out = helper.create_tmp_variable(dtype)
-    max_index = helper.create_tmp_variable(dtype)
-
-    helper.append_op(
-        type="sequence_pool",
-        inputs={"X": input},
-        outputs={"Out": pool_out,
-                 "MaxIndex": max_index},
-        attrs={"pooltype": pool_type.upper()})
-
-    return pool_out
-
-
-def pool2d(input,
-           pool_size,
-           pool_type,
-           pool_stride=None,
-           pool_padding=None,
-           global_pooling=False,
-           main_program=None,
-           startup_program=None):
-    """
-    This function adds the operator for pooling in 2 dimensions, using the
-    pooling configurations mentioned in input parameters.
-    """
-    if pool_padding is None:
-        pool_padding = [0, 0]
-    if pool_stride is None:
-        pool_stride = [1, 1]
-    if pool_type not in ["max", "avg"]:
-        raise ValueError(
-            "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
-            str(pool_type))
-    if isinstance(pool_size, int):
-        pool_size = [pool_size, pool_size]
-    if isinstance(pool_stride, int):
-        pool_stride = [pool_stride, pool_stride]
-    if isinstance(pool_padding, int):
-        pool_padding = [pool_padding, pool_padding]
-
-    helper = LayerHelper('pool2d', **locals())
-    dtype = helper.input_dtype()
-    pool_out = helper.create_tmp_variable(dtype)
-
-    helper.append_op(
-        type="pool2d",
-        inputs={"X": input},
-        outputs={"Out": pool_out},
-        attrs={
-            "pooling_type": pool_type,
-            "ksize": pool_size,
-            "global_pooling": global_pooling,
-            "strides": pool_stride,
-            "paddings": pool_padding
-        })
-
-    return pool_out
-
-
-def batch_norm(input,
-               act=None,
-               is_test=False,
-               momentum=0.9,
-               epsilon=1e-05,
-               param_attr=None,
-               bias_attr=None,
-               data_layout='NCHW',
-               main_program=None,
-               startup_program=None):
-    """
-    This function helps create an operator to implement
-    the BatchNorm layer using the configurations from the input parameters.
-    """
-    helper = LayerHelper('batch_norm', **locals())
-    dtype = helper.input_dtype()
-
-    input_shape = input.shape
-    if data_layout == 'NCHW':
-        channel_num = input_shape[1]
-    else:
-        if data_layout == 'NHWC':
-            channel_num = input_shape[-1]
-        else:
-            raise ValueError("unsupported data layout:" + data_layout)
-
-    param_shape = [channel_num]
-
-    # create parameter
-    scale = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=param_shape,
-        dtype=dtype,
-        default_initializer=Constant(1.0))
-
-    bias = helper.create_parameter(
-        attr=helper.param_attr, shape=param_shape, dtype=dtype, is_bias=True)
-
-    mean = helper.create_global_variable(
-        dtype=input.dtype, shape=param_shape, persistable=True)
-    helper.set_variable_initializer(var=mean, initializer=Constant(0.0))
-
-    variance = helper.create_global_variable(
-        dtype=input.dtype, shape=param_shape, persistable=True)
-    helper.set_variable_initializer(var=variance, initializer=Constant(1.0))
-
-    # create output
-    # mean and mean_out share the same memory
-    mean_out = mean
-    # variance and variance out share the same memory
-    variance_out = variance
-    saved_mean = helper.create_tmp_variable(dtype)
-    saved_variance = helper.create_tmp_variable(dtype)
-
-    batch_norm_out = helper.create_tmp_variable(dtype)
-
-    helper.append_op(
-        type="batch_norm",
-        inputs={
-            "X": input,
-            "Scale": scale,
-            "Bias": bias,
-            "Mean": mean,
-            "Variance": variance
-        },
-        outputs={
-            "Y": batch_norm_out,
-            "MeanOut": mean_out,
-            "VarianceOut": variance_out,
-            "SavedMean": saved_mean,
-            "SavedVariance": saved_variance
-        },
-        attrs={"momentum": momentum,
-               "epsilon": epsilon,
-               "is_test": is_test})
-
-    return helper.append_activation(batch_norm_out)
-
-
-def beam_search_decode(ids, scores, main_program=None, startup_program=None):
-    helper = LayerHelper('beam_search_decode', **locals())
-    sentence_ids = helper.create_tmp_variable(dtype=ids.dtype)
-    sentence_scores = helper.create_tmp_variable(dtype=ids.dtype)
-
-    helper.append_op(
-        type="beam_search_decode",
-        inputs={"Ids": ids,
-                "Scores": scores},
-        outputs={
-            "SentenceIds": sentence_ids,
-            "SentenceScores": sentence_scores
-        })
-
-    return sentence_ids, sentence_scores
-
-
-class BlockGuard(object):
-    """
-    BlockGuard class.
-
-    BlockGuard class is used to create a sub-block in a program by
-    using the Python `with` keyword.
-    """
-
-    def __init__(self, main_program):
-        if not isinstance(main_program, Program):
-            raise TypeError("BlockGuard takes a program")
-        self.main_program = main_program
-
-    def __enter__(self):
-        self.main_program.create_block()
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.main_program.rollback()
-        if exc_type is not None:
-            return False  # re-raise exception
-        return True
-
-
-class StaticRNNGuard(BlockGuard):
-    """
-    StaticRNNGuard class.
-
-    StaticRNNGuard class is used to create a StaticRNN block in a program.
-    """
-
-    def __init__(self, rnn):
-        if not isinstance(rnn, StaticRNN):
-            raise TypeError("StaticRNNGuard takes a StaticRNN")
-        super(StaticRNNGuard, self).__init__(rnn.helper.main_program)
-        self.rnn = rnn
-
-    def __enter__(self):
-        self.rnn.status = StaticRNN.IN_RNN_BLOCK
-        return super(StaticRNNGuard, self).__enter__()
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if exc_type is not None:
-            return False
-        self.rnn.status = StaticRNN.AFTER_RNN_BLOCK
-        self.rnn.complete_rnn_op()
-        return super(StaticRNNGuard, self).__exit__(exc_type, exc_val, exc_tb)
-
-
-class StaticRNNMemoryLink(object):
-    """
-    StaticRNNMemoryLink class.
-
-    Args:
-        init: the initial variable for Memory
-        init: Variable
-        pre_mem: the memory variable in previous time step
-        pre_mem: Variable
-        mem: the memory variable in current time step
-        mem: Variable
-
-    StaticRNNMemoryLink class is used to create a link between two
-    memory cells of a StaticRNN.
-    """
-
-    def __init__(self, init, pre_mem, mem=None):
-        self.init = init
-        self.pre_mem = pre_mem
-        self.mem = mem
-
-
-class StaticRNN(object):
-    """
-    StaticRNN class.
-
-    StaticRNN class is used to create a StaticRNN. The RNN will have its
-    own parameters like inputs, outputs, memories, status and length.
-    """
-    BEFORE_RNN_BLOCK = 0
-    IN_RNN_BLOCK = 1
-    AFTER_RNN_BLOCK = 2
-
-    def __init__(self, name=None, main_program=None):
-        self.helper = LayerHelper(
-            "static_rnn", name=name, main_program=main_program)
-        self.memories = {}  # memory map, from pre_mem.name --> MemoryLink
-        self.inputs = []  # input variable list in current block
-        self.outputs = []  # output variable list in parent block
-        self.status = StaticRNN.BEFORE_RNN_BLOCK  # status flag.
-        # sequence length, since it is a static RNN, sequence length are fixed.
-        self.seq_len = None
-
-    def step(self):
-        return StaticRNNGuard(self)
-
-    def _assert_in_rnn_block_(self, method):
-        if self.status != StaticRNN.IN_RNN_BLOCK:
-            raise ValueError("You must invoke {0} in rnn block".format(method))
-
-    def memory(self,
-               init=None,
-               shape=None,
-               batch_ref=None,
-               init_value=0.0,
-               init_batch_dim_idx=0,
-               ref_batch_dim_idx=1):
-        """
-        Args:
-            init: boot memory, if not set, a shape, batch_ref must be provided
-            shape: shape of the boot memory
-            batch_ref: batch size reference variable
-            init_value: the init value of boot memory
-            init_batch_dim_idx: the index of batch size in init's dimension
-            ref_batch_dim_idx: the index of batch size in batch_ref's dimension
-        """
-        self._assert_in_rnn_block_('memory')
-        if init is None:
-            if shape is None or batch_ref is None:
-                raise ValueError(
-                    "if init is None, memory at least need shape and batch_ref")
-            parent_block = self.parent_block()
-            var_name = unique_name("@".join([self.helper.name, "memory_boot"]))
-            boot_var = parent_block.create_var(
-                name=var_name,
-                shape=shape,
-                dtype=batch_ref.dtype,
-                persistable=False)
-
-            parent_block.append_op(
-                type="fill_constant_batch_size_like",
-                inputs={'Input': [batch_ref]},
-                outputs={'Out': [boot_var]},
-                attrs={
-                    'value': init_value,
-                    'shape': boot_var.shape,
-                    'dtype': boot_var.dtype,
-                    'input_dim_idx': ref_batch_dim_idx,
-                    'output_dim_idx': init_batch_dim_idx
-                })
-
-            return self.memory(init=boot_var)
-        else:
-            pre_mem = self.helper.create_variable(
-                name=unique_name("@".join([self.helper.name, "mem"])),
-                dtype=init.dtype,
-                shape=init.shape)
-            self.memories[pre_mem.name] = StaticRNNMemoryLink(
-                init=init, pre_mem=pre_mem)
-            return pre_mem
-
-    def step_input(self, x):
-        self._assert_in_rnn_block_('step_input')
-        if not isinstance(x, Variable):
-            raise TypeError("step input takes a Variable")
-        if self.seq_len is None:
-            self.seq_len = x.shape[0]
-        elif self.seq_len != x.shape[0]:
-            raise ValueError("Static RNN only take fix seq_len input")
-
-        ipt = self.helper.create_variable(
-            name=x.name, dtype=x.dtype, shape=list(x.shape[1:]), type=x.type)
-        self.inputs.append(ipt)
-        return ipt
-
-    def step_output(self, o):
-        self._assert_in_rnn_block_('step_output')
-        if not isinstance(o, Variable):
-            raise TypeError("step output takes a Variable")
-
-        tmp_o = self.helper.create_tmp_variable(dtype=o.dtype)
-        self.helper.append_op(
-            type='rnn_memory_helper',
-            inputs={'X': [o]},
-            outputs={'Out': tmp_o},
-            attrs={'dtype': o.dtype})
-
-        out_var = self.parent_block().create_var(
-            name=tmp_o.name,
-            shape=[self.seq_len] + list(tmp_o.shape),
-            dtype=tmp_o.dtype)
-
-        self.outputs.append(out_var)
-
-    def output(self, *outputs):
-        for each in outputs:
-            self.step_output(each)
-
-    def update_memory(self, mem, var):
-        if not isinstance(mem, Variable) or not isinstance(var, Variable):
-            raise TypeError("update memory should take variables")
-        self.memories[mem.name].mem = var
-
-    def parent_block(self):
-        prog = self.helper.main_program
-        parent_idx = prog.current_block().parent_idx
-        assert parent_idx >= 0
-        parent_block = prog.block(parent_idx)
-        return parent_block
-
-    def __call__(self, *args, **kwargs):
-        if self.status != StaticRNN.AFTER_RNN_BLOCK:
-            raise ValueError("RNN output can only be retrieved after rnn block")
-        if len(self.outputs) == 0:
-            raise ValueError("RNN has no output")
-        elif len(self.outputs) == 1:
-            return self.outputs[0]
-        else:
-            return self.outputs
-
-    def complete_rnn_op(self):
-        main_program = self.helper.main_program
-        rnn_block = main_program.current_block()
-        parent_block = self.parent_block()
-
-        local_inputs = set()
-
-        for op in rnn_block.ops:
-            assert isinstance(op, Operator)
-            for oname in op.output_names:
-                for out_var_name in op.output(oname):
-                    local_inputs.add(out_var_name)
-
-        for var in self.inputs:
-            local_inputs.add(var.name)
-        for m in self.memories:
-            local_inputs.add(m)
-
-        params = list()
-        for op in rnn_block.ops:
-            assert isinstance(op, Operator)
-            for iname in op.input_names:
-                for in_var_name in op.input(iname):
-                    if in_var_name not in local_inputs:
-                        params.append(in_var_name)
-
-        parameters = [parent_block.var(name) for name in params]
-
-        step_scope = parent_block.create_var(
-            type=core.VarDesc.VarType.STEP_SCOPES)
-
-        inlinks = [parent_block.var(i.name) for i in self.inputs]
-        outlinks = self.outputs
-
-        boot_memories = []
-        pre_memories = []
-        memories = []
-        for _, mem in self.memories.iteritems():
-            boot_memories.append(mem.init)
-            pre_memories.append(mem.pre_mem.name)
-            mem_var = rnn_block.var(mem.mem.name)
-            assert isinstance(mem_var, Variable)
-            new_mem = self.helper.create_tmp_variable(dtype=mem_var.dtype)
-
-            rnn_block.append_op(
-                type='rnn_memory_helper',
-                inputs={'X': [mem_var]},
-                outputs={'Out': [new_mem]},
-                attrs={'dtype': mem_var.dtype})
-
-            memories.append(new_mem.name)
-
-        parent_block.append_op(
-            type='recurrent',
-            inputs={
-                'inputs': inlinks,
-                'initial_states': boot_memories,
-                'parameters': parameters
-            },
-            outputs={'outputs': outlinks,
-                     'step_scopes': [step_scope]},
-            attrs={
-                'ex_states': pre_memories,
-                'states': memories,
-                'step_block': rnn_block
-            })
-
-
-class WhileGuard(BlockGuard):
-    def __init__(self, while_op):
-        if not isinstance(while_op, While):
-            raise TypeError("WhileGuard takes a while op")
-        super(WhileGuard, self).__init__(while_op.helper.main_program)
-        self.while_op = while_op
-
-    def __enter__(self):
-        self.while_op.status = While.IN_WHILE_BLOCK
-        return super(WhileGuard, self).__enter__()
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if exc_type is not None:
-            return False
-        self.while_op.status = While.AFTER_WHILE_BLOCK
-        self.while_op.complete()
-        return super(WhileGuard, self).__exit__(exc_type, exc_val, exc_tb)
-
-
-class While(object):
-    BEFORE_WHILE_BLOCK = 0
-    IN_WHILE_BLOCK = 1
-    AFTER_WHILE_BLOCK = 2
-
-    def __init__(self, cond, name=None, main_program=None):
-        self.helper = LayerHelper("while", name=name, main_program=main_program)
-        self.status = While.BEFORE_WHILE_BLOCK
-        if not isinstance(cond, Variable):
-            raise TypeError("condition should be a variable")
-        assert isinstance(cond, Variable)
-        if cond.dtype != core.DataType.BOOL:
-            raise TypeError("condition should be a bool variable")
-        if reduce(lambda a, b: a * b, cond.shape, 1) != 1:
-            raise TypeError("condition should be a bool scalar")
-        self.cond_var = cond
-
-    def block(self):
-        return WhileGuard(self)
-
-    def complete(self):
-        main_program = self.helper.main_program
-        while_block = main_program.current_block()
-        parent_block = main_program.block(main_program.current_block()
-                                          .parent_idx)
-
-        inner_outputs = {self.cond_var.name}
-        x_name_list = set()
-        for op in while_block.ops:
-            for iname in op.input_names:
-                for in_var_name in op.input(iname):
-                    if in_var_name not in inner_outputs:
-                        x_name_list.add(in_var_name)
-
-            for oname in op.output_names:
-                for out_var_name in op.output(oname):
-                    inner_outputs.add(out_var_name)
-
-        out_vars = []
-        for inner_out_name in inner_outputs:
-            if inner_out_name in parent_block.vars:
-                out_vars.append(parent_block.var(inner_out_name))
-
-        step_scope = parent_block.create_var(
-            type=core.VarDesc.VarType.STEP_SCOPES)
-
-        parent_block.append_op(
-            type='while',
-            inputs={
-                'X': [parent_block.var(x_name) for x_name in x_name_list],
-                'Condition': [self.cond_var]
-            },
-            outputs={'Out': out_vars,
-                     'StepScopes': [step_scope]},
-            attrs={'step_block': while_block})
-
-
-def lstm(x,
-         c_pre_init,
-         hidden_dim,
-         forget_bias=None,
-         main_program=None,
-         startup_program=None):
-    """
-    This function helps create an operator for the LSTM (Long Short Term
-    Memory) cell that can be used inside an RNN.
-    """
-    helper = LayerHelper('lstm_unit', **locals())
-    rnn = StaticRNN()
-    with rnn.step():
-        c_pre = rnn.memory(init=c_pre_init)
-        x_t = rnn.step_input(x)
-
-        before_fc = concat(
-            input=[x_t, c_pre],
-            axis=1,
-            main_program=main_program,
-            startup_program=startup_program)
-        after_fc = fc(input=before_fc,
-                      size=hidden_dim * 4,
-                      main_program=main_program,
-                      startup_program=startup_program)
-
-        dtype = x.dtype
-        c = helper.create_tmp_variable(dtype)
-        h = helper.create_tmp_variable(dtype)
-
-        helper.append_op(
-            type='lstm_unit',
-            inputs={"X": after_fc,
-                    "C_prev": c_pre},
-            outputs={"C": c,
-                     "H": h},
-            attrs={"forget_bias": forget_bias})
-
-        rnn.update_memory(c_pre, c)
-        rnn.output(h)
-
-    return rnn()
-
-
-def lod_rank_table(x, level=0, main_program=None):
-    """
-    This function creates an operator for creating a LOD_RANK_TABLE
-    using the input x.
-    """
-    helper = LayerHelper("lod_rank_table", **locals())
-    table = helper.create_variable(
-        type=core.VarDesc.VarType.LOD_RANK_TABLE,
-        name=unique_name("lod_rank_table"))
-    helper.append_op(
-        type='lod_rank_table',
-        inputs={'X': x},
-        outputs={'Out': table},
-        attrs={'level': level})
-    return table
-
-
-def max_sequence_len(rank_table, main_program=None):
-    """
-    This function creates an operator to calculate the length of
-    max seqence through input rank_table(should be a lod_rank_table)
-    """
-    helper = LayerHelper("max_seqence_len", **locals())
-    res = helper.create_tmp_variable(dtype="int64")
-    helper.append_op(
-        type="max_sequence_len",
-        inputs={"RankTable": rank_table},
-        outputs={"Out": res})
-    return res
-
-
-def topk(input, k, main_program=None, startup_program=None):
-    helper = LayerHelper('topk', **locals())
-    topk_out = helper.create_tmp_variable(dtype=input.data_type)
-    topk_indices = helper.create_tmp_variable(dtype='int64')
-    helper.append_op(
-        type='top_k',
-        inputs={'X': [input]},
-        outputs={'Out': [topk_out],
-                 'Indices': [topk_indices]},
-        attrs={'k': k})
-    return topk_out, topk_indices
-
-
-def lod_tensor_to_array(x, table, main_program=None):
-    """
-    This function creates an operator to convert an LOD_Tensor to
-    an array.
-    """
-    helper = LayerHelper("lod_tensor_to_array", **locals())
-    array = helper.create_variable(
-        name=unique_name("lod_tensor_to_array"),
-        type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-        dtype=x.dtype)
-    helper.append_op(
-        type='lod_tensor_to_array',
-        inputs={'X': x,
-                'RankTable': table},
-        outputs={'Out': array})
-    return array
-
-
-def array_to_lod_tensor(x, table, main_program=None, startup_program=None):
-    """
-    This function creates an operator to convert an array to a
-    LOD_Tensor.
-    """
-    helper = LayerHelper("array_to_lod_tensor", **locals())
-    tmp = helper.create_tmp_variable(dtype=x.dtype)
-    helper.append_op(
-        type="array_to_lod_tensor",
-        inputs={'X': x,
-                'RankTable': table},
-        outputs={'Out': tmp})
-    return tmp
-
-
-def fill_constant(shape,
-                  dtype,
-                  value,
-                  out=None,
-                  main_program=None,
-                  startup_program=None):
-    """
-    This function creates a tensor , with shape as mentioned in the input and
-    specified dtype and fills this up with a constant value that
-    comes in the input. It also sets the stop_gradient to be True.
-    """
-    helper = LayerHelper("fill_constant", **locals())
-    if out is None:
-        out = helper.create_tmp_variable(dtype=dtype)
-    helper.append_op(
-        type='fill_constant',
-        inputs={},
-        outputs={'Out': [out]},
-        attrs={'shape': shape,
-               'dtype': out.dtype,
-               'value': float(value)})
-    out.stop_gradient = True
-    return out
-
-
-def fill_constant_batch_size_like(input,
-                                  shape,
-                                  dtype,
-                                  value,
-                                  input_dim_idx=0,
-                                  output_dim_idx=0,
-                                  main_program=None,
-                                  startup_program=None):
-    helper = LayerHelper("fill_constant_batch_size_like", **locals())
-    out = helper.create_tmp_variable(dtype=dtype)
-    helper.append_op(
-        type='fill_constant_batch_size_like',
-        inputs={'Input': input},
-        outputs={'Out': [out]},
-        attrs={
-            'shape': shape,
-            'dtype': out.dtype,
-            'value': float(value),
-            'input_dim_idx': input_dim_idx,
-            'output_dim_idx': output_dim_idx
-        })
-    out.stop_gradient = True
-    return out
-
-
-def ones(shape, dtype, main_program=None):
-    """
-    This function performs the same function as fill_constant() declared above
-    with the constant value being 1.0.
-    """
-    return fill_constant(value=1.0, **locals())
-
-
-def zeros(shape, dtype, main_program=None):
-    """
-    This function performs the same function as fill_constant() declared above
-    with the constant value being 0.0.
-    """
-    return fill_constant(value=0.0, **locals())
-
-
-def increment(x,
-              value=1.0,
-              in_place=True,
-              main_program=None,
-              startup_program=None):
-    """
-    This function creates an operator to increment each value in the input
-    `x` by an amount: `value` as mentioned in the input parameter. This
-    operation is performed in-place by default.
-    """
-    helper = LayerHelper("increment", **locals())
-    if not in_place:
-        out = helper.create_tmp_variable(dtype=x.dtype)
-    else:
-        out = x
-    helper.append_op(
-        type='increment',
-        inputs={'X': [x]},
-        outputs={'Out': [out]},
-        attrs={'step': float(value)})
-    return out
-
-
-def array_write(x, i, array=None, main_program=None, startup_program=None):
-    """
-    This function creates an operator to write the data out as a
-    LOD_TENSOR_ARRAY.
-    """
-    helper = LayerHelper('array_write', **locals())
-    if array is None:
-        array = helper.create_variable(
-            name="{0}.out".format(helper.name),
-            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-            dtype=x.dtype)
-    helper.append_op(
-        type='write_to_array',
-        inputs={'X': [x],
-                'I': [i]},
-        outputs={'Out': [array]})
-    return array
-
-
-def create_array(dtype, main_program=None):
-    helper = LayerHelper("array", **locals())
-    return helper.create_variable(
-        name="{0}.out".format(helper.name),
-        type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-        dtype=dtype)
-
-
-def less_than(x, y, cond=None, main_program=None, **ignored):
-    helper = LayerHelper("less_than", **locals())
-    if cond is None:
-        cond = helper.create_tmp_variable(dtype='bool')
-        cond.stop_gradient = True
-
-    helper.append_op(
-        type='less_than', inputs={'X': [x],
-                                  'Y': [y]}, outputs={'Out': [cond]})
-    return cond
-
-
-def array_read(array, i, main_program=None, startup_program=None):
-    """
-    This function creates an operator to read the data in as a
-    LOD_TENSOR_ARRAY.
-    """
-    helper = LayerHelper('array_read', **locals())
-    if not isinstance(
-            array,
-            Variable) or array.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY:
-        raise TypeError("array should be tensor array vairable")
-    out = helper.create_tmp_variable(dtype=array.dtype)
-    helper.append_op(
-        type='read_from_array',
-        inputs={'X': [array],
-                'I': [i]},
-        outputs={'Out': [out]})
-    return out
-
-
-def shrink_memory(x, i, table, main_program=None, startup_program=None):
-    """
-    This function creates an operator to shrink_rnn_memory using the RankTable
-    as mentioned in the input parameter.
-    """
-    helper = LayerHelper('shrink_memory', **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
-    helper.append_op(
-        type='shrink_rnn_memory',
-        inputs={'X': [x],
-                'I': [i],
-                'RankTable': [table]},
-        outputs={'Out': [out]},
-        attrs={})
-    return out
-
-
-def array_length(array, main_program=None):
-    """
-    This function creates an operator to find the length of the
-    LOD_TENSOR_ARRAY.
-    """
-    helper = LayerHelper('array_length', **locals())
-    tmp = helper.create_tmp_variable(dtype='int64')
-    tmp.stop_gradient = True
-    helper.append_op(
-        type='lod_array_length', inputs={'X': [array]}, outputs={'Out': [tmp]})
-    return tmp
-
-
-def conv2d_transpose(input,
-                     num_filters,
-                     output_size=None,
-                     filter_size=None,
-                     padding=None,
-                     stride=None,
-                     dilation=None,
-                     param_attr=None,
-                     main_program=None,
-                     startup_program=None):
-    """
-    The transpose of conv2d layer.
-
-    This layer is also known as deconvolution layer.
-
-    Args:
-        input(Variable): The input image with [N, C, H, W] format.
-        num_filters(int): The number of filter. It is as same as the output
-            image channel.
-        output_size(int|tuple|None): The output image size. If output size is a
-            tuple, it must contain two integers, (image_H, image_W). This
-            parameter only works when filter_size is None.
-        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
-            it must contain two integers, (filter_size_H, filter_size_W).
-            Otherwise, the filter will be a square.  None if use output size to
-            calculate filter_size
-        padding(int|tuple): The padding size. If padding is a tuple, it must
-            contain two integers, (padding_H, padding_W). Otherwise, the
-            padding_H = padding_W = padding.
-        stride(int|tuple): The stride size. If stride is a tuple, it must
-            contain two integers, (stride_H, stride_W). Otherwise, the
-            stride_H = stride_W = stride.
-        dilation(int|tuple): The dilation size. If dilation is a tuple, it must
-            contain two integers, (dilation_H, dilation_W). Otherwise, the
-            dilation_H = dilation_W = dilation.
-        param_attr: Parameter Attribute.
-        main_program(Program): the main program
-        startup_program(Program): the startup program
-
-    Returns:
-        Variable: Output image.
-    """
-    helper = LayerHelper("conv2d_transpose", **locals())
-    if not isinstance(input, Variable):
-        raise TypeError("Input of conv2d_transpose must be Variable")
-    input_channel = input.shape[1]
-
-    op_attr = dict()
-
-    if isinstance(padding, int):
-        op_attr['paddings'] = [padding, padding]
-    elif padding is not None:
-        op_attr['paddings'] = padding
-
-    if isinstance(stride, int):
-        op_attr['strides'] = stride
-    elif stride is not None:
-        op_attr['strides'] = stride
-
-    if isinstance(dilation, int):
-        op_attr['dilations'] = dilation
-    elif stride is not None:
-        op_attr['dilations'] = dilation
-
-    if filter_size is None:
-        if output_size is None:
-            raise ValueError("output_size must be set when filter_size is None")
-        if isinstance(output_size, int):
-            output_size = [output_size, output_size]
-
-        padding = op_attr.get('paddings', [0, 0])
-        stride = op_attr.get('strides', [1, 1])
-        dilation = op_attr.get('dilations', [1, 1])
-
-        h_in = input.shape[2]
-        w_in = input.shape[3]
-
-        filter_size_h = (output_size[0] - (h_in - 1) * stride[0] + 2 *
-                         padding[0] - 1) / dilation[0] + 1
-        filter_size_w = (output_size[1] - (w_in - 1) * stride[1] + 2 *
-                         padding[1] - 1) / dilation[1] + 1
-        filter_size = [filter_size_h, filter_size_w]
-
-    elif isinstance(filter_size, int):
-        filter_size = [filter_size, filter_size]
-
-    filter_shape = [input_channel, num_filters] + filter_size
-    img_filter = helper.create_parameter(
-        dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)
-
-    out = helper.create_tmp_variable(dtype=input.dtype)
-    helper.append_op(
-        type='conv2d_transpose',
-        inputs={'Input': [input],
-                'Filter': [img_filter]},
-        outputs={'Output': out},
-        attrs=op_attr)
-
-    return out
-
-
-class ConditionalBlockGuard(BlockGuard):
-    def __init__(self, block):
-        if not isinstance(block, ConditionalBlock):
-            raise TypeError("block should be conditional block")
-        super(ConditionalBlockGuard, self).__init__(block.helper.main_program)
-        self.block = block
-
-    def __enter__(self):
-        return super(ConditionalBlockGuard, self).__enter__()
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.block.complete()
-        return super(ConditionalBlockGuard, self).__exit__(exc_type, exc_val,
-                                                           exc_tb)
-
-
-class ConditionalBlock(object):
-    def __init__(self,
-                 inputs,
-                 name=None,
-                 main_program=None,
-                 startup_program=None):
-        for each_input in inputs:
-            if not isinstance(each_input, Variable):
-                raise TypeError("Each input should be variable")
-        self.inputs = inputs
-        self.helper = LayerHelper(
-            'conditional_block',
-            name=name,
-            main_program=main_program,
-            startup_program=startup_program)
-
-    def block(self):
-        return ConditionalBlockGuard(self)
-
-    def complete(self):
-        inside_block = self.helper.main_program.current_block()
-        parent_block = self.helper.main_program.block(inside_block.parent_idx)
-
-        intermediate = set()
-        params = set()
-
-        for each_op in inside_block.ops:
-            assert isinstance(each_op, Operator)
-            for iname in each_op.input_names:
-                for in_var_name in each_op.input(iname):
-                    if in_var_name not in intermediate:
-                        params.add(in_var_name)
-
-            for oname in each_op.output_names:
-                for out_var_name in each_op.output(oname):
-                    intermediate.add(out_var_name)
-        input_set = set([ipt.name for ipt in self.inputs])
-
-        param_list = [
-            parent_block.var(each_name) for each_name in params
-            if each_name not in input_set
-        ]
-
-        out_list = [
-            parent_block.var(var_name) for var_name in parent_block.vars
-            if var_name not in intermediate
-        ]
-
-        step_scope = parent_block.create_var(
-            type=core.VarDesc.VarType.STEP_SCOPES)
-        parent_block.append_op(
-            type='conditional_block',
-            inputs={
-                'X': self.inputs,
-                'Params': param_list,
-            },
-            outputs={'Out': out_list,
-                     'Scope': [step_scope]},
-            attrs={'block': inside_block})
-
-
-class IfElseBlockGuard(object):
-    def __init__(self, is_true, ifelse):
-        if not isinstance(ifelse, IfElse):
-            raise TypeError("ifelse must be an instance of IfElse class")
-
-        if ifelse.status != IfElse.OUT_IF_ELSE_BLOCKS:
-            raise ValueError("You cannot invoke IfElse.block() inside a block")
-
-        self.is_true = is_true
-        self.ie = ifelse
-        if is_true:
-            self.cond_block = ifelse.conditional_true_block
-        else:
-            self.cond_block = ifelse.conditional_false_block
-
-        if not isinstance(self.cond_block, ConditionalBlock):
-            raise TypeError("Unexpected situation")
-
-        self.cond_block = self.cond_block.block()
-
-    def __enter__(self):
-        self.ie.status = IfElse.IN_IF_ELSE_TRUE_BLOCKS if self.is_true else IfElse.IN_IF_ELSE_FALSE_BLOCKS
-        self.cond_block.__enter__()
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if not self.cond_block.__exit__(exc_type, exc_val, exc_tb):
-            # re-raise inside exception
-            return False
-        if len(self.ie.output_table[1 if self.is_true else 0]) == 0:
-            raise ValueError("Must set output inside block")
-        self.ie.status = IfElse.OUT_IF_ELSE_BLOCKS
-
-
-class IfElse(object):
-    OUT_IF_ELSE_BLOCKS = 0
-    IN_IF_ELSE_TRUE_BLOCKS = 1
-    IN_IF_ELSE_FALSE_BLOCKS = 2
-
-    def __init__(self, cond, name=None, main_program=None,
-                 startup_program=None):
-        if not isinstance(cond, Variable):
-            raise TypeError("cond must be a Variable")
-        self.helper = LayerHelper(
-            'ifelse',
-            name=name,
-            main_program=main_program,
-            startup_program=startup_program)
-        self.cond = cond
-        self.input_table = {}
-        self.status = IfElse.OUT_IF_ELSE_BLOCKS
-        self.conditional_true_block = ConditionalBlock(inputs=[self.cond])
-        self.conditional_false_block = ConditionalBlock(inputs=[self.cond])
-        self.output_table = ([], [])  # (true_outs, false_outs)
-
-    def input(self, x):
-        if self.status == IfElse.OUT_IF_ELSE_BLOCKS:
-            raise ValueError("input must in true/false blocks")
-        if id(x) not in self.input_table:
-            parent_block = self.parent_block()
-            out_true = parent_block.create_var(
-                name=unique_name('ifelse_input' + self.helper.name),
-                dtype=x.dtype)
-
-            out_false = parent_block.create_var(
-                name=unique_name('ifelse_input' + self.helper.name),
-                dtype=x.dtype)
-            parent_block.append_op(
-                type='split_lod_tensor',
-                inputs={
-                    'X': x,
-                    'Mask': self.cond,
-                },
-                outputs={'OutTrue': out_true,
-                         'OutFalse': out_false},
-                attrs={'level': 0})
-            self.input_table[id(x)] = (out_true, out_false)
-        else:
-            out_true, out_false = self.input_table[id(x)]
-
-        if self.status == IfElse.IN_IF_ELSE_TRUE_BLOCKS:
-            return out_true
-        else:
-            return out_false
-
-    def parent_block(self):
-        current_block = self.helper.main_program.current_block()
-        return self.helper.main_program.block(current_block.parent_idx)
-
-    def true_block(self):
-        return IfElseBlockGuard(True, self)
-
-    def false_block(self):
-        return IfElseBlockGuard(False, self)
-
-    def output(self, *outs):
-        if self.status == self.OUT_IF_ELSE_BLOCKS:
-            raise ValueError("output can only be invoked in the sub-block")
-
-        out_table = self.output_table[1 if self.status ==
-                                      self.IN_IF_ELSE_TRUE_BLOCKS else 0]
-        parent_block = self.parent_block()
-        for each_out in outs:
-            if not isinstance(each_out, Variable):
-                raise TypeError("Each output should be a variable")
-            # create outside tensor
-            outside_out = parent_block.create_var(
-                name=unique_name("_".join([self.helper.name, 'output'])),
-                dtype=each_out.dtype)
-            out_table.append(outside_out)
-
-            # assign local var to outside
-            assign(
-                input=each_out,
-                output=outside_out,
-                main_program=self.helper.main_program,
-                startup_program=self.helper.startup_program)
-
-    def __call__(self):
-        if self.status != self.OUT_IF_ELSE_BLOCKS:
-            raise ValueError("IfElse::__call__ must be out of sub-block")
-        false_len, true_len = map(len, self.output_table)
-        if false_len == 0 and true_len == 0:
-            raise ValueError("Must invoke true_block/false_block before "
-                             "__call__")
-        elif false_len != true_len and false_len != 0 and true_len != 0:
-            raise ValueError("The output side must be same")
-        elif false_len == 0 or true_len == 0:
-            return self.output_table[0 if false_len != 0 else 1]
-
-        # else none of false_len/true_len is zero
-        # merge together
-        rlist = []
-        for false_var, true_var in zip(*self.output_table):
-            rlist.append(
-                merge_lod_tensor(
-                    in_true=true_var,
-                    in_false=false_var,
-                    mask=self.cond,
-                    x=self.cond,
-                    level=0,
-                    main_program=self.helper.main_program,
-                    startup_program=self.helper.startup_program))
-        return rlist
-
-
-class DynamicRNN(object):
-    BEFORE_RNN = 0
-    IN_RNN = 1
-    AFTER_RNN = 2
-
-    def __init__(self, name=None, main_program=None, startup_program=None):
-        self.helper = LayerHelper(
-            'dynamic_rnn',
-            name=name,
-            main_program=main_program,
-            startup_program=startup_program)
-        self.status = DynamicRNN.BEFORE_RNN
-        self.lod_rank_table = None
-        self.max_seq_len = None
-        self.step_idx = None
-        self.zero_idx = fill_constant(shape=[1], value=0, dtype='int64')
-        self.mem_dict = dict()
-        self.output_array = []
-        self.outputs = []
-        self.cond = self.helper.create_tmp_variable(dtype='bool')
-        self.cond.stop_gradient = False
-        self.while_op = While(self.cond)
-        self.input_array = []
-        self.mem_link = []
-
-    def step_input(self, x):
-        self._assert_in_rnn_block_("step_input")
-        if not isinstance(x, Variable):
-            raise TypeError(
-                "step_input() can only take a Variable as its input")
-        parent_block = self._parent_block_()
-        if self.lod_rank_table is None:
-            self.lod_rank_table = parent_block.create_var(
-                name=unique_name('lod_rank_table'),
-                type=core.VarDesc.VarType.LOD_RANK_TABLE)
-            self.lod_rank_table.stop_gradient = True
-            parent_block.append_op(
-                type='lod_rank_table',
-                inputs={"X": x},
-                outputs={"Out": self.lod_rank_table})
-            self.max_seq_len = parent_block.create_var(
-                name=unique_name('dynamic_rnn_max_seq_len'), dtype='int64')
-            self.max_seq_len.stop_gradient = False
-            parent_block.append_op(
-                type='max_sequence_len',
-                inputs={'RankTable': self.lod_rank_table},
-                outputs={"Out": self.max_seq_len})
-            self.cond.stop_gradient = True
-            parent_block.append_op(
-                type='less_than',
-                inputs={'X': self.step_idx,
-                        'Y': self.max_seq_len},
-                outputs={'Out': self.cond})
-
-        input_array = parent_block.create_var(
-            name=unique_name('dynamic_rnn_input_array'),
-            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-            dtype=x.dtype)
-        self.input_array.append((input_array, x.dtype))
-        parent_block.append_op(
-            type='lod_tensor_to_array',
-            inputs={'X': x,
-                    'RankTable': self.lod_rank_table},
-            outputs={'Out': input_array})
-        return array_read(
-            array=input_array, i=self.step_idx, **self.helper.to_kwargs)
-
-    @contextlib.contextmanager
-    def block(self):
-        if self.status != DynamicRNN.BEFORE_RNN:
-            raise ValueError("rnn.block() can only be invoke once")
-        self.step_idx = fill_constant(shape=[1], dtype='int64', value=0)
-        self.step_idx.stop_gradient = False
-        self.status = DynamicRNN.IN_RNN
-        with self.while_op.block():
-            yield
-            increment(
-                x=self.step_idx,
-                value=1.0,
-                in_place=True,
-                **self.helper.to_kwargs)
-
-            for new_mem, mem_array in self.mem_link:
-                array_write(
-                    x=new_mem,
-                    i=self.step_idx,
-                    array=mem_array,
-                    **self.helper.to_kwargs)
-
-            less_than(
-                x=self.step_idx,
-                y=self.max_seq_len,
-                cond=self.cond,
-                **self.helper.to_kwargs)
-
-        self.status = DynamicRNN.AFTER_RNN
-        for each_array in self.output_array:
-            self.outputs.append(
-                array_to_lod_tensor(
-                    x=each_array,
-                    table=self.lod_rank_table,
-                    **self.helper.to_kwargs))
-
-    def __call__(self, *args, **kwargs):
-        if self.status != DynamicRNN.AFTER_RNN:
-            raise ValueError(
-                "Dynamic RNN outputs can only be retrieved after rnn block")
-        if len(self.outputs) == 1:
-            return self.outputs[0]
-        else:
-            return self.outputs
-
-    def memory(self, init=None, shape=None, value=0.0, dtype='float32'):
-        self._assert_in_rnn_block_('memory')
-        if init is not None:
-            if not isinstance(init, Variable):
-                raise TypeError(
-                    "The input arg `init` of memory() must be a Variable")
-            parent_block = self._parent_block_()
-            mem_array = parent_block.create_var(
-                name=unique_name('dynamic_rnn_mem_array'),
-                type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-                dtype=init.dtype)
-            parent_block.append_op(
-                type='write_to_array',
-                inputs={'X': init,
-                        'I': self.zero_idx},
-                outputs={'Out': mem_array})
-            retv = array_read(
-                array=mem_array, i=self.step_idx, **self.helper.to_kwargs)
-            retv = shrink_memory(
-                x=retv,
-                i=self.step_idx,
-                table=self.lod_rank_table,
-                **self.helper.to_kwargs)
-            self.mem_dict[retv.name] = mem_array
-            return retv
-        else:
-            if len(self.input_array) == 0:
-                raise ValueError(
-                    "step_input should be invoked before memory(shape=..., value=...)"
-                )
-            parent_block = self._parent_block_()
-            init = parent_block.create_var(
-                name=unique_name('mem_init'), dtype=dtype)
-            arr, dtype = self.input_array[0]
-            in0 = parent_block.create_var(name=unique_name('in0'), dtype=dtype)
-            parent_block.append_op(
-                type='read_from_array',
-                inputs={'X': [arr],
-                        'I': [self.zero_idx]},
-                outputs={'Out': [in0]})
-            parent_block.append_op(
-                type='fill_constant_batch_size_like',
-                inputs={'Input': [in0]},
-                outputs={'Out': [init]},
-                attrs={
-                    'shape': [-1] + shape,
-                    'value': float(value),
-                    'dtype': init.dtype
-                })
-            return self.memory(init=init)
-
-    def update_memory(self, ex_mem, new_mem):
-        self._assert_in_rnn_block_('update_memory')
-        if not isinstance(ex_mem, Variable):
-            raise TypeError("The input arg `ex_mem` of update_memory() must "
-                            "be a Variable")
-        if not isinstance(new_mem, Variable):
-            raise TypeError("The input arg `new_mem` of update_memory() must "
-                            "be a Variable")
-
-        mem_array = self.mem_dict.get(ex_mem.name, None)
-        if mem_array is None:
-            raise ValueError("Please invoke memory before update_memory")
-        if self.lod_rank_table is None:
-            raise ValueError("Please invoke step_input before update_memory")
-
-        self.mem_link.append((new_mem, mem_array))
-
-    def output(self, *outputs):
-        self._assert_in_rnn_block_('output')
-        parent_block = self._parent_block_()
-        for each in outputs:
-            outside_array = parent_block.create_var(
-                name=unique_name("_".join(
-                    [self.helper.name, "output_array", each.name])),
-                type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-                dtype=each.dtype)
-            array_write(x=each, i=self.step_idx, array=outside_array)
-            self.output_array.append(outside_array)
-
-    def _parent_block_(self):
-        prog = self.helper.main_program
-        parent_idx = prog.current_block().parent_idx
-        assert parent_idx >= 0
-        parent_block = prog.block(parent_idx)
-
-        return parent_block
-
-    def _assert_in_rnn_block_(self, method):
-        if self.status != DynamicRNN.IN_RNN:
-            raise ValueError("{0} can only be invoked inside rnn block.".format(
-                method))
diff --git a/python/paddle/v2/fluid/layers/__init__.py b/python/paddle/v2/fluid/layers/__init__.py
new file mode 100644
index 0000000000..249f570e13
--- /dev/null
+++ b/python/paddle/v2/fluid/layers/__init__.py
@@ -0,0 +1,17 @@
+import ops
+from ops import *
+import nn
+from nn import *
+import io
+from io import *
+import tensor
+from tensor import *
+import control_flow
+from control_flow import *
+
+__all__ = []
+__all__ += nn.__all__
+__all__ += io.__all__
+__all__ += tensor.__all__
+__all__ += control_flow.__all__
+__all__ += ops.__all__
diff --git a/python/paddle/v2/fluid/layers/control_flow.py b/python/paddle/v2/fluid/layers/control_flow.py
new file mode 100644
index 0000000000..5af6c78977
--- /dev/null
+++ b/python/paddle/v2/fluid/layers/control_flow.py
@@ -0,0 +1,1022 @@
+from ..layer_helper import LayerHelper, unique_name
+from ..framework import Program, Variable, Operator
+from .. import core
+from tensor import assign, fill_constant
+import contextlib
+
+__all__ = [
+    'split_lod_tensor', 'merge_lod_tensor', 'BlockGuard', 'StaticRNNGuard',
+    'StaticRNNMemoryLink', 'WhileGuard', 'While', 'lod_rank_table',
+    'max_sequence_len', 'topk', 'lod_tensor_to_array', 'array_to_lod_tensor',
+    'increment', 'array_write', 'create_array', 'less_than', 'array_read',
+    'shrink_memory', 'array_length', 'IfElse', 'DynamicRNN', 'ConditionalBlock',
+    'StaticRNN'
+]
+
+
+def split_lod_tensor(input,
+                     mask,
+                     level=0,
+                     main_program=None,
+                     startup_program=None):
+    helper = LayerHelper('split_lod_tensor', **locals())
+    out_true = helper.create_tmp_variable(dtype=input.dtype)
+    out_false = helper.create_tmp_variable(dtype=input.dtype)
+    helper.append_op(
+        type='split_lod_tensor',
+        inputs={
+            'X': input,
+            'Mask': mask,
+        },
+        outputs={'OutTrue': out_true,
+                 'OutFalse': out_false},
+        attrs={'level': level})
+    return out_true, out_false
+
+
+def merge_lod_tensor(in_true,
+                     in_false,
+                     x,
+                     mask,
+                     level=0,
+                     main_program=None,
+                     startup_program=None):
+    helper = LayerHelper('merge_lod_tensor', **locals())
+    out = helper.create_tmp_variable(dtype=in_true.dtype)
+    helper.append_op(
+        type='merge_lod_tensor',
+        inputs={'X': x,
+                'Mask': mask,
+                'InTrue': in_true,
+                'InFalse': in_false},
+        outputs={'Out': out},
+        attrs={'level': level})
+    return out
+
+
+class BlockGuard(object):
+    """
+    BlockGuard class.
+
+    BlockGuard class is used to create a sub-block in a program by
+    using the Python `with` keyword.
+    """
+
+    def __init__(self, main_program):
+        if not isinstance(main_program, Program):
+            raise TypeError("BlockGuard takes a program")
+        self.main_program = main_program
+
+    def __enter__(self):
+        self.main_program.create_block()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.main_program.rollback()
+        if exc_type is not None:
+            return False  # re-raise exception
+        return True
+
+
+class StaticRNNGuard(BlockGuard):
+    """
+    StaticRNNGuard class.
+
+    StaticRNNGuard class is used to create a StaticRNN block in a program.
+    """
+
+    def __init__(self, rnn):
+        if not isinstance(rnn, StaticRNN):
+            raise TypeError("StaticRNNGuard takes a StaticRNN")
+        super(StaticRNNGuard, self).__init__(rnn.helper.main_program)
+        self.rnn = rnn
+
+    def __enter__(self):
+        self.rnn.status = StaticRNN.IN_RNN_BLOCK
+        return super(StaticRNNGuard, self).__enter__()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_type is not None:
+            return False
+        self.rnn.status = StaticRNN.AFTER_RNN_BLOCK
+        self.rnn.complete_rnn_op()
+        return super(StaticRNNGuard, self).__exit__(exc_type, exc_val, exc_tb)
+
+
+class StaticRNNMemoryLink(object):
+    """
+    StaticRNNMemoryLink class.
+
+    Args:
+        init: the initial variable for Memory
+        init: Variable
+        pre_mem: the memory variable in previous time step
+        pre_mem: Variable
+        mem: the memory variable in current time step
+        mem: Variable
+
+    StaticRNNMemoryLink class is used to create a link between two
+    memory cells of a StaticRNN.
+    """
+
+    def __init__(self, init, pre_mem, mem=None):
+        self.init = init
+        self.pre_mem = pre_mem
+        self.mem = mem
+
+
+class StaticRNN(object):
+    """
+    StaticRNN class.
+
+    StaticRNN class is used to create a StaticRNN. The RNN will have its
+    own parameters like inputs, outputs, memories, status and length.
+    """
+    BEFORE_RNN_BLOCK = 0
+    IN_RNN_BLOCK = 1
+    AFTER_RNN_BLOCK = 2
+
+    def __init__(self, name=None, main_program=None):
+        self.helper = LayerHelper(
+            "static_rnn", name=name, main_program=main_program)
+        self.memories = {}  # memory map, from pre_mem.name --> MemoryLink
+        self.inputs = []  # input variable list in current block
+        self.outputs = []  # output variable list in parent block
+        self.status = StaticRNN.BEFORE_RNN_BLOCK  # status flag.
+        # sequence length, since it is a static RNN, sequence length are fixed.
+        self.seq_len = None
+
+    def step(self):
+        return StaticRNNGuard(self)
+
+    def _assert_in_rnn_block_(self, method):
+        if self.status != StaticRNN.IN_RNN_BLOCK:
+            raise ValueError("You must invoke {0} in rnn block".format(method))
+
+    def memory(self,
+               init=None,
+               shape=None,
+               batch_ref=None,
+               init_value=0.0,
+               init_batch_dim_idx=0,
+               ref_batch_dim_idx=1):
+        """
+        Args:
+            init: boot memory, if not set, a shape, batch_ref must be provided
+            shape: shape of the boot memory
+            batch_ref: batch size reference variable
+            init_value: the init value of boot memory
+            init_batch_dim_idx: the index of batch size in init's dimension
+            ref_batch_dim_idx: the index of batch size in batch_ref's dimension
+        """
+        self._assert_in_rnn_block_('memory')
+        if init is None:
+            if shape is None or batch_ref is None:
+                raise ValueError(
+                    "if init is None, memory at least need shape and batch_ref")
+            parent_block = self.parent_block()
+            var_name = unique_name("@".join([self.helper.name, "memory_boot"]))
+            boot_var = parent_block.create_var(
+                name=var_name,
+                shape=shape,
+                dtype=batch_ref.dtype,
+                persistable=False)
+
+            parent_block.append_op(
+                type="fill_constant_batch_size_like",
+                inputs={'Input': [batch_ref]},
+                outputs={'Out': [boot_var]},
+                attrs={
+                    'value': init_value,
+                    'shape': boot_var.shape,
+                    'dtype': boot_var.dtype,
+                    'input_dim_idx': ref_batch_dim_idx,
+                    'output_dim_idx': init_batch_dim_idx
+                })
+
+            return self.memory(init=boot_var)
+        else:
+            pre_mem = self.helper.create_variable(
+                name=unique_name("@".join([self.helper.name, "mem"])),
+                dtype=init.dtype,
+                shape=init.shape)
+            self.memories[pre_mem.name] = StaticRNNMemoryLink(
+                init=init, pre_mem=pre_mem)
+            return pre_mem
+
+    def step_input(self, x):
+        self._assert_in_rnn_block_('step_input')
+        if not isinstance(x, Variable):
+            raise TypeError("step input takes a Variable")
+        if self.seq_len is None:
+            self.seq_len = x.shape[0]
+        elif self.seq_len != x.shape[0]:
+            raise ValueError("Static RNN only take fix seq_len input")
+
+        ipt = self.helper.create_variable(
+            name=x.name, dtype=x.dtype, shape=list(x.shape[1:]), type=x.type)
+        self.inputs.append(ipt)
+        return ipt
+
+    def step_output(self, o):
+        self._assert_in_rnn_block_('step_output')
+        if not isinstance(o, Variable):
+            raise TypeError("step output takes a Variable")
+
+        tmp_o = self.helper.create_tmp_variable(dtype=o.dtype)
+        self.helper.append_op(
+            type='rnn_memory_helper',
+            inputs={'X': [o]},
+            outputs={'Out': tmp_o},
+            attrs={'dtype': o.dtype})
+
+        out_var = self.parent_block().create_var(
+            name=tmp_o.name,
+            shape=[self.seq_len] + list(tmp_o.shape),
+            dtype=tmp_o.dtype)
+
+        self.outputs.append(out_var)
+
+    def output(self, *outputs):
+        for each in outputs:
+            self.step_output(each)
+
+    def update_memory(self, mem, var):
+        if not isinstance(mem, Variable) or not isinstance(var, Variable):
+            raise TypeError("update memory should take variables")
+        self.memories[mem.name].mem = var
+
+    def parent_block(self):
+        prog = self.helper.main_program
+        parent_idx = prog.current_block().parent_idx
+        assert parent_idx >= 0
+        parent_block = prog.block(parent_idx)
+        return parent_block
+
+    def __call__(self, *args, **kwargs):
+        if self.status != StaticRNN.AFTER_RNN_BLOCK:
+            raise ValueError("RNN output can only be retrieved after rnn block")
+        if len(self.outputs) == 0:
+            raise ValueError("RNN has no output")
+        elif len(self.outputs) == 1:
+            return self.outputs[0]
+        else:
+            return self.outputs
+
+    def complete_rnn_op(self):
+        main_program = self.helper.main_program
+        rnn_block = main_program.current_block()
+        parent_block = self.parent_block()
+
+        local_inputs = set()
+
+        for op in rnn_block.ops:
+            assert isinstance(op, Operator)
+            for oname in op.output_names:
+                for out_var_name in op.output(oname):
+                    local_inputs.add(out_var_name)
+
+        for var in self.inputs:
+            local_inputs.add(var.name)
+        for m in self.memories:
+            local_inputs.add(m)
+
+        params = list()
+        for op in rnn_block.ops:
+            assert isinstance(op, Operator)
+            for iname in op.input_names:
+                for in_var_name in op.input(iname):
+                    if in_var_name not in local_inputs:
+                        params.append(in_var_name)
+
+        parameters = [parent_block.var(name) for name in params]
+
+        step_scope = parent_block.create_var(
+            type=core.VarDesc.VarType.STEP_SCOPES)
+
+        inlinks = [parent_block.var(i.name) for i in self.inputs]
+        outlinks = self.outputs
+
+        boot_memories = []
+        pre_memories = []
+        memories = []
+        for _, mem in self.memories.iteritems():
+            boot_memories.append(mem.init)
+            pre_memories.append(mem.pre_mem.name)
+            mem_var = rnn_block.var(mem.mem.name)
+            assert isinstance(mem_var, Variable)
+            new_mem = self.helper.create_tmp_variable(dtype=mem_var.dtype)
+
+            rnn_block.append_op(
+                type='rnn_memory_helper',
+                inputs={'X': [mem_var]},
+                outputs={'Out': [new_mem]},
+                attrs={'dtype': mem_var.dtype})
+
+            memories.append(new_mem.name)
+
+        parent_block.append_op(
+            type='recurrent',
+            inputs={
+                'inputs': inlinks,
+                'initial_states': boot_memories,
+                'parameters': parameters
+            },
+            outputs={'outputs': outlinks,
+                     'step_scopes': [step_scope]},
+            attrs={
+                'ex_states': pre_memories,
+                'states': memories,
+                'sub_block': rnn_block
+            })
+
+
+class WhileGuard(BlockGuard):
+    def __init__(self, while_op):
+        if not isinstance(while_op, While):
+            raise TypeError("WhileGuard takes a while op")
+        super(WhileGuard, self).__init__(while_op.helper.main_program)
+        self.while_op = while_op
+
+    def __enter__(self):
+        self.while_op.status = While.IN_WHILE_BLOCK
+        return super(WhileGuard, self).__enter__()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_type is not None:
+            return False
+        self.while_op.status = While.AFTER_WHILE_BLOCK
+        self.while_op.complete()
+        return super(WhileGuard, self).__exit__(exc_type, exc_val, exc_tb)
+
+
+class While(object):
+    BEFORE_WHILE_BLOCK = 0
+    IN_WHILE_BLOCK = 1
+    AFTER_WHILE_BLOCK = 2
+
+    def __init__(self, cond, name=None, main_program=None):
+        self.helper = LayerHelper("while", name=name, main_program=main_program)
+        self.status = While.BEFORE_WHILE_BLOCK
+        if not isinstance(cond, Variable):
+            raise TypeError("condition should be a variable")
+        assert isinstance(cond, Variable)
+        if cond.dtype != core.DataType.BOOL:
+            raise TypeError("condition should be a bool variable")
+        if reduce(lambda a, b: a * b, cond.shape, 1) != 1:
+            raise TypeError("condition should be a bool scalar")
+        self.cond_var = cond
+
+    def block(self):
+        return WhileGuard(self)
+
+    def complete(self):
+        main_program = self.helper.main_program
+        while_block = main_program.current_block()
+        parent_block = main_program.block(main_program.current_block()
+                                          .parent_idx)
+
+        inner_outputs = {self.cond_var.name}
+        x_name_list = set()
+        for op in while_block.ops:
+            for iname in op.input_names:
+                for in_var_name in op.input(iname):
+                    if in_var_name not in inner_outputs:
+                        x_name_list.add(in_var_name)
+
+            for oname in op.output_names:
+                for out_var_name in op.output(oname):
+                    inner_outputs.add(out_var_name)
+
+        out_vars = []
+        for inner_out_name in inner_outputs:
+            if inner_out_name in parent_block.vars:
+                out_vars.append(parent_block.var(inner_out_name))
+
+        step_scope = parent_block.create_var(
+            type=core.VarDesc.VarType.STEP_SCOPES)
+
+        parent_block.append_op(
+            type='while',
+            inputs={
+                'X': [parent_block.var(x_name) for x_name in x_name_list],
+                'Condition': [self.cond_var]
+            },
+            outputs={'Out': out_vars,
+                     'StepScopes': [step_scope]},
+            attrs={'sub_block': while_block})
+
+
+def lod_rank_table(x, level=0, main_program=None):
+    """
+    This function creates an operator for creating a LOD_RANK_TABLE
+    using the input x.
+    """
+    helper = LayerHelper("lod_rank_table", **locals())
+    table = helper.create_variable(
+        type=core.VarDesc.VarType.LOD_RANK_TABLE,
+        name=unique_name("lod_rank_table"))
+    helper.append_op(
+        type='lod_rank_table',
+        inputs={'X': x},
+        outputs={'Out': table},
+        attrs={'level': level})
+    return table
+
+
+def max_sequence_len(rank_table, main_program=None):
+    """
+    This function creates an operator to calculate the length of
+    max seqence through input rank_table(should be a lod_rank_table)
+    """
+    helper = LayerHelper("max_seqence_len", **locals())
+    res = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="max_sequence_len",
+        inputs={"RankTable": rank_table},
+        outputs={"Out": res})
+    return res
+
+
+def topk(input, k, main_program=None, startup_program=None):
+    helper = LayerHelper('topk', **locals())
+    topk_out = helper.create_tmp_variable(dtype=input.data_type)
+    topk_indices = helper.create_tmp_variable(dtype='int64')
+    helper.append_op(
+        type='top_k',
+        inputs={'X': [input]},
+        outputs={'Out': [topk_out],
+                 'Indices': [topk_indices]},
+        attrs={'k': k})
+    return topk_out, topk_indices
+
+
+def lod_tensor_to_array(x, table, main_program=None):
+    """
+    This function creates an operator to convert an LOD_Tensor to
+    an array.
+    """
+    helper = LayerHelper("lod_tensor_to_array", **locals())
+    array = helper.create_variable(
+        name=unique_name("lod_tensor_to_array"),
+        type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+        dtype=x.dtype)
+    helper.append_op(
+        type='lod_tensor_to_array',
+        inputs={'X': x,
+                'RankTable': table},
+        outputs={'Out': array})
+    return array
+
+
+def array_to_lod_tensor(x, table, main_program=None, startup_program=None):
+    """
+    This function creates an operator to convert an array to a
+    LOD_Tensor.
+    """
+    helper = LayerHelper("array_to_lod_tensor", **locals())
+    tmp = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type="array_to_lod_tensor",
+        inputs={'X': x,
+                'RankTable': table},
+        outputs={'Out': tmp})
+    return tmp
+
+
+def increment(x,
+              value=1.0,
+              in_place=True,
+              main_program=None,
+              startup_program=None):
+    """
+    This function creates an operator to increment each value in the input
+    `x` by an amount: `value` as mentioned in the input parameter. This
+    operation is performed in-place by default.
+    """
+    helper = LayerHelper("increment", **locals())
+    if not in_place:
+        out = helper.create_tmp_variable(dtype=x.dtype)
+    else:
+        out = x
+    helper.append_op(
+        type='increment',
+        inputs={'X': [x]},
+        outputs={'Out': [out]},
+        attrs={'step': float(value)})
+    return out
+
+
+def array_write(x, i, array=None, main_program=None, startup_program=None):
+    """
+    This function creates an operator to write the data out as a
+    LOD_TENSOR_ARRAY.
+    """
+    helper = LayerHelper('array_write', **locals())
+    if array is None:
+        array = helper.create_variable(
+            name="{0}.out".format(helper.name),
+            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+            dtype=x.dtype)
+    helper.append_op(
+        type='write_to_array',
+        inputs={'X': [x],
+                'I': [i]},
+        outputs={'Out': [array]})
+    return array
+
+
+def create_array(dtype, main_program=None):
+    helper = LayerHelper("array", **locals())
+    return helper.create_variable(
+        name="{0}.out".format(helper.name),
+        type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+        dtype=dtype)
+
+
+def less_than(x, y, cond=None, main_program=None, **ignored):
+    helper = LayerHelper("less_than", **locals())
+    if cond is None:
+        cond = helper.create_tmp_variable(dtype='bool')
+        cond.stop_gradient = True
+
+    helper.append_op(
+        type='less_than', inputs={'X': [x],
+                                  'Y': [y]}, outputs={'Out': [cond]})
+    return cond
+
+
+def array_read(array, i, main_program=None, startup_program=None):
+    """
+    This function creates an operator to read the data in as a
+    LOD_TENSOR_ARRAY.
+    """
+    helper = LayerHelper('array_read', **locals())
+    if not isinstance(
+            array,
+            Variable) or array.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY:
+        raise TypeError("array should be tensor array vairable")
+    out = helper.create_tmp_variable(dtype=array.dtype)
+    helper.append_op(
+        type='read_from_array',
+        inputs={'X': [array],
+                'I': [i]},
+        outputs={'Out': [out]})
+    return out
+
+
+def shrink_memory(x, i, table, main_program=None, startup_program=None):
+    """
+    This function creates an operator to shrink_rnn_memory using the RankTable
+    as mentioned in the input parameter.
+    """
+    helper = LayerHelper('shrink_memory', **locals())
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type='shrink_rnn_memory',
+        inputs={'X': [x],
+                'I': [i],
+                'RankTable': [table]},
+        outputs={'Out': [out]},
+        attrs={})
+    return out
+
+
+def array_length(array, main_program=None):
+    """
+    This function creates an operator to find the length of the
+    LOD_TENSOR_ARRAY.
+    """
+    helper = LayerHelper('array_length', **locals())
+    tmp = helper.create_tmp_variable(dtype='int64')
+    tmp.stop_gradient = True
+    helper.append_op(
+        type='lod_array_length', inputs={'X': [array]}, outputs={'Out': [tmp]})
+    return tmp
+
+
+class ConditionalBlockGuard(BlockGuard):
+    def __init__(self, block):
+        if not isinstance(block, ConditionalBlock):
+            raise TypeError("block should be conditional block")
+        super(ConditionalBlockGuard, self).__init__(block.helper.main_program)
+        self.block = block
+
+    def __enter__(self):
+        return super(ConditionalBlockGuard, self).__enter__()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.block.complete()
+        return super(ConditionalBlockGuard, self).__exit__(exc_type, exc_val,
+                                                           exc_tb)
+
+
+class ConditionalBlock(object):
+    def __init__(self,
+                 inputs,
+                 name=None,
+                 main_program=None,
+                 startup_program=None):
+        for each_input in inputs:
+            if not isinstance(each_input, Variable):
+                raise TypeError("Each input should be variable")
+        self.inputs = inputs
+        self.helper = LayerHelper(
+            'conditional_block',
+            name=name,
+            main_program=main_program,
+            startup_program=startup_program)
+
+    def block(self):
+        return ConditionalBlockGuard(self)
+
+    def complete(self):
+        inside_block = self.helper.main_program.current_block()
+        parent_block = self.helper.main_program.block(inside_block.parent_idx)
+
+        intermediate = set()
+        params = set()
+
+        for each_op in inside_block.ops:
+            assert isinstance(each_op, Operator)
+            for iname in each_op.input_names:
+                for in_var_name in each_op.input(iname):
+                    if in_var_name not in intermediate:
+                        params.add(in_var_name)
+
+            for oname in each_op.output_names:
+                for out_var_name in each_op.output(oname):
+                    intermediate.add(out_var_name)
+        input_set = set([ipt.name for ipt in self.inputs])
+
+        param_list = [
+            parent_block.var(each_name) for each_name in params
+            if each_name not in input_set
+        ]
+
+        out_list = [
+            parent_block.var(var_name) for var_name in parent_block.vars
+            if var_name not in intermediate
+        ]
+
+        step_scope = parent_block.create_var(
+            type=core.VarDesc.VarType.STEP_SCOPES)
+        parent_block.append_op(
+            type='conditional_block',
+            inputs={
+                'X': self.inputs,
+                'Params': param_list,
+            },
+            outputs={'Out': out_list,
+                     'Scope': [step_scope]},
+            attrs={'sub_block': inside_block})
+
+
+class IfElseBlockGuard(object):
+    def __init__(self, is_true, ifelse):
+        if not isinstance(ifelse, IfElse):
+            raise TypeError("ifelse must be an instance of IfElse class")
+
+        if ifelse.status != IfElse.OUT_IF_ELSE_BLOCKS:
+            raise ValueError("You cannot invoke IfElse.block() inside a block")
+
+        self.is_true = is_true
+        self.ie = ifelse
+        if is_true:
+            self.cond_block = ifelse.conditional_true_block
+        else:
+            self.cond_block = ifelse.conditional_false_block
+
+        if not isinstance(self.cond_block, ConditionalBlock):
+            raise TypeError("Unexpected situation")
+
+        self.cond_block = self.cond_block.block()
+
+    def __enter__(self):
+        self.ie.status = IfElse.IN_IF_ELSE_TRUE_BLOCKS if self.is_true else IfElse.IN_IF_ELSE_FALSE_BLOCKS
+        self.cond_block.__enter__()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if not self.cond_block.__exit__(exc_type, exc_val, exc_tb):
+            # re-raise inside exception
+            return False
+        if len(self.ie.output_table[1 if self.is_true else 0]) == 0:
+            raise ValueError("Must set output inside block")
+        self.ie.status = IfElse.OUT_IF_ELSE_BLOCKS
+
+
+class IfElse(object):
+    OUT_IF_ELSE_BLOCKS = 0
+    IN_IF_ELSE_TRUE_BLOCKS = 1
+    IN_IF_ELSE_FALSE_BLOCKS = 2
+
+    def __init__(self, cond, name=None, main_program=None,
+                 startup_program=None):
+        if not isinstance(cond, Variable):
+            raise TypeError("cond must be a Variable")
+        self.helper = LayerHelper(
+            'ifelse',
+            name=name,
+            main_program=main_program,
+            startup_program=startup_program)
+        self.cond = cond
+        self.input_table = {}
+        self.status = IfElse.OUT_IF_ELSE_BLOCKS
+        self.conditional_true_block = ConditionalBlock(inputs=[self.cond])
+        self.conditional_false_block = ConditionalBlock(inputs=[self.cond])
+        self.output_table = ([], [])  # (true_outs, false_outs)
+
+    def input(self, x):
+        if self.status == IfElse.OUT_IF_ELSE_BLOCKS:
+            raise ValueError("input must in true/false blocks")
+        if id(x) not in self.input_table:
+            parent_block = self.parent_block()
+            out_true = parent_block.create_var(
+                name=unique_name('ifelse_input' + self.helper.name),
+                dtype=x.dtype)
+
+            out_false = parent_block.create_var(
+                name=unique_name('ifelse_input' + self.helper.name),
+                dtype=x.dtype)
+            parent_block.append_op(
+                type='split_lod_tensor',
+                inputs={
+                    'X': x,
+                    'Mask': self.cond,
+                },
+                outputs={'OutTrue': out_true,
+                         'OutFalse': out_false},
+                attrs={'level': 0})
+            self.input_table[id(x)] = (out_true, out_false)
+        else:
+            out_true, out_false = self.input_table[id(x)]
+
+        if self.status == IfElse.IN_IF_ELSE_TRUE_BLOCKS:
+            return out_true
+        else:
+            return out_false
+
+    def parent_block(self):
+        current_block = self.helper.main_program.current_block()
+        return self.helper.main_program.block(current_block.parent_idx)
+
+    def true_block(self):
+        return IfElseBlockGuard(True, self)
+
+    def false_block(self):
+        return IfElseBlockGuard(False, self)
+
+    def output(self, *outs):
+        if self.status == self.OUT_IF_ELSE_BLOCKS:
+            raise ValueError("output can only be invoked in the sub-block")
+
+        out_table = self.output_table[1 if self.status ==
+                                      self.IN_IF_ELSE_TRUE_BLOCKS else 0]
+        parent_block = self.parent_block()
+        for each_out in outs:
+            if not isinstance(each_out, Variable):
+                raise TypeError("Each output should be a variable")
+            # create outside tensor
+            outside_out = parent_block.create_var(
+                name=unique_name("_".join([self.helper.name, 'output'])),
+                dtype=each_out.dtype)
+            out_table.append(outside_out)
+
+            # assign local var to outside
+            assign(
+                input=each_out,
+                output=outside_out,
+                main_program=self.helper.main_program,
+                startup_program=self.helper.startup_program)
+
+    def __call__(self):
+        if self.status != self.OUT_IF_ELSE_BLOCKS:
+            raise ValueError("IfElse::__call__ must be out of sub-block")
+        false_len, true_len = map(len, self.output_table)
+        if false_len == 0 and true_len == 0:
+            raise ValueError("Must invoke true_block/false_block before "
+                             "__call__")
+        elif false_len != true_len and false_len != 0 and true_len != 0:
+            raise ValueError("The output side must be same")
+        elif false_len == 0 or true_len == 0:
+            return self.output_table[0 if false_len != 0 else 1]
+
+        # else none of false_len/true_len is zero
+        # merge together
+        rlist = []
+        for false_var, true_var in zip(*self.output_table):
+            rlist.append(
+                merge_lod_tensor(
+                    in_true=true_var,
+                    in_false=false_var,
+                    mask=self.cond,
+                    x=self.cond,
+                    level=0,
+                    main_program=self.helper.main_program,
+                    startup_program=self.helper.startup_program))
+        return rlist
+
+
+class DynamicRNN(object):
+    BEFORE_RNN = 0
+    IN_RNN = 1
+    AFTER_RNN = 2
+
+    def __init__(self, name=None, main_program=None, startup_program=None):
+        self.helper = LayerHelper(
+            'dynamic_rnn',
+            name=name,
+            main_program=main_program,
+            startup_program=startup_program)
+        self.status = DynamicRNN.BEFORE_RNN
+        self.lod_rank_table = None
+        self.max_seq_len = None
+        self.step_idx = None
+        self.zero_idx = fill_constant(shape=[1], value=0, dtype='int64')
+        self.mem_dict = dict()
+        self.output_array = []
+        self.outputs = []
+        self.cond = self.helper.create_tmp_variable(dtype='bool')
+        self.cond.stop_gradient = False
+        self.while_op = While(self.cond)
+        self.input_array = []
+        self.mem_link = []
+
+    def step_input(self, x):
+        self._assert_in_rnn_block_("step_input")
+        if not isinstance(x, Variable):
+            raise TypeError(
+                "step_input() can only take a Variable as its input")
+        parent_block = self._parent_block_()
+        if self.lod_rank_table is None:
+            self.lod_rank_table = parent_block.create_var(
+                name=unique_name('lod_rank_table'),
+                type=core.VarDesc.VarType.LOD_RANK_TABLE)
+            self.lod_rank_table.stop_gradient = True
+            parent_block.append_op(
+                type='lod_rank_table',
+                inputs={"X": x},
+                outputs={"Out": self.lod_rank_table})
+            self.max_seq_len = parent_block.create_var(
+                name=unique_name('dynamic_rnn_max_seq_len'), dtype='int64')
+            self.max_seq_len.stop_gradient = False
+            parent_block.append_op(
+                type='max_sequence_len',
+                inputs={'RankTable': self.lod_rank_table},
+                outputs={"Out": self.max_seq_len})
+            self.cond.stop_gradient = True
+            parent_block.append_op(
+                type='less_than',
+                inputs={'X': self.step_idx,
+                        'Y': self.max_seq_len},
+                outputs={'Out': self.cond})
+
+        input_array = parent_block.create_var(
+            name=unique_name('dynamic_rnn_input_array'),
+            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+            dtype=x.dtype)
+        self.input_array.append((input_array, x.dtype))
+        parent_block.append_op(
+            type='lod_tensor_to_array',
+            inputs={'X': x,
+                    'RankTable': self.lod_rank_table},
+            outputs={'Out': input_array})
+        return array_read(
+            array=input_array, i=self.step_idx, **self.helper.to_kwargs)
+
+    @contextlib.contextmanager
+    def block(self):
+        if self.status != DynamicRNN.BEFORE_RNN:
+            raise ValueError("rnn.block() can only be invoke once")
+        self.step_idx = fill_constant(shape=[1], dtype='int64', value=0)
+        self.step_idx.stop_gradient = False
+        self.status = DynamicRNN.IN_RNN
+        with self.while_op.block():
+            yield
+            increment(
+                x=self.step_idx,
+                value=1.0,
+                in_place=True,
+                **self.helper.to_kwargs)
+
+            for new_mem, mem_array in self.mem_link:
+                array_write(
+                    x=new_mem,
+                    i=self.step_idx,
+                    array=mem_array,
+                    **self.helper.to_kwargs)
+
+            less_than(
+                x=self.step_idx,
+                y=self.max_seq_len,
+                cond=self.cond,
+                **self.helper.to_kwargs)
+
+        self.status = DynamicRNN.AFTER_RNN
+        for each_array in self.output_array:
+            self.outputs.append(
+                array_to_lod_tensor(
+                    x=each_array,
+                    table=self.lod_rank_table,
+                    **self.helper.to_kwargs))
+
+    def __call__(self, *args, **kwargs):
+        if self.status != DynamicRNN.AFTER_RNN:
+            raise ValueError(
+                "Dynamic RNN outputs can only be retrieved after rnn block")
+        if len(self.outputs) == 1:
+            return self.outputs[0]
+        else:
+            return self.outputs
+
+    def memory(self, init=None, shape=None, value=0.0, dtype='float32'):
+        self._assert_in_rnn_block_('memory')
+        if init is not None:
+            if not isinstance(init, Variable):
+                raise TypeError(
+                    "The input arg `init` of memory() must be a Variable")
+            parent_block = self._parent_block_()
+            mem_array = parent_block.create_var(
+                name=unique_name('dynamic_rnn_mem_array'),
+                type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+                dtype=init.dtype)
+            parent_block.append_op(
+                type='write_to_array',
+                inputs={'X': init,
+                        'I': self.zero_idx},
+                outputs={'Out': mem_array})
+            retv = array_read(
+                array=mem_array, i=self.step_idx, **self.helper.to_kwargs)
+            retv = shrink_memory(
+                x=retv,
+                i=self.step_idx,
+                table=self.lod_rank_table,
+                **self.helper.to_kwargs)
+            self.mem_dict[retv.name] = mem_array
+            return retv
+        else:
+            if len(self.input_array) == 0:
+                raise ValueError(
+                    "step_input should be invoked before memory(shape=..., value=...)"
+                )
+            parent_block = self._parent_block_()
+            init = parent_block.create_var(
+                name=unique_name('mem_init'), dtype=dtype)
+            arr, dtype = self.input_array[0]
+            in0 = parent_block.create_var(name=unique_name('in0'), dtype=dtype)
+            parent_block.append_op(
+                type='read_from_array',
+                inputs={'X': [arr],
+                        'I': [self.zero_idx]},
+                outputs={'Out': [in0]})
+            parent_block.append_op(
+                type='fill_constant_batch_size_like',
+                inputs={'Input': [in0]},
+                outputs={'Out': [init]},
+                attrs={
+                    'shape': [-1] + shape,
+                    'value': float(value),
+                    'dtype': init.dtype
+                })
+            return self.memory(init=init)
+
+    def update_memory(self, ex_mem, new_mem):
+        self._assert_in_rnn_block_('update_memory')
+        if not isinstance(ex_mem, Variable):
+            raise TypeError("The input arg `ex_mem` of update_memory() must "
+                            "be a Variable")
+        if not isinstance(new_mem, Variable):
+            raise TypeError("The input arg `new_mem` of update_memory() must "
+                            "be a Variable")
+
+        mem_array = self.mem_dict.get(ex_mem.name, None)
+        if mem_array is None:
+            raise ValueError("Please invoke memory before update_memory")
+        if self.lod_rank_table is None:
+            raise ValueError("Please invoke step_input before update_memory")
+
+        self.mem_link.append((new_mem, mem_array))
+
+    def output(self, *outputs):
+        self._assert_in_rnn_block_('output')
+        parent_block = self._parent_block_()
+        for each in outputs:
+            outside_array = parent_block.create_var(
+                name=unique_name("_".join(
+                    [self.helper.name, "output_array", each.name])),
+                type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+                dtype=each.dtype)
+            array_write(x=each, i=self.step_idx, array=outside_array)
+            self.output_array.append(outside_array)
+
+    def _parent_block_(self):
+        prog = self.helper.main_program
+        parent_idx = prog.current_block().parent_idx
+        assert parent_idx >= 0
+        parent_block = prog.block(parent_idx)
+
+        return parent_block
+
+    def _assert_in_rnn_block_(self, method):
+        if self.status != DynamicRNN.IN_RNN:
+            raise ValueError("{0} can only be invoked inside rnn block.".format(
+                method))
diff --git a/python/paddle/v2/fluid/layers/io.py b/python/paddle/v2/fluid/layers/io.py
new file mode 100644
index 0000000000..f03d8e3c3e
--- /dev/null
+++ b/python/paddle/v2/fluid/layers/io.py
@@ -0,0 +1,57 @@
+from .. import core
+from ..layer_helper import LayerHelper
+
+__all__ = ['data']
+
+
+def data(name,
+         shape,
+         append_batch_size=True,
+         dtype='float32',
+         lod_level=0,
+         type=core.VarDesc.VarType.LOD_TENSOR,
+         main_program=None,
+         startup_program=None,
+         stop_gradient=True):
+    """
+    Data Layer.
+
+    Args:
+       name: The name/alias of the function
+       shape: Tuple declaring the shape.
+       append_batch_size: Whether or not to append the data as a batch.
+       dtype: The type of data : float32, float_16, int etc
+       type: The output type. By default it is LOD_TENSOR.
+       lod_level(int): The LoD Level. 0 means the input data is not a sequence.
+       main_program: Name of the main program that calls this
+       startup_program: Name of the startup program
+       stop_gradient: A boolean that mentions whether gradient should flow.
+
+    This function takes in input and based on whether data has
+    to be returned back as a minibatch, it creates the global variable using
+    the helper functions. The global variables can be accessed by all the
+    following operations and layers in the graph.
+
+    All the input variables of this function are passed in as local variables
+    to the LayerHelper constructor.
+
+    """
+    helper = LayerHelper('data', **locals())
+    shape = list(shape)
+    for i in xrange(len(shape)):
+        if shape[i] is None:
+            shape[i] = -1
+            append_batch_size = False
+        elif shape[i] < 0:
+            append_batch_size = False
+
+    if append_batch_size:
+        shape = [-1] + shape  # append batch size as -1
+
+    return helper.create_global_variable(
+        name=name,
+        shape=shape,
+        dtype=dtype,
+        type=type,
+        stop_gradient=stop_gradient,
+        lod_level=lod_level)
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
new file mode 100644
index 0000000000..c0bfc22707
--- /dev/null
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -0,0 +1,797 @@
+"""
+All layers just related to the neural network.
+"""
+
+from ..layer_helper import LayerHelper
+from ..initializer import Normal, Constant
+from ..framework import Variable
+
+__all__ = [
+    'fc', 'embedding', 'dynamic_lstm', 'gru_unit', 'linear_chain_crf',
+    'crf_decoding', 'cos_sim', 'cross_entropy', 'square_error_cost', 'accuracy',
+    'chunk_eval', 'sequence_conv', 'conv2d', 'sequence_pool', 'pool2d',
+    'batch_norm', 'beam_search_decode', 'conv2d_transpose'
+]
+
+
+def fc(input,
+       size,
+       num_flatten_dims=1,
+       param_attr=None,
+       bias_attr=None,
+       act=None,
+       name=None,
+       main_program=None,
+       startup_program=None):
+    """
+    Fully Connected Layer.
+
+    Args:
+       input: The input tensor to the function
+       size: The size of the layer
+       num_flatten_dims: Number of columns in input
+       param_attr: The parameters/weights to the FC Layer
+       param_initializer: Initializer used for the weight/parameter. If None, XavierInitializer() is used
+       bias_attr: The bias parameter for the FC layer
+       bias_initializer: Initializer used for the bias. If None, then ConstantInitializer() is used
+       act: Activation to be applied to the output of FC layer
+       name: Name/alias of the function
+       main_program: Name of the main program that calls this
+       startup_program: Name of the startup program
+
+    This function can take in multiple inputs and performs the Fully Connected
+    function (linear transformation) on top of each of them.
+    So for input x, the output will be : Wx + b. Where W is the parameter,
+    b the bias and x is the input.
+
+    The function also applies an activation (non-linearity) on top of the
+    output, if activation is passed in the input.
+
+    All the input variables of this function are passed in as local variables
+    to the LayerHelper constructor.
+
+    """
+    helper = LayerHelper('fc', **locals())
+
+    dtype = helper.input_dtype()
+
+    mul_results = []
+    for input_var, param_attr in helper.iter_inputs_and_params():
+        input_shape = input_var.shape
+        param_shape = [
+            reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1)
+        ] + [size]
+        w = helper.create_parameter(
+            attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)
+        tmp = helper.create_tmp_variable(dtype)
+        helper.append_op(
+            type="mul",
+            inputs={
+                "X": input_var,
+                "Y": w,
+            },
+            outputs={"Out": tmp},
+            attrs={'x_num_col_dims': num_flatten_dims,
+                   'y_num_col_dims': 1})
+        mul_results.append(tmp)
+
+    # sum
+    if len(mul_results) == 1:
+        pre_bias = mul_results[0]
+    else:
+        pre_bias = helper.create_tmp_variable(dtype)
+        helper.append_op(
+            type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias})
+    # add bias
+    pre_activation = helper.append_bias_op(pre_bias)
+    # add activation
+    return helper.append_activation(pre_activation)
+
+
+def embedding(input,
+              size,
+              is_sparse=False,
+              param_attr=None,
+              dtype='float32',
+              main_program=None,
+              startup_program=None):
+    """
+    Embedding Layer.
+
+    Args:
+       param_initializer:
+       input: The input to the function
+       size: The size of the layer
+       is_sparse: A flag that decleares whether the input is sparse
+       param_attr: Parameters for this layer
+       dtype: The type of data : float32, float_16, int etc
+       main_program: Name of the main program that calls this
+       startup_program: Name of the startup program
+
+    This function can take in the input (which is a vector of IDs) and
+    performs a lookup in the lookup_table using these IDs, to result into
+    the embedding of each ID in the input.
+
+    All the input variables of this function are passed in as local variables
+    to the LayerHelper constructor.
+
+    """
+
+    helper = LayerHelper('embedding', **locals())
+    w = helper.create_parameter(
+        attr=helper.param_attr, shape=size, dtype=dtype, is_bias=False)
+    tmp = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type='lookup_table',
+        inputs={'Ids': input,
+                'W': w},
+        outputs={'Out': tmp},
+        attrs={'is_sparse': is_sparse})
+    return tmp
+
+
+# TODO(qijun): expose H0 and C0
+def dynamic_lstm(input,
+                 size,
+                 param_attr=None,
+                 bias_attr=None,
+                 use_peepholes=True,
+                 is_reverse=False,
+                 gate_activation='sigmoid',
+                 cell_activation='tanh',
+                 candidate_activation='tanh',
+                 dtype='float32',
+                 main_program=None,
+                 startup_program=None):
+    helper = LayerHelper('lstm', **locals())
+    size = size / 4
+    weight = helper.create_parameter(
+        attr=helper.param_attr, shape=[size, 4 * size], dtype=dtype)
+    bias_size = [1, 7 * size]
+    if not use_peepholes:
+        bias_size[1] = 4 * size
+    bias = helper.create_parameter(
+        attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
+
+    hidden = helper.create_tmp_variable(dtype)
+    cell = helper.create_tmp_variable(dtype)
+    batch_gate = helper.create_tmp_variable(dtype)
+    batch_cell_pre_act = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type='lstm',
+        inputs={'Input': input,
+                'Weight': weight,
+                'Bias': bias},
+        outputs={
+            'Hidden': hidden,
+            'Cell': cell,
+            'BatchGate': batch_gate,
+            'BatchCellPreAct': batch_cell_pre_act
+        },
+        attrs={
+            'use_peepholes': use_peepholes,
+            'is_reverse': is_reverse,
+            'gate_activation': gate_activation,
+            'cell_activation': cell_activation,
+            'candidate_activation': candidate_activation
+        })
+    return hidden, cell
+
+
+def gru_unit(input,
+             hidden,
+             size,
+             weight=None,
+             bias=None,
+             activation='tanh',
+             gate_activation='sigmoid',
+             main_program=None,
+             startup_program=None):
+    """
+    GRUUnit Operator implements partial calculations of the GRU unit as following:
+
+    $$
+    update \ gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\
+    reset \ gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r)  \\
+    output \ candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\
+    output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t)
+    $$
+
+    which is same as one time step of GRU Operator.
+
+    @note To implement the complete GRU unit, fully-connected operator must be
+    used before to feed xu, xr and xc as the Input of GRUUnit operator.
+
+    TODO(ChunweiYan) add more document here
+    """
+    activation_dict = dict(
+        identity=0,
+        sigmoid=1,
+        tanh=2,
+        relu=3, )
+    activation = activation_dict[activation]
+    gate_activation = activation_dict[gate_activation]
+
+    helper = LayerHelper('gru_unit', **locals())
+    dtype = helper.input_dtype()
+    size = size / 3
+
+    # create weight
+    if weight is None:
+        weight = helper.create_parameter(
+            attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
+
+    # create bias
+    if bias is None:
+        bias_size = [1, 3 * size]
+        bias = helper.create_parameter(
+            attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
+
+    gate = helper.create_tmp_variable(dtype)
+    reset_hidden_pre = helper.create_tmp_variable(dtype)
+    updated_hidden = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type='gru_unit',
+        inputs={'Input': input,
+                'HiddenPrev': hidden,
+                'Weight': weight},
+        outputs={
+            'Gate': gate,
+            'ResetHiddenPrev': reset_hidden_pre,
+            'Hidden': updated_hidden,
+        },
+        attrs={
+            'activation': 0,
+            'gate_activation': 1,
+        })
+
+    return updated_hidden, reset_hidden_pre, gate
+
+
+def linear_chain_crf(input,
+                     label,
+                     param_attr=None,
+                     main_program=None,
+                     startup_program=None):
+    helper = LayerHelper('linear_chain_crf', **locals())
+    size = input.shape[1]
+    transition = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=[size + 2, size],
+        dtype=helper.input_dtype())
+    alpha = helper.create_tmp_variable(dtype=helper.input_dtype())
+    emission_exps = helper.create_tmp_variable(dtype=helper.input_dtype())
+    transition_exps = helper.create_tmp_variable(dtype=helper.input_dtype())
+    log_likelihood = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='linear_chain_crf',
+        inputs={"Emission": [input],
+                "Transition": transition,
+                "Label": label},
+        outputs={
+            "Alpha": [alpha],
+            "EmissionExps": [emission_exps],
+            "TransitionExps": transition_exps,
+            "LogLikelihood": log_likelihood
+        })
+
+    return log_likelihood
+
+
+def crf_decoding(input,
+                 param_attr,
+                 label=None,
+                 main_program=None,
+                 startup_program=None):
+    helper = LayerHelper('crf_decoding', **locals())
+    transition = helper.get_parameter(param_attr.name)
+    viterbi_path = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='crf_decoding',
+        inputs={"Emission": [input],
+                "Transition": transition,
+                "Label": label},
+        outputs={"ViterbiPath": [viterbi_path]})
+
+    return viterbi_path
+
+
+def cos_sim(X, Y, **kwargs):
+    """
+    This function performs the cosine similarity between two tensors
+    X and Y and returns that as the output.
+    """
+    helper = LayerHelper('cos_sim', **kwargs)
+    out = helper.create_tmp_variable(dtype=X.dtype)
+    xnorm = helper.create_tmp_variable(dtype=X.dtype)
+    ynorm = helper.create_tmp_variable(dtype=X.dtype)
+    helper.append_op(
+        type='cos_sim',
+        inputs={'X': [X],
+                'Y': [Y]},
+        outputs={'Out': [out],
+                 'XNorm': [xnorm],
+                 'YNorm': [ynorm]})
+    return out
+
+
+def cross_entropy(input, label, **kwargs):
+    """
+    This function computes cross_entropy using the input and label.
+    """
+    helper = LayerHelper('cross_entropy', **kwargs)
+    out = helper.create_tmp_variable(dtype=input.dtype)
+    helper.append_op(
+        type='cross_entropy',
+        inputs={'X': [input],
+                'Label': [label]},
+        outputs={'Y': [out]},
+        attrs=kwargs)
+    return out
+
+
+def square_error_cost(input, label, **kwargs):
+    """
+    This functions returns the squared error cost using the input and label.
+    The output is appending the op to do the above.
+    """
+    helper = LayerHelper('square_error_cost', **kwargs)
+    minus_out = helper.create_tmp_variable(dtype=input.dtype)
+    helper.append_op(
+        type='elementwise_sub',
+        inputs={'X': [input],
+                'Y': [label]},
+        outputs={'Out': [minus_out]})
+
+    square_out = helper.create_tmp_variable(dtype=input.dtype)
+    helper.append_op(
+        type='square', inputs={'X': [minus_out]}, outputs={'Y': [square_out]})
+    return square_out
+
+
+def accuracy(input, label, k=1, correct=None, total=None, **kwargs):
+    """
+    This function computes the accuracy using the input and label.
+    The output is the top_k inputs and their indices.
+    """
+    helper = LayerHelper("accuracy", **kwargs)
+    topk_out = helper.create_tmp_variable(dtype=input.dtype)
+    topk_indices = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="top_k",
+        inputs={"X": [input]},
+        outputs={"Out": [topk_out],
+                 "Indices": [topk_indices]},
+        attrs={"k": k})
+    acc_out = helper.create_tmp_variable(dtype="float32")
+    if correct is None:
+        correct = helper.create_tmp_variable(dtype="int64")
+    if total is None:
+        total = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="accuracy",
+        inputs={
+            "Out": [topk_out],
+            "Indices": [topk_indices],
+            "Label": [label]
+        },
+        outputs={
+            "Accuracy": [acc_out],
+            "Correct": [correct],
+            "Total": [total],
+        })
+    return acc_out
+
+
+def chunk_eval(input,
+               label,
+               chunk_scheme,
+               num_chunk_types,
+               excluded_chunk_types=None,
+               **kwargs):
+    """
+    This function computes the accuracy using the input and label.
+    The output is the top_k inputs and their indices.
+    """
+    helper = LayerHelper("chunk_eval", **kwargs)
+
+    # prepare output
+    precision = helper.create_tmp_variable(dtype="float32")
+    recall = helper.create_tmp_variable(dtype="float32")
+    f1_score = helper.create_tmp_variable(dtype="float32")
+
+    helper.append_op(
+        type="chunk_eval",
+        inputs={"Inference": [input],
+                "Label": [label]},
+        outputs={
+            "Precision": [precision],
+            "Recall": [recall],
+            "F1-Score": [f1_score]
+        },
+        attrs={
+            "num_chunk_types": num_chunk_types,
+            'chunk_scheme': chunk_scheme,
+            'excluded_chunk_types': excluded_chunk_types or []
+        })
+    return precision, recall, f1_score
+
+
+def sequence_conv(input,
+                  num_filters,
+                  filter_size=3,
+                  filter_stride=1,
+                  padding=None,
+                  bias_attr=None,
+                  param_attr=None,
+                  act=None,
+                  main_program=None,
+                  startup_program=None):
+    """
+    This function creates the op for sequence_conv, using the inputs and
+    other convolutional configurations for the filters and stride as given
+    in the input parameters to the function.
+    """
+
+    # FIXME(dzh) : want to unify the argument of python layer
+    # function. So we ignore some unecessary attributes.
+    # such as, padding_trainable, context_start.
+
+    helper = LayerHelper('sequence_conv', **locals())
+    dtype = helper.input_dtype()
+    filter_shape = [filter_size * input.shape[1], num_filters]
+    filter_param = helper.create_parameter(
+        attr=helper.param_attr, shape=filter_shape, dtype=dtype)
+    pre_bias = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type='sequence_conv',
+        inputs={
+            'X': [input],
+            'Filter': [filter_param],
+        },
+        outputs={"Out": pre_bias},
+        attrs={
+            'contextStride': filter_stride,
+            'contextStart': -int(filter_size / 2),
+            'contextLength': filter_size
+        })
+    pre_act = helper.append_bias_op(pre_bias)
+    return helper.append_activation(pre_act)
+
+
+def conv2d(input,
+           num_filters,
+           filter_size,
+           stride=None,
+           padding=None,
+           groups=None,
+           param_attr=None,
+           bias_attr=None,
+           act=None,
+           name=None,
+           main_program=None,
+           startup_program=None):
+    """
+    This function creates the op for a 2-dimensional Convolution.
+    This is performed using the parameters of filters(size, dimensionality etc)
+    , stride and other configurations for a Convolution operation.
+    This funciton can also append an activation on top of the
+    conv-2d output, if mentioned in the input parameters.
+    """
+
+    if stride is None:
+        stride = [1, 1]
+    helper = LayerHelper('conv2d', **locals())
+    dtype = helper.input_dtype()
+
+    num_channels = input.shape[1]
+    if groups is None:
+        num_filter_channels = num_channels
+    else:
+        if num_channels % groups != 0:
+            raise ValueError("num_channels must be divisible by groups.")
+        num_filter_channels = num_channels / groups
+
+    if isinstance(filter_size, int):
+        filter_size = [filter_size, filter_size]
+    if isinstance(stride, int):
+        stride = [stride, stride]
+    if isinstance(padding, int):
+        padding = [padding, padding]
+
+    input_shape = input.shape
+    filter_shape = [num_filters, num_filter_channels] + filter_size
+
+    def _get_default_param_initializer():
+        std = (2.0 / (filter_size[0]**2 * num_channels))**0.5
+        return Normal(0.0, std, 0)
+
+    filter_param = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=filter_shape,
+        dtype=dtype,
+        default_initializer=_get_default_param_initializer())
+
+    pre_bias = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type='conv2d_cudnn',
+        inputs={
+            'Input': input,
+            'Filter': filter_param,
+        },
+        outputs={"Output": pre_bias},
+        attrs={'strides': stride,
+               'paddings': padding,
+               'groups': groups})
+
+    pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
+
+    return helper.append_activation(pre_act)
+
+
+def sequence_pool(input, pool_type, **kwargs):
+    """
+    This function add the operator for sequence pooling.
+    This is applied on top of the input using pool_type mentioned
+    in the parameters.
+    """
+    helper = LayerHelper('sequence_pool', input=input, **kwargs)
+    dtype = helper.input_dtype()
+    pool_out = helper.create_tmp_variable(dtype)
+    max_index = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type="sequence_pool",
+        inputs={"X": input},
+        outputs={"Out": pool_out,
+                 "MaxIndex": max_index},
+        attrs={"pooltype": pool_type.upper()})
+
+    return pool_out
+
+
+def pool2d(input,
+           pool_size,
+           pool_type,
+           pool_stride=None,
+           pool_padding=None,
+           global_pooling=False,
+           main_program=None,
+           startup_program=None):
+    """
+    This function adds the operator for pooling in 2 dimensions, using the
+    pooling configurations mentioned in input parameters.
+    """
+    if pool_padding is None:
+        pool_padding = [0, 0]
+    if pool_stride is None:
+        pool_stride = [1, 1]
+    if pool_type not in ["max", "avg"]:
+        raise ValueError(
+            "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
+            str(pool_type))
+    if isinstance(pool_size, int):
+        pool_size = [pool_size, pool_size]
+    if isinstance(pool_stride, int):
+        pool_stride = [pool_stride, pool_stride]
+    if isinstance(pool_padding, int):
+        pool_padding = [pool_padding, pool_padding]
+
+    helper = LayerHelper('pool2d', **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type="pool2d",
+        inputs={"X": input},
+        outputs={"Out": pool_out},
+        attrs={
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "global_pooling": global_pooling,
+            "strides": pool_stride,
+            "paddings": pool_padding
+        })
+
+    return pool_out
+
+
+def batch_norm(input,
+               act=None,
+               is_test=False,
+               momentum=0.9,
+               epsilon=1e-05,
+               param_attr=None,
+               bias_attr=None,
+               data_layout='NCHW',
+               main_program=None,
+               startup_program=None):
+    """
+    This function helps create an operator to implement
+    the BatchNorm layer using the configurations from the input parameters.
+    """
+    helper = LayerHelper('batch_norm', **locals())
+    dtype = helper.input_dtype()
+
+    input_shape = input.shape
+    if data_layout == 'NCHW':
+        channel_num = input_shape[1]
+    else:
+        if data_layout == 'NHWC':
+            channel_num = input_shape[-1]
+        else:
+            raise ValueError("unsupported data layout:" + data_layout)
+
+    param_shape = [channel_num]
+
+    # create parameter
+    scale = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=param_shape,
+        dtype=dtype,
+        default_initializer=Constant(1.0))
+
+    bias = helper.create_parameter(
+        attr=helper.param_attr, shape=param_shape, dtype=dtype, is_bias=True)
+
+    mean = helper.create_global_variable(
+        dtype=input.dtype, shape=param_shape, persistable=True)
+    helper.set_variable_initializer(var=mean, initializer=Constant(0.0))
+
+    variance = helper.create_global_variable(
+        dtype=input.dtype, shape=param_shape, persistable=True)
+    helper.set_variable_initializer(var=variance, initializer=Constant(1.0))
+
+    # create output
+    # mean and mean_out share the same memory
+    mean_out = mean
+    # variance and variance out share the same memory
+    variance_out = variance
+    saved_mean = helper.create_tmp_variable(dtype)
+    saved_variance = helper.create_tmp_variable(dtype)
+
+    batch_norm_out = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type="batch_norm",
+        inputs={
+            "X": input,
+            "Scale": scale,
+            "Bias": bias,
+            "Mean": mean,
+            "Variance": variance
+        },
+        outputs={
+            "Y": batch_norm_out,
+            "MeanOut": mean_out,
+            "VarianceOut": variance_out,
+            "SavedMean": saved_mean,
+            "SavedVariance": saved_variance
+        },
+        attrs={"momentum": momentum,
+               "epsilon": epsilon,
+               "is_test": is_test})
+
+    return helper.append_activation(batch_norm_out)
+
+
+def beam_search_decode(ids, scores, main_program=None, startup_program=None):
+    helper = LayerHelper('beam_search_decode', **locals())
+    sentence_ids = helper.create_tmp_variable(dtype=ids.dtype)
+    sentence_scores = helper.create_tmp_variable(dtype=ids.dtype)
+
+    helper.append_op(
+        type="beam_search_decode",
+        inputs={"Ids": ids,
+                "Scores": scores},
+        outputs={
+            "SentenceIds": sentence_ids,
+            "SentenceScores": sentence_scores
+        })
+
+    return sentence_ids, sentence_scores
+
+
+def conv2d_transpose(input,
+                     num_filters,
+                     output_size=None,
+                     filter_size=None,
+                     padding=None,
+                     stride=None,
+                     dilation=None,
+                     param_attr=None,
+                     main_program=None,
+                     startup_program=None):
+    """
+    The transpose of conv2d layer.
+
+    This layer is also known as deconvolution layer.
+
+    Args:
+        input(Variable): The input image with [N, C, H, W] format.
+        num_filters(int): The number of filter. It is as same as the output
+            image channel.
+        output_size(int|tuple|None): The output image size. If output size is a
+            tuple, it must contain two integers, (image_H, image_W). This
+            parameter only works when filter_size is None.
+        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
+            it must contain two integers, (filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square.  None if use output size to
+            calculate filter_size
+        padding(int|tuple): The padding size. If padding is a tuple, it must
+            contain two integers, (padding_H, padding_W). Otherwise, the
+            padding_H = padding_W = padding.
+        stride(int|tuple): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride.
+        dilation(int|tuple): The dilation size. If dilation is a tuple, it must
+            contain two integers, (dilation_H, dilation_W). Otherwise, the
+            dilation_H = dilation_W = dilation.
+        param_attr: Parameter Attribute.
+        main_program(Program): the main program
+        startup_program(Program): the startup program
+
+    Returns:
+        Variable: Output image.
+    """
+    helper = LayerHelper("conv2d_transpose", **locals())
+    if not isinstance(input, Variable):
+        raise TypeError("Input of conv2d_transpose must be Variable")
+    input_channel = input.shape[1]
+
+    op_attr = dict()
+
+    if isinstance(padding, int):
+        op_attr['paddings'] = [padding, padding]
+    elif padding is not None:
+        op_attr['paddings'] = padding
+
+    if isinstance(stride, int):
+        op_attr['strides'] = [stride, stride]
+    elif stride is not None:
+        op_attr['strides'] = stride
+
+    if isinstance(dilation, int):
+        op_attr['dilations'] = [dilation, dilation]
+    elif dilation is not None:
+        op_attr['dilations'] = dilation
+
+    if filter_size is None:
+        if output_size is None:
+            raise ValueError("output_size must be set when filter_size is None")
+        if isinstance(output_size, int):
+            output_size = [output_size, output_size]
+
+        padding = op_attr.get('paddings', [0, 0])
+        stride = op_attr.get('strides', [1, 1])
+        dilation = op_attr.get('dilations', [1, 1])
+
+        h_in = input.shape[2]
+        w_in = input.shape[3]
+
+        filter_size_h = (output_size[0] - (h_in - 1) * stride[0] + 2 *
+                         padding[0] - 1) / dilation[0] + 1
+        filter_size_w = (output_size[1] - (w_in - 1) * stride[1] + 2 *
+                         padding[1] - 1) / dilation[1] + 1
+        filter_size = [filter_size_h, filter_size_w]
+
+    elif isinstance(filter_size, int):
+        filter_size = [filter_size, filter_size]
+
+    filter_shape = [input_channel, num_filters] + filter_size
+    img_filter = helper.create_parameter(
+        dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)
+
+    out = helper.create_tmp_variable(dtype=input.dtype)
+    helper.append_op(
+        type='conv2d_transpose',
+        inputs={'Input': [input],
+                'Filter': [img_filter]},
+        outputs={'Output': out},
+        attrs=op_attr)
+
+    return out
diff --git a/python/paddle/v2/fluid/layers/ops.py b/python/paddle/v2/fluid/layers/ops.py
new file mode 100644
index 0000000000..fa312ace60
--- /dev/null
+++ b/python/paddle/v2/fluid/layers/ops.py
@@ -0,0 +1,9 @@
+from ..registry import register_layer
+__all__ = [
+    'mean', 'mul', 'dropout', 'reshape', 'sigmoid', 'scale', 'transpose',
+    'sigmoid_cross_entropy_with_logits', 'elementwise_add', 'elementwise_div',
+    'elementwise_sub', 'elementwise_mul', 'clip', 'abs'
+]
+
+for _OP in set(__all__):
+    globals()[_OP] = register_layer(_OP)
diff --git a/python/paddle/v2/fluid/layers/tensor.py b/python/paddle/v2/fluid/layers/tensor.py
new file mode 100644
index 0000000000..a839ed897d
--- /dev/null
+++ b/python/paddle/v2/fluid/layers/tensor.py
@@ -0,0 +1,130 @@
+from ..layer_helper import LayerHelper
+
+__all__ = [
+    'create_tensor', 'cast', 'concat', 'sums', 'assign',
+    'fill_constant_batch_size_like', 'fill_constant', 'ones', 'zeros'
+]
+
+
+def create_tensor(dtype, name=None, main_program=None, startup_program=None):
+    helper = LayerHelper("create_tensor", **locals())
+    return helper.create_variable(name=helper.name, dtype=dtype)
+
+
+def cast(x, dtype, main_program=None):
+    """
+    This function takes in the input with input_dtype
+    and casts it to the output_dtype as the output.
+    """
+    helper = LayerHelper('cast', **locals())
+    out = helper.create_tmp_variable(dtype=dtype)
+    helper.append_op(
+        type='cast',
+        inputs={'X': [x]},
+        outputs={'Out': [out]},
+        attrs={'in_dtype': x.dtype,
+               'out_dtype': out.dtype})
+    return out
+
+
+def concat(input, axis, main_program=None, startup_program=None):
+    """
+    This function concats the input along the axis mentioned
+    and returns that as the output.
+    """
+    helper = LayerHelper('concat', **locals())
+    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='concat',
+        inputs={'X': input},
+        outputs={'Out': [out]},
+        attrs={'axis': axis})
+    return out
+
+
+def sums(input, out=None, main_program=None, startup_program=None):
+    """
+    This function takes in the input and performs the sum operation on it
+    and returns that as the output.
+    """
+    helper = LayerHelper('sum', **locals())
+    if out is None:
+        out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(type='sum', inputs={'X': input}, outputs={'Out': out})
+    return out
+
+
+def assign(input, output, main_program=None, startup_program=None):
+    helper = LayerHelper('assign', **locals())
+    helper.append_op(
+        type='scale',
+        inputs={'X': [input]},
+        outputs={'Out': [output]},
+        attrs={'scale': 1.0})
+    return output
+
+
+def fill_constant(shape,
+                  dtype,
+                  value,
+                  out=None,
+                  main_program=None,
+                  startup_program=None):
+    """
+    This function creates a tensor , with shape as mentioned in the input and
+    specified dtype and fills this up with a constant value that
+    comes in the input. It also sets the stop_gradient to be True.
+    """
+    helper = LayerHelper("fill_constant", **locals())
+    if out is None:
+        out = helper.create_tmp_variable(dtype=dtype)
+    helper.append_op(
+        type='fill_constant',
+        inputs={},
+        outputs={'Out': [out]},
+        attrs={'shape': shape,
+               'dtype': out.dtype,
+               'value': float(value)})
+    out.stop_gradient = True
+    return out
+
+
+def fill_constant_batch_size_like(input,
+                                  shape,
+                                  dtype,
+                                  value,
+                                  input_dim_idx=0,
+                                  output_dim_idx=0,
+                                  main_program=None,
+                                  startup_program=None):
+    helper = LayerHelper("fill_constant_batch_size_like", **locals())
+    out = helper.create_tmp_variable(dtype=dtype)
+    helper.append_op(
+        type='fill_constant_batch_size_like',
+        inputs={'Input': input},
+        outputs={'Out': [out]},
+        attrs={
+            'shape': shape,
+            'dtype': out.dtype,
+            'value': float(value),
+            'input_dim_idx': input_dim_idx,
+            'output_dim_idx': output_dim_idx
+        })
+    out.stop_gradient = True
+    return out
+
+
+def ones(shape, dtype, main_program=None):
+    """
+    This function performs the same function as fill_constant() declared above
+    with the constant value being 1.0.
+    """
+    return fill_constant(value=1.0, **locals())
+
+
+def zeros(shape, dtype, main_program=None):
+    """
+    This function performs the same function as fill_constant() declared above
+    with the constant value being 0.0.
+    """
+    return fill_constant(value=0.0, **locals())
diff --git a/python/paddle/v2/fluid/param_attr.py b/python/paddle/v2/fluid/param_attr.py
index 86088fdd7c..7952a5ea51 100644
--- a/python/paddle/v2/fluid/param_attr.py
+++ b/python/paddle/v2/fluid/param_attr.py
@@ -36,6 +36,8 @@ class ParamAttr(object):
     def to_attr(arg):
         if arg is None:
             return ParamAttr()
+        elif isinstance(arg, list) or isinstance(arg, tuple):
+            return [ParamAttr.to_attr(a) for a in arg]
         elif isinstance(arg, ParamAttr):
             return arg
         elif isinstance(arg, str) or isinstance(arg, unicode):
diff --git a/python/paddle/v2/fluid/registry.py b/python/paddle/v2/fluid/registry.py
new file mode 100644
index 0000000000..6f5dd365de
--- /dev/null
+++ b/python/paddle/v2/fluid/registry.py
@@ -0,0 +1,186 @@
+import re
+import cStringIO
+import warnings
+import functools
+import inspect
+
+import proto.framework_pb2 as framework_pb2
+from framework import OpProtoHolder, Variable, Program, Operator
+from paddle.v2.fluid.layer_helper import LayerHelper, unique_name
+
+__all__ = ['deprecated', 'register_layer']
+
+
+def _convert_(name):
+    """
+    Formatting.
+
+    Args:
+       name: The name/alias
+
+    This function takes in a name and converts it to a standard format of
+    group1_group2. Where as per the regular expression, group1 can have
+    alphabets and numbers and group2 has capital alphabets.
+
+    """
+    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
+    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
+
+
+def _generate_doc_string_(op_proto):
+    """
+    Generate docstring by OpProto
+
+    Args:
+        op_proto (framework_pb2.OpProto): a protobuf message typed OpProto
+
+    Returns:
+        str: the document string
+    """
+
+    def _type_to_str_(tp):
+        return framework_pb2.AttrType.Name(tp)
+
+    if not isinstance(op_proto, framework_pb2.OpProto):
+        raise TypeError("OpProto should be `framework_pb2.OpProto`")
+
+    buf = cStringIO.StringIO()
+    buf.write(op_proto.comment)
+    buf.write('\nArgs:\n')
+    for each_input in op_proto.inputs:
+        line_begin = '    {0}: '.format(_convert_(each_input.name))
+        buf.write(line_begin)
+        buf.write(each_input.comment)
+        buf.write('\n')
+        buf.write(' ' * len(line_begin))
+        buf.write('Duplicable: ')
+        buf.write(str(each_input.duplicable))
+        buf.write('  Optional: ')
+        buf.write(str(each_input.dispensable))
+        buf.write('\n')
+
+    for each_attr in op_proto.attrs:
+        buf.write('    ')
+        buf.write(each_attr.name)
+        buf.write(' (')
+        buf.write(_type_to_str_(each_attr.type))
+        buf.write('): ')
+        buf.write(each_attr.comment)
+        buf.write('\n')
+
+    if len(op_proto.outputs) != 0:
+        buf.write('\nReturns:\n')
+        buf.write('    ')
+        for each_opt in op_proto.outputs:
+            if not each_opt.intermediate:
+                break
+        buf.write(each_opt.comment)
+
+    return buf.getvalue()
+
+
+def register_layer(op_type):
+    """
+    Register an Python layer for an Operator
+
+    Args:
+       op_type: The name of the operator to be created
+
+    This function takes in the operator type (sigmoid, mean , average etc) and
+    creates the operator functionality.
+
+    """
+    op_proto = OpProtoHolder.instance().get_op_proto(op_type)
+    not_intermediate_outputs = \
+        filter(lambda output: not output.intermediate, op_proto.outputs)
+    intermediate_outputs = \
+        filter(lambda output: output.intermediate, op_proto.outputs)
+
+    if len(not_intermediate_outputs) != 1:
+        raise ValueError("Only one non intermediate output operator can be",
+                         "automatically generated")
+
+    if not_intermediate_outputs[0].duplicable:
+        raise ValueError(
+            "Only non duplicable op can be automatically generated")
+
+    for output in intermediate_outputs:
+        if output.duplicable:
+            raise ValueError("The op can be automatically generated only when ",
+                             "all intermediate ops are not duplicable")
+
+    o_name = not_intermediate_outputs[0].name
+    intermediate_output_names = [output.name for output in intermediate_outputs]
+
+    def infer_and_check_dtype(op_proto, **kwargs):
+        """
+        This function performs the sanity check for dtype and
+        instance type.
+        """
+        dtype = None
+        for ipt in op_proto.inputs:
+            name = _convert_(ipt.name)
+            val = kwargs.pop(name, [])
+            if not isinstance(val, list) and not isinstance(val, tuple):
+                val = [val]
+            for each in val:
+                if not isinstance(each, Variable):
+                    raise ValueError("input of {0} must be variable".format(
+                        op_type))
+
+                if dtype is None:
+                    dtype = each.dtype
+                elif dtype != each.dtype:
+                    raise ValueError(
+                        "operator {0} must input same dtype. {1} vs {2}".format(
+                            op_type, dtype, each.dtype))
+
+        return dtype
+
+    def func(**kwargs):
+        helper = LayerHelper(op_type, **kwargs)
+
+        dtype = infer_and_check_dtype(op_proto, **kwargs)
+
+        inputs = dict()
+        for ipt in op_proto.inputs:
+            name = _convert_(ipt.name)
+            val = kwargs.pop(name, [])
+            if not isinstance(val, list) and not isinstance(val, tuple):
+                val = [val]
+            inputs[ipt.name] = val
+
+        outputs = dict()
+        out = helper.create_tmp_variable(dtype=dtype)
+        outputs[o_name] = [out]
+        for name in intermediate_output_names:
+            outputs[name] = [helper.create_tmp_variable(dtype=dtype)]
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=kwargs)
+        return helper.append_activation(out)
+
+    func.__name__ = op_type
+    func.__doc__ = _generate_doc_string_(op_proto)
+    return func
+
+
+def deprecated(func_or_class):
+    """
+    Deprecated warning decorator. It will result a warning message.
+    Should be used before class or function, member function
+    """
+
+    @functools.wraps(func)
+    def func_wrapper(*args, **kwargs):
+        """
+        Wrap func with deprecated warning
+        """
+        warnings.simplefilter('always', DeprecationWarning)  #turn off filter
+        warnings.warn(
+            "Call to deprecated function {}.".format(func.__name__),
+            category=DeprecationWarning,
+            stacklevel=2)
+        warnings.simplefilter('default', DeprecationWarning)  #reset filter
+        return func(*args, **kwargs)
+
+    return func_wrapper
diff --git a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
index 4e71b6f345..3d336ffe95 100644
--- a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
+++ b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
@@ -1,9 +1,9 @@
 from __future__ import print_function
 
-import numpy as np
+import sys
+
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
-import sys
 
 
 def resnet_cifar10(input, depth=32):
diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
index 80f8599679..c0b051f862 100644
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
@@ -1,6 +1,51 @@
 import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
+from paddle.v2.fluid.layer_helper import LayerHelper
+
+
+def lstm(x,
+         c_pre_init,
+         hidden_dim,
+         forget_bias=None,
+         main_program=None,
+         startup_program=None):
+    """
+    This function helps create an operator for the LSTM (Long Short Term
+    Memory) cell that can be used inside an RNN.
+    """
+    helper = LayerHelper('lstm_unit', **locals())
+    rnn = fluid.layers.StaticRNN()
+    with rnn.step():
+        c_pre = rnn.memory(init=c_pre_init)
+        x_t = rnn.step_input(x)
+
+        before_fc = fluid.layers.concat(
+            input=[x_t, c_pre],
+            axis=1,
+            main_program=main_program,
+            startup_program=startup_program)
+        after_fc = fluid.layers.fc(input=before_fc,
+                                   size=hidden_dim * 4,
+                                   main_program=main_program,
+                                   startup_program=startup_program)
+
+        dtype = x.dtype
+        c = helper.create_tmp_variable(dtype)
+        h = helper.create_tmp_variable(dtype)
+
+        helper.append_op(
+            type='lstm_unit',
+            inputs={"X": after_fc,
+                    "C_prev": c_pre},
+            outputs={"C": c,
+                     "H": h},
+            attrs={"forget_bias": forget_bias})
+
+        rnn.update_memory(c_pre, c)
+        rnn.output(h)
+
+    return rnn()
 
 
 def lstm_net(dict_dim, class_dim=2, emb_dim=32, seq_len=80, batch_size=50):
@@ -23,8 +68,7 @@ def lstm_net(dict_dim, class_dim=2, emb_dim=32, seq_len=80, batch_size=50):
     c_pre_init = fluid.layers.fill_constant(
         dtype=emb.dtype, shape=[batch_size, emb_dim], value=0.0)
     c_pre_init.stop_gradient = False
-    layer_1_out = fluid.layers.lstm(
-        emb, c_pre_init=c_pre_init, hidden_dim=emb_dim)
+    layer_1_out = lstm(emb, c_pre_init=c_pre_init, hidden_dim=emb_dim)
     layer_1_out = fluid.layers.transpose(x=layer_1_out, axis=[1, 0, 2])
 
     prediction = fluid.layers.fc(input=layer_1_out,
diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py
index 57f6a362de..9b88080158 100644
--- a/python/paddle/v2/fluid/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
@@ -29,7 +29,10 @@ class TestBook(unittest.TestCase):
             label = layers.data(name='label', shape=[1], dtype='int32')
             hidden1 = layers.fc(input=images, size=128, act='relu')
             hidden2 = layers.fc(input=hidden1, size=64, act='relu')
-            predict = layers.fc(input=hidden2, size=10, act='softmax')
+            predict = layers.fc(input=[hidden2, hidden1],
+                                size=10,
+                                act='softmax',
+                                param_attr=["sftmax.w1", "sftmax.w2"])
             cost = layers.cross_entropy(input=predict, label=label)
             avg_cost = layers.mean(x=cost)
             self.assertIsNotNone(avg_cost)
diff --git a/python/paddle/v2/fluid/tests/test_reduce_op.py b/python/paddle/v2/fluid/tests/test_reduce_op.py
index 70359d60cb..a021d4dd91 100644
--- a/python/paddle/v2/fluid/tests/test_reduce_op.py
+++ b/python/paddle/v2/fluid/tests/test_reduce_op.py
@@ -85,5 +85,19 @@ class Test1DReduce(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+class TestReduceAll(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float32")}
+        self.attrs = {'reduce_all': True}
+        self.outputs = {'Out': self.inputs['X'].sum()}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_registry.py b/python/paddle/v2/fluid/tests/test_registry.py
new file mode 100644
index 0000000000..f8328f31cf
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_registry.py
@@ -0,0 +1,22 @@
+import unittest
+import warnings
+
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.registry as registry
+
+
+class TestRegistry(unittest.TestCase):
+    def test_registry_layer(self):
+        self.layer_type = "mean"
+        program = framework.Program()
+
+        x = fluid.layers.data(name='X', shape=[10, 10], dtype='float32')
+        output = layers.mean(x)
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+
+        X = np.random.random((10, 10)).astype("float32")
+        mean_out = exe.run(program, feed={"X": X}, fetch_list=[output])
+        self.assertAlmostEqual(np.mean(X), mean_out)
diff --git a/python/paddle/v2/parameters.py b/python/paddle/v2/parameters.py
index bd97dc1199..7b7d1a1d16 100644
--- a/python/paddle/v2/parameters.py
+++ b/python/paddle/v2/parameters.py
@@ -383,19 +383,22 @@ class Parameters(object):
             params.deserialize(param_name, f)
         return params
 
-    def init_from_tar(self, f):
+    def init_from_tar(self, f, exclude_params=[]):
         """
         Different from `from_tar`, this interface can be used to
         init partial network parameters from another saved model.
 
         :param f: the initialized model file.
         :type f: tar file
+        :param exclude_params: the names of parameters that should  
+            not be initialized from the model file.
+        :type exclude_params: list of strings
         :return: Nothing.
         """
 
         tar_param = Parameters.from_tar(f)
         for pname in tar_param.names():
-            if pname in self.names():
+            if pname in self.names() and pname not in exclude_params:
                 self.set(pname, tar_param.get(pname))
 
 
diff --git a/python/setup.py.in b/python/setup.py.in
index 9ccb4dc176..8396fb44cf 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -68,6 +68,7 @@ packages=['paddle',
           'paddle.v2.plot',
           'paddle.v2.fluid',
           'paddle.v2.fluid.proto',
+          'paddle.v2.fluid.layers',
           'py_paddle']
 
 with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f: