diff --git a/.gitignore b/.gitignore
index ac56a3320e..fe0d13f4d9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,9 @@
+paddle/operators/check_t.save
+paddle/operators/check_tensor.ls
+paddle/operators/tensor.save
+python/paddle/v2/fluid/tests/book/image_classification_resnet.inference.model/
+python/paddle/v2/fluid/tests/book/image_classification_vgg.inference.model/
+python/paddle/v2/fluid/tests/book/label_semantic_roles.inference.model/
 *.DS_Store
 build/
 build_doc/
@@ -27,5 +33,5 @@ CMakeFiles
 cmake_install.cmake
 paddle/.timestamp
 python/paddlepaddle.egg-info/
-paddle/pybind/pybind.h
+paddle/fluid/pybind/pybind.h
 python/paddle/version.py
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e8ea828dd2..3a21574b85 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -137,7 +137,7 @@ include(external/openblas)  # download, build, install openblas
 include(external/mkldnn)    # download, build, install mkldnn
 include(external/swig)      # download, build, install swig
 include(external/warpctc)   # download, build, install warpctc
-include(external/boost)     # download, build, install boost
+include(external/boost)     # download boost
 include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
@@ -156,6 +156,7 @@ include(rdma)               # set rdma libraries
 include(flags)              # set paddle compile flags
 include(version)            # set PADDLE_VERSION
 include(coveralls)          # set code coverage
+include(inference_lib)      # add paddle fluid inference libraries
 
 
 include_directories("${PADDLE_SOURCE_DIR}")
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index a60453ff4e..3c36cffcb4 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,5 +1,8 @@
 # Contribute Code
 
+You are welcome to contribute to project PaddlePaddle. To contribute to PaddlePaddle, you have to agree with the 
+[PaddlePaddle Contributor License Agreement](https://gist.github.com/wangkuiyi/0c22c7b1bd3bb7eb27d76f85c3a3e329).
+
 We sincerely appreciate your contribution.  This document explains our workflow and work style.
 
 ## Workflow
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 6bea7cf302..de94bd5008 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -181,7 +181,8 @@ elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
 elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
     list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
 elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
-    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_MINSIZEREL})
+    # nvcc 9 does not support -Os. Use Release flags instead
+    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
 endif()
 
 mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake
index c70d83b3f4..dbc676bdac 100644
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -21,6 +21,7 @@ set(BOOST_URL           "http://sourceforge.net/projects/boost/files/boost/${BOO
 set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
 set(BOOST_DOWNLOAD_DIR  "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
 set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
+set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1)
 
 include_directories(${BOOST_INCLUDE_DIR})
 
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index d49c8d6011..6a701e076c 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -28,9 +28,3 @@ endif()
 add_dependencies(eigen3 extern_eigen3)
 
 LIST(APPEND external_project_dependencies eigen3)
-
-IF(NOT WITH_C_API AND WITH_FLUID)
-    INSTALL(FILES ${EIGEN_INCLUDE_DIR}/Eigen/Core DESTINATION third_party/eigen3/Eigen)
-    INSTALL(DIRECTORY ${EIGEN_INCLUDE_DIR}/Eigen/src DESTINATION third_party/eigen3/Eigen)
-    INSTALL(DIRECTORY ${EIGEN_INCLUDE_DIR}/unsupported/Eigen DESTINATION third_party/eigen3/unsupported)
-ENDIF()
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index 6094630454..d4f252bb9f 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -52,7 +52,7 @@ ADD_DEPENDENCIES(gflags extern_gflags)
 
 LIST(APPEND external_project_dependencies gflags)
 
-IF(WITH_C_API OR WITH_FLUID)
+IF(WITH_C_API)
   INSTALL(DIRECTORY ${GFLAGS_INCLUDE_DIR} DESTINATION third_party/gflags)
   IF(ANDROID)
     INSTALL(FILES ${GFLAGS_LIBRARIES} DESTINATION third_party/gflags/lib/${ANDROID_ABI})
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index 382fbda3b5..0c6b3aafcb 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -68,7 +68,7 @@ LINK_LIBRARIES(glog gflags)
 
 LIST(APPEND external_project_dependencies glog)
 
-IF(WITH_C_API OR WITH_FLUID)
+IF(WITH_C_API)
   INSTALL(DIRECTORY ${GLOG_INCLUDE_DIR} DESTINATION third_party/glog)
   IF(ANDROID)
     INSTALL(FILES ${GLOG_LIBRARIES} DESTINATION third_party/glog/lib/${ANDROID_ABI})
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 365a370a9c..ff5855052d 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -250,7 +250,7 @@ IF(NOT PROTOBUF_FOUND)
     SET(PROTOBUF_PROTOC_LIBRARY ${extern_protobuf_PROTOC_LIBRARY}
         CACHE FILEPATH "protoc library." FORCE)
 
-    IF(WITH_C_API OR WITH_FLUID)
+    IF(WITH_C_API)
         INSTALL(DIRECTORY ${PROTOBUF_INCLUDE_DIR} DESTINATION third_party/protobuf)
         IF(ANDROID)
             INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI})
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 7cb4efa7bf..5fa60df7b3 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -52,6 +52,7 @@ ExternalProject_Add(
                     -DWITH_TORCH=OFF
                     -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
                     -DBUILD_SHARED=ON
+                    -DBUILD_TESTS=OFF
                     -DCMAKE_POSITION_INDEPENDENT_CODE=ON
                     -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                     ${EXTERNAL_OPTIONAL_ARGS}
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 33ef6860e1..1cb54ba216 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -179,20 +179,24 @@ function(cc_library TARGET_NAME)
   set(oneValueArgs "")
   set(multiValueArgs SRCS DEPS)
   cmake_parse_arguments(cc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  if (cc_library_SRCS)
-    if (cc_library_SHARED OR cc_library_shared) # build *.so
+  if(cc_library_SRCS)
+    if(cc_library_SHARED OR cc_library_shared) # build *.so
       add_library(${TARGET_NAME} SHARED ${cc_library_SRCS})
     else()
       add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
     endif()
-    if (cc_library_DEPS)
+    if(cc_library_DEPS)
       # Don't need link libwarpctc.so
-      if ("${cc_library_DEPS};" MATCHES "warpctc;")
+      if("${cc_library_DEPS};" MATCHES "warpctc;")
         list(REMOVE_ITEM cc_library_DEPS warpctc)
         add_dependencies(${TARGET_NAME} warpctc)
       endif()
+      # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
+      target_circle_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
+      if("${cc_library_DEPS}" MATCHES "ARCHIVE_START")
+        list(REMOVE_ITEM cc_library_DEPS ARCHIVE_START ARCHIVE_END)
+      endif()
       add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
-      target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
     endif()
     
     # cpplint code style
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
new file mode 100644
index 0000000000..7d53554358
--- /dev/null
+++ b/cmake/inference_lib.cmake
@@ -0,0 +1,90 @@
+# make package for paddle fluid shared and static library
+function(copy TARGET)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DSTS DEPS)
+    cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
+    list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
+    if(NOT ${copy_lib_SRCS_len} EQUAL ${copy_lib_DSTS_len})
+        message(FATAL_ERROR "${TARGET} source numbers are not equal to destination numbers")
+    endif()
+    math(EXPR len "${copy_lib_SRCS_len} - 1")
+    
+    add_custom_target(${TARGET} DEPENDS ${copy_lib_DEPS})
+    foreach(index RANGE ${len})
+        list(GET copy_lib_SRCS ${index} src)
+        list(GET copy_lib_DSTS ${index} dst)
+        add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND mkdir -p "${dst}")
+        if(IS_DIRECTORY ${src})
+            add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND cp -r "${src}" "${dst}")
+        else()
+            add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND cp "${src}" "${dst}")
+        endif()
+    endforeach()
+endfunction()
+
+# third party
+set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/eigen3")
+copy(eigen3_lib
+  SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen
+  DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported
+)
+
+set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/gflags")
+copy(gflags_lib
+  SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES}
+  DSTS ${dst_dir} ${dst_dir}/lib
+)
+
+set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/glog")
+copy(glog_lib
+  SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
+  DSTS ${dst_dir} ${dst_dir}/lib
+)
+
+IF(NOT PROTOBUF_FOUND)
+    set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/protobuf")
+    copy(protobuf_lib
+      SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LITE_LIBRARY}
+      DSTS ${dst_dir} ${dst_dir}/lib
+    )
+ENDIF(NOT PROTOBUF_FOUND)
+
+# paddle fluid module
+set(src_dir "${PADDLE_SOURCE_DIR}/paddle")
+set(dst_dir "${CMAKE_INSTALL_PREFIX}/paddle")
+set(module "framework")
+copy(framework_lib DEPS framework_py_proto 
+  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/framework/framework.pb.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module}
+)
+
+set(module "memory")
+copy(memory_lib
+  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail
+)
+
+set(module "inference")
+copy(inference_lib DEPENDS paddle_fluid_shared
+  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/inference/libpaddle_fluid.so
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}
+)
+
+set(module "platform")
+copy(platform_lib
+  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details
+)
+
+set(module "string")
+copy(string_lib
+  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/tinyformat/*.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat
+)
+
+add_custom_target(inference_lib_dist DEPENDS 
+  inference_lib framework_lib memory_lib platform_lib string_lib
+  gflags_lib glog_lib protobuf_lib eigen3_lib)
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index 94dd3457fb..58ce5d61c9 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -47,3 +47,5 @@ sphinx_add_target(paddle_docs_cn
                   ${SPHINX_CACHE_DIR_CN}
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_CN})
+
+add_subdirectory(api)
diff --git a/doc/api/CMakeLists.txt b/doc/api/CMakeLists.txt
new file mode 100644
index 0000000000..4e0bc1d5b8
--- /dev/null
+++ b/doc/api/CMakeLists.txt
@@ -0,0 +1,20 @@
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
+
+# HTML output director
+set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
+
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
+    "${BINARY_BUILD_DIR_EN}/conf.py"
+    @ONLY)
+
+sphinx_add_target(paddle_api_docs
+                  html
+                  ${BINARY_BUILD_DIR_EN}
+                  ${SPHINX_CACHE_DIR_EN}
+                  ${CMAKE_CURRENT_SOURCE_DIR}
+                  ${SPHINX_HTML_DIR_EN})
diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst
index e24613b94b..58c493fd74 100644
--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
@@ -323,6 +323,12 @@ batch_norm
 ..  autofunction:: paddle.v2.fluid.layers.batch_norm
     :noindex:
 
+layer_norm
+----------
+
+..  autofunction:: paddle.v2.fluid.layers.layer_norm
+    :noindex:
+
 beam_search_decode
 ------------------
 
diff --git a/doc/howto/dev/build_cn.md b/doc/build_and_install/build_cn.md
similarity index 100%
rename from doc/howto/dev/build_cn.md
rename to doc/build_and_install/build_cn.md
diff --git a/doc/howto/dev/build_en.md b/doc/build_and_install/build_en.md
similarity index 100%
rename from doc/howto/dev/build_en.md
rename to doc/build_and_install/build_en.md
diff --git a/doc/getstarted/build_and_install/build_from_source_cn.rst b/doc/build_and_install/build_from_source_cn.rst
similarity index 100%
rename from doc/getstarted/build_and_install/build_from_source_cn.rst
rename to doc/build_and_install/build_from_source_cn.rst
diff --git a/doc/getstarted/build_and_install/build_from_source_en.rst b/doc/build_and_install/build_from_source_en.rst
similarity index 100%
rename from doc/getstarted/build_and_install/build_from_source_en.rst
rename to doc/build_and_install/build_from_source_en.rst
diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/build_and_install/docker_install_cn.rst
similarity index 100%
rename from doc/getstarted/build_and_install/docker_install_cn.rst
rename to doc/build_and_install/docker_install_cn.rst
diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/build_and_install/docker_install_en.rst
similarity index 100%
rename from doc/getstarted/build_and_install/docker_install_en.rst
rename to doc/build_and_install/docker_install_en.rst
diff --git a/doc/getstarted/build_and_install/index_cn.rst b/doc/build_and_install/index_cn.rst
similarity index 94%
rename from doc/getstarted/build_and_install/index_cn.rst
rename to doc/build_and_install/index_cn.rst
index c9ba84c842..4220ff2279 100644
--- a/doc/getstarted/build_and_install/index_cn.rst
+++ b/doc/build_and_install/index_cn.rst
@@ -13,7 +13,7 @@ PaddlePaddle提供pip和Docker的安装方式：
 
    pip_install_cn.rst
    docker_install_cn.rst
-   ../../howto/dev/build_cn.md
+   build_cn.md
 
 编译流程
 ++++++++
diff --git a/doc/getstarted/build_and_install/index_en.rst b/doc/build_and_install/index_en.rst
similarity index 95%
rename from doc/getstarted/build_and_install/index_en.rst
rename to doc/build_and_install/index_en.rst
index 32d66d63dd..db6b5be742 100644
--- a/doc/getstarted/build_and_install/index_en.rst
+++ b/doc/build_and_install/index_en.rst
@@ -13,7 +13,7 @@ You can choose either pip or Docker to complete your install:
 
    pip_install_en.rst
    docker_install_en.rst
-   ../../howto/dev/build_en.md
+   build_en.md
 
 
 Build from Source
diff --git a/doc/getstarted/build_and_install/paddleci.png b/doc/build_and_install/paddleci.png
similarity index 100%
rename from doc/getstarted/build_and_install/paddleci.png
rename to doc/build_and_install/paddleci.png
diff --git a/doc/getstarted/build_and_install/pip_install_cn.rst b/doc/build_and_install/pip_install_cn.rst
similarity index 100%
rename from doc/getstarted/build_and_install/pip_install_cn.rst
rename to doc/build_and_install/pip_install_cn.rst
diff --git a/doc/getstarted/build_and_install/pip_install_en.rst b/doc/build_and_install/pip_install_en.rst
similarity index 100%
rename from doc/getstarted/build_and_install/pip_install_en.rst
rename to doc/build_and_install/pip_install_en.rst
diff --git a/doc/design/auto_gradient_check.md b/doc/design/auto_gradient_check.md
index f9991541bc..773b7b6a76 100644
--- a/doc/design/auto_gradient_check.md
+++ b/doc/design/auto_gradient_check.md
@@ -1,23 +1,23 @@
-## Auto Gradient Checker Design
+## Auto Gradient Check Design
 
-## Backgraound：
-- Generally, it is easy to check whether the forward computation of an Operator is correct or not. However, backpropagation is a notoriously difficult algorithm to debug and get right:
-  1. you should get the right backpropagation formula according to the forward computation.
-  2. you should implement it right in CPP.
-  3. it's difficult to prepare test data.
+## Background：
+- Generally, it is easy to check whether the forward computation of an Operator is correct or not. However, backpropagation is a notoriously difficult algorithm to debug and get right because of the following challenges:
+  1. The formula for backpropagation formula should be correct according to the forward computation.
+  2. The Implementation of the above shoule be correct in CPP.
+  3. It is difficult to prepare an unbiased test data.
 
-- Auto gradient checking gets a numerical gradient by forward Operator and use it as a reference of the backward Operator's result. It has several advantages:
-  1. numerical gradient checker only need forward operator.
-  2. user only need to prepare the input data for forward Operator.
+- Auto gradient checking gets a numerical gradient using forward Operator and uses it as a reference for the backward Operator's result. It has several advantages:
+  1. Numerical gradient checker only needs the forward operator.
+  2. The user only needs to prepare the input data for forward Operator and not worry about the backward Operator.
 
 ## Mathematical Theory
-The following two document from Stanford has a detailed explanation of how to get numerical gradient and why it's useful.
+The following documents from Stanford have a detailed explanation of how to compute the numerical gradient and why it is useful.
 
 - [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization)
 - [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96)
 
 
-## Numeric Gradient Implementation
+## Numerical Gradient Implementation
 ### Python Interface
 ```python
 def get_numerical_gradient(op,
@@ -27,73 +27,76 @@ def get_numerical_gradient(op,
                          delta=0.005,
                          local_scope=None):
     """
-    Get Numeric Gradient for an operator's input.
+    Get Numerical Gradient for the input of an operator.
 
-    :param op: C++ operator instance, could be an network
+    :param op: C++ operator instance, could be an network.
     :param input_values: The input variables. Should be an dictionary, whose key is
-    variable name, and value is numpy array.
+    variable name, and value is a numpy array.
     :param output_name: The final output variable name.
-    :param input_to_check: The input variable with respect to which to compute the gradient.
-    :param delta: The perturbation value for numeric gradient method. The
-    smaller delta is, the more accurate result will get. But if that delta is
-     too small, it will suffer from numerical stability problem.
+    :param input_to_check: The input variable with respect to which the gradient has to be computed.
+    :param delta: The perturbation value for numerical gradient method. The
+    smaller the delta, the more accurate the result. But if the delta is too
+    small, it will suffer from the numerical stability problem.
     :param local_scope: The local scope used for get_numeric_gradient.
     :return: The gradient array in numpy format.
     """
 ```
 
-### Explaination:
+### Explanation:
 
-- Why need `output_name`
-  - An Operator may have multiple Output, one can get independent gradient from each Output. So caller should specify the name of the output variable.
+- Why do we need an `output_name`
+  - An Operator may have multiple Outputs, one can compute an independent gradient from each Output. So the caller should specify the name of the output variable.
 
-- Why need `input_to_check`
-  - One operator may have multiple inputs. Gradient Op can calculate the gradient of these inputs at the same time. But Numeric Gradient needs to calculate them one by one. So `get_numeric_gradient` is designed to calculate the gradient for one input. If you need to compute multiple inputs, you can call `get_numeric_gradient` multiple times.
+- Why do we need `input_to_check`
+  - One operator can have multiple inputs. Gradient Op can calculate the gradient of these inputs at the same time. But Numerical Gradient needs to calculate them one by one. So `get_numeric_gradient` is designed to calculate the gradient for one input. If you need to compute multiple inputs, you can call `get_numeric_gradient` multiple times each with a different input.
 
 
 ### Core Algorithm Implementation
 
 
 ```python
-    # we only compute gradient of one element a time.
+    # we only compute the gradient of one element a time.
     # we use a for loop to compute the gradient of each element.
     for i in xrange(tensor_size):
-        # get one input element by its index i.
-        origin = tensor_to_check.get_float_element(i)
+        # get one input element using the index i.
+        original = tensor_to_check.get_float_element(i)
 
-        # add delta to it, run op and then get the new value of the result tensor.
-        x_pos = origin + delta
+        # add delta to it, run the forward op and then
+        # get the new value of the result tensor.
+        x_pos = original + delta
         tensor_to_check.set_float_element(i, x_pos)
         y_pos = get_output()
 
-        # plus delta to this element, run op and get the new value of the result tensor.
-        x_neg = origin - delta
+        # Subtract delta from this element, run the op again
+        # and get the new value of the result tensor.
+        x_neg = original - delta
         tensor_to_check.set_float_element(i, x_neg)
         y_neg = get_output()
 
         # restore old value
-        tensor_to_check.set_float_element(i, origin)
+        tensor_to_check.set_float_element(i, original)
 
-        # compute the gradient of this element and store it into a numpy array.
+        # compute the gradient of this element and store
+        # it into a numpy array.
         gradient_flat[i] = (y_pos - y_neg) / delta / 2
 
     # reshape the gradient result to the shape of the source tensor.
     return gradient_flat.reshape(tensor_to_check.get_dims())
 ```
 
-## Auto Graident Checker Framework
+## Auto Gradient Check Framework
 
 Each Operator Kernel has three kinds of Gradient:
 
 1. Numerical gradient
 2. CPU kernel gradient
-3. GPU kernel gradient (if supported)
+3. GPU kernel gradient (if supported by the device)
 
-The numerical gradient only relies on forward Operator. So we use the numerical gradient as the reference value. And the gradient checking is performed in the following three steps:
+The numerical gradient only relies on the forward Operator, so we use the numerical gradient as the reference value. The gradient checking is performed in the following three steps:
 
-1. calculate the numerical gradient
-2. calculate CPU kernel gradient with the backward Operator and compare it with the numerical gradient
-3. calculate GPU kernel gradient with the backward Operator and compare it with the numeric gradient (if supported)
+1. Calculate the numerical gradient
+2. Calculate CPU kernel gradient with the backward Operator and compare it with the numerical gradient.
+3. Calculate GPU kernel gradient with the backward Operator and compare it with the numeric gradient. (if supported)
 
 #### Python Interface
 
@@ -109,26 +112,27 @@ The numerical gradient only relies on forward Operator. So we use the numerical
         """
         :param forward_op: used to create backward_op
         :param input_vars: numpy value of input variable. The following
-            computation will use these variables.
-        :param inputs_to_check: the input variable with respect to which to compute the gradient.
+          computation will use these variables.
+        :param inputs_to_check: the input variable with respect to which the
+          gradient will be computed.
         :param output_name: The final output variable name.
         :param max_relative_error: The relative tolerance parameter.
-        :param no_grad_set: used when create backward ops
+        :param no_grad_set: used to create backward ops
         :param only_cpu: only compute and check gradient on cpu kernel.
         :return:
         """
 ```
 
-### How to check if two numpy array is close enough?
-if `abs_numerical_grad` is nearly zero, then use abs error for numerical_grad
+### How to check if two numpy arrays are close enough?
+if `abs_numerical_grad` is nearly zero, then use absolute error for numerical_grad.
 
 ```python
 numerical_grad = ...
 operator_grad = numpy.array(scope.find_var(grad_var_name(name)).get_tensor())
 
 abs_numerical_grad = numpy.abs(numerical_grad)
-# if abs_numerical_grad is nearly zero, then use abs error for numeric_grad, not relative
-# error.
+# if abs_numerical_grad is nearly zero, then use abs error for
+# numeric_grad, instead of relative error.
 abs_numerical_grad[abs_numerical_grad < 1e-3] = 1
 
 diff_mat = numpy.abs(abs_numerical_grad - operator_grad) / abs_numerical_grad
@@ -137,10 +141,10 @@ max_diff = numpy.max(diff_mat)
 
 
 #### Notes：
-The Input data for auto gradient checker should be reasonable to avoid numerical  stability problem.
+The Input data for auto gradient checker should be reasonable to avoid numerical stability problem.
 
 
-#### Refs:
+#### References:
 
 - [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization)
 - [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96)
diff --git a/doc/design/cpp_data_feeding.md b/doc/design/cpp_data_feeding.md
new file mode 100644
index 0000000000..40205350f9
--- /dev/null
+++ b/doc/design/cpp_data_feeding.md
@@ -0,0 +1,79 @@
+# C++ Data Feeding
+
+In training with Paddle V2 API, data feeding wholly dependents on Python code. To get rid of the Python environment and achieve the goal of "wrapping the whole training by a while loop op" in Paddle Fluid, a C++ data feeding mechanism is required. 
+
+In this document we show the fundamental design of C++ data feeding process, which includes the data reading, shuffling and batching.
+
+## Reader
+
+A new concept named 'Reader' is introduced. `Reader` is a series of inherited classes which can be hold by our `Variable` and they are used to read or process file data.
+
+
+### `ReaderBase`
+
+`ReaderBase` is the abstract base class of all readers. It defines the all readers' interfaces.
+
+```cpp
+class ReaderBase {
+ public:
+  explicit ReaderBase(const std::vector<DDim>& shapes) : shapes_(shapes) {
+    PADDLE_ENFORCE(!shapes_.empty());
+  }
+  // Read the next batch of data. (A 'batch' can be only one instance)
+  virtual void ReadNext(std::vector<LoDTensor>* out) = 0;
+  // Show whether the next bacth exists.
+  virtual bool HasNext() const = 0;
+  
+  // Reinitialize the reader and read the file from the begin.
+  virtual void ReInit() = 0;
+  
+  // Get a certain read in data's shape.
+  DDim shape(size_t idx) const;
+  // Get shapes of all read in data.
+  std::vector<DDim> shapes() const { return shapes_; }
+  // Set shapes of read in data.
+  void set_shapes(const std::vector<DDim>& shapes) { shapes_ = shapes; }
+
+  virtual ~ReaderBase() {}
+
+ protected:
+  std::vector<DDim> shapes_;
+};
+```
+
+### `FileReader` and `DecoratedReader`
+
+These two classes are derived from the `ReaderBase` and will further be derived by respective specific readers. That is to say, in our design, there are two kinds of readers: file readers and decorated readers. A file reader reads from a file of some specific format, and yield only one instance of data at a time. e.g. RecordIO reader, jpg reader, .... A decorated reader takes another reader(both file reader and decorated reader are OK) as its 'underlying reader'. It gets data from its underlying reader, does some process on them(shuffling, or batching), then yields processed data. The output data of a decorated reader can be a single instance or a batch. `ShuffleReader` and `BatchReader` are both decorated readers.
+
+All the readers share exactly the same interfaces defined in `ReaderBase`. So they can be decorated for more than one time: We can **shuffle** a reader's outputs and then **batch** the shuffle outputs. The interface consistency also allows related ops use readers without knowing what they are exactly.
+
+
+### `ReaderHolder`
+
+Different readers belong to different class types. It leads to a problem: How can we drop them into `Variable`s and fetch them out by a unified method? For example, if a Variable holds a `BatchReader`, we can not get it by the following code:
+
+```cpp
+var->Get<ReaderBase>("batch_reader");
+```
+
+we have to write:
+
+```cpp
+var->Get<BatchReader>("batch_reader");
+```
+
+This requires each time getting a reader from a variable we must know the reader's type exactly. It is nearly impossible.
+
+To solve this problem, we introduce `ReaderHolder` as a wrapper. It acts as an empty decorator of `ReaderBase`, which erases reader's type. With `ReaderHolder` we are able to fetch all types of readers by `var->Get<ReaderHolder>("...")` and regard the obtained object as a reader.
+
+## Related Operators
+
+To create and invoke readers, some now ops are introduced:
+
+### `CreateReaderOp`
+
+Each reader has its creating op. File readers' creating ops have no input and yield the created file reader as its output. Decorated readers' creating ops take the underlying readers as inputs and then yield new decorated readers.
+
+### `ReadOp`
+
+A reader is only a Variable. It cannot trigger the reading process by itself. So we add the `ReadOp` to execute it. A `ReadOp` takes a reader Variable as its input. Each time it runs, it invokes the reader‘s `ReadNext()` function and gets a new batch of data(or only one instance of data, if we use file reader directly). The output data of a reader are in the form of `std::vector<LoDTenosr>`, so the `ReadOp` also needs to split the vector and move LoDTensors to their respective output Variables.
diff --git a/doc/design/csp.md b/doc/design/csp.md
index ba9cacfdea..10d936860f 100644
--- a/doc/design/csp.md
+++ b/doc/design/csp.md
@@ -42,7 +42,7 @@ The type *channel* is conceptually the blocking queue.  In Go, its implemented i
 
 The `select` operation has been in OS kernels long before Go language.  All Unix kernels implement system calls *poll* and *select*.  They monitor multiple file descriptors to see if I/O is possible on any of them.  This takes O(N) time.  Since Linux 2.6, a new system call, *epoll*, can do the same in O(1) time.  In BSD systems, there is a similar system call *kqueue*.  Go's Linux implementation uses epoll.
 
-It might be a good idea to implement Fluid's select using epoll too.  In this design doc, we start from the O(N) way, so we could focus on Python binding and the syntax.
+It might be a good idea to implement Fluid's select using epoll too.  In this design doc, we start from the O(N) way so that we could focus on Python binding and the syntax.
 
 ### Type Channel
 
@@ -71,14 +71,14 @@ ch1 := make(chan int, 100)  // a channel that can buffer 100 ints.
 In Fluid, we should be able to do the same:
 
 ```python
-ch  = fluid.make_chan(dtype=INT)
-ch1 = fluid.make_chan(dtype=INT, 100)
+ch  = fluid.make_channel(dtype=INT)
+ch1 = fluid.make_channel(dtype=INT, 100)
 ```
 
 In addition to that, we want channels that can hold more complex element types, e.g., Tensors of float16:
 
 ```python
-ch = fluid.make_chan(dtype=Tensor, etype=float16)
+ch = fluid.make_channel(dtype=Tensor, etype=float16)
 ```
 
 or Tensors of Tensors of float16 etc.
@@ -87,8 +87,136 @@ The point here is that we need a consistent way to compose types, like in C++ we
 
 ### Send and Recv
 
+Go's CSP implementation depends on data type *channel*. There are two types of channels:
+
+1. The unblocked channel, or buffered channel, is a blocking queue with a non-zero sized buffer. The sending to buffered channel blocks if the buffer is full, and the receive operation blocks if the buffer is empty.
+1. blocked channel, or unbuffered channel, is a blocking queue with no buffer.  Both sending and receiving block with unbuffered channels.
+
+There are four types of actions with a channel:
+
+1. Create a channel
+
+   ```go
+   ch := make(chan int) // this is an unbuffered channel
+   ch := make(chan int, 100) // this is a buffered channel of 100 ints.
+   ```
+
+1. Send
+
+   ```go
+   ch <- 111
+   ```
+
+1. Recv
+
+   ```go
+   y, ok <- ch
+   ```
+
+1. Close
+
+   ```go
+   close(ch)
+   ```
+   
+   Please be aware that a closed channel is not a nil channel, which is `var ch chan int`.
+   
+There are some [axioms with channels](https://dave.cheney.net/2014/03/19/channel-axioms):
+
+1. A send to a nil channel blocks forever
+
+1. A receive from a nil channel blocks forever
+
+1. A send to a closed channel panics
+
+1. A receive from a closed channel returns the residual values and then zeros.
+
+In Fluid, we have [buffered channels](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/buffered_channel.h) and [unbuffered channels](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/unbuffered_channel.h)
+
+The following program illustrates the Python syntax for accessing Fluid buffers.
+
+```python
+import fluid
+
+buffer_size = 10
+ch = fluid.make_channel(dtype=INT, buffer_size)
+
+# Now write three elements to the channel
+with fluid.while(steps=buffer_size):
+  fluid.send(ch, step)
+
+fluid.close_channel(ch)
+
+with fluid.while(steps=buffer_size):
+  fluid.print(fluid.recv(ch))
+```
+
+The following example shows that to avoid the always-blocking behavior of unbuffered channels, we need to use Fluid's goroutines.
+
+```python
+import fluid
+
+ch = fluid.make_channel(dtype=INT)
+
+with fluid.go():
+  fluid.send(ch)
+
+y = fluid.recv(ch)
+
+fluid.close_channel(ch)
+```
+
 ### Select
 
+In Go, the `select` statement lets a goroutine wait on multiple communication operations. A `select` blocks until one of its cases can run, then it executes that case. It chooses one at random if multiple are ready.
+
+```go
+
+ch1  := make(chan int)       
+ch2  := make(chan int, 100)
+
+x := 0
+
+for {
+    select {
+    case ch1 <- x:
+      x := x + 1
+    case y <- ch2:
+      fmt.Println("Received on channel")
+    default:
+      fmt.Println("Default")
+    }
+  }
+
+```
+
+In Fluid, we should be able to do the same:
+
+```python
+ch1  = fluid.make_chan(dtype=INT)
+ch2 = fluid.make_chan(dtype=INT, 100)
+
+sel = fluid.select()
+
+with sel.case(ch1, 'w', X):
+    fluid.layers.increment(X)
+
+with sel.case(ch2, 'r', Y):
+    fluid.print("Received on Channel")
+
+with sel.default():
+    fluid.print("Default")
+
+```
+
+In the above code snippet, `X` and `Y` are variables. Now let us look at each of these statements one by one.
+
+- `sel.case(ch1, 'w', X)` : This specifies that we are writing to `ch1` and we want to write the integer in variable `X` to the channel. The character `w` is used here to make the syntax familiar to write syntax in Python I/O.
+
+- `sel.case(ch2, 'r', Y)` : This specifies that we would like to read the result from `ch2` into variable `Y`. The character `r` is used here to make the syntax familiar to read syntax in Python I/O.
+
+- `sel.default()` : This is equivalent to the default in Go `select`. If none of the channels are ready for read or write, then the fluid code in the default block will be executed.
+
 ## Example Programs
 
 ### 1. RPC between Trainers and Parameter Servers
diff --git a/doc/design/switch.md b/doc/design/switch.md
index 9db1b2782a..827d0601c6 100644
--- a/doc/design/switch.md
+++ b/doc/design/switch.md
@@ -10,8 +10,7 @@ The following example shows the usage of `fluid.switch`.
 a = fluid.Var(10)
 b = fluid.Var(0)
 
-switch = fluid.switch()
-with switch.block():
+with switch() as switch:
     with switch.case(fluid.less_equal(a, 10)):
         fluid.print("Case 1")
     with switch.case(fluid.larger(a, 0)):
diff --git a/doc/howto/dev/FullyConnected.jpg b/doc/dev/FullyConnected.jpg
similarity index 100%
rename from doc/howto/dev/FullyConnected.jpg
rename to doc/dev/FullyConnected.jpg
diff --git a/doc/howto/dev/contribute_to_paddle_cn.md b/doc/dev/contribute_to_paddle_cn.md
similarity index 100%
rename from doc/howto/dev/contribute_to_paddle_cn.md
rename to doc/dev/contribute_to_paddle_cn.md
diff --git a/doc/dev/contribute_to_paddle_en.md b/doc/dev/contribute_to_paddle_en.md
new file mode 120000
index 0000000000..f939e75f21
--- /dev/null
+++ b/doc/dev/contribute_to_paddle_en.md
@@ -0,0 +1 @@
+../../CONTRIBUTING.md
\ No newline at end of file
diff --git a/doc/dev/index_cn.rst b/doc/dev/index_cn.rst
new file mode 100644
index 0000000000..487db868bb
--- /dev/null
+++ b/doc/dev/index_cn.rst
@@ -0,0 +1,8 @@
+开发标准
+========
+
+..  toctree::
+  :maxdepth: 1
+
+  contribute_to_paddle_cn.md
+  write_docs_cn.rst
diff --git a/doc/dev/index_en.rst b/doc/dev/index_en.rst
new file mode 100644
index 0000000000..5dd12d2233
--- /dev/null
+++ b/doc/dev/index_en.rst
@@ -0,0 +1,9 @@
+Development
+------------
+
+..  toctree::
+  :maxdepth: 1
+
+  new_layer_en.rst
+  contribute_to_paddle_en.md
+  write_docs_en.rst
diff --git a/doc/howto/dev/new_layer_cn.rst b/doc/dev/new_layer_cn.rst
similarity index 100%
rename from doc/howto/dev/new_layer_cn.rst
rename to doc/dev/new_layer_cn.rst
diff --git a/doc/howto/dev/new_layer_en.rst b/doc/dev/new_layer_en.rst
similarity index 100%
rename from doc/howto/dev/new_layer_en.rst
rename to doc/dev/new_layer_en.rst
diff --git a/doc/howto/dev/new_op_cn.md b/doc/dev/new_op_cn.md
similarity index 100%
rename from doc/howto/dev/new_op_cn.md
rename to doc/dev/new_op_cn.md
diff --git a/doc/howto/dev/new_op_en.md b/doc/dev/new_op_en.md
similarity index 100%
rename from doc/howto/dev/new_op_en.md
rename to doc/dev/new_op_en.md
diff --git a/doc/howto/dev/new_op_kernel_en.md b/doc/dev/new_op_kernel_en.md
similarity index 100%
rename from doc/howto/dev/new_op_kernel_en.md
rename to doc/dev/new_op_kernel_en.md
diff --git a/doc/howto/dev/use_eigen_cn.md b/doc/dev/use_eigen_cn.md
similarity index 100%
rename from doc/howto/dev/use_eigen_cn.md
rename to doc/dev/use_eigen_cn.md
diff --git a/doc/howto/dev/use_eigen_en.md b/doc/dev/use_eigen_en.md
similarity index 100%
rename from doc/howto/dev/use_eigen_en.md
rename to doc/dev/use_eigen_en.md
diff --git a/doc/howto/dev/write_docs_cn.rst b/doc/dev/write_docs_cn.rst
similarity index 98%
rename from doc/howto/dev/write_docs_cn.rst
rename to doc/dev/write_docs_cn.rst
index 1bc947c260..f79769b810 100644
--- a/doc/howto/dev/write_docs_cn.rst
+++ b/doc/dev/write_docs_cn.rst
@@ -1,6 +1,6 @@
-##################
-如何贡献/修改文档
-##################
+#############
+如何贡献文档
+#############
 
 PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两个部分。文档都是通过 `cmake`_ 驱动 `sphinx`_ 编译生成，生成后的文档分别存储在编译目录的 ``doc`` 和 ``doc_cn`` 两个子目录下。
 也可以利用PaddlePaddle 工具来编译文档，这个情况下所有的文件会存在整理过的的文件目录 .ppo_workspace/content 下
diff --git a/doc/howto/dev/write_docs_en.rst b/doc/dev/write_docs_en.rst
similarity index 98%
rename from doc/howto/dev/write_docs_en.rst
rename to doc/dev/write_docs_en.rst
index b3ef07eb1d..f3408a8426 100644
--- a/doc/howto/dev/write_docs_en.rst
+++ b/doc/dev/write_docs_en.rst
@@ -1,6 +1,6 @@
-##################
+########################
 Contribute Documentation
-##################
+########################
 
 PaddlePaddle supports English documentation ``doc`` and Chinese documentation ``doc_cn``.
 Both are compiled by `cmake`_ and `sphinx`_ , the compiled documentations will be stored under ``doc`` and ``doc_cn`` directories.
diff --git a/doc/getstarted/concepts/use_concepts_cn.rst b/doc/getstarted/concepts/use_concepts_cn.rst
index e695ff283e..608f49f5a9 100644
--- a/doc/getstarted/concepts/use_concepts_cn.rst
+++ b/doc/getstarted/concepts/use_concepts_cn.rst
@@ -4,7 +4,7 @@
 
 PaddlePaddle是源于百度的一个深度学习平台。PaddlePaddle为深度学习研究人员提供了丰富的API，可以轻松地完成神经网络配置，模型训练等任务。
 这里将介绍PaddlePaddle的基本使用概念，并且展示了如何利用PaddlePaddle来解决一个经典的线性回归问题。
-在使用该文档之前，请参考 `安装文档 <../build_and_install/index_cn.html>`_ 完成PaddlePaddle的安装。
+在使用该文档之前，请参考 `安装文档 <../../build_and_install/index_cn.html>`_ 完成PaddlePaddle的安装。
 
 
 配置网络
diff --git a/doc/getstarted/index_cn.rst b/doc/getstarted/index_cn.rst
index 9f6ee25987..1dc141396b 100644
--- a/doc/getstarted/index_cn.rst
+++ b/doc/getstarted/index_cn.rst
@@ -1,61 +1,8 @@
 新手入门
 ============
 
-.. _quick_install:
-
-快速安装
-++++++++
-
-PaddlePaddle支持使用pip快速安装，目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12，并安装有Python2.7。
-执行下面的命令完成快速安装，版本为cpu_avx_openblas：
-
-  .. code-block:: bash
-
-     pip install paddlepaddle
-
-如果需要安装支持GPU的版本（cuda7.5_cudnn5_avx_openblas），需要执行：
-
-  .. code-block:: bash
-
-     pip install paddlepaddle-gpu
-
-更详细的安装和编译方法参考：
-
-..  toctree::
-  :maxdepth: 1
-
-  build_and_install/index_cn.rst
-
-.. _quick_start:
-
-快速开始
-++++++++
-
-创建一个 housing.py 并粘贴此Python代码：
-
-  .. code-block:: python
-
-     import paddle.v2 as paddle
-
-     # Initialize PaddlePaddle.
-     paddle.init(use_gpu=False, trainer_count=1)
-
-     # Configure the neural network.
-     x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
-     y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
-
-     # Infer using provided test data.
-     probs = paddle.infer(
-         output_layer=y_predict,
-         parameters=paddle.dataset.uci_housing.model(),
-         input=[item for item in paddle.dataset.uci_housing.test()()])
-
-     for i in xrange(len(probs)):
-         print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
-
-执行 :code:`python housing.py` 瞧！ 它应该打印出预测住房数据的清单。
-
 ..  toctree::
   :maxdepth: 1
 
+  quickstart_cn.rst
   concepts/use_concepts_cn.rst
diff --git a/doc/getstarted/index_en.rst b/doc/getstarted/index_en.rst
index 063d9d880c..c680e19037 100644
--- a/doc/getstarted/index_en.rst
+++ b/doc/getstarted/index_en.rst
@@ -1,61 +1,7 @@
 GET STARTED
 ============
 
-.. _quick_install:
-
-Quick Install
-----------------------
-
-You can use pip to install PaddlePaddle with a single command, supports
-CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed.
-Simply run the following command to install, the version is cpu_avx_openblas:
-
-  .. code-block:: bash
-
-     pip install paddlepaddle
-
-If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run:
-
-  .. code-block:: bash
-
-     pip install paddlepaddle-gpu
-
-For more details about installation and build:
-
 ..  toctree::
   :maxdepth: 1
 
-  build_and_install/index_en.rst
-
-
-.. _quick_start:
-
-Quick Start
-++++++++
-
-Create a new file called housing.py, and paste this Python
-code:
-
-
-  .. code-block:: python
-
-     import paddle.v2 as paddle
-
-     # Initialize PaddlePaddle.
-     paddle.init(use_gpu=False, trainer_count=1)
-
-     # Configure the neural network.
-     x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
-     y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
-
-     # Infer using provided test data.
-     probs = paddle.infer(
-         output_layer=y_predict,
-         parameters=paddle.dataset.uci_housing.model(),
-         input=[item for item in paddle.dataset.uci_housing.test()()])
-
-     for i in xrange(len(probs)):
-         print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
-
-Run :code:`python housing.py` and voila! It should print out a list of predictions
-for the test housing data.
+  quickstart_en.rst
diff --git a/doc/getstarted/quickstart_cn.rst b/doc/getstarted/quickstart_cn.rst
new file mode 100644
index 0000000000..d511cead26
--- /dev/null
+++ b/doc/getstarted/quickstart_cn.rst
@@ -0,0 +1,47 @@
+快速开始
+========
+
+快速安装
+--------
+
+PaddlePaddle支持使用pip快速安装，目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12，并安装有Python2.7。
+执行下面的命令完成快速安装，版本为cpu_avx_openblas：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+如果需要安装支持GPU的版本（cuda7.5_cudnn5_avx_openblas），需要执行：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+更详细的安装和编译方法参考：:ref:`install_steps` 。
+
+快速使用
+--------
+
+创建一个 housing.py 并粘贴此Python代码：
+
+  .. code-block:: python
+
+     import paddle.v2 as paddle
+
+     # Initialize PaddlePaddle.
+     paddle.init(use_gpu=False, trainer_count=1)
+
+     # Configure the neural network.
+     x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+     y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+
+     # Infer using provided test data.
+     probs = paddle.infer(
+         output_layer=y_predict,
+         parameters=paddle.dataset.uci_housing.model(),
+         input=[item for item in paddle.dataset.uci_housing.test()()])
+
+     for i in xrange(len(probs)):
+         print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
+
+执行 :code:`python housing.py` 瞧！ 它应该打印出预测住房数据的清单。
diff --git a/doc/getstarted/quickstart_en.rst b/doc/getstarted/quickstart_en.rst
new file mode 100644
index 0000000000..70f7fe0646
--- /dev/null
+++ b/doc/getstarted/quickstart_en.rst
@@ -0,0 +1,51 @@
+Quick Start
+============
+
+Quick Install
+-------------
+
+You can use pip to install PaddlePaddle with a single command, supports
+CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed.
+Simply run the following command to install, the version is cpu_avx_openblas:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+For more details about installation and build: :ref:`install_steps` .
+
+Quick Use
+---------
+
+Create a new file called housing.py, and paste this Python
+code:
+
+
+  .. code-block:: python
+
+     import paddle.v2 as paddle
+
+     # Initialize PaddlePaddle.
+     paddle.init(use_gpu=False, trainer_count=1)
+
+     # Configure the neural network.
+     x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+     y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+
+     # Infer using provided test data.
+     probs = paddle.infer(
+         output_layer=y_predict,
+         parameters=paddle.dataset.uci_housing.model(),
+         input=[item for item in paddle.dataset.uci_housing.test()()])
+
+     for i in xrange(len(probs)):
+         print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
+
+Run :code:`python housing.py` and voila! It should print out a list of predictions
+for the test housing data.
diff --git a/doc/howto/usage/capi/compile_paddle_lib_cn.md b/doc/howto/capi/compile_paddle_lib_cn.md
similarity index 99%
rename from doc/howto/usage/capi/compile_paddle_lib_cn.md
rename to doc/howto/capi/compile_paddle_lib_cn.md
index ac5ecffe2e..fd8dec8164 100644
--- a/doc/howto/usage/capi/compile_paddle_lib_cn.md
+++ b/doc/howto/capi/compile_paddle_lib_cn.md
@@ -1,4 +1,4 @@
-## 编译 PaddlePaddle 预测库
+## 安装与编译C-API预测库
 
 ### 概述
 
diff --git a/doc/howto/usage/capi/images/csr.png b/doc/howto/capi/images/csr.png
similarity index 100%
rename from doc/howto/usage/capi/images/csr.png
rename to doc/howto/capi/images/csr.png
diff --git a/doc/howto/usage/capi/images/sequence_data.png b/doc/howto/capi/images/sequence_data.png
similarity index 100%
rename from doc/howto/usage/capi/images/sequence_data.png
rename to doc/howto/capi/images/sequence_data.png
diff --git a/doc/howto/usage/capi/images/workflow_of_CAPI.png b/doc/howto/capi/images/workflow_of_CAPI.png
similarity index 100%
rename from doc/howto/usage/capi/images/workflow_of_CAPI.png
rename to doc/howto/capi/images/workflow_of_CAPI.png
diff --git a/doc/howto/usage/capi/index_cn.rst b/doc/howto/capi/index_cn.rst
similarity index 87%
rename from doc/howto/usage/capi/index_cn.rst
rename to doc/howto/capi/index_cn.rst
index fd774fbc74..e589a6d346 100644
--- a/doc/howto/usage/capi/index_cn.rst
+++ b/doc/howto/capi/index_cn.rst
@@ -1,4 +1,4 @@
-PaddlePaddle C-API
+C-API预测库
 ==================
 
 ..  toctree::
diff --git a/doc/howto/usage/capi/organization_of_the_inputs_cn.md b/doc/howto/capi/organization_of_the_inputs_cn.md
similarity index 100%
rename from doc/howto/usage/capi/organization_of_the_inputs_cn.md
rename to doc/howto/capi/organization_of_the_inputs_cn.md
diff --git a/doc/howto/usage/capi/workflow_of_capi_cn.md b/doc/howto/capi/workflow_of_capi_cn.md
similarity index 99%
rename from doc/howto/usage/capi/workflow_of_capi_cn.md
rename to doc/howto/capi/workflow_of_capi_cn.md
index e0a42fff12..a61d2267bf 100644
--- a/doc/howto/usage/capi/workflow_of_capi_cn.md
+++ b/doc/howto/capi/workflow_of_capi_cn.md
@@ -1,4 +1,4 @@
-## C-API 使用流程
+## C-API使用流程
 
 这篇文档介绍 PaddlePaddle C-API 整体使用流程。
 
diff --git a/doc/howto/usage/cluster/cluster_train_cn.md b/doc/howto/cluster/cmd_argument_cn.md
similarity index 56%
rename from doc/howto/usage/cluster/cluster_train_cn.md
rename to doc/howto/cluster/cmd_argument_cn.md
index 0f3db59607..5c575dd5b5 100644
--- a/doc/howto/usage/cluster/cluster_train_cn.md
+++ b/doc/howto/cluster/cmd_argument_cn.md
@@ -1,41 +1,7 @@
-# 分布式训练
-
-
-## 概述
-
-本文将介绍如何使用PaddlePaddle在不同的集群框架下完成分布式训练。分布式训练架构如下图所示：
-
-<img src="https://user-images.githubusercontent.com/13348433/31772175-5f419eca-b511-11e7-9db7-5231fe3d9ccb.png" width="500">
-
-- 数据分片（Data shard): 用于训练神经网络的数据，被切分成多个部分，每个部分分别给每个trainer使用。
-- 计算节点（Trainer）: 每个trainer启动后读取切分好的一部分数据，开始神经网络的“前馈”和“后馈”计算，并和参数服务器通信。在完成一定量数据的训练后，上传计算得出的梯度（gradients），然后下载优化更新后的神经网络参数（parameters）。
-- 参数服务器（Parameter server）:每个参数服务器只保存整个神经网络所有参数的一部分。参数服务器接收从计算节点上传的梯度，并完成参数优化更新，再将更新后的参数下发到每个计算节点。
-
-这样，通过计算节点和参数服务器的分布式协作，可以完成神经网络的SGD方法的训练。PaddlePaddle可以同时支持同步随机梯度下降（SGD）和异步随机梯度下降。
-
-在使用同步SGD训练神经网络时，PaddlePaddle使用同步屏障（barrier），使梯度的提交和参数的更新按照顺序方式执行。在异步SGD中，则并不会等待所有trainer提交梯度才更新参数，这样极大地提高了计算的并行性：参数服务器之间不相互依赖，并行地接收梯度和更新参数，参数服务器也不会等待计算节点全部都提交梯度之后才开始下一步，计算节点之间也不会相互依赖，并行地执行模型的训练。可以看出，虽然异步SGD方式会提高参数更新并行度, 但是并不能保证参数同步更新，在任意时间某一台参数服务器上保存的参数可能比另一台要更新，与同步SGD相比，梯度会有噪声。
-
-
-## 环境准备
-
-1. 准备您的计算集群。计算集群通常由一组（几台到几千台规模）的Linux服务器组成。服务器之间可以通过局域网（LAN）联通，每台服务器具有集群中唯一的IP地址（或者可被DNS解析的主机名）。集群中的每台计算机通常被成为一个“节点”。
-1. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU，还需要在节点上安装对应的GPU驱动以及CUDA。PaddlePaddle的安装可以参考[build_and_install](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/index_cn.html)的多种安装方式。我们推荐使用[Docker](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_cn.html)安装方式来快速安装PaddlePaddle。
-
-安装完成之后，执行下面的命令可以查看已经安装的版本（docker安装方式可以进入docker容器执行：`docker run -it paddlepaddle/paddle:[tag] /bin/bash`）：
-```bash
-$ paddle version
-PaddlePaddle 0.10.0, compiled with
-    with_avx: ON
-    with_gpu: OFF
-    with_double: OFF
-    with_python: ON
-    with_rdma: OFF
-    with_timer: OFF
-```
+## 启动参数说明
 
-下面以`doc/howto/usage/cluster/src/word2vec`中的代码作为实例，介绍使用PaddlePaddle v2 API完成分布式训练。
+下面以`doc/howto/cluster/src/word2vec`中的代码作为实例，介绍使用PaddlePaddle v2 API完成分布式训练。
 
-## 启动参数说明
 ### 启动参数服务器
 执行以下的命令启动一个参数服务器并等待和计算节点的数据交互
 ```bash
@@ -167,22 +133,3 @@ test.txt-00002
 
 - `train_data_dir`：包含训练数据的目录，可以是从分布式存储挂载过来的，也可以是在任务启动前下载到本地的。
 - `test_data_dir`：包含测试数据集的目录。
-
-## 使用分布式计算平台或工具
-
-PaddlePaddle可以使用多种分布式计算平台构建分布式计算任务，包括：
-- [Kubernetes](http://kubernetes.io) Google开源的容器集群的调度框架，支持大规模集群生产环境的完整集群方案。
-- [OpenMPI](https://www.open-mpi.org) 成熟的高性能并行计算框架。
-- [Fabric](http://www.fabfile.org) 集群管理工具。可以使用`Fabric`编写集群任务提交和管理脚本。
-
-对于不同的集群平台，会分别介绍集群作业的启动和停止方法。这些例子都可以在[cluster_train_v2](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2)找到。
-
-在使用分布式计算平台进行训练时，任务被调度在集群中时，分布式计算平台通常会通过API或者环境变量提供任务运行需要的参数，比如节点的ID、IP和任务节点个数等。
-
-## 在不同集群中运行
-
-  - [fabric集群](fabric_cn.md)
-  - [openmpi集群](openmpi_cn.md)
-  - [kubernetes单机](k8s_cn.md)
-  - [kubernetes distributed分布式](k8s_distributed_cn.md)
-  - [AWS上运行kubernetes集群训练](k8s_aws_cn.md)
diff --git a/doc/howto/usage/cluster/cluster_train_en.md b/doc/howto/cluster/cmd_argument_en.md
similarity index 58%
rename from doc/howto/usage/cluster/cluster_train_en.md
rename to doc/howto/cluster/cmd_argument_en.md
index f9424f8f1a..06fd571756 100644
--- a/doc/howto/usage/cluster/cluster_train_en.md
+++ b/doc/howto/cluster/cmd_argument_en.md
@@ -1,40 +1,7 @@
-# Distributed Training
-
-## Introduction
-
-In this article, we'll explain how to run distributed training jobs with PaddlePaddle on different types of clusters. The diagram below shows the main architecture of a distributed trainning job:
-
-<img src="https://user-images.githubusercontent.com/13348433/31772146-41523d84-b511-11e7-8a12-a69fd136c283.png" width="500">
-
-- Data shard: training data will be split into multiple partitions, trainers use the partitions of the whole dataset to do the training job.
-- Trainer: each trainer reads the data shard, and train the neural network. Then the trainer will upload calculated "gradients" to parameter servers, and wait for parameters to be optimized on the parameter server side. When that finishes, the trainer download optimized parameters and continues its training.
-- Parameter server: every parameter server stores part of the whole neural network model data. They will do optimization calculations when gradients are uploaded from trainers, and then send updated parameters to trainers.
-
-PaddlePaddle can support both synchronize stochastic gradient descent (SGD) and asynchronous SGD.
-
-When training with synchronize SGD, PaddlePaddle uses an internal "synchronize barrier" which makes gradients update and parameter download in strict order. On the other hand, asynchronous SGD won't wait for all trainers to finish upload at a single step, this will increase the parallelism of distributed training: parameter servers do not depend on each other, they'll do parameter optimization concurrently. Parameter servers will not wait for trainers, so trainers will also do their work concurrently. But asynchronous SGD will introduce more randomness and noises in the gradient.
-
-## Preparations
-1. Prepare your computer cluster. It's normally a bunch of Linux servers connected by LAN. Each server will be assigned a unique IP address. The computers in the cluster can be called "nodes".
-2. Install PaddlePaddle on every node. If you are going to take advantage of GPU cards, you'll also need to install proper driver and CUDA libraries. To install PaddlePaddle please read [this build and install](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html) document. We strongly recommend using [Docker installation](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/docker_install_en.html).
-
-After installation, you can check the version by typing the below command (run a docker container  if using docker: `docker run -it paddlepaddle/paddle:[tag] /bin/bash`):
-
-```bash
-$ paddle version
-PaddlePaddle 0.10.0rc, compiled with
-    with_avx: ON
-    with_gpu: OFF
-    with_double: OFF
-    with_python: ON
-    with_rdma: OFF
-    with_timer: OFF
-```
-
-We'll take `doc/howto/usage/cluster/src/word2vec` as an example to introduce distributed training using PaddlePaddle v2 API.
-
 ## Command-line arguments
 
+We'll take `doc/howto/cluster/src/word2vec` as an example to introduce distributed training using PaddlePaddle v2 API.
+
 ### Starting parameter server
 
 Type the below command to start a parameter server which will wait for trainers to connect:
@@ -171,21 +138,3 @@ Your workspace may looks like:
 
 - `train_data_dir`: containing training data. Mount from storage service or copy trainning data to here.
 - `test_data_dir`: containing testing data.
-
-## Use cluster platforms or cluster management tools
-
-PaddlePaddle supports running jobs on several platforms including:
-- [Kubernetes](http://kubernetes.io) open-source system for automating deployment, scaling, and management of containerized applications from Google.
-- [OpenMPI](https://www.open-mpi.org) Mature high performance parallel computing framework.
-- [Fabric](http://www.fabfile.org) A cluster management tool. Write scripts to submit jobs or manage the cluster.
-
-We'll introduce cluster job management on these platforms. The examples can be found under [cluster_train_v2](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2).
-
-These cluster platforms provide API or environment variables for training processes, when the job is dispatched to different nodes. Like node ID, IP or total number of nodes etc.
-
-## Use different clusters
-
-  - [fabric](fabric_en.md)
-  - [openmpi](openmpi_en.md)
-  - [kubernetes](k8s_en.md)
-  - [kubernetes on AWS](k8s_aws_en.md)
diff --git a/doc/howto/usage/cluster/fluid_cluster_train_en.md b/doc/howto/cluster/fluid_cluster_train_en.md
similarity index 100%
rename from doc/howto/usage/cluster/fluid_cluster_train_en.md
rename to doc/howto/cluster/fluid_cluster_train_en.md
diff --git a/doc/howto/cluster/index_cn.rst b/doc/howto/cluster/index_cn.rst
new file mode 100644
index 0000000000..a60521b4a9
--- /dev/null
+++ b/doc/howto/cluster/index_cn.rst
@@ -0,0 +1,22 @@
+分布式训练
+==========
+
+本节将介绍如何使用PaddlePaddle在不同的集群框架下完成分布式训练。分布式训练架构如下图所示：
+
+.. image:: src/ps_cn.png
+   :width: 500
+
+- 数据分片（Data shard): 用于训练神经网络的数据，被切分成多个部分，每个部分分别给每个trainer使用。
+- 计算节点（Trainer）: 每个trainer启动后读取切分好的一部分数据，开始神经网络的“前馈”和“后馈”计算，并和参数服务器通信。在完成一定量数据的训练后，上传计算得出的梯度（gradients），然后下载优化更新后的神经网络参数（parameters）。
+- 参数服务器（Parameter server）:每个参数服务器只保存整个神经网络所有参数的一部分。参数服务器接收从计算节点上传的梯度，并完成参数优化更新，再将更新后的参数下发到每个计算节点。
+
+这样，通过计算节点和参数服务器的分布式协作，可以完成神经网络的SGD方法的训练。PaddlePaddle可以同时支持同步随机梯度下降（SGD）和异步随机梯度下降。
+
+在使用同步SGD训练神经网络时，PaddlePaddle使用同步屏障（barrier），使梯度的提交和参数的更新按照顺序方式执行。在异步SGD中，则并不会等待所有trainer提交梯度才更新参数，这样极大地提高了计算的并行性：参数服务器之间不相互依赖，并行地接收梯度和更新参数，参数服务器也不会等待计算节点全部都提交梯度之后才开始下一步，计算节点之间也不会相互依赖，并行地执行模型的训练。可以看出，虽然异步SGD方式会提高参数更新并行度, 但是并不能保证参数同步更新，在任意时间某一台参数服务器上保存的参数可能比另一台要更新，与同步SGD相比，梯度会有噪声。
+
+..  toctree::
+  :maxdepth: 1
+
+  preparations_cn.md
+  cmd_argument_cn.md
+  multi_cluster/index_cn.rst
diff --git a/doc/howto/cluster/index_en.rst b/doc/howto/cluster/index_en.rst
new file mode 100644
index 0000000000..2640a09dcc
--- /dev/null
+++ b/doc/howto/cluster/index_en.rst
@@ -0,0 +1,22 @@
+Distributed Training
+====================
+
+In this section, we'll explain how to run distributed training jobs with PaddlePaddle on different types of clusters. The diagram below shows the main architecture of a distributed trainning job:
+
+.. image:: src/ps_en.png
+   :width: 500
+
+- Data shard: training data will be split into multiple partitions, trainers use the partitions of the whole dataset to do the training job.
+- Trainer: each trainer reads the data shard, and train the neural network. Then the trainer will upload calculated "gradients" to parameter servers, and wait for parameters to be optimized on the parameter server side. When that finishes, the trainer download optimized parameters and continues its training.
+- Parameter server: every parameter server stores part of the whole neural network model data. They will do optimization calculations when gradients are uploaded from trainers, and then send updated parameters to trainers.
+
+PaddlePaddle can support both synchronize stochastic gradient descent (SGD) and asynchronous SGD.
+
+When training with synchronize SGD, PaddlePaddle uses an internal "synchronize barrier" which makes gradients update and parameter download in strict order. On the other hand, asynchronous SGD won't wait for all trainers to finish upload at a single step, this will increase the parallelism of distributed training: parameter servers do not depend on each other, they'll do parameter optimization concurrently. Parameter servers will not wait for trainers, so trainers will also do their work concurrently. But asynchronous SGD will introduce more randomness and noises in the gradient.
+
+..  toctree::
+  :maxdepth: 1
+
+  preparations_en.md
+  cmd_argument_en.md
+  multi_cluster/index_en.rst
diff --git a/doc/howto/usage/cluster/fabric_cn.md b/doc/howto/cluster/multi_cluster/fabric_cn.md
similarity index 100%
rename from doc/howto/usage/cluster/fabric_cn.md
rename to doc/howto/cluster/multi_cluster/fabric_cn.md
diff --git a/doc/howto/usage/cluster/fabric_en.md b/doc/howto/cluster/multi_cluster/fabric_en.md
similarity index 100%
rename from doc/howto/usage/cluster/fabric_en.md
rename to doc/howto/cluster/multi_cluster/fabric_en.md
diff --git a/doc/howto/cluster/multi_cluster/index_cn.rst b/doc/howto/cluster/multi_cluster/index_cn.rst
new file mode 100644
index 0000000000..ef56b6ddb3
--- /dev/null
+++ b/doc/howto/cluster/multi_cluster/index_cn.rst
@@ -0,0 +1,20 @@
+在不同集群中运行
+================
+
+PaddlePaddle可以使用多种分布式计算平台构建分布式计算任务，包括：
+- `Kubernetes <http://kubernetes.io>`_ Google开源的容器集群的调度框架，支持大规模集群生产环境的完整集群方案。
+- `OpenMPI <https://www.open-mpi.org>`_ 成熟的高性能并行计算框架。
+- `Fabric <http://www.fabfile.org>`_ 集群管理工具。可以使用`Fabric`编写集群任务提交和管理脚本。
+
+对于不同的集群平台，会分别介绍集群作业的启动和停止方法。这些例子都可以在 `cluster_train_v2 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2>`_ 找到。
+
+在使用分布式计算平台进行训练时，任务被调度在集群中时，分布式计算平台通常会通过API或者环境变量提供任务运行需要的参数，比如节点的ID、IP和任务节点个数等。
+
+..  toctree::
+  :maxdepth: 1
+
+  fabric_cn.md
+  openmpi_cn.md
+  k8s_cn.md
+  k8s_distributed_cn.md
+  k8s_aws_cn.md
diff --git a/doc/howto/cluster/multi_cluster/index_en.rst b/doc/howto/cluster/multi_cluster/index_en.rst
new file mode 100644
index 0000000000..dac7aaef08
--- /dev/null
+++ b/doc/howto/cluster/multi_cluster/index_en.rst
@@ -0,0 +1,19 @@
+Use different clusters
+======================
+
+PaddlePaddle supports running jobs on several platforms including:
+- `Kubernetes <http://kubernetes.io>`_ open-source system for automating deployment, scaling, and management of containerized applications from Google.
+- `OpenMPI <https://www.open-mpi.org>`_ Mature high performance parallel computing framework.
+- `Fabric <http://www.fabfile.org>`_ A cluster management tool. Write scripts to submit jobs or manage the cluster.
+
+We'll introduce cluster job management on these platforms. The examples can be found under `cluster_train_v2 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2>`_ .
+
+These cluster platforms provide API or environment variables for training processes, when the job is dispatched to different nodes. Like node ID, IP or total number of nodes etc.
+
+..  toctree::
+  :maxdepth: 1
+
+  fabric_en.md
+  openmpi_en.md
+  k8s_en.md
+  k8s_aws_en.md
diff --git a/doc/howto/usage/cluster/k8s_aws_cn.md b/doc/howto/cluster/multi_cluster/k8s_aws_cn.md
similarity index 100%
rename from doc/howto/usage/cluster/k8s_aws_cn.md
rename to doc/howto/cluster/multi_cluster/k8s_aws_cn.md
diff --git a/doc/howto/usage/cluster/k8s_aws_en.md b/doc/howto/cluster/multi_cluster/k8s_aws_en.md
similarity index 100%
rename from doc/howto/usage/cluster/k8s_aws_en.md
rename to doc/howto/cluster/multi_cluster/k8s_aws_en.md
diff --git a/doc/howto/usage/cluster/k8s_cn.md b/doc/howto/cluster/multi_cluster/k8s_cn.md
similarity index 100%
rename from doc/howto/usage/cluster/k8s_cn.md
rename to doc/howto/cluster/multi_cluster/k8s_cn.md
diff --git a/doc/howto/usage/cluster/k8s_distributed_cn.md b/doc/howto/cluster/multi_cluster/k8s_distributed_cn.md
similarity index 100%
rename from doc/howto/usage/cluster/k8s_distributed_cn.md
rename to doc/howto/cluster/multi_cluster/k8s_distributed_cn.md
diff --git a/doc/howto/usage/cluster/k8s_en.md b/doc/howto/cluster/multi_cluster/k8s_en.md
similarity index 100%
rename from doc/howto/usage/cluster/k8s_en.md
rename to doc/howto/cluster/multi_cluster/k8s_en.md
diff --git a/doc/howto/usage/cluster/openmpi_cn.md b/doc/howto/cluster/multi_cluster/openmpi_cn.md
similarity index 100%
rename from doc/howto/usage/cluster/openmpi_cn.md
rename to doc/howto/cluster/multi_cluster/openmpi_cn.md
diff --git a/doc/howto/usage/cluster/openmpi_en.md b/doc/howto/cluster/multi_cluster/openmpi_en.md
similarity index 100%
rename from doc/howto/usage/cluster/openmpi_en.md
rename to doc/howto/cluster/multi_cluster/openmpi_en.md
diff --git a/doc/howto/usage/cluster/src/add_security_group.png b/doc/howto/cluster/multi_cluster/src/add_security_group.png
similarity index 100%
rename from doc/howto/usage/cluster/src/add_security_group.png
rename to doc/howto/cluster/multi_cluster/src/add_security_group.png
diff --git a/doc/howto/usage/cluster/src/create_efs.png b/doc/howto/cluster/multi_cluster/src/create_efs.png
similarity index 100%
rename from doc/howto/usage/cluster/src/create_efs.png
rename to doc/howto/cluster/multi_cluster/src/create_efs.png
diff --git a/doc/howto/usage/cluster/src/k8s-paddle-arch.png b/doc/howto/cluster/multi_cluster/src/k8s-paddle-arch.png
similarity index 100%
rename from doc/howto/usage/cluster/src/k8s-paddle-arch.png
rename to doc/howto/cluster/multi_cluster/src/k8s-paddle-arch.png
diff --git a/doc/howto/usage/cluster/src/k8s_data/Dockerfile b/doc/howto/cluster/multi_cluster/src/k8s_data/Dockerfile
similarity index 100%
rename from doc/howto/usage/cluster/src/k8s_data/Dockerfile
rename to doc/howto/cluster/multi_cluster/src/k8s_data/Dockerfile
diff --git a/doc/howto/usage/cluster/src/k8s_data/README.md b/doc/howto/cluster/multi_cluster/src/k8s_data/README.md
similarity index 100%
rename from doc/howto/usage/cluster/src/k8s_data/README.md
rename to doc/howto/cluster/multi_cluster/src/k8s_data/README.md
diff --git a/doc/howto/usage/cluster/src/k8s_data/get_data.sh b/doc/howto/cluster/multi_cluster/src/k8s_data/get_data.sh
similarity index 100%
rename from doc/howto/usage/cluster/src/k8s_data/get_data.sh
rename to doc/howto/cluster/multi_cluster/src/k8s_data/get_data.sh
diff --git a/doc/howto/usage/cluster/src/k8s_train/Dockerfile b/doc/howto/cluster/multi_cluster/src/k8s_train/Dockerfile
similarity index 100%
rename from doc/howto/usage/cluster/src/k8s_train/Dockerfile
rename to doc/howto/cluster/multi_cluster/src/k8s_train/Dockerfile
diff --git a/doc/howto/usage/cluster/src/k8s_train/README.md b/doc/howto/cluster/multi_cluster/src/k8s_train/README.md
similarity index 100%
rename from doc/howto/usage/cluster/src/k8s_train/README.md
rename to doc/howto/cluster/multi_cluster/src/k8s_train/README.md
diff --git a/doc/howto/usage/cluster/src/k8s_train/start.sh b/doc/howto/cluster/multi_cluster/src/k8s_train/start.sh
similarity index 100%
rename from doc/howto/usage/cluster/src/k8s_train/start.sh
rename to doc/howto/cluster/multi_cluster/src/k8s_train/start.sh
diff --git a/doc/howto/usage/cluster/src/k8s_train/start_paddle.py b/doc/howto/cluster/multi_cluster/src/k8s_train/start_paddle.py
similarity index 100%
rename from doc/howto/usage/cluster/src/k8s_train/start_paddle.py
rename to doc/howto/cluster/multi_cluster/src/k8s_train/start_paddle.py
diff --git a/doc/howto/usage/cluster/src/pserver_and_trainer.png b/doc/howto/cluster/multi_cluster/src/pserver_and_trainer.png
similarity index 100%
rename from doc/howto/usage/cluster/src/pserver_and_trainer.png
rename to doc/howto/cluster/multi_cluster/src/pserver_and_trainer.png
diff --git a/doc/howto/usage/cluster/src/route53_create_recordset.png b/doc/howto/cluster/multi_cluster/src/route53_create_recordset.png
similarity index 100%
rename from doc/howto/usage/cluster/src/route53_create_recordset.png
rename to doc/howto/cluster/multi_cluster/src/route53_create_recordset.png
diff --git a/doc/howto/usage/cluster/src/route53_create_zone.png b/doc/howto/cluster/multi_cluster/src/route53_create_zone.png
similarity index 100%
rename from doc/howto/usage/cluster/src/route53_create_zone.png
rename to doc/howto/cluster/multi_cluster/src/route53_create_zone.png
diff --git a/doc/howto/usage/cluster/src/worker_security_group.png b/doc/howto/cluster/multi_cluster/src/worker_security_group.png
similarity index 100%
rename from doc/howto/usage/cluster/src/worker_security_group.png
rename to doc/howto/cluster/multi_cluster/src/worker_security_group.png
diff --git a/doc/howto/cluster/preparations_cn.md b/doc/howto/cluster/preparations_cn.md
new file mode 100644
index 0000000000..ce40697e70
--- /dev/null
+++ b/doc/howto/cluster/preparations_cn.md
@@ -0,0 +1,16 @@
+## 环境准备
+
+1. 准备您的计算集群。计算集群通常由一组（几台到几千台规模）的Linux服务器组成。服务器之间可以通过局域网（LAN）联通，每台服务器具有集群中唯一的IP地址（或者可被DNS解析的主机名）。集群中的每台计算机通常被成为一个“节点”。
+1. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU，还需要在节点上安装对应的GPU驱动以及CUDA。PaddlePaddle的安装可以参考[build_and_install](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/index_cn.html)的多种安装方式。我们推荐使用[Docker](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_cn.html)安装方式来快速安装PaddlePaddle。
+
+安装完成之后，执行下面的命令可以查看已经安装的版本（docker安装方式可以进入docker容器执行：`docker run -it paddlepaddle/paddle:[tag] /bin/bash`）：
+```bash
+$ paddle version
+PaddlePaddle 0.10.0, compiled with
+    with_avx: ON
+    with_gpu: OFF
+    with_double: OFF
+    with_python: ON
+    with_rdma: OFF
+    with_timer: OFF
+```
diff --git a/doc/howto/cluster/preparations_en.md b/doc/howto/cluster/preparations_en.md
new file mode 100644
index 0000000000..4b77b29390
--- /dev/null
+++ b/doc/howto/cluster/preparations_en.md
@@ -0,0 +1,17 @@
+## Preparations
+
+1. Prepare your computer cluster. It's normally a bunch of Linux servers connected by LAN. Each server will be assigned a unique IP address. The computers in the cluster can be called "nodes".
+2. Install PaddlePaddle on every node. If you are going to take advantage of GPU cards, you'll also need to install proper driver and CUDA libraries. To install PaddlePaddle please read [this build and install](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html) document. We strongly recommend using [Docker installation](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/docker_install_en.html).
+
+After installation, you can check the version by typing the below command (run a docker container  if using docker: `docker run -it paddlepaddle/paddle:[tag] /bin/bash`):
+
+```bash
+$ paddle version
+PaddlePaddle 0.10.0rc, compiled with
+    with_avx: ON
+    with_gpu: OFF
+    with_double: OFF
+    with_python: ON
+    with_rdma: OFF
+    with_timer: OFF
+```
diff --git a/doc/howto/usage/cluster/src/Dockerfile b/doc/howto/cluster/src/Dockerfile
similarity index 100%
rename from doc/howto/usage/cluster/src/Dockerfile
rename to doc/howto/cluster/src/Dockerfile
diff --git a/doc/howto/usage/cluster/src/efs_mount.png b/doc/howto/cluster/src/efs_mount.png
similarity index 100%
rename from doc/howto/usage/cluster/src/efs_mount.png
rename to doc/howto/cluster/src/efs_mount.png
diff --git a/doc/howto/usage/cluster/src/managed_policy.png b/doc/howto/cluster/src/managed_policy.png
similarity index 100%
rename from doc/howto/usage/cluster/src/managed_policy.png
rename to doc/howto/cluster/src/managed_policy.png
diff --git a/doc/howto/usage/cluster/src/trainer_cn.png b/doc/howto/cluster/src/ps_cn.png
similarity index 100%
rename from doc/howto/usage/cluster/src/trainer_cn.png
rename to doc/howto/cluster/src/ps_cn.png
diff --git a/doc/howto/usage/cluster/src/trainer.png b/doc/howto/cluster/src/ps_en.png
similarity index 100%
rename from doc/howto/usage/cluster/src/trainer.png
rename to doc/howto/cluster/src/ps_en.png
diff --git a/doc/howto/cluster/src/trainer.png b/doc/howto/cluster/src/trainer.png
new file mode 100644
index 0000000000..6537d3d565
Binary files /dev/null and b/doc/howto/cluster/src/trainer.png differ
diff --git a/doc/howto/cluster/src/trainer_cn.png b/doc/howto/cluster/src/trainer_cn.png
new file mode 100644
index 0000000000..f9525739cc
Binary files /dev/null and b/doc/howto/cluster/src/trainer_cn.png differ
diff --git a/doc/howto/usage/cluster/src/word2vec/api_train_v2.py b/doc/howto/cluster/src/word2vec/api_train_v2.py
similarity index 100%
rename from doc/howto/usage/cluster/src/word2vec/api_train_v2.py
rename to doc/howto/cluster/src/word2vec/api_train_v2.py
diff --git a/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py b/doc/howto/cluster/src/word2vec/api_train_v2_cluster.py
similarity index 100%
rename from doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py
rename to doc/howto/cluster/src/word2vec/api_train_v2_cluster.py
diff --git a/doc/howto/usage/cluster/src/word2vec/prepare.py b/doc/howto/cluster/src/word2vec/prepare.py
similarity index 100%
rename from doc/howto/usage/cluster/src/word2vec/prepare.py
rename to doc/howto/cluster/src/word2vec/prepare.py
diff --git a/doc/howto/usage/cmd_parameter/arguments_cn.md b/doc/howto/cmd_parameter/arguments_cn.md
similarity index 100%
rename from doc/howto/usage/cmd_parameter/arguments_cn.md
rename to doc/howto/cmd_parameter/arguments_cn.md
diff --git a/doc/howto/usage/cmd_parameter/arguments_en.md b/doc/howto/cmd_parameter/arguments_en.md
similarity index 100%
rename from doc/howto/usage/cmd_parameter/arguments_en.md
rename to doc/howto/cmd_parameter/arguments_en.md
diff --git a/doc/howto/usage/cmd_parameter/detail_introduction_cn.md b/doc/howto/cmd_parameter/detail_introduction_cn.md
similarity index 100%
rename from doc/howto/usage/cmd_parameter/detail_introduction_cn.md
rename to doc/howto/cmd_parameter/detail_introduction_cn.md
diff --git a/doc/howto/usage/cmd_parameter/detail_introduction_en.md b/doc/howto/cmd_parameter/detail_introduction_en.md
similarity index 100%
rename from doc/howto/usage/cmd_parameter/detail_introduction_en.md
rename to doc/howto/cmd_parameter/detail_introduction_en.md
diff --git a/doc/howto/usage/cmd_parameter/index_cn.rst b/doc/howto/cmd_parameter/index_cn.rst
similarity index 85%
rename from doc/howto/usage/cmd_parameter/index_cn.rst
rename to doc/howto/cmd_parameter/index_cn.rst
index 4c87298211..17b379f629 100644
--- a/doc/howto/usage/cmd_parameter/index_cn.rst
+++ b/doc/howto/cmd_parameter/index_cn.rst
@@ -1,6 +1,6 @@
 ..  _cmd_line_index:
 
-设置命令行参数
+命令行参数设置
 ===============
 
 ..  toctree::
diff --git a/doc/howto/usage/cmd_parameter/index_en.rst b/doc/howto/cmd_parameter/index_en.rst
similarity index 100%
rename from doc/howto/usage/cmd_parameter/index_en.rst
rename to doc/howto/cmd_parameter/index_en.rst
diff --git a/doc/howto/usage/cmd_parameter/use_case_cn.md b/doc/howto/cmd_parameter/use_case_cn.md
similarity index 100%
rename from doc/howto/usage/cmd_parameter/use_case_cn.md
rename to doc/howto/cmd_parameter/use_case_cn.md
diff --git a/doc/howto/usage/cmd_parameter/use_case_en.md b/doc/howto/cmd_parameter/use_case_en.md
similarity index 100%
rename from doc/howto/usage/cmd_parameter/use_case_en.md
rename to doc/howto/cmd_parameter/use_case_en.md
diff --git a/doc/howto/dev/contribute_to_paddle_en.md b/doc/howto/dev/contribute_to_paddle_en.md
deleted file mode 120000
index c97564d93a..0000000000
--- a/doc/howto/dev/contribute_to_paddle_en.md
+++ /dev/null
@@ -1 +0,0 @@
-../../../CONTRIBUTING.md
\ No newline at end of file
diff --git a/doc/howto/index_cn.rst b/doc/howto/index_cn.rst
index e0c69f7a6a..0c534f107b 100644
--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
@@ -1,37 +1,11 @@
-进阶指南
+进阶使用
 ========
 
-使用说明
---------
-
-..  toctree::
-  :maxdepth: 1
-
-  usage/cmd_parameter/index_cn.rst
-  usage/cluster/cluster_train_cn.md
-  usage/capi/index_cn.rst
-
-开发标准
---------
-
-..  toctree::
-  :maxdepth: 1
-
-  dev/contribute_to_paddle_cn.md
-  dev/write_docs_cn.rst
-
-模型配置
---------
-
-..  toctree::
-  :maxdepth: 1
-
-  deep_model/rnn/index_cn.rst
-
-性能优化
---------
-
 ..  toctree::
   :maxdepth: 1
 
+  cmd_parameter/index_cn.rst
+  cluster/index_cn.rst
+  capi/index_cn.rst
+  rnn/index_cn.rst
   optimization/gpu_profiling_cn.rst
diff --git a/doc/howto/index_en.rst b/doc/howto/index_en.rst
index 6d1bf7dfc0..ae8b86f75b 100644
--- a/doc/howto/index_en.rst
+++ b/doc/howto/index_en.rst
@@ -1,37 +1,10 @@
 HOW TO
 =======
 
-Usage
--------
-
-..  toctree::
-  :maxdepth: 1
-
-  usage/cmd_parameter/index_en.rst
-  usage/cluster/cluster_train_en.md
-
-Development
-------------
-
-..  toctree::
-  :maxdepth: 1
-
-  dev/new_layer_en.rst
-  dev/contribute_to_paddle_en.md
-  dev/write_docs_en.rst
-
-Configuration
--------------
-
-..  toctree::
-  :maxdepth: 1
-
-  deep_model/rnn/index_en.rst
-
-Optimization
--------------
-
 ..  toctree::
   :maxdepth: 1
 
+  cmd_parameter/index_en.rst
+  cluster/index_en.rst
+  rnn/index_en.rst
   optimization/gpu_profiling_en.rst
diff --git a/doc/howto/optimization/cpu_profiling.md b/doc/howto/optimization/cpu_profiling_en.md
similarity index 100%
rename from doc/howto/optimization/cpu_profiling.md
rename to doc/howto/optimization/cpu_profiling_en.md
diff --git a/doc/howto/optimization/gpu_profiling_cn.rst b/doc/howto/optimization/gpu_profiling_cn.rst
index e2b0b0396e..0239eef4f1 100644
--- a/doc/howto/optimization/gpu_profiling_cn.rst
+++ b/doc/howto/optimization/gpu_profiling_cn.rst
@@ -1,6 +1,6 @@
-==================
-GPU性能分析与调优
-==================
+============
+GPU性能调优
+============
 
 ..  contents::
 
diff --git a/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst b/doc/howto/rnn/hierarchical_layer_cn.rst
similarity index 100%
rename from doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
rename to doc/howto/rnn/hierarchical_layer_cn.rst
diff --git a/doc/howto/deep_model/rnn/hrnn_rnn_api_compare_cn.rst b/doc/howto/rnn/hrnn_rnn_api_compare_cn.rst
similarity index 100%
rename from doc/howto/deep_model/rnn/hrnn_rnn_api_compare_cn.rst
rename to doc/howto/rnn/hrnn_rnn_api_compare_cn.rst
diff --git a/doc/howto/deep_model/rnn/index_cn.rst b/doc/howto/rnn/index_cn.rst
similarity index 90%
rename from doc/howto/deep_model/rnn/index_cn.rst
rename to doc/howto/rnn/index_cn.rst
index 9ecab5594c..bcc8c2f46e 100644
--- a/doc/howto/deep_model/rnn/index_cn.rst
+++ b/doc/howto/rnn/index_cn.rst
@@ -1,4 +1,4 @@
-RNN相关模型
+RNN模型
 ===========
 
 ..  toctree::
diff --git a/doc/howto/deep_model/rnn/index_en.rst b/doc/howto/rnn/index_en.rst
similarity index 100%
rename from doc/howto/deep_model/rnn/index_en.rst
rename to doc/howto/rnn/index_en.rst
diff --git a/doc/howto/deep_model/rnn/recurrent_group_cn.md b/doc/howto/rnn/recurrent_group_cn.md
similarity index 100%
rename from doc/howto/deep_model/rnn/recurrent_group_cn.md
rename to doc/howto/rnn/recurrent_group_cn.md
diff --git a/doc/howto/deep_model/rnn/rnn_config_cn.rst b/doc/howto/rnn/rnn_config_cn.rst
similarity index 100%
rename from doc/howto/deep_model/rnn/rnn_config_cn.rst
rename to doc/howto/rnn/rnn_config_cn.rst
diff --git a/doc/howto/deep_model/rnn/rnn_config_en.rst b/doc/howto/rnn/rnn_config_en.rst
similarity index 100%
rename from doc/howto/deep_model/rnn/rnn_config_en.rst
rename to doc/howto/rnn/rnn_config_en.rst
diff --git a/doc/howto/deep_model/rnn/src/bi_lstm.jpg b/doc/howto/rnn/src/bi_lstm.jpg
similarity index 100%
rename from doc/howto/deep_model/rnn/src/bi_lstm.jpg
rename to doc/howto/rnn/src/bi_lstm.jpg
diff --git a/doc/howto/deep_model/rnn/src/encoder-decoder-attention-model.png b/doc/howto/rnn/src/encoder-decoder-attention-model.png
similarity index 100%
rename from doc/howto/deep_model/rnn/src/encoder-decoder-attention-model.png
rename to doc/howto/rnn/src/encoder-decoder-attention-model.png
diff --git a/doc/howto/deep_model/rnn/src/glossary_rnn.dot b/doc/howto/rnn/src/glossary_rnn.dot
similarity index 100%
rename from doc/howto/deep_model/rnn/src/glossary_rnn.dot
rename to doc/howto/rnn/src/glossary_rnn.dot
diff --git a/doc/howto/deep_model/rnn/src/glossary_rnn_with_memory.dot b/doc/howto/rnn/src/glossary_rnn_with_memory.dot
similarity index 100%
rename from doc/howto/deep_model/rnn/src/glossary_rnn_with_memory.dot
rename to doc/howto/rnn/src/glossary_rnn_with_memory.dot
diff --git a/doc/howto/deep_model/rnn/src/simple_full_hierarchical_recurrent.dot b/doc/howto/rnn/src/simple_full_hierarchical_recurrent.dot
similarity index 100%
rename from doc/howto/deep_model/rnn/src/simple_full_hierarchical_recurrent.dot
rename to doc/howto/rnn/src/simple_full_hierarchical_recurrent.dot
diff --git a/doc/howto/deep_model/rnn/src/simple_full_recurrent.dot b/doc/howto/rnn/src/simple_full_recurrent.dot
similarity index 100%
rename from doc/howto/deep_model/rnn/src/simple_full_recurrent.dot
rename to doc/howto/rnn/src/simple_full_recurrent.dot
diff --git a/doc/index_cn.rst b/doc/index_cn.rst
index ada51c2d73..0f645db6fc 100644
--- a/doc/index_cn.rst
+++ b/doc/index_cn.rst
@@ -5,7 +5,7 @@ PaddlePaddle 文档
   :maxdepth: 1
 
   getstarted/index_cn.rst
+  build_and_install/index_cn.rst
   howto/index_cn.rst
-  api/index_cn.rst
+  dev/index_cn.rst
   faq/index_cn.rst
-  mobile/index_cn.rst
diff --git a/doc/index_en.rst b/doc/index_en.rst
index 23b64b6cad..166f56c28f 100644
--- a/doc/index_en.rst
+++ b/doc/index_en.rst
@@ -5,6 +5,6 @@ PaddlePaddle Documentation
   :maxdepth: 1
 
   getstarted/index_en.rst
+  build_and_install/index_en.rst
   howto/index_en.rst
-  api/index_en.rst
-  mobile/index_en.rst
+  dev/index_en.rst
diff --git a/doc/mobile/index_cn.rst b/doc/mobile/index_cn.rst
deleted file mode 100644
index 1d99666e58..0000000000
--- a/doc/mobile/index_cn.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-MOBILE
-======
-
-..  toctree::
-  :maxdepth: 1
-
-  cross_compiling_for_android_cn.md
-  cross_compiling_for_ios_cn.md
-  cross_compiling_for_raspberry_cn.md
diff --git a/doc/mobile/index_en.rst b/doc/mobile/index_en.rst
deleted file mode 100644
index ef421dacad..0000000000
--- a/doc/mobile/index_en.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-MOBILE
-======
-
-..  toctree::
-  :maxdepth: 1
-
-  cross_compiling_for_android_en.md
-  cross_compiling_for_ios_en.md
-  cross_compiling_for_raspberry_en.md
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 3f9c132ef6..c7deba2ab4 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -19,12 +19,7 @@ else()
   endif()
 
   if(NOT ANDROID AND NOT IOS)
-    add_subdirectory(memory)
-    add_subdirectory(platform)
-    add_subdirectory(framework)
-    add_subdirectory(operators)
-    add_subdirectory(pybind)
-    add_subdirectory(inference)
+    add_subdirectory(fluid)
   endif()
 
   if(WITH_SWIG_PY)
diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
new file mode 100644
index 0000000000..a6b4191518
--- /dev/null
+++ b/paddle/fluid/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_subdirectory(memory)
+add_subdirectory(platform)
+add_subdirectory(framework)
+add_subdirectory(operators)
+add_subdirectory(pybind)
+add_subdirectory(inference)
diff --git a/paddle/framework/.clang-format b/paddle/fluid/framework/.clang-format
similarity index 100%
rename from paddle/framework/.clang-format
rename to paddle/fluid/framework/.clang-format
diff --git a/paddle/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
similarity index 93%
rename from paddle/framework/CMakeLists.txt
rename to paddle/fluid/framework/CMakeLists.txt
index 8b71f73c36..ef1bc07c2d 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -20,10 +20,13 @@ endif()
 
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
 
+nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place paddle_memory device_context init)
 cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto)
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init)
 
+cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
+
 cc_test(variable_test SRCS variable_test.cc)
 
 cc_library(threadpool SRCS threadpool.cc DEPS enforce)
@@ -92,11 +95,4 @@ cc_test(init_test SRCS init_test.cc DEPS init)
 cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
 cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
       
-if(NOT WITH_C_API AND WITH_FLUID)
-  file(GLOB FRAMEWORK_HEADERS *.h)
-  install(FILES ${FRAMEWORK_HEADERS} DESTINATION include/paddle/framework)
-  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/framework.pb.h DESTINATION include/paddle/framework)
-  install(FILES details/cow_ptr.h details/op_registry.h DESTINATION include/paddle/framework/details)
-endif()
-
 cc_test(channel_test SRCS channel_test.cc)
diff --git a/paddle/framework/attribute.cc b/paddle/fluid/framework/attribute.cc
similarity index 97%
rename from paddle/framework/attribute.cc
rename to paddle/fluid/framework/attribute.cc
index 5074e8f5a0..1d7e7366b0 100644
--- a/paddle/framework/attribute.cc
+++ b/paddle/fluid/framework/attribute.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/attribute.h"
+#include "paddle/fluid/framework/attribute.h"
 
 #include <vector>
 
diff --git a/paddle/framework/attribute.h b/paddle/fluid/framework/attribute.h
similarity index 98%
rename from paddle/framework/attribute.h
rename to paddle/fluid/framework/attribute.h
index bcff9bc4c4..16be42ae71 100644
--- a/paddle/framework/attribute.h
+++ b/paddle/fluid/framework/attribute.h
@@ -20,9 +20,9 @@ limitations under the License. */
 #include <unordered_set>
 #include <vector>
 
-#include "paddle/framework/framework.pb.h"
-#include "paddle/framework/type_defs.h"
-#include "paddle/platform/enforce.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/type_defs.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/backward.cc b/paddle/fluid/framework/backward.cc
similarity index 98%
rename from paddle/framework/backward.cc
rename to paddle/fluid/framework/backward.cc
index 85e693434a..c4795f4fc5 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/fluid/framework/backward.cc
@@ -12,17 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/backward.h"
-#include "paddle/operators/net_op.h"
+#include "paddle/fluid/framework/backward.h"
+#include "paddle/fluid/operators/net_op.h"
 
 #include <deque>
 #include <list>
 #include <memory>
 #include <unordered_set>
 
-#include "paddle/framework/block_desc.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/net_op.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/net_op.h"
 
 namespace paddle {
 namespace framework {
@@ -534,7 +534,7 @@ ParamGradInfoMap AppendBackward(
   auto root_block = program_desc.MutableBlock(root_block_idx);
 
   std::string fill_one_op_out = GradVarName(target.Name());
-  bool is_scalar = target.Shape() == std::vector<int64_t>{1};
+  bool is_scalar = target.GetShape() == std::vector<int64_t>{1};
   PADDLE_ENFORCE(is_scalar, "target should be scalar");
   VLOG(3) << "backward from loss=" << target.Name()
           << " data_type=" << target.GetDataType();
@@ -565,7 +565,7 @@ ParamGradInfoMap AppendBackward(
 
   auto var = root_block->Var(fill_one_op_out);
   var->SetDataType(target.GetDataType());
-  var->SetShape(target.Shape());
+  var->SetShape(target.GetShape());
   auto& target_grad = retv[target.Name()];
   target_grad.name_ = fill_one_op_out;
   target_grad.block_idx_ = root_block_idx;
diff --git a/paddle/framework/backward.h b/paddle/fluid/framework/backward.h
similarity index 94%
rename from paddle/framework/backward.h
rename to paddle/fluid/framework/backward.h
index 69ee380236..2ea6922426 100644
--- a/paddle/framework/backward.h
+++ b/paddle/fluid/framework/backward.h
@@ -18,8 +18,8 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>
 
-#include "paddle/framework/operator.h"
-#include "paddle/framework/program_desc.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/backward_test.cc b/paddle/fluid/framework/backward_test.cc
similarity index 99%
rename from paddle/framework/backward_test.cc
rename to paddle/fluid/framework/backward_test.cc
index 72743b5fd0..f9604c6891 100644
--- a/paddle/framework/backward_test.cc
+++ b/paddle/fluid/framework/backward_test.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/framework/backward.h"
+#include "paddle/fluid/framework/backward.h"
 
 #include <gtest/gtest.h>
-#include "paddle/framework/block_desc.h"
-#include "paddle/framework/op_desc.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/var_desc.h"
-#include "paddle/operators/net_op.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/operators/net_op.h"
 
 USE_NO_KERNEL_OP(fill_constant);
 
diff --git a/paddle/framework/block_desc.cc b/paddle/fluid/framework/block_desc.cc
similarity index 96%
rename from paddle/framework/block_desc.cc
rename to paddle/fluid/framework/block_desc.cc
index dd2ed87252..9550159155 100644
--- a/paddle/framework/block_desc.cc
+++ b/paddle/fluid/framework/block_desc.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/block_desc.h"
-#include "paddle/framework/operator.h"
-#include "paddle/framework/program_desc.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
 
 namespace paddle {
 namespace framework {
@@ -162,9 +162,8 @@ BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc,
     : prog_(prog), desc_(desc) {
   need_update_ = true;
   for (auto &op : other.ops_) {
-    ops_.emplace_back(new OpDesc(*op, this));
+    ops_.emplace_back(new OpDesc(*op->Proto(), prog, this));
   }
-
   for (auto &it : other.vars_) {
     auto *var = new VarDesc(*it.second);
     vars_[it.first].reset(var);
diff --git a/paddle/framework/block_desc.h b/paddle/fluid/framework/block_desc.h
similarity index 93%
rename from paddle/framework/block_desc.h
rename to paddle/fluid/framework/block_desc.h
index 4b609e4bcb..5f7eca3878 100644
--- a/paddle/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
@@ -20,10 +20,10 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
-#include "paddle/framework/op_desc.h"
-#include "paddle/framework/proto_desc.h"
-#include "paddle/framework/var_desc.h"
-#include "paddle/platform/macros.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/proto_desc.h"
+#include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/platform/macros.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/channel.h b/paddle/fluid/framework/channel.h
similarity index 83%
rename from paddle/framework/channel.h
rename to paddle/fluid/framework/channel.h
index 0570980c5a..5acf4fb39b 100644
--- a/paddle/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -23,8 +23,8 @@ namespace framework {
 template <typename T>
 class Channel {
  public:
-  virtual void Send(T*) = 0;
-  virtual void Receive(T*) = 0;
+  virtual bool Send(T*) = 0;
+  virtual bool Receive(T*) = 0;
   virtual size_t Cap() = 0;
   virtual void Close() = 0;
   virtual ~Channel() {}
@@ -54,5 +54,5 @@ void CloseChannel(Channel<T>* ch) {
 }  // namespace framework
 }  // namespace paddle
 
-#include "paddle/framework/details/buffered_channel.h"
-#include "paddle/framework/details/unbuffered_channel.h"
+#include "paddle/fluid/framework/details/buffered_channel.h"
+#include "paddle/fluid/framework/details/unbuffered_channel.h"
diff --git a/paddle/fluid/framework/channel_test.cc b/paddle/fluid/framework/channel_test.cc
new file mode 100644
index 0000000000..953fa40fec
--- /dev/null
+++ b/paddle/fluid/framework/channel_test.cc
@@ -0,0 +1,510 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/channel.h"
+
+#include <chrono>
+#include <thread>
+
+#include "gtest/gtest.h"
+
+using paddle::framework::Channel;
+using paddle::framework::MakeChannel;
+using paddle::framework::CloseChannel;
+using paddle::framework::details::Buffered;
+using paddle::framework::details::UnBuffered;
+
+void RecevingOrderEqualToSendingOrder(Channel<int> *ch) {
+  unsigned sum_send = 0;
+  std::thread t([&]() {
+    for (int i = 0; i < 5; i++) {
+      EXPECT_EQ(ch->Send(&i), true);
+      sum_send += i;
+    }
+  });
+  for (int i = 0; i < 5; i++) {
+    int recv;
+    EXPECT_EQ(ch->Receive(&recv), true);
+    EXPECT_EQ(recv, i);
+  }
+
+  CloseChannel(ch);
+  t.join();
+  EXPECT_EQ(sum_send, 10U);
+  delete ch;
+}
+
+TEST(Channel, MakeAndClose) {
+  using paddle::framework::details::Buffered;
+  using paddle::framework::details::UnBuffered;
+  {
+    // MakeChannel should return a buffered channel is buffer_size > 0.
+    auto ch = MakeChannel<int>(10);
+    EXPECT_NE(dynamic_cast<Buffered<int> *>(ch), nullptr);
+    EXPECT_EQ(dynamic_cast<UnBuffered<int> *>(ch), nullptr);
+    CloseChannel(ch);
+    delete ch;
+  }
+  {
+    // MakeChannel should return an un-buffered channel is buffer_size = 0.
+    auto ch = MakeChannel<int>(0);
+    EXPECT_EQ(dynamic_cast<Buffered<int> *>(ch), nullptr);
+    EXPECT_NE(dynamic_cast<UnBuffered<int> *>(ch), nullptr);
+    CloseChannel(ch);
+    delete ch;
+  }
+}
+
+TEST(Channel, SufficientBufferSizeDoesntBlock) {
+  const size_t buffer_size = 10;
+  auto ch = MakeChannel<size_t>(buffer_size);
+  for (size_t i = 0; i < buffer_size; ++i) {
+    EXPECT_EQ(ch->Send(&i), true);  // should not block
+  }
+
+  size_t out;
+  for (size_t i = 0; i < buffer_size; ++i) {
+    EXPECT_EQ(ch->Receive(&out), true);  // should not block
+    EXPECT_EQ(out, i);
+  }
+  CloseChannel(ch);
+  delete ch;
+}
+
+// This tests that a  channel must return false
+// on send and receive performed after closing the channel.
+// Receive will only return false after close when queue is empty.
+// By creating separate threads for sending and receiving, we make this
+// function able to test both buffered and unbuffered channels.
+void SendReceiveWithACloseChannelShouldPanic(Channel<size_t> *ch) {
+  const size_t data = 5;
+  std::thread send_thread{[&]() {
+    size_t i = data;
+    EXPECT_EQ(ch->Send(&i), true);  // should not block
+  }};
+
+  std::thread recv_thread{[&]() {
+    size_t i;
+    EXPECT_EQ(ch->Receive(&i), true);  // should not block
+    EXPECT_EQ(i, data);
+  }};
+
+  send_thread.join();
+  recv_thread.join();
+
+  // After closing send should return false. Receive should
+  // also return false as there is no data in queue.
+  CloseChannel(ch);
+  send_thread = std::thread{[&]() {
+    size_t i = data;
+    EXPECT_EQ(ch->Send(&i), false);  // should return false
+  }};
+  recv_thread = std::thread{[&]() {
+    size_t i;
+    // should return false because channel is closed and queue is empty
+    EXPECT_EQ(ch->Receive(&i), false);
+  }};
+
+  send_thread.join();
+  recv_thread.join();
+}
+
+TEST(Channel, SendReceiveClosedBufferedChannelPanics) {
+  size_t buffer_size = 10;
+  auto ch = MakeChannel<size_t>(buffer_size);
+  SendReceiveWithACloseChannelShouldPanic(ch);
+  delete ch;
+}
+
+TEST(Channel, SendReceiveClosedUnBufferedChannelPanics) {
+  auto ch = MakeChannel<size_t>(0);
+  SendReceiveWithACloseChannelShouldPanic(ch);
+  delete ch;
+}
+
+TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) {
+  const size_t buffer_size = 10;
+  auto ch = MakeChannel<size_t>(buffer_size);
+
+  for (size_t i = 0; i < buffer_size; ++i) {
+    EXPECT_EQ(ch->Send(&i), true);  // sending should not block
+  }
+
+  size_t out;
+  for (size_t i = 0; i < buffer_size / 2; ++i) {
+    EXPECT_EQ(ch->Receive(&out), true);  // receiving should not block
+    EXPECT_EQ(out, i);
+  }
+
+  CloseChannel(ch);
+
+  for (size_t i = buffer_size / 2; i < buffer_size; ++i) {
+    EXPECT_EQ(ch->Receive(&out),
+              true);  // receving should return residual values.
+    EXPECT_EQ(out, i);
+  }
+
+  for (size_t i = 0; i < buffer_size; ++i) {
+    EXPECT_EQ(ch->Receive(&out),
+              false);  // receiving on closed channel should return false
+  }
+  delete ch;
+}
+
+TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
+  const size_t buffer_size = 10;
+  auto ch = MakeChannel<size_t>(buffer_size);
+  size_t sum = 0;
+  std::thread t([&]() {
+    // Try to write more than buffer size.
+    for (size_t i = 0; i < 2 * buffer_size; ++i) {
+      if (i < buffer_size)
+        EXPECT_EQ(ch->Send(&i), true);  // should block after 10 iterations
+      else
+        EXPECT_EQ(ch->Send(&i), false);
+      sum += i;
+    }
+  });
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.1 sec
+  EXPECT_EQ(sum, 45U);
+
+  CloseChannel(ch);
+  t.join();
+  delete ch;
+}
+
+TEST(Channel, RecevingOrderEqualToSendingOrderWithUnBufferedChannel) {
+  auto ch = MakeChannel<int>(0);
+  RecevingOrderEqualToSendingOrder(ch);
+}
+
+TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel) {
+  auto ch = MakeChannel<int>(10);
+  RecevingOrderEqualToSendingOrder(ch);
+}
+
+void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+
+  // Launches threads that try to read and are blocked because of no writers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    t[i] = std::thread(
+        [&](bool *p) {
+          int data;
+          EXPECT_EQ(ch->Receive(&data), false);
+          *p = true;
+        },
+        &thread_ended[i]);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.1 sec
+
+  // Verify that all the threads are blocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], false);
+  }
+
+  // Explicitly close the channel
+  // This should unblock all receivers
+  CloseChannel(ch);
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.1 sec
+
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+}
+
+void ChannelCloseUnblocksSendersTest(Channel<int> *ch) {
+  using paddle::framework::details::Buffered;
+  using paddle::framework::details::UnBuffered;
+
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+  bool send_success[num_threads];
+
+  // Launches threads that try to write and are blocked because of no readers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    send_success[i] = false;
+    t[i] = std::thread(
+        [&](bool *ended, bool *success) {
+          int data = 10;
+          *success = ch->Send(&data);
+          *ended = true;
+        },
+        &thread_ended[i], &send_success[i]);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+
+  if (dynamic_cast<Buffered<int> *>(ch)) {
+    // If ch is Buffered, atleast 4 threads must be blocked.
+    int ct = 0;
+    for (size_t i = 0; i < num_threads; i++) {
+      if (!thread_ended[i]) ct++;
+    }
+    EXPECT_GE(ct, 4);
+  } else {
+    // If ch is UnBuffered, all the threads should be blocked.
+    for (size_t i = 0; i < num_threads; i++) {
+      EXPECT_EQ(thread_ended[i], false);
+    }
+  }
+  // Explicitly close the thread
+  // This should unblock all senders
+  CloseChannel(ch);
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+
+  if (dynamic_cast<Buffered<int> *>(ch)) {
+    // Verify that only 1 send was successful
+    int ct = 0;
+    for (size_t i = 0; i < num_threads; i++) {
+      if (send_success[i]) ct++;
+    }
+    // Only 1 send must be successful
+    EXPECT_EQ(ct, 1);
+  }
+
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+}
+
+// This tests that closing a buffered channel also unblocks
+//  any receivers waiting on the channel
+TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
+  auto ch = MakeChannel<int>(1);
+  ChannelCloseUnblocksReceiversTest(ch);
+  delete ch;
+}
+
+// This tests that closing a buffered channel also unblocks
+//  any senders waiting for channel to have write space
+TEST(Channel, BufferedChannelCloseUnblocksSendersTest) {
+  auto ch = MakeChannel<int>(1);
+  ChannelCloseUnblocksSendersTest(ch);
+  delete ch;
+}
+
+// This tests that closing an unbuffered channel also unblocks
+//  unblocks any receivers waiting for senders
+TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
+  auto ch = MakeChannel<int>(0);
+  ChannelCloseUnblocksReceiversTest(ch);
+  delete ch;
+}
+
+// This tests that closing an unbuffered channel also unblocks
+//  unblocks any senders waiting for senders
+TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) {
+  auto ch = MakeChannel<int>(0);
+  ChannelCloseUnblocksReceiversTest(ch);
+  delete ch;
+}
+
+TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
+  auto ch = MakeChannel<int>(0);
+  unsigned sum_send = 0;
+  // Send should block after three iterations
+  // since we only have three receivers.
+  std::thread t([&]() {
+    // Try to send more number of times
+    // than receivers
+    for (int i = 0; i < 4; i++) {
+      ch->Send(&i);
+      sum_send += i;
+    }
+  });
+  for (int i = 0; i < 3; i++) {
+    int recv;
+    ch->Receive(&recv);
+    EXPECT_EQ(recv, i);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.5 sec
+  EXPECT_EQ(sum_send, 3U);
+
+  CloseChannel(ch);
+  t.join();
+  delete ch;
+}
+
+TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
+  auto ch = MakeChannel<int>(0);
+  unsigned sum_send = 0;
+  unsigned sum_receive = 0;
+  // The receiver should block after 5
+  // iterations, since there are only 5 senders.
+  std::thread t([&]() {
+    for (int i = 0; i < 8; i++) {
+      int recv;
+      ch->Receive(&recv);  // should block after the fifth iteration.
+      EXPECT_EQ(recv, i);
+      sum_receive += i;
+    }
+  });
+  for (int i = 0; i < 5; i++) {
+    ch->Send(&i);
+    sum_send += i;
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+  EXPECT_EQ(sum_send, 10U);
+  EXPECT_EQ(sum_receive, 10U);
+  // send three more elements
+  for (int i = 5; i < 8; i++) {
+    ch->Send(&i);
+    sum_send += i;
+  }
+
+  CloseChannel(ch);
+  t.join();
+  EXPECT_EQ(sum_send, 28U);
+  EXPECT_EQ(sum_receive, 28U);
+  delete ch;
+}
+
+// This tests that destroying a channel unblocks
+//  any senders waiting for channel to have write space
+void ChannelDestroyUnblockSenders(Channel<int> *ch) {
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+  bool send_success[num_threads];
+
+  // Launches threads that try to write and are blocked because of no readers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    send_success[i] = false;
+    t[i] = std::thread(
+        [&](bool *ended, bool *success) {
+          int data = 10;
+          *success = ch->Send(&data);
+          *ended = true;
+        },
+        &thread_ended[i], &send_success[i]);
+  }
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+  bool is_buffered_channel = false;
+  if (dynamic_cast<Buffered<int> *>(ch)) is_buffered_channel = true;
+
+  if (is_buffered_channel) {
+    // If channel is buffered, verify that atleast 4 threads are blocked
+    int ct = 0;
+    for (size_t i = 0; i < num_threads; i++) {
+      if (thread_ended[i] == false) ct++;
+    }
+    // Atleast 4 threads must be blocked
+    EXPECT_GE(ct, 4);
+  } else {
+    // Verify that all the threads are blocked
+    for (size_t i = 0; i < num_threads; i++) {
+      EXPECT_EQ(thread_ended[i], false);
+    }
+  }
+  // Explicitly destroy the channel
+  delete ch;
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
+
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+
+  // Count number of successfuld sends
+  int ct = 0;
+  for (size_t i = 0; i < num_threads; i++) {
+    if (send_success[i]) ct++;
+  }
+
+  if (is_buffered_channel) {
+    // Only 1 send must be successful
+    EXPECT_EQ(ct, 1);
+  } else {
+    // In unbuffered channel, no send should be successful
+    EXPECT_EQ(ct, 0);
+  }
+
+  // Join all threads
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+}
+
+// This tests that destroying a channel also unblocks
+//  any receivers waiting on the channel
+void ChannelDestroyUnblockReceivers(Channel<int> *ch) {
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+
+  // Launches threads that try to read and are blocked because of no writers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    t[i] = std::thread(
+        [&](bool *p) {
+          int data;
+          // All reads should return false
+          EXPECT_EQ(ch->Receive(&data), false);
+          *p = true;
+        },
+        &thread_ended[i]);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+
+  // Verify that all threads are blocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], false);
+  }
+  // delete the channel
+  delete ch;
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+}
+
+TEST(Channel, BufferedChannelDestroyUnblocksReceiversTest) {
+  size_t buffer_size = 1;
+  auto ch = MakeChannel<int>(buffer_size);
+  ChannelDestroyUnblockReceivers(ch);
+}
+
+TEST(Channel, BufferedChannelDestroyUnblocksSendersTest) {
+  size_t buffer_size = 1;
+  auto ch = MakeChannel<int>(buffer_size);
+  ChannelDestroyUnblockSenders(ch);
+}
+
+// This tests that destroying an unbuffered channel also unblocks
+//  unblocks any receivers waiting for senders
+TEST(Channel, UnbufferedChannelDestroyUnblocksReceiversTest) {
+  auto ch = MakeChannel<int>(0);
+  ChannelDestroyUnblockReceivers(ch);
+}
+
+TEST(Channel, UnbufferedChannelDestroyUnblocksSendersTest) {
+  auto ch = MakeChannel<int>(0);
+  ChannelDestroyUnblockSenders(ch);
+}
diff --git a/paddle/framework/data_device_transform.cc b/paddle/fluid/framework/data_device_transform.cc
similarity index 96%
rename from paddle/framework/data_device_transform.cc
rename to paddle/fluid/framework/data_device_transform.cc
index 5daf5a4e0a..3c6dd28455 100644
--- a/paddle/framework/data_device_transform.cc
+++ b/paddle/fluid/framework/data_device_transform.cc
@@ -11,7 +11,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/data_device_transform.h"
+#include "paddle/fluid/framework/data_device_transform.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/data_device_transform.h b/paddle/fluid/framework/data_device_transform.h
similarity index 81%
rename from paddle/framework/data_device_transform.h
rename to paddle/fluid/framework/data_device_transform.h
index 39750a85f2..0c4559f586 100644
--- a/paddle/framework/data_device_transform.h
+++ b/paddle/fluid/framework/data_device_transform.h
@@ -13,10 +13,10 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/framework/tensor_util.h"
-#include "paddle/platform/device_context.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu
similarity index 93%
rename from paddle/framework/data_device_transform_test.cu
rename to paddle/fluid/framework/data_device_transform_test.cu
index efc05b3106..f740f9b326 100644
--- a/paddle/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -14,13 +14,13 @@ limitations under the License. */
 
 #include "gtest/gtest.h"
 
-#include "paddle/framework/init.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/op_info.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/elementwise_op_function.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/platform/device_context.h"
+#include "paddle/fluid/framework/init.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/data_layout.h b/paddle/fluid/framework/data_layout.h
similarity index 97%
rename from paddle/framework/data_layout.h
rename to paddle/fluid/framework/data_layout.h
index 31817251ed..b72f13f2e8 100644
--- a/paddle/framework/data_layout.h
+++ b/paddle/fluid/framework/data_layout.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <cctype>
 #include <ostream>
 
-#include "paddle/platform/enforce.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
similarity index 96%
rename from paddle/framework/data_layout_transform.cc
rename to paddle/fluid/framework/data_layout_transform.cc
index 9d0a6d5ea3..c546a508fe 100644
--- a/paddle/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/framework/data_layout_transform.h"
+#include "paddle/fluid/framework/data_layout_transform.h"
 
-#include "paddle/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h
similarity index 87%
rename from paddle/framework/data_layout_transform.h
rename to paddle/fluid/framework/data_layout_transform.h
index 368f7fc989..862405fbf4 100644
--- a/paddle/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
@@ -14,9 +14,9 @@
 
 #pragma once
 
-#include "paddle/framework/op_kernel_type.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/framework/variable.h"
+#include "paddle/fluid/framework/op_kernel_type.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/variable.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/data_layout_transform_test.cc b/paddle/fluid/framework/data_layout_transform_test.cc
similarity index 93%
rename from paddle/framework/data_layout_transform_test.cc
rename to paddle/fluid/framework/data_layout_transform_test.cc
index 093e8d4d34..99eb46bde3 100644
--- a/paddle/framework/data_layout_transform_test.cc
+++ b/paddle/fluid/framework/data_layout_transform_test.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/framework/data_layout_transform.h"
+#include "paddle/fluid/framework/data_layout_transform.h"
 
 #include "gtest/gtest.h"
-#include "paddle/platform/device_context.h"
+#include "paddle/fluid/platform/device_context.h"
 
 TEST(DataTransform, DataLayoutFunction) {
   using namespace paddle::framework;
diff --git a/paddle/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc
similarity index 92%
rename from paddle/framework/data_transform.cc
rename to paddle/fluid/framework/data_transform.cc
index b6fd46401f..9575d01af8 100644
--- a/paddle/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/data_transform.h"
+#include "paddle/fluid/framework/data_transform.h"
 
-#include "paddle/framework/data_device_transform.h"
-#include "paddle/framework/data_layout_transform.h"
-#include "paddle/framework/data_type_transform.h"
+#include "paddle/fluid/framework/data_device_transform.h"
+#include "paddle/fluid/framework/data_layout_transform.h"
+#include "paddle/fluid/framework/data_type_transform.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/data_transform.h b/paddle/fluid/framework/data_transform.h
similarity index 73%
rename from paddle/framework/data_transform.h
rename to paddle/fluid/framework/data_transform.h
index a4b7890237..70d3a174ac 100644
--- a/paddle/framework/data_transform.h
+++ b/paddle/fluid/framework/data_transform.h
@@ -18,14 +18,14 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include "paddle/framework/op_kernel_type.h"
-#include "paddle/framework/selected_rows.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/framework/variable.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/macros.h"
-#include "paddle/platform/transform.h"
+#include "paddle/fluid/framework/op_kernel_type.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/platform/transform.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/data_type.h b/paddle/fluid/framework/data_type.h
similarity index 97%
rename from paddle/framework/data_type.h
rename to paddle/fluid/framework/data_type.h
index 98eb3e857d..7a527f0d0c 100644
--- a/paddle/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 #include <typeindex>
-#include "paddle/framework/framework.pb.h"
-#include "paddle/platform/enforce.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc
similarity index 95%
rename from paddle/framework/data_type_transform.cc
rename to paddle/fluid/framework/data_type_transform.cc
index 7df1cc6b75..6921927305 100644
--- a/paddle/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/data_type_transform.h"
+#include "paddle/fluid/framework/data_type_transform.h"
 
-#include "paddle/framework/selected_rows.h"
-#include "paddle/platform/transform.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/platform/transform.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/data_type_transform.h b/paddle/fluid/framework/data_type_transform.h
similarity index 83%
rename from paddle/framework/data_type_transform.h
rename to paddle/fluid/framework/data_type_transform.h
index 067c0c2a5b..830cced093 100644
--- a/paddle/framework/data_type_transform.h
+++ b/paddle/fluid/framework/data_type_transform.h
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/op_kernel_type.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/framework/variable.h"
-#include "paddle/platform/device_context.h"
+#include "paddle/fluid/framework/op_kernel_type.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/data_type_transform_test.cc b/paddle/fluid/framework/data_type_transform_test.cc
similarity index 96%
rename from paddle/framework/data_type_transform_test.cc
rename to paddle/fluid/framework/data_type_transform_test.cc
index 89d32f5283..88dbc51b21 100644
--- a/paddle/framework/data_type_transform_test.cc
+++ b/paddle/fluid/framework/data_type_transform_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/data_type_transform.h"
+#include "paddle/fluid/framework/data_type_transform.h"
 
 #include "gtest/gtest.h"
 
diff --git a/paddle/framework/ddim.cc b/paddle/fluid/framework/ddim.cc
similarity index 98%
rename from paddle/framework/ddim.cc
rename to paddle/fluid/framework/ddim.cc
index 8b6f42b82d..f063ee2e6d 100644
--- a/paddle/framework/ddim.cc
+++ b/paddle/fluid/framework/ddim.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/ddim.h"
-#include "paddle/platform/enforce.h"
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/ddim.h b/paddle/fluid/framework/ddim.h
similarity index 96%
rename from paddle/framework/ddim.h
rename to paddle/fluid/framework/ddim.h
index 4ca5e49566..750ab787ab 100644
--- a/paddle/framework/ddim.h
+++ b/paddle/fluid/framework/ddim.h
@@ -17,9 +17,9 @@ limitations under the License. */
 #include <initializer_list>
 #include <stdexcept>
 #include <vector>
-#include "paddle/framework/dim.h"
-#include "paddle/platform/enforce.h"
-#include "paddle/platform/variant.h"
+#include "paddle/fluid/framework/dim.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/variant.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/ddim_test.cc b/paddle/fluid/framework/ddim_test.cc
similarity index 98%
rename from paddle/framework/ddim_test.cc
rename to paddle/fluid/framework/ddim_test.cc
index bc259d1f60..18d305a403 100644
--- a/paddle/framework/ddim_test.cc
+++ b/paddle/fluid/framework/ddim_test.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <vector>
 
 #include "gtest/gtest.h"
-#include "paddle/framework/ddim.h"
+#include "paddle/fluid/framework/ddim.h"
 
 TEST(DDim, Equality) {
   // construct a DDim from an initialization list
diff --git a/paddle/framework/details/buffered_channel.h b/paddle/fluid/framework/details/buffered_channel.h
similarity index 58%
rename from paddle/framework/details/buffered_channel.h
rename to paddle/fluid/framework/details/buffered_channel.h
index 9c806461aa..88faf3acf7 100644
--- a/paddle/framework/details/buffered_channel.h
+++ b/paddle/fluid/framework/details/buffered_channel.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,25 +13,34 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <atomic>
 #include <condition_variable>
 #include <deque>
 #include <mutex>
 
-#include "paddle/framework/channel.h"
-#include "paddle/platform/enforce.h"
+#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
 
+// Four of the properties of Buffered Channel:
+// - A send to a full channel blocks temporarily until a receive from the
+// channel or the channel is closed.
+// - A receive from an empty channel blocks temporarily until a send to the
+// channel or the channel is closed.
+// - A send to a closed channel returns false immediately.
+// - A receive from a closed channel returns false immediately.
+
 template <typename T>
 class Buffered : public paddle::framework::Channel<T> {
   friend Channel<T>* paddle::framework::MakeChannel<T>(size_t);
   friend void paddle::framework::CloseChannel<T>(Channel<T>*);
 
  public:
-  virtual void Send(T*);
-  virtual void Receive(T*);
+  virtual bool Send(T*);
+  virtual bool Receive(T*);
   virtual size_t Cap() { return cap_; }
   virtual void Close();
   virtual ~Buffered();
@@ -41,19 +50,26 @@ class Buffered : public paddle::framework::Channel<T> {
   std::mutex mu_;
   std::condition_variable empty_cond_var_;
   std::condition_variable full_cond_var_;
+  std::condition_variable destructor_cond_var_;
   std::deque<T> channel_;
-  bool closed_;
+  std::atomic<bool> closed_{false};
+  std::atomic<unsigned> send_ctr{0};
+  std::atomic<unsigned> recv_ctr{0};
 
   Buffered(size_t cap) : cap_(cap), closed_(false) {
     PADDLE_ENFORCE_GT(cap, 0);
   }
 
-  void NotifyAllSenders(std::unique_lock<std::mutex>*);
   void NotifyAllParticipants(std::unique_lock<std::mutex>*);
 };
 
 template <typename T>
-void Buffered<T>::Send(T* item) {
+bool Buffered<T>::Send(T* item) {
+  bool ret = false;
+  if (closed_) {
+    return ret;
+  }
+  send_ctr++;
   std::unique_lock<std::mutex> lock(mu_);
   full_cond_var_.wait(lock,
                       [this]() { return channel_.size() < cap_ || closed_; });
@@ -61,24 +77,40 @@ void Buffered<T>::Send(T* item) {
     channel_.push_back(std::move(*item));
     lock.unlock();
     empty_cond_var_.notify_one();
+    ret = true;
   }
+  send_ctr--;
+  destructor_cond_var_.notify_one();
+  return ret;
 }
 
 template <typename T>
-void Buffered<T>::Receive(T* item) {
+bool Buffered<T>::Receive(T* item) {
+  bool ret = false;
+  // Once the channel has been closed and all data has been consumed,
+  // just return false. Don't even try acquiring the mutex.
+  if (closed_ && channel_.empty()) {
+    return false;
+  }
+  recv_ctr++;
   std::unique_lock<std::mutex> lock(mu_);
   empty_cond_var_.wait(lock, [this]() { return !channel_.empty() || closed_; });
-  if (!closed_) {
+  if (!channel_.empty()) {
     *item = std::move(channel_.front());
     channel_.pop_front();
-    NotifyAllSenders(&lock);
-  } else {
-    item = nullptr;
+    full_cond_var_.notify_one();
+    ret = true;
   }
+  recv_ctr--;
+  destructor_cond_var_.notify_one();
+  return ret;
 }
 
 template <typename T>
 void Buffered<T>::Close() {
+  if (closed_) {
+    return;
+  }
   std::unique_lock<std::mutex> lock(mu_);
   closed_ = true;
   NotifyAllParticipants(&lock);
@@ -90,12 +122,12 @@ Buffered<T>::~Buffered() {
   closed_ = true;
   channel_.clear();
   NotifyAllParticipants(&lock);
-}
 
-template <typename T>
-void Buffered<T>::NotifyAllSenders(std::unique_lock<std::mutex>* lock) {
-  lock->unlock();
-  full_cond_var_.notify_all();
+  // The destructor must wait for all readers and writers to complete their task
+  // The channel has been closed, so we will not accept new readers and writers
+  lock.lock();
+  destructor_cond_var_.wait(
+      lock, [this]() { return send_ctr == 0 && recv_ctr == 0; });
 }
 
 template <typename T>
diff --git a/paddle/framework/details/cow_ptr.h b/paddle/fluid/framework/details/cow_ptr.h
similarity index 97%
rename from paddle/framework/details/cow_ptr.h
rename to paddle/fluid/framework/details/cow_ptr.h
index 7e308ffb5a..69bcea6252 100644
--- a/paddle/framework/details/cow_ptr.h
+++ b/paddle/fluid/framework/details/cow_ptr.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/paddle/framework/details/cow_ptr_test.cc b/paddle/fluid/framework/details/cow_ptr_test.cc
similarity index 89%
rename from paddle/framework/details/cow_ptr_test.cc
rename to paddle/fluid/framework/details/cow_ptr_test.cc
index 936954a233..d2142af277 100644
--- a/paddle/framework/details/cow_ptr_test.cc
+++ b/paddle/fluid/framework/details/cow_ptr_test.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/framework/details/cow_ptr.h"
+#include "paddle/fluid/framework/details/cow_ptr.h"
 #include "gtest/gtest.h"
 
 namespace paddle {
diff --git a/paddle/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h
similarity index 93%
rename from paddle/framework/details/op_registry.h
rename to paddle/fluid/framework/details/op_registry.h
index 6d50e820b2..d73604ad18 100644
--- a/paddle/framework/details/op_registry.h
+++ b/paddle/fluid/framework/details/op_registry.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/grad_op_desc_maker.h"
-#include "paddle/framework/op_info.h"
-#include "paddle/framework/op_proto_maker.h"
-#include "paddle/framework/operator.h"
-#include "paddle/framework/var_type_inference.h"
+#include "paddle/fluid/framework/grad_op_desc_maker.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/var_type_inference.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/details/unbuffered_channel.h b/paddle/fluid/framework/details/unbuffered_channel.h
similarity index 76%
rename from paddle/framework/details/unbuffered_channel.h
rename to paddle/fluid/framework/details/unbuffered_channel.h
index 0dc5afd7e5..5c9424928c 100644
--- a/paddle/framework/details/unbuffered_channel.h
+++ b/paddle/fluid/framework/details/unbuffered_channel.h
@@ -17,20 +17,27 @@ limitations under the License. */
 #include <condition_variable>
 #include <mutex>
 
-#include "paddle/framework/channel.h"
+#include "paddle/fluid/framework/channel.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
 
+// Four of the properties of UnBuffered Channel:
+// - A send to a channel blocks temporarily until a receive from the
+// channel or the channel is closed.
+// - A receive from a channel blocks temporarily until a send to the
+// channel or the channel is closed.
+// - A send to a closed channel returns false immediately.
+// - A receive from a closed channel returns false immediately.
 template <typename T>
 class UnBuffered : public paddle::framework::Channel<T> {
   friend Channel<T>* paddle::framework::MakeChannel<T>(size_t);
   friend void paddle::framework::CloseChannel<T>(Channel<T>*);
 
  public:
-  virtual void Send(T*);
-  virtual void Receive(T*);
+  virtual bool Send(T*);
+  virtual bool Receive(T*);
   virtual size_t Cap() { return 0; }
   virtual void Close();
   virtual ~UnBuffered();
@@ -45,9 +52,11 @@ class UnBuffered : public paddle::framework::Channel<T> {
   // A transaction occurs only when both are true
   std::atomic<bool> reader_found_{false}, writer_found_{false};
   std::condition_variable cv_channel_;
-  std::condition_variable_any cv_reader_, cv_writer_;
+  std::condition_variable_any cv_reader_, cv_writer_, cv_destructor_;
   T* item{nullptr};
   std::atomic<bool> closed_{false};
+  std::atomic<unsigned> send_ctr{0};
+  std::atomic<unsigned> recv_ctr{0};
 
   UnBuffered() : closed_(false) {}
 
@@ -57,7 +66,12 @@ class UnBuffered : public paddle::framework::Channel<T> {
 // This function implements the concept of how data should
 // be sent from a writer to a reader.
 template <typename T>
-void UnBuffered<T>::Send(T* data) {
+bool UnBuffered<T>::Send(T* data) {
+  bool ret = false;
+  if (closed_) {
+    return ret;
+  }
+  send_ctr++;
   // Prevent other writers from entering
   std::unique_lock<std::recursive_mutex> writer_lock(mu_write_);
   writer_found_ = true;
@@ -74,14 +88,24 @@ void UnBuffered<T>::Send(T* data) {
     channel_lock.lock();
     cv_channel_.wait(channel_lock,
                      [this]() { return item == nullptr || closed_; });
+    ret = true;
   }
   writer_found_ = false;
+  send_ctr--;
+  cv_destructor_.notify_one();
+  return ret;
 }
 
 // This function implements the concept of how
 // data that was sent by a writer is read from a reader.
 template <typename T>
-void UnBuffered<T>::Receive(T* data) {
+bool UnBuffered<T>::Receive(T* data) {
+  bool ret = false;
+  // If channel is closed, we don't even want any reader to enter.
+  // Unlike a buffered channel, an unbuffered channel does not allow
+  // readers to read after closing because there is no buffer to be consumed.
+  if (closed_) return ret;
+  recv_ctr++;
   // Prevent other readers from entering
   std::unique_lock<std::recursive_mutex> read_lock{mu_read_};
   reader_found_ = true;
@@ -98,16 +122,23 @@ void UnBuffered<T>::Receive(T* data) {
       *data = std::move(*item);
       item = nullptr;
       lock_ch.unlock();
+      ret = true;
     }
     cv_channel_.notify_one();
   }
   reader_found_ = false;
+  recv_ctr--;
+  cv_destructor_.notify_one();
+  return ret;
 }
 
 // This function implements the sequence of events
 // that take place once the channel is closed.
 template <typename T>
 void UnBuffered<T>::Close() {
+  if (closed_) {
+    return;
+  }
   std::unique_lock<std::mutex> lock(mu_ch_);
   item = nullptr;
   closed_ = true;
@@ -123,6 +154,9 @@ UnBuffered<T>::~UnBuffered() {
   item = nullptr;
   closed_ = true;
   NotifyAllParticipants(&lock);
+  lock.lock();
+  cv_destructor_.wait(lock,
+                      [this]() { return send_ctr == 0 && recv_ctr == 0; });
 }
 
 // This function notifies all the readers, writers and
diff --git a/paddle/framework/dim.h b/paddle/fluid/framework/dim.h
similarity index 99%
rename from paddle/framework/dim.h
rename to paddle/fluid/framework/dim.h
index ec17d7c615..3938fd3df5 100644
--- a/paddle/framework/dim.h
+++ b/paddle/fluid/framework/dim.h
@@ -18,8 +18,8 @@
 #include <stdexcept>
 #include <type_traits>
 
-#include "paddle/platform/assert.h"
-#include "paddle/platform/hostdevice.h"
+#include "paddle/fluid/platform/assert.h"
+#include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/dim_test.cu b/paddle/fluid/framework/dim_test.cu
similarity index 98%
rename from paddle/framework/dim_test.cu
rename to paddle/fluid/framework/dim_test.cu
index 2bcab7c5c2..0f1969d797 100644
--- a/paddle/framework/dim_test.cu
+++ b/paddle/fluid/framework/dim_test.cu
@@ -15,7 +15,7 @@
 #include <sstream>
 
 #include "gtest/gtest.h"
-#include "paddle/framework/dim.h"
+#include "paddle/fluid/framework/dim.h"
 
 __global__ void test(paddle::framework::Dim<2>* o) {
   o[0] = paddle::framework::make_dim(5, 6);
diff --git a/paddle/framework/eigen.h b/paddle/fluid/framework/eigen.h
similarity index 98%
rename from paddle/framework/eigen.h
rename to paddle/fluid/framework/eigen.h
index 54bbeafcab..d1b8c701a7 100644
--- a/paddle/framework/eigen.h
+++ b/paddle/fluid/framework/eigen.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/tensor.h"
+#include "paddle/fluid/framework/tensor.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
 namespace paddle {
diff --git a/paddle/framework/eigen_test.cc b/paddle/fluid/framework/eigen_test.cc
similarity index 98%
rename from paddle/framework/eigen_test.cc
rename to paddle/fluid/framework/eigen_test.cc
index 9e368a522c..f9e3abeccb 100644
--- a/paddle/framework/eigen_test.cc
+++ b/paddle/fluid/framework/eigen_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/framework/eigen.h"
+#include "paddle/fluid/framework/eigen.h"
 #include <gtest/gtest.h>
 
 namespace paddle {
diff --git a/paddle/framework/executor.cc b/paddle/fluid/framework/executor.cc
similarity index 94%
rename from paddle/framework/executor.cc
rename to paddle/fluid/framework/executor.cc
index 9a232b0843..816ad8d659 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -12,18 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/executor.h"
+#include "paddle/fluid/framework/executor.h"
 
 #include <set>
 
 #include "gflags/gflags.h"
-#include "paddle/framework/feed_fetch_method.h"
-#include "paddle/framework/feed_fetch_type.h"
-#include "paddle/framework/lod_rank_table.h"
-#include "paddle/framework/lod_tensor_array.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/platform/place.h"
-#include "paddle/platform/profiler.h"
+#include "paddle/fluid/framework/feed_fetch_method.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/profiler.h"
 
 DECLARE_bool(benchmark);
 DEFINE_bool(check_nan_inf, false,
@@ -52,11 +53,13 @@ static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
     var->GetMutable<LoDTensorArray>();
   } else if (var_type == proto::VarDesc::PLACE_LIST) {
     var->GetMutable<platform::PlaceList>();
+  } else if (var_type == proto::VarDesc::READER) {
+    var->GetMutable<ReaderHolder>();
   } else {
     PADDLE_THROW(
         "Variable type %d is not in "
-        "[LoDTensor, SelectedRows, FEED_MINIBATCH, FETCH_LIST, LOD_RANK_TABLE,"
-        " PLACE_LIST]",
+        "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
+        "LOD_RANK_TABLE, PLACE_LIST, READER]",
         var_type);
   }
 }
diff --git a/paddle/framework/executor.h b/paddle/fluid/framework/executor.h
similarity index 87%
rename from paddle/framework/executor.h
rename to paddle/fluid/framework/executor.h
index 035ff48a52..893c949939 100644
--- a/paddle/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/op_info.h"
-#include "paddle/framework/program_desc.h"
-#include "paddle/framework/scope.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/platform/device_context.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc
similarity index 95%
rename from paddle/framework/feed_fetch_method.cc
rename to paddle/fluid/framework/feed_fetch_method.cc
index 21201b6755..a9bb17355d 100644
--- a/paddle/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/feed_fetch_method.h"
+#include "paddle/fluid/framework/feed_fetch_method.h"
 #include "glog/logging.h"
-#include "paddle/framework/variable.h"
+#include "paddle/fluid/framework/variable.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/feed_fetch_method.h b/paddle/fluid/framework/feed_fetch_method.h
similarity index 90%
rename from paddle/framework/feed_fetch_method.h
rename to paddle/fluid/framework/feed_fetch_method.h
index b71945fcc8..5355c29047 100644
--- a/paddle/framework/feed_fetch_method.h
+++ b/paddle/fluid/framework/feed_fetch_method.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/feed_fetch_type.h"
-#include "paddle/framework/scope.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/scope.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/feed_fetch_type.h b/paddle/fluid/framework/feed_fetch_type.h
similarity index 95%
rename from paddle/framework/feed_fetch_type.h
rename to paddle/fluid/framework/feed_fetch_type.h
index 168f456675..4281e36b13 100644
--- a/paddle/framework/feed_fetch_type.h
+++ b/paddle/fluid/framework/feed_fetch_type.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <string>
 #include <vector>
-#include "paddle/framework/lod_tensor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/framework.proto b/paddle/fluid/framework/framework.proto
similarity index 93%
rename from paddle/framework/framework.proto
rename to paddle/fluid/framework/framework.proto
index 5b6ef03f61..d7be1a7352 100644
--- a/paddle/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -116,6 +116,8 @@ message LoDTensorArrayDesc {
   optional int32 lod_level = 2 [ default = 0 ];
 }
 
+message ReaderDesc { repeated LoDTensorDesc lod_tensor = 1; }
+
 message VarDesc {
   enum VarType {
     LOD_TENSOR = 1;
@@ -126,13 +128,15 @@ message VarDesc {
     LOD_RANK_TABLE = 6;
     LOD_TENSOR_ARRAY = 7;
     PLACE_LIST = 8;
+    READER = 9;
   }
   required string name = 1;
   required VarType type = 2;
-  optional LoDTensorDesc lod_tensor = 3;
-  optional TensorDesc selected_rows = 4;
+  optional bool persistable = 3 [ default = false ];
+  optional LoDTensorDesc lod_tensor = 4;
+  optional TensorDesc selected_rows = 5;
   optional LoDTensorArrayDesc tensor_array = 6;
-  optional bool persistable = 5 [ default = false ];
+  optional ReaderDesc reader = 7;
 }
 
 message BlockDesc {
diff --git a/paddle/framework/grad_op_desc_maker.h b/paddle/fluid/framework/grad_op_desc_maker.h
similarity index 96%
rename from paddle/framework/grad_op_desc_maker.h
rename to paddle/fluid/framework/grad_op_desc_maker.h
index 2082f8bb76..21dd4e8854 100644
--- a/paddle/framework/grad_op_desc_maker.h
+++ b/paddle/fluid/framework/grad_op_desc_maker.h
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <string>
 #include <unordered_set>
 #include <vector>
-#include "paddle/framework/op_desc.h"
-#include "paddle/framework/operator.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
 namespace framework {
@@ -122,6 +122,11 @@ class GradOpDescMakerBase {
     return it->second;
   }
 
+  template <typename T>
+  inline const T& Attr(const std::string& name) const {
+    return boost::get<T>(GetAttr(name));
+  }
+
   std::string ForwardOpType() const { return this->fwd_op_.Type(); }
 
  private:
diff --git a/paddle/framework/init.cc b/paddle/fluid/framework/init.cc
similarity index 92%
rename from paddle/framework/init.cc
rename to paddle/fluid/framework/init.cc
index 3f6ea121b3..cb2d740d86 100644
--- a/paddle/framework/init.cc
+++ b/paddle/fluid/framework/init.cc
@@ -16,10 +16,10 @@ limitations under the License. */
 #include <stdexcept>
 #include <string>
 
-#include "paddle/framework/init.h"
-#include "paddle/framework/operator.h"
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/place.h"
+#include "paddle/fluid/framework/init.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
 #include "paddle/string/piece.h"
 
 namespace paddle {
diff --git a/paddle/framework/init.h b/paddle/fluid/framework/init.h
similarity index 100%
rename from paddle/framework/init.h
rename to paddle/fluid/framework/init.h
diff --git a/paddle/framework/init_test.cc b/paddle/fluid/framework/init_test.cc
similarity index 92%
rename from paddle/framework/init_test.cc
rename to paddle/fluid/framework/init_test.cc
index 01e076dd8e..f3018541e2 100644
--- a/paddle/framework/init_test.cc
+++ b/paddle/fluid/framework/init_test.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "gtest/gtest.h"
 
-#include "paddle/framework/init.h"
-#include "paddle/platform/device_context.h"
+#include "paddle/fluid/framework/init.h"
+#include "paddle/fluid/platform/device_context.h"
 
 TEST(InitDevices, CPU) {
   using paddle::framework::InitDevices;
diff --git a/paddle/framework/library_type.h b/paddle/fluid/framework/library_type.h
similarity index 100%
rename from paddle/framework/library_type.h
rename to paddle/fluid/framework/library_type.h
diff --git a/paddle/framework/lod_rank_table.cc b/paddle/fluid/framework/lod_rank_table.cc
similarity index 97%
rename from paddle/framework/lod_rank_table.cc
rename to paddle/fluid/framework/lod_rank_table.cc
index 704bce2a0e..31c8749234 100644
--- a/paddle/framework/lod_rank_table.cc
+++ b/paddle/fluid/framework/lod_rank_table.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/lod_rank_table.h b/paddle/fluid/framework/lod_rank_table.h
similarity index 97%
rename from paddle/framework/lod_rank_table.h
rename to paddle/fluid/framework/lod_rank_table.h
index df188709e9..0eaaf49e4c 100644
--- a/paddle/framework/lod_rank_table.h
+++ b/paddle/fluid/framework/lod_rank_table.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <iosfwd>
-#include "paddle/framework/lod_tensor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
similarity index 98%
rename from paddle/framework/lod_tensor.cc
rename to paddle/fluid/framework/lod_tensor.cc
index cb27de6991..05c67e453d 100644
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/data_type.h"
-#include "paddle/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/framework.pb.h"
 
-#include "paddle/memory/memcpy.h"
-#include "paddle/memory/memory.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/memory/memory.h"
 
 #include <stdint.h>
 #include <string.h>
diff --git a/paddle/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h
similarity index 93%
rename from paddle/framework/lod_tensor.h
rename to paddle/fluid/framework/lod_tensor.h
index d0ab640485..1509a9fb13 100644
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -21,12 +21,12 @@ limitations under the License. */
 #endif
 
 #include <glog/logging.h>
-#include "paddle/framework/ddim.h"
-#include "paddle/framework/mixed_vector.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/framework/tensor_util.h"
-#include "paddle/platform/enforce.h"
-#include "paddle/platform/place.h"
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/mixed_vector.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace framework {
@@ -46,15 +46,7 @@ namespace framework {
  *    0 2 4 7
  *    0 2 5 7 10 12 15 20
  */
-struct LoD : public std::vector<Vector<size_t>> {
-  using std::vector<Vector<size_t>>::vector;
-
-  void CopyFromCUDA() {
-    for (auto it = this->begin(); it != this->end(); ++it) {
-      it->CopyFromCUDA();
-    }
-  }
-};
+using LoD = std::vector<Vector<size_t>>;
 
 std::ostream& operator<<(std::ostream& os, const LoD& lod);
 std::ostream& operator<<(std::ostream& os, const LoDTensor& t);
diff --git a/paddle/framework/lod_tensor.md b/paddle/fluid/framework/lod_tensor.md
similarity index 100%
rename from paddle/framework/lod_tensor.md
rename to paddle/fluid/framework/lod_tensor.md
diff --git a/paddle/framework/lod_tensor_array.h b/paddle/fluid/framework/lod_tensor_array.h
similarity index 94%
rename from paddle/framework/lod_tensor_array.h
rename to paddle/fluid/framework/lod_tensor_array.h
index 4a8e7f4fa5..652513bd22 100644
--- a/paddle/framework/lod_tensor_array.h
+++ b/paddle/fluid/framework/lod_tensor_array.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <vector>
-#include "paddle/framework/lod_tensor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc
similarity index 99%
rename from paddle/framework/lod_tensor_test.cc
rename to paddle/fluid/framework/lod_tensor_test.cc
index 3b63020e68..7e0ed2495d 100644
--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/framework/lod_tensor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
diff --git a/paddle/framework/lod_tensor_test.cu b/paddle/fluid/framework/lod_tensor_test.cu
similarity index 70%
rename from paddle/framework/lod_tensor_test.cu
rename to paddle/fluid/framework/lod_tensor_test.cu
index d4c9f00bd9..4dd7810c1b 100644
--- a/paddle/framework/lod_tensor_test.cu
+++ b/paddle/fluid/framework/lod_tensor_test.cu
@@ -15,11 +15,12 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <stdio.h>
-#include "paddle/framework/init.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/platform/assert.h"
 
-#include <gtest/gtest.h>
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/init.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/platform/assert.h"
+#include "paddle/fluid/platform/place.h"
 
 __global__ void test(size_t* a, int size) {
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size;
@@ -28,28 +29,6 @@ __global__ void test(size_t* a, int size) {
   }
 }
 
-TEST(Vector, Normal) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  using namespace paddle::memory;
-
-  paddle::framework::InitDevices();
-
-  paddle::framework::Vector<size_t> vec({1, 2, 3});
-  size_t* ptr = vec.data();
-  for (size_t i = 0; i < vec.size(); ++i) {
-    EXPECT_EQ(vec[i], *(ptr + i));
-  }
-
-  vec.clear();
-  vec.CopyFromCUDA();
-
-  std::vector<size_t> v = {1, 2, 3};
-  for (size_t i = 0; i < v.size(); ++i) {
-    EXPECT_EQ(v[i], vec[i]);
-  }
-}
-
 TEST(LoD, data) {
   paddle::framework::InitDevices();
 
@@ -58,10 +37,9 @@ TEST(LoD, data) {
   lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
 
   auto& v = lod[0];
-  test<<<1, 1>>>(v.cuda_data(), v.size());
+  paddle::platform::CUDAPlace gpu(0);
+  test<<<1, 1>>>(v.CUDAMutableData(gpu), v.size());
   cudaDeviceSynchronize();
-
-  v.CopyFromCUDA();
   for (size_t i = 0; i < v.size(); ++i) {
     EXPECT_EQ(v[i], i * 2);
   }
@@ -85,9 +63,8 @@ TEST(LoDTensor, LoDInGPU) {
 
   auto lod = lod_tensor.lod();
 
-  test<<<1, 8>>>(lod[0].cuda_data(), lod[0].size());
+  test<<<1, 8>>>(lod[0].CUDAMutableData(place), lod[0].size());
   cudaDeviceSynchronize();
-  lod.CopyFromCUDA();
 
   for (size_t i = 0; i < src_lod[0].size(); ++i) {
     EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2);
diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h
new file mode 100644
index 0000000000..9756754260
--- /dev/null
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -0,0 +1,363 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <initializer_list>
+#include <vector>
+
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+
+#include "glog/logging.h"
+
+namespace paddle {
+namespace framework {
+
+// Vector<T> implements the std::vector interface, and can get Data or
+// MutableData from any place. The data will be synced implicitly inside.
+template <typename T>
+class Vector {
+ public:
+  using value_type = T;
+
+  // Default ctor. Create empty Vector
+  Vector() { InitEmpty(); }
+
+  // Fill vector with value. The vector size is `count`.
+  explicit Vector(size_t count, const T& value = T()) {
+    if (count == 0) {
+      InitEmpty();
+    } else {
+      resize(count);
+      T* ptr = begin();
+      for (size_t i = 0; i < count; ++i) {
+        ptr[i] = value;
+      }
+    }
+  }
+
+  // Ctor with init_list
+  Vector(std::initializer_list<T> init) {
+    if (init.size() == 0) {
+      InitEmpty();
+    } else {
+      InitByIter(init.size(), init.begin(), init.end());
+    }
+  }
+
+  // implicit cast from std::vector.
+  template <typename U>
+  Vector(const std::vector<U>& dat) {  // NOLINT
+    if (dat.size() == 0) {
+      InitEmpty();
+    } else {
+      InitByIter(dat.size(), dat.begin(), dat.end());
+    }
+  }
+
+  // Copy ctor
+  Vector(const Vector<T>& other) { this->operator=(other); }
+
+  // Copy operator
+  Vector<T>& operator=(const Vector<T>& other) {
+    if (other.size() != 0) {
+      this->InitByIter(other.size(), other.begin(), other.end());
+    } else {
+      InitEmpty();
+    }
+    return *this;
+  }
+
+  // Move ctor
+  Vector(Vector<T>&& other) {
+    this->size_ = other.size_;
+    this->flag_ = other.flag_;
+    if (other.cuda_vec_.memory_size()) {
+      this->cuda_vec_.ShareDataWith(other.cuda_vec_);
+    }
+    if (other.cpu_vec_.memory_size()) {
+      this->cpu_vec_.ShareDataWith(other.cpu_vec_);
+    }
+  }
+
+  // CPU data access method. Mutable.
+  T& operator[](size_t i) {
+    MutableCPU();
+    return const_cast<T*>(cpu_vec_.data<T>())[i];
+  }
+
+  // CPU data access method. Immutable.
+  const T& operator[](size_t i) const {
+    ImmutableCPU();
+    return cpu_vec_.data<T>()[i];
+  }
+
+  // std::vector iterator methods. Based on CPU data access method
+  size_t size() const { return size_; }
+
+  T* begin() { return &this->operator[](0); }
+
+  T* end() { return &this->operator[](size()); }
+
+  T& front() { return *begin(); }
+
+  T& back() {
+    auto it = end();
+    --it;
+    return *it;
+  }
+
+  const T* begin() const { return &this->operator[](0); }
+  const T* end() const { return &this->operator[](size()); }
+
+  const T& back() const {
+    auto it = end();
+    --it;
+    return *it;
+  }
+
+  T* data() { return begin(); }
+
+  const T* data() const { return begin(); }
+
+  const T& front() const { return *begin(); }
+  // end of std::vector iterator methods
+
+  // assign this from iterator.
+  // NOTE: the iterator must support `end-begin`
+  template <typename Iter>
+  void assign(Iter begin, Iter end) {
+    InitByIter(end - begin, begin, end);
+  }
+
+  // push_back. If the previous capacity is not enough, the memory will
+  // double.
+  void push_back(T elem) {
+    if (size_ + 1 > capacity()) {
+      reserve((size_ + 1) << 1);
+    }
+    *end() = elem;
+    ++size_;
+  }
+
+  // extend a vector by iterator.
+  // NOTE: the iterator must support end-begin
+  template <typename It>
+  void Extend(It begin, It end) {
+    size_t pre_size = size_;
+    resize(pre_size + (end - begin));
+    T* ptr = this->begin() + pre_size;
+    for (; begin < end; ++begin, ++ptr) {
+      *ptr = *begin;
+    }
+  }
+
+  // resize the vector
+  void resize(size_t size) {
+    if (size + 1 < capacity()) {
+      size_ = size;
+    } else {
+      MutableCPU();
+      Tensor cpu_tensor;
+      platform::Place cpu = platform::CPUPlace();
+      T* ptr = cpu_tensor.mutable_data<T>(
+          framework::make_ddim({static_cast<int64_t>(size)}), cpu);
+      const T* old_ptr =
+          cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data<T>();
+      if (old_ptr != nullptr) {
+        std::copy(old_ptr, old_ptr + size_, ptr);
+      }
+      size_ = size;
+      cpu_vec_.ShareDataWith(cpu_tensor);
+    }
+  }
+
+  // get cuda ptr. immutable
+  const T* CUDAData(platform::Place place) const {
+    PADDLE_ENFORCE(platform::is_gpu_place(place),
+                   "CUDA Data must on CUDA place");
+    ImmutableCUDA(place);
+    return cuda_vec_.data<T>();
+  }
+
+  // get cuda ptr. mutable
+  T* CUDAMutableData(platform::Place place) {
+    const T* ptr = CUDAData(place);
+    flag_ = kDirty | kDataInCUDA;
+    return const_cast<T*>(ptr);
+  }
+
+  // clear
+  void clear() {
+    size_ = 0;
+    flag_ = kDirty | kDataInCPU;
+  }
+
+  size_t capacity() const {
+    return cpu_vec_.memory_size() / SizeOfType(typeid(T));
+  }
+
+  // reserve data
+  void reserve(size_t size) {
+    size_t pre_size = size_;
+    resize(size);
+    resize(pre_size);
+  }
+
+  // the unify method to access CPU or CUDA data. immutable.
+  const T* Data(platform::Place place) const {
+    if (platform::is_gpu_place(place)) {
+      return CUDAData(place);
+    } else {
+      return data();
+    }
+  }
+
+  // the unify method to access CPU or CUDA data. mutable.
+  T* MutableData(platform::Place place) {
+    if (platform::is_gpu_place(place)) {
+      return CUDAMutableData(place);
+    } else {
+      return data();
+    }
+  }
+
+  // implicit cast operator. Vector can be cast to std::vector implicitly.
+  operator std::vector<T>() const {
+    std::vector<T> result;
+    result.resize(size());
+    std::copy(begin(), end(), result.begin());
+    return result;
+  }
+
+  bool operator==(const Vector<T>& other) const {
+    if (size() != other.size()) return false;
+    for (auto it1 = begin(), it2 = other.begin(); it1 < end(); ++it1, ++it2) {
+      if (*it1 != *it2) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+ private:
+  void InitEmpty() {
+    size_ = 0;
+    flag_ = kDataInCPU;
+  }
+
+  template <typename Iter>
+  void InitByIter(size_t size, Iter begin, Iter end) {
+    platform::Place cpu = platform::CPUPlace();
+    T* ptr = this->cpu_vec_.template mutable_data<T>(
+        framework::make_ddim({static_cast<int64_t>(size)}), cpu);
+    for (size_t i = 0; i < size; ++i) {
+      *ptr++ = *begin++;
+    }
+    flag_ = kDataInCPU | kDirty;
+    size_ = size;
+  }
+
+  enum DataFlag {
+    kDataInCPU = 0x01,
+    kDataInCUDA = 0x02,
+    // kDirty means the data has been changed in one device.
+    kDirty = 0x10
+  };
+
+  void CopyToCPU() const {
+    // COPY GPU Data To CPU
+    Copy(cuda_vec_, platform::CPUPlace(), &cpu_vec_);
+    WaitPlace(cuda_vec_.place());
+  }
+
+  void MutableCPU() {
+    if (IsInCUDA() && IsDirty()) {
+      CopyToCPU();
+    }
+    flag_ = kDirty | kDataInCPU;
+  }
+
+  void ImmutableCUDA(platform::Place place) const {
+    if (IsDirty()) {
+      if (IsInCPU()) {
+        Copy(cpu_vec_, boost::get<platform::CUDAPlace>(place), &cuda_vec_);
+        WaitPlace(place);
+        UnsetFlag(kDirty);
+        SetFlag(kDataInCUDA);
+      } else if (IsInCUDA() && !(place == cuda_vec_.place())) {
+        framework::Tensor tmp;
+        Copy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
+        WaitPlace(cuda_vec_.place());
+        cuda_vec_.ShareDataWith(tmp);
+        // Still dirty
+      } else {
+        // Dirty && DataInCUDA && Device is same
+        // Do nothing
+      }
+    } else {
+      if (!IsInCUDA()) {
+        // Even data is not dirty. However, data is not in CUDA. Copy data.
+        Copy(cpu_vec_, boost::get<platform::CUDAPlace>(place), &cuda_vec_);
+        WaitPlace(place);
+        SetFlag(kDataInCUDA);
+      } else if (!(place == cuda_vec_.place())) {
+        framework::Tensor tmp;
+        WaitPlace(cuda_vec_.place());
+        Copy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
+        WaitPlace(cuda_vec_.place());
+        WaitPlace(place);
+        cuda_vec_.ShareDataWith(tmp);
+      } else {
+        // Not Dirty && DataInCUDA && Device is same
+        // Do nothing.
+      }
+    }
+  }
+
+  void ImmutableCPU() const {
+    if (IsDirty() &&
+        !IsInCPU()) {  // If data has been changed in CUDA, or CPU has no data.
+      CopyToCPU();
+      UnsetFlag(kDirty);
+    }
+    SetFlag(kDataInCPU);
+  }
+
+  void UnsetFlag(int flag) const { flag_ &= ~flag; }
+  void SetFlag(int flag) const { flag_ |= flag; }
+
+  bool IsDirty() const { return flag_ & kDirty; }
+
+  bool IsInCUDA() const { return flag_ & kDataInCUDA; }
+
+  bool IsInCPU() const { return flag_ & kDataInCPU; }
+
+  static void WaitPlace(const platform::Place place) {
+    if (platform::is_gpu_place(place)) {
+      platform::DeviceContextPool::Instance()
+          .Get(boost::get<platform::CUDAPlace>(place))
+          ->Wait();
+    }
+  }
+
+  mutable int flag_;
+  mutable Tensor cpu_vec_;
+  mutable Tensor cuda_vec_;
+  size_t size_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/mixed_vector_test.cu b/paddle/fluid/framework/mixed_vector_test.cu
new file mode 100644
index 0000000000..a890645256
--- /dev/null
+++ b/paddle/fluid/framework/mixed_vector_test.cu
@@ -0,0 +1,93 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <cuda_runtime.h>
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/mixed_vector.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+template <typename T>
+using vec = paddle::framework::Vector<T>;
+
+TEST(mixed_vector, CPU_VECTOR) {
+  vec<int> tmp;
+  for (int i = 0; i < 10; ++i) {
+    tmp.push_back(i);
+  }
+  ASSERT_EQ(tmp.size(), 10);
+  vec<int> tmp2;
+  tmp2 = tmp;
+  ASSERT_EQ(tmp2.size(), 10);
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_EQ(tmp2[i], i);
+    ASSERT_EQ(tmp2[i], tmp[i]);
+  }
+  int cnt = 0;
+  for (auto& t : tmp2) {
+    ASSERT_EQ(t, cnt);
+    ++cnt;
+  }
+}
+
+static __global__ void multiply_10(int* ptr) {
+  for (int i = 0; i < 10; ++i) {
+    ptr[i] *= 10;
+  }
+}
+
+cudaStream_t GetCUDAStream(paddle::platform::CUDAPlace place) {
+  return reinterpret_cast<const paddle::platform::CUDADeviceContext*>(
+             paddle::platform::DeviceContextPool::Instance().Get(place))
+      ->stream();
+}
+
+TEST(mixed_vector, GPU_VECTOR) {
+  vec<int> tmp;
+  for (int i = 0; i < 10; ++i) {
+    tmp.push_back(i);
+  }
+  ASSERT_EQ(tmp.size(), 10);
+  paddle::platform::CUDAPlace gpu(0);
+
+  multiply_10<<<1, 1, 0, GetCUDAStream(gpu)>>>(tmp.MutableData(gpu));
+
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_EQ(tmp[i], i * 10);
+  }
+}
+
+TEST(mixed_vector, MultiGPU) {
+  if (paddle::platform::GetCUDADeviceCount() < 2) {
+    LOG(WARNING) << "Skip mixed_vector.MultiGPU since there are not multiple "
+                    "GPUs in your machine.";
+    return;
+  }
+
+  vec<int> tmp;
+  for (int i = 0; i < 10; ++i) {
+    tmp.push_back(i);
+  }
+  ASSERT_EQ(tmp.size(), 10);
+  paddle::platform::CUDAPlace gpu0(0);
+  paddle::platform::SetDeviceId(0);
+  multiply_10<<<1, 1, 0, GetCUDAStream(gpu0)>>>(tmp.MutableData(gpu0));
+  paddle::platform::CUDAPlace gpu1(1);
+  auto* gpu1_ptr = tmp.MutableData(gpu1);
+  paddle::platform::SetDeviceId(1);
+  multiply_10<<<1, 1, 0, GetCUDAStream(gpu1)>>>(gpu1_ptr);
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_EQ(tmp[i], i * 100);
+  }
+}
diff --git a/paddle/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
similarity index 88%
rename from paddle/framework/op_desc.cc
rename to paddle/fluid/framework/op_desc.cc
index f554c77845..cbc15e60b8 100644
--- a/paddle/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/op_desc.h"
+#include "paddle/fluid/framework/op_desc.h"
 #include <functional>
 #include <mutex>
 #include <unordered_map>
 #include "glog/logging.h"
-#include "paddle/framework/block_desc.h"
-#include "paddle/framework/operator.h"
-#include "paddle/framework/program_desc.h"
-#include "paddle/framework/shape_inference.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/shape_inference.h"
 
 namespace paddle {
 namespace framework {
@@ -72,6 +72,13 @@ class CompileTimeInferShapeContext : public InferShapeContext {
 
   void SetDim(const std::string &name, const DDim &dim) override;
 
+  std::vector<DDim> GetRepeatedDims(const std::string &name) const override;
+
+  void SetRepeatedDims(const std::string &name,
+                       const std::vector<DDim> &dims) override;
+
+  InferShapeVarPtr GetVarPtr(const std::string &name) override;
+
   const OpDesc &op_;
   const BlockDesc &block_;
 };
@@ -120,11 +127,10 @@ OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block)
   // restore attrs_
   for (const proto::OpDesc::Attr &attr : desc_.attrs()) {
     std::string attr_name = attr.name();
+    // The sub_block referred to by the BLOCK attr hasn't been added
+    // to ProgramDesc class yet, we skip setting BLOCK attr here.
     if (attr.type() != proto::AttrType::BLOCK) {
       attrs_[attr_name] = GetAttrValue(attr);
-    } else {
-      auto bid = attr.block_idx();
-      attrs_[attr_name] = prog->MutableBlock(bid);
     }
   }
   this->block_ = block;
@@ -457,23 +463,48 @@ const std::vector<std::string> &CompileTimeInferShapeContext::Outputs(
 DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const {
   auto var = block_.FindVarRecursive(name);
   PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
+  DDim res;
   try {
-    auto shape = var->Shape();
-    if (shape.empty()) {
-      return framework::make_ddim({0UL});
-    } else {
-      return framework::make_ddim(var->Shape());
-    }
+    auto shape = var->GetShape();
+    res = shape.empty() ? make_ddim({0UL}) : make_ddim(shape);
   } catch (...) {
     VLOG(5) << "GetDim of variable " << name << " error";
     std::rethrow_exception(std::current_exception());
   }
+  return res;
+}
+
+std::vector<DDim> CompileTimeInferShapeContext::GetRepeatedDims(
+    const std::string &name) const {
+  auto var = block_.FindVarRecursive(name);
+  PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
+  std::vector<DDim> res;
+  try {
+    auto shapes = var->GetShapes();
+    for (const auto &s : shapes) {
+      res.push_back(s.empty() ? make_ddim({0UL}) : make_ddim(s));
+    }
+  } catch (...) {
+    VLOG(5) << "GetRepeatedDim of variable " << name << " error.";
+    std::rethrow_exception(std::current_exception());
+  }
+  return res;
 }
 
 void CompileTimeInferShapeContext::SetDim(const std::string &name,
                                           const DDim &dim) {
-  block_.FindVarRecursive(name)->SetShape(framework::vectorize(dim));
+  block_.FindVarRecursive(name)->SetShape(vectorize(dim));
+}
+
+void CompileTimeInferShapeContext::SetRepeatedDims(
+    const std::string &name, const std::vector<DDim> &dims) {
+  auto var = block_.FindVarRecursive(name);
+  PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
+  std::vector<std::vector<int64_t>> dim_vec(dims.size());
+  std::transform(dims.begin(), dims.end(), dim_vec.begin(), vectorize);
+  var->SetShapes(dim_vec);
 }
+
 bool CompileTimeInferShapeContext::IsRuntime() const { return false; }
 
 proto::VarDesc::VarType CompileTimeInferShapeContext::GetVarType(
@@ -481,5 +512,10 @@ proto::VarDesc::VarType CompileTimeInferShapeContext::GetVarType(
   return block_.FindVarRecursive(name)->GetType();
 }
 
+InferShapeVarPtr CompileTimeInferShapeContext::GetVarPtr(
+    const std::string &name) {
+  return block_.FindVarRecursive(name);
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/op_desc.h b/paddle/fluid/framework/op_desc.h
similarity index 96%
rename from paddle/framework/op_desc.h
rename to paddle/fluid/framework/op_desc.h
index 13695cff59..698df829e5 100644
--- a/paddle/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -16,9 +16,9 @@ limitations under the License. */
 
 #include <unordered_map>
 #include <vector>
-#include "paddle/framework/attribute.h"
-#include "paddle/framework/type_defs.h"
-#include "paddle/framework/var_desc.h"
+#include "paddle/fluid/framework/attribute.h"
+#include "paddle/fluid/framework/type_defs.h"
+#include "paddle/fluid/framework/var_desc.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/op_info.cc b/paddle/fluid/framework/op_info.cc
similarity index 95%
rename from paddle/framework/op_info.cc
rename to paddle/fluid/framework/op_info.cc
index b520108109..703c9c3234 100644
--- a/paddle/framework/op_info.cc
+++ b/paddle/fluid/framework/op_info.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/op_info.h"
+#include "paddle/fluid/framework/op_info.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/op_info.h b/paddle/fluid/framework/op_info.h
similarity index 95%
rename from paddle/framework/op_info.h
rename to paddle/fluid/framework/op_info.h
index d9b89f9cac..e6b3ff9e65 100644
--- a/paddle/framework/op_info.h
+++ b/paddle/fluid/framework/op_info.h
@@ -18,9 +18,9 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 
-#include "paddle/framework/attribute.h"
-#include "paddle/framework/type_defs.h"
-#include "paddle/platform/macros.h"
+#include "paddle/fluid/framework/attribute.h"
+#include "paddle/fluid/framework/type_defs.h"
+#include "paddle/fluid/platform/macros.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/op_kernel_type.h b/paddle/fluid/framework/op_kernel_type.h
similarity index 93%
rename from paddle/framework/op_kernel_type.h
rename to paddle/fluid/framework/op_kernel_type.h
index 44adb94d2a..b5dbff26d7 100644
--- a/paddle/framework/op_kernel_type.h
+++ b/paddle/fluid/framework/op_kernel_type.h
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/data_layout.h"
-#include "paddle/framework/data_type.h"
-#include "paddle/framework/library_type.h"
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/place.h"
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/library_type.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/op_kernel_type_test.cc b/paddle/fluid/framework/op_kernel_type_test.cc
similarity index 97%
rename from paddle/framework/op_kernel_type_test.cc
rename to paddle/fluid/framework/op_kernel_type_test.cc
index cb23bbde01..64096907df 100644
--- a/paddle/framework/op_kernel_type_test.cc
+++ b/paddle/fluid/framework/op_kernel_type_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/op_kernel_type.h"
+#include "paddle/fluid/framework/op_kernel_type.h"
 #include <gtest/gtest.h>
 #include <iostream>
 
diff --git a/paddle/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc
similarity index 97%
rename from paddle/framework/op_proto_maker.cc
rename to paddle/fluid/framework/op_proto_maker.cc
index 151d61d5b1..0a779b10b4 100644
--- a/paddle/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@@ -11,7 +11,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h
similarity index 96%
rename from paddle/framework/op_proto_maker.h
rename to paddle/fluid/framework/op_proto_maker.h
index efd3a5ca53..1dbfc7d37b 100644
--- a/paddle/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -13,8 +13,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/attribute.h"
-#include "paddle/framework/framework.pb.h"
+#include "paddle/fluid/framework/attribute.h"
+#include "paddle/fluid/framework/framework.pb.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/op_proto_maker_test.cc b/paddle/fluid/framework/op_proto_maker_test.cc
similarity index 97%
rename from paddle/framework/op_proto_maker_test.cc
rename to paddle/fluid/framework/op_proto_maker_test.cc
index f16cb6fa3a..cfefee8dbd 100644
--- a/paddle/framework/op_proto_maker_test.cc
+++ b/paddle/fluid/framework/op_proto_maker_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
 
 #include "gtest/gtest.h"
 
diff --git a/paddle/framework/op_registry.cc b/paddle/fluid/framework/op_registry.cc
similarity index 98%
rename from paddle/framework/op_registry.cc
rename to paddle/fluid/framework/op_registry.cc
index dfa151316d..739ec72ebc 100644
--- a/paddle/framework/op_registry.cc
+++ b/paddle/fluid/framework/op_registry.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <paddle/framework/op_registry.h>
+#include "paddle/fluid/framework/op_registry.h"
 
 #include <vector>
 
diff --git a/paddle/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
similarity index 95%
rename from paddle/framework/op_registry.h
rename to paddle/fluid/framework/op_registry.h
index 5de9ae559c..73faa99668 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -22,14 +22,14 @@ limitations under the License. */
 #include <unordered_set>
 
 #include "glog/logging.h"  // For VLOG()
-#include "paddle/framework/attribute.h"
-#include "paddle/framework/details/op_registry.h"
-#include "paddle/framework/framework.pb.h"
-#include "paddle/framework/grad_op_desc_maker.h"
-#include "paddle/framework/op_desc.h"
-#include "paddle/framework/operator.h"
-#include "paddle/framework/scope.h"
-#include "paddle/framework/shape_inference.h"
+#include "paddle/fluid/framework/attribute.h"
+#include "paddle/fluid/framework/details/op_registry.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/grad_op_desc_maker.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/shape_inference.h"
 
 namespace paddle {
 namespace framework {
@@ -143,7 +143,7 @@ class OpKernelRegistrar : public Registrar {
 
 /**
  * Macro to register Operator. When the input is duplicable, you should
- * use REGISTER_OP_EX with deop_empty_grad=false instead.
+ * use REGISTER_OP_EX with drop_empty_grad=false instead.
  */
 #define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, \
                     grad_op_class)                                   \
diff --git a/paddle/framework/op_registry_test.cc b/paddle/fluid/framework/op_registry_test.cc
similarity index 99%
rename from paddle/framework/op_registry_test.cc
rename to paddle/fluid/framework/op_registry_test.cc
index 341da8befd..bfbb2cfc2c 100644
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/fluid/framework/op_registry_test.cc
@@ -15,7 +15,7 @@
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace pd = paddle::framework;
 
diff --git a/paddle/framework/operator.cc b/paddle/fluid/framework/operator.cc
similarity index 92%
rename from paddle/framework/operator.cc
rename to paddle/fluid/framework/operator.cc
index 81fa8cf477..61529fe38b 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -16,11 +16,11 @@ limitations under the License. */
 
 #include <algorithm>
 
-#include "paddle/framework/data_transform.h"
-#include "paddle/framework/executor.h"
-#include "paddle/framework/operator.h"
-#include "paddle/framework/shape_inference.h"
-#include "paddle/framework/var_type.h"
+#include "paddle/fluid/framework/data_transform.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/shape_inference.h"
+#include "paddle/fluid/framework/var_type.h"
 
 DECLARE_bool(benchmark);
 
@@ -320,8 +320,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
     if (length == 0) {
       return false;
     }
-    PADDLE_ENFORCE_EQ(length, 1UL, "Input %s should have more than one inputs",
-                      name);
+    PADDLE_ENFORCE_EQ(length, 1UL,
+                      "Input %s should not have more than one inputs", name);
     auto ipt = ins[0];
     auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
     return var != nullptr;
@@ -333,8 +333,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
     if (length == 0) {
       return false;
     }
-    PADDLE_ENFORCE_EQ(length, 1UL, "Output %s should have more than one inputs",
-                      name);
+    PADDLE_ENFORCE_EQ(length, 1UL,
+                      "Output %s should not have more than one inputs", name);
     auto ipt = outs[0];
     auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
     return var != nullptr;
@@ -421,8 +421,22 @@ class RuntimeInferShapeContext : public InferShapeContext {
     } else if (var->IsType<SelectedRows>()) {
       return var->Get<SelectedRows>().GetCompleteDims();
     } else {
-      PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.",
-                   name, var->Type().name());
+      PADDLE_THROW(
+          "Only LoDTensor/SelectedRows support 'GetDim', but Variable %s's "
+          "type_id is %s.",
+          name, var->Type().name());
+    }
+  }
+
+  std::vector<DDim> GetRepeatedDims(const std::string& name) const override {
+    Variable* var = scope_.FindVar(name);
+    if (var->IsType<ReaderHolder>()) {
+      return var->Get<ReaderHolder>().shapes();
+    } else {
+      PADDLE_THROW(
+          "Only ReaderHolder support 'GetRepeatedDims', but Variable %s's "
+          "type_id is %s.",
+          name, var->Type().name());
     }
   }
 
@@ -438,11 +452,28 @@ class RuntimeInferShapeContext : public InferShapeContext {
     }
   }
 
+  void SetRepeatedDims(const std::string& name,
+                       const std::vector<DDim>& dims) override {
+    Variable* var = scope_.FindVar(name);
+    if (var->IsType<ReaderHolder>()) {
+      var->GetMutable<ReaderHolder>()->set_shapes(dims);
+    } else {
+      PADDLE_THROW(
+          "Only ReaderHolder support 'SetRepeatedDims', but Variable %s's "
+          "type_id is %s.",
+          name, var->Type().name());
+    }
+  }
+
   proto::VarDesc::VarType GetVarType(const std::string& name) const override {
     auto* var = scope_.FindVar(name);
     return ToVarType(var->Type());
   }
 
+  InferShapeVarPtr GetVarPtr(const std::string& name) override {
+    return scope_.FindVar(name);
+  }
+
  private:
   const OperatorBase& op_;
   const Scope& scope_;
diff --git a/paddle/framework/operator.h b/paddle/fluid/framework/operator.h
similarity index 96%
rename from paddle/framework/operator.h
rename to paddle/fluid/framework/operator.h
index c9140f304c..52300abeb7 100644
--- a/paddle/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -22,17 +22,17 @@ limitations under the License. */
 #include <vector>
 
 #include "glog/logging.h"  // For VLOG
-#include "paddle/framework/attribute.h"
-#include "paddle/framework/block_desc.h"
-#include "paddle/framework/framework.pb.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/op_info.h"
-#include "paddle/framework/op_kernel_type.h"
-#include "paddle/framework/scope.h"
-#include "paddle/framework/selected_rows.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/variant.h"
+#include "paddle/fluid/framework/attribute.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_kernel_type.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/variant.h"
 #include "paddle/utils/Error.h"
 
 namespace paddle {
diff --git a/paddle/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc
similarity index 98%
rename from paddle/framework/operator_test.cc
rename to paddle/fluid/framework/operator_test.cc
index b69d7c7a74..b90f5538bb 100644
--- a/paddle/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "gtest/gtest.h"
 
-#include "paddle/framework/init.h"
-#include "paddle/framework/op_info.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
+#include "paddle/fluid/framework/init.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc
similarity index 77%
rename from paddle/framework/program_desc.cc
rename to paddle/fluid/framework/program_desc.cc
index 15ea4035c6..b3f2e97cd9 100644
--- a/paddle/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/program_desc.h"
-#include "paddle/framework/block_desc.h"
-#include "paddle/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
 
 namespace paddle {
 namespace framework {
@@ -43,11 +43,20 @@ ProgramDesc::ProgramDesc() {
 
 ProgramDesc::ProgramDesc(const ProgramDesc &o) {
   desc_ = o.desc_;
-
   for (int i = 0; i < desc_.blocks_size(); ++i) {
     auto *block = desc_.mutable_blocks(i);
     blocks_.emplace_back(new BlockDesc(*o.blocks_[i], block, this));
   }
+  for (auto &block : blocks_) {
+    for (auto *op : block->AllOps()) {
+      for (const auto &attr : op->Proto()->attrs()) {
+        if (attr.type() == proto::AttrType::BLOCK) {
+          size_t blk_idx = attr.block_idx();
+          op->SetBlockAttr(attr.name(), *this->MutableBlock(blk_idx));
+        }
+      }
+    }
+  }
 }
 
 ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) {
@@ -55,6 +64,16 @@ ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) {
   for (auto &block_desc : *desc_.mutable_blocks()) {
     blocks_.emplace_back(new BlockDesc(this, &block_desc));
   }
+  for (auto &block : blocks_) {
+    for (auto *op : block->AllOps()) {
+      for (const auto &attr : op->Proto()->attrs()) {
+        if (attr.type() == proto::AttrType::BLOCK) {
+          size_t blk_idx = attr.block_idx();
+          op->SetBlockAttr(attr.name(), *this->MutableBlock(blk_idx));
+        }
+      }
+    }
+  }
 }
 
 ProgramDesc::ProgramDesc(const std::string &binary_str) {
diff --git a/paddle/framework/program_desc.h b/paddle/fluid/framework/program_desc.h
similarity index 88%
rename from paddle/framework/program_desc.h
rename to paddle/fluid/framework/program_desc.h
index 8e958eab6e..937de6ba92 100644
--- a/paddle/framework/program_desc.h
+++ b/paddle/fluid/framework/program_desc.h
@@ -16,10 +16,10 @@ limitations under the License. */
 
 #include <memory>
 #include <vector>
-#include "paddle/framework/block_desc.h"
-#include "paddle/framework/framework.pb.h"
-#include "paddle/framework/proto_desc.h"
-#include "paddle/platform/macros.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/proto_desc.h"
+#include "paddle/fluid/platform/macros.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/program_desc_test.cc b/paddle/fluid/framework/program_desc_test.cc
similarity index 95%
rename from paddle/framework/program_desc_test.cc
rename to paddle/fluid/framework/program_desc_test.cc
index 59947c9f21..afd5c9dabf 100644
--- a/paddle/framework/program_desc_test.cc
+++ b/paddle/fluid/framework/program_desc_test.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/program_desc.h"
+#include "paddle/fluid/framework/program_desc.h"
 #include "gtest/gtest.h"
-#include "paddle/framework/block_desc.h"
+#include "paddle/fluid/framework/block_desc.h"
 
 namespace paddle {
 namespace framework {
@@ -53,7 +53,7 @@ TEST(ProgramDesc, copy_ctor) {
     ASSERT_NE(copy, var_before);
     ASSERT_EQ(copy->Name(), var_before->Name());
     ASSERT_EQ(copy->GetType(), var_before->GetType());
-    ASSERT_EQ(copy->Shape(), var_before->Shape());
+    ASSERT_EQ(copy->GetShape(), var_before->GetShape());
     ASSERT_EQ(copy->Proto()->SerializeAsString(),
               var_before->Proto()->SerializeAsString());
   };
@@ -117,7 +117,7 @@ TEST(ProgramDescBind, serialize_and_deserialize) {
     ASSERT_NE(restored, var_before);
     ASSERT_EQ(restored->Name(), var_before->Name());
     ASSERT_EQ(restored->GetType(), var_before->GetType());
-    ASSERT_EQ(restored->Shape(), var_before->Shape());
+    ASSERT_EQ(restored->GetShape(), var_before->GetShape());
     ASSERT_EQ(restored->Proto()->SerializeAsString(),
               var_before->Proto()->SerializeAsString());
   };
diff --git a/paddle/framework/proto_desc.h b/paddle/fluid/framework/proto_desc.h
similarity index 100%
rename from paddle/framework/proto_desc.h
rename to paddle/fluid/framework/proto_desc.h
diff --git a/paddle/framework/prune.cc b/paddle/fluid/framework/prune.cc
similarity index 63%
rename from paddle/framework/prune.cc
rename to paddle/fluid/framework/prune.cc
index bff8e0bcea..79dbd3bcab 100644
--- a/paddle/framework/prune.cc
+++ b/paddle/fluid/framework/prune.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/prune.h"
+#include "paddle/fluid/framework/prune.h"
 
 #include <algorithm>
 #include <set>
@@ -49,11 +49,28 @@ bool IsTarget(const proto::OpDesc& op_desc) {
   return false;
 }
 
-void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
-                int block_id) {
-  // TODO(tonyyang-svail):
-  //    - will change to use multiple blocks for RNN op and Cond Op
+int GetSubBlockIndex(const proto::OpDesc& op_desc) {
+  for (auto& attr : op_desc.attrs()) {
+    if (attr.type() == proto::AttrType::BLOCK) {
+      PADDLE_ENFORCE(attr.has_block_idx());
+      return attr.block_idx();
+    }
+  }
+  return -1;
+}
+
+bool HasSubBlock(const proto::OpDesc& op_desc) {
+  return GetSubBlockIndex(op_desc) > 0;
+}
 
+// block_id is the idx of the current block in the input desc
+// parent_block_id is the idx of the parent of the current block
+// in the output desc, -1 means the current block is global block
+// dependent_vars is passed recursively from the parent block to
+// the child block to help pruning
+void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
+                int block_id, int parent_block_id,
+                std::set<std::string>& dependent_vars) {
   auto& block = input.blocks(block_id);
   auto& ops = block.ops();
 
@@ -72,11 +89,9 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
     expect_fetch = (op_desc.type() == kFetchOpType);
   }
 
-  std::set<std::string> dependent_vars;
   std::vector<bool> should_run;
   for (auto op_iter = ops.rbegin(); op_iter != ops.rend(); ++op_iter) {
     auto& op_desc = *op_iter;
-
     if (IsTarget(op_desc) || HasDependentVar(op_desc, dependent_vars)) {
       // insert its input to the dependency graph
       for (auto& var : op_desc.inputs()) {
@@ -84,7 +99,6 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
           dependent_vars.insert(argu);
         }
       }
-
       should_run.push_back(true);
     } else {
       should_run.push_back(false);
@@ -95,45 +109,81 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
   // we reverse the should_run vector
   std::reverse(should_run.begin(), should_run.end());
 
-  *output = input;
-  auto* op_field = output->mutable_blocks(block_id)->mutable_ops();
+  // copy the current block from input to output
+  auto* block_field = output->mutable_blocks();
+  *block_field->Add() = input.blocks(block_id);
+
+  int output_block_id = output->blocks_size() - 1;
+  auto* output_block = output->mutable_blocks(output_block_id);
+  output_block->set_idx(output_block_id);
+  output_block->set_parent_idx(parent_block_id);
+
+  auto* op_field = output_block->mutable_ops();
   op_field->Clear();
   for (size_t i = 0; i < should_run.size(); ++i) {
     if (should_run[i]) {
-      *op_field->Add() = input.blocks(block_id).ops(i);
+      auto* op = op_field->Add();
+      *op = input.blocks(block_id).ops(i);
+      if (HasSubBlock(*op)) {
+        // create sub_block_dependent_vars here to help prune the sub block
+        std::set<std::string> sub_block_dependent_vars;
+        for (auto& var : op->inputs()) {
+          for (auto& argu : var.arguments()) {
+            sub_block_dependent_vars.insert(argu);
+          }
+        }
+        for (auto& var : op->outputs()) {
+          for (auto& argu : var.arguments()) {
+            sub_block_dependent_vars.insert(argu);
+          }
+        }
+        // GetSubBlockIndex(*op) is the idx of the sub_block in the input desc
+        // output_block_id is the idx of the current block in the output desc
+        prune_impl(input, output, GetSubBlockIndex(*op), output_block_id,
+                   sub_block_dependent_vars);
+      }
     }
   }
 
   // remove the VarDescs in BlockDesc that are not referenced in
   // the pruned OpDescs
   std::unordered_map<std::string, proto::VarDesc> var_map;
-  auto* var_field = output->mutable_blocks(block_id)->mutable_vars();
+  auto* var_field = output->mutable_blocks(output_block_id)->mutable_vars();
   for (const auto& var : *var_field) {
     var_map[var.name()] = var;
   }
 
-  var_field->Clear();
+  std::set<std::string> var_names;
   for (const auto& op : *op_field) {
-    // add VarDescs of all input arguments for each OpDesc
     auto& input_field = op.inputs();
     for (auto& input_var : input_field) {
       for (auto& arg : input_var.arguments()) {
-        *var_field->Add() = var_map[arg];
+        if (var_map.count(arg) != 0) {
+          var_names.insert(arg);
+        }
       }
     }
-    // add VarDescs of all output arguments for each OpDesc
     auto& output_field = op.outputs();
     for (auto& output_var : output_field) {
       for (auto& arg : output_var.arguments()) {
-        *var_field->Add() = var_map[arg];
+        if (var_map.count(arg) != 0) {
+          var_names.insert(arg);
+        }
       }
     }
   }
+
+  var_field->Clear();
+  for (const auto& name : var_names) {
+    *var_field->Add() = var_map[name];
+  }
 }
 
 // TODO(fengjiayi): Prune() could be inplaced to avoid unnecessary copies
 void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output) {
-  prune_impl(input, output, 0);
+  std::set<std::string> dependent_vars;
+  output->clear_blocks();
+  prune_impl(input, output, 0, -1, dependent_vars);
 }
 
 void inference_optimize_impl(const proto::ProgramDesc& input,
diff --git a/paddle/framework/prune.h b/paddle/fluid/framework/prune.h
similarity index 90%
rename from paddle/framework/prune.h
rename to paddle/fluid/framework/prune.h
index 593292523d..601e66b67a 100644
--- a/paddle/framework/prune.h
+++ b/paddle/fluid/framework/prune.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/framework.pb.h"
-#include "paddle/platform/enforce.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/prune_test.cc b/paddle/fluid/framework/prune_test.cc
similarity index 93%
rename from paddle/framework/prune_test.cc
rename to paddle/fluid/framework/prune_test.cc
index d76c5abca9..36b76f0763 100644
--- a/paddle/framework/prune_test.cc
+++ b/paddle/fluid/framework/prune_test.cc
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/prune.h"
+#include "paddle/fluid/framework/prune.h"
 
-#include "paddle/framework/attribute.h"
-#include "paddle/framework/operator.h"
-#include "paddle/operators/net_op.h"
+#include "paddle/fluid/framework/attribute.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/net_op.h"
 
-#include "paddle/framework/block_desc.h"
-#include "paddle/framework/op_desc.h"
-#include "paddle/framework/program_desc.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/program_desc.h"
 
 #include <gtest/gtest.h>
 
diff --git a/paddle/fluid/framework/reader.cc b/paddle/fluid/framework/reader.cc
new file mode 100644
index 0000000000..1ef0c48211
--- /dev/null
+++ b/paddle/fluid/framework/reader.cc
@@ -0,0 +1,116 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/reader.h"
+
+namespace paddle {
+namespace framework {
+
+DDim ReaderBase::shape(size_t idx) const {
+  PADDLE_ENFORCE_LT(
+      idx, shapes_.size(),
+      "Cannot get the %d'th shape, 'shapes_' only has %d elements.", idx,
+      shapes_.size());
+  return shapes_[idx];
+}
+
+void ShuffleReader::ReadNext(std::vector<LoDTensor>* out) {
+  if (iteration_pos_ >= buffer_.size()) {
+    // Reload buffer with new data
+    buffer_.clear();
+    buffer_.reserve(buffer_size_);
+    for (int i = 0; i < buffer_size_; ++i) {
+      if (reader_->HasNext()) {
+        buffer_.push_back(std::vector<LoDTensor>());
+        reader_->ReadNext(&buffer_.back());
+      } else {
+        break;
+      }
+    }
+    // TODO(fengjiayi): 'std::random_shuffle' can be very slow. It needs to be
+    // optimize.
+    std::random_shuffle(buffer_.begin(), buffer_.end());
+    iteration_pos_ = 0;
+  }
+  out->clear();
+  if (!buffer_.empty()) {
+    std::swap(*out, buffer_[iteration_pos_++]);
+  }
+  // if buffer_ is empty, the 'out' will return as an empty vector.
+}
+
+void BatchReader::ReadNext(std::vector<LoDTensor>* out) {
+  buffer_.clear();
+  buffer_.reserve(batch_size_);
+  for (int i = 0; i < batch_size_; ++i) {
+    if (reader_->HasNext()) {
+      buffer_.push_back(std::vector<LoDTensor>());
+      reader_->ReadNext(&buffer_.back());
+    } else {
+      break;
+    }
+  }
+  // Concat instances
+  out->clear();
+  if (buffer_.empty()) {
+    // if buffer_ is empty, the 'out' will return as an empty vector.
+    return;
+  }
+  int out_num = buffer_[0].size();
+  out->reserve(out_num);
+  for (int j = 0; j < out_num; ++j) {
+    // Merge shape and check date type
+    std::type_index batch_type = buffer_[0][j].type();
+    DDim batch_shape = buffer_[0][j].dims();
+    for (size_t i = 1; i < buffer_.size(); ++i) {
+      std::type_index ins_type = buffer_[i][j].type();
+      DDim ins_shape = buffer_[i][j].dims();
+      PADDLE_ENFORCE_EQ(batch_type, ins_type);
+      PADDLE_ENFORCE_EQ(slice_ddim(batch_shape, 1, batch_shape.size()),
+                        slice_ddim(ins_shape, 1, ins_shape.size()));
+      PADDLE_ENFORCE_GT(ins_shape[0], 0);
+      batch_shape[0] += ins_shape[0];
+    }
+
+    LoDTensor out_tensor;
+    out_tensor.Resize(batch_shape);
+    out_tensor.mutable_data(platform::CPUPlace(), batch_type);
+    int64_t dst_offset = 0;
+
+    // Merge lod and data
+    LoD batch_lod;
+    for (size_t i = 0; i < buffer_.size(); ++i) {
+      DDim ins_shape = buffer_[i][j].dims();
+      LoD ins_lod = buffer_[i][j].lod();
+      if (i == 0) {
+        batch_lod = ins_lod;
+      } else {
+        PADDLE_ENFORCE_EQ(batch_lod.size(), ins_lod.size());
+        for (size_t level_idx = 0; level_idx < batch_lod.size(); ++level_idx) {
+          auto& lod_level = batch_lod[level_idx];
+          for (size_t k = 1; k < ins_lod[level_idx].size(); ++k) {
+            lod_level.push_back(ins_lod[level_idx][k] + lod_level.back());
+          }
+        }
+      }
+      Tensor dst = out_tensor.Slice(dst_offset, dst_offset + ins_shape[0]);
+      Copy(buffer_[i][j], platform::CPUPlace(), &dst);
+      dst_offset += ins_shape[0];
+    }
+    out_tensor.set_lod(batch_lod);
+    out->push_back(out_tensor);
+  }
+}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h
new file mode 100644
index 0000000000..4a5eba5fb7
--- /dev/null
+++ b/paddle/fluid/framework/reader.h
@@ -0,0 +1,161 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+
+namespace paddle {
+namespace framework {
+
+class ReaderBase {
+ public:
+  explicit ReaderBase(const std::vector<DDim>& shapes) : shapes_(shapes) {
+    PADDLE_ENFORCE(!shapes_.empty());
+  }
+  virtual void ReadNext(std::vector<LoDTensor>* out) = 0;
+  virtual bool HasNext() const = 0;
+
+  virtual void ReInit() = 0;
+
+  DDim shape(size_t idx) const;
+  std::vector<DDim> shapes() const { return shapes_; }
+  void set_shapes(const std::vector<DDim>& shapes) { shapes_ = shapes; }
+
+  virtual ~ReaderBase() {}
+
+ protected:
+  std::vector<DDim> shapes_;
+};
+
+class FileReader : public ReaderBase {
+ public:
+  explicit FileReader(const std::vector<DDim>& shapes) : ReaderBase(shapes) {}
+};
+
+class DecoratedReader : public ReaderBase {
+ public:
+  explicit DecoratedReader(ReaderBase* reader)
+      : ReaderBase(reader->shapes()), reader_(reader) {
+    PADDLE_ENFORCE_NOT_NULL(reader_);
+  }
+
+  bool HasNext() const override { return reader_->HasNext(); }
+
+  void ReInit() override { reader_->ReInit(); }
+
+ protected:
+  ReaderBase* reader_;
+};
+
+// file readers
+
+template <typename T>
+class RandomDataGenerator : public FileReader {
+ public:
+  RandomDataGenerator(const std::vector<DDim>& shapes, float min, float max)
+      : FileReader(shapes), min_(min), max_(max) {
+    PADDLE_ENFORCE_LE(
+        min, max, "'min' shouldn't be greater than 'max'.(%f vs %f)", min, max);
+    unsigned int seed = std::random_device()();
+    engine_.seed(seed);
+    dist_ = std::uniform_real_distribution<float>(min_, max_);
+  }
+
+  void ReadNext(std::vector<LoDTensor>* out) override {
+    out->clear();
+    out->reserve(shapes_.size());
+    for (const DDim& shape : shapes_) {
+      PADDLE_ENFORCE_GE(
+          shape.size(), 2,
+          "The rank of reader's output data should be 2 at least.(Now it's %d)",
+          shape.size());
+      LoDTensor out_tensor;
+      out_tensor.Resize(shape);
+      T* data = out_tensor.mutable_data<T>(platform::CPUPlace());
+      int64_t numel = product(shape);
+      for (int64_t i = 0; i < numel; ++i) {
+        data[i] = dist_(engine_);
+      }
+      out->push_back(out_tensor);
+    }
+  }
+
+  bool HasNext() const override { return true; }
+
+  void ReInit() override { return; }
+
+ private:
+  float min_;
+  float max_;
+  std::minstd_rand engine_;
+  std::uniform_real_distribution<float> dist_;
+};
+
+// decorated readers
+
+class ShuffleReader : public DecoratedReader {
+ public:
+  ShuffleReader(ReaderBase* reader, int buffer_size)
+      : DecoratedReader(reader), buffer_size_(buffer_size), iteration_pos_(0) {
+    buffer_.reserve(buffer_size);
+  }
+
+  void ReadNext(std::vector<LoDTensor>* out) override;
+
+ private:
+  int buffer_size_;
+  std::vector<std::vector<LoDTensor>> buffer_;
+  size_t iteration_pos_;
+};
+
+class BatchReader : public DecoratedReader {
+ public:
+  BatchReader(ReaderBase* reader, int batch_size)
+      : DecoratedReader(reader), batch_size_(batch_size) {
+    buffer_.reserve(batch_size_);
+  }
+
+  void ReadNext(std::vector<LoDTensor>* out) override;
+
+ private:
+  int batch_size_;
+  std::vector<std::vector<LoDTensor>> buffer_;
+};
+
+// The ReaderHolder is used as readers' unified wrapper,
+// making it easier to access different type readers in Variables.
+class ReaderHolder {
+ public:
+  void Reset(ReaderBase* reader) { reader_.reset(reader); }
+
+  ReaderBase* Get() const { return reader_.get(); }
+
+  void ReadNext(std::vector<LoDTensor>* out) { reader_->ReadNext(out); }
+  bool HasNext() const { return reader_->HasNext(); }
+  void ReInit() { reader_->ReInit(); }
+
+  DDim shape(size_t idx) const { return reader_->shape(idx); }
+  std::vector<DDim> shapes() const { return reader_->shapes(); }
+  void set_shapes(const std::vector<DDim>& shapes) {
+    reader_->set_shapes(shapes);
+  }
+
+ private:
+  std::unique_ptr<ReaderBase> reader_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/scope.cc b/paddle/fluid/framework/scope.cc
similarity index 97%
rename from paddle/framework/scope.cc
rename to paddle/fluid/framework/scope.cc
index af08b2ab81..6006ed16bd 100644
--- a/paddle/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/scope.h"
+#include "paddle/fluid/framework/scope.h"
 
 #include <memory>  // for unique_ptr
 #include <mutex>   // for call_once
 #include "glog/logging.h"
-#include "paddle/framework/threadpool.h"
+#include "paddle/fluid/framework/threadpool.h"
 #include "paddle/string/printf.h"
 
 DEFINE_bool(benchmark, false,
diff --git a/paddle/framework/scope.h b/paddle/fluid/framework/scope.h
similarity index 96%
rename from paddle/framework/scope.h
rename to paddle/fluid/framework/scope.h
index a1da81cc79..2da9e0716e 100644
--- a/paddle/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -19,8 +19,8 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
-#include "paddle/framework/variable.h"
-#include "paddle/platform/macros.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/macros.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/scope_test.cc b/paddle/fluid/framework/scope_test.cc
similarity index 97%
rename from paddle/framework/scope_test.cc
rename to paddle/fluid/framework/scope_test.cc
index 0f5b86061d..d64acb130c 100644
--- a/paddle/framework/scope_test.cc
+++ b/paddle/fluid/framework/scope_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/scope.h"
+#include "paddle/fluid/framework/scope.h"
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 
diff --git a/paddle/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc
similarity index 98%
rename from paddle/framework/selected_rows.cc
rename to paddle/fluid/framework/selected_rows.cc
index 3b3e60177a..f5d9e9a495 100644
--- a/paddle/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/selected_rows.h"
+#include "paddle/fluid/framework/selected_rows.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h
similarity index 96%
rename from paddle/framework/selected_rows.h
rename to paddle/fluid/framework/selected_rows.h
index 30d3dfc1e8..f1a263962b 100644
--- a/paddle/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -10,8 +10,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/tensor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/tensor.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/selected_rows_test.cc b/paddle/fluid/framework/selected_rows_test.cc
similarity index 97%
rename from paddle/framework/selected_rows_test.cc
rename to paddle/fluid/framework/selected_rows_test.cc
index 8ff3fb6a97..d414f2a593 100644
--- a/paddle/framework/selected_rows_test.cc
+++ b/paddle/fluid/framework/selected_rows_test.cc
@@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/selected_rows.h"
+#include "paddle/fluid/framework/selected_rows.h"
 #include "gtest/gtest.h"
 
 namespace paddle {
diff --git a/paddle/framework/shape_inference.cc b/paddle/fluid/framework/shape_inference.cc
similarity index 66%
rename from paddle/framework/shape_inference.cc
rename to paddle/fluid/framework/shape_inference.cc
index a0fa467291..cfd2334f1a 100644
--- a/paddle/framework/shape_inference.cc
+++ b/paddle/fluid/framework/shape_inference.cc
@@ -11,9 +11,9 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/framework/shape_inference.h"
+#include "paddle/fluid/framework/shape_inference.h"
 #include "grad_op_desc_maker.h"
-#include "paddle/framework/operator.h"
+#include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
 namespace framework {
@@ -32,6 +32,16 @@ std::vector<DDim> InferShapeContext::GetInputsDim(
   return GetDims(arg_names);
 }
 
+std::vector<DDim> InferShapeContext::GetReaderDims(
+    const std::string &name) const {
+  const std::vector<std::string> &arg_names = Inputs(name);
+  PADDLE_ENFORCE_EQ(
+      arg_names.size(), 1UL,
+      "Reader input '%s' should hold one element, but now it holds %d", name,
+      arg_names.size());
+  return this->GetRepeatedDims(arg_names[0]);
+}
+
 DDim InferShapeContext::GetInputsElementDim(const std::string &name,
                                             int idx) const {
   const std::vector<std::string> &names = Inputs(name);
@@ -52,6 +62,38 @@ void InferShapeContext::SetOutputsDim(const std::string &name,
   SetDims(names, dims);
 }
 
+void InferShapeContext::SetReaderDims(const std::string &name,
+                                      const std::vector<DDim> &dims) {
+  const std::vector<std::string> &arg_names = Outputs(name);
+  PADDLE_ENFORCE_EQ(
+      arg_names.size(), 1UL,
+      "Reader output '%s' should hold one element, but now it holds %d", name,
+      arg_names.size());
+  return this->SetRepeatedDims(arg_names[0], dims);
+}
+
+std::vector<InferShapeVarPtr> InferShapeContext::GetInputVarPtrs(
+    const std::string &name) {
+  const std::vector<std::string> arg_names = Inputs(name);
+  std::vector<InferShapeVarPtr> res;
+  res.reserve(arg_names.size());
+  std::transform(
+      arg_names.begin(), arg_names.end(), std::back_inserter(res),
+      [this](const std::string &name) { return this->GetVarPtr(name); });
+  return res;
+}
+
+std::vector<InferShapeVarPtr> InferShapeContext::GetOutputVarPtrs(
+    const std::string &name) {
+  const std::vector<std::string> arg_names = Outputs(name);
+  std::vector<InferShapeVarPtr> res;
+  res.reserve(arg_names.size());
+  std::transform(
+      arg_names.begin(), arg_names.end(), std::back_inserter(res),
+      [this](const std::string &name) { return this->GetVarPtr(name); });
+  return res;
+}
+
 std::vector<DDim> InferShapeContext::GetDims(
     const std::vector<std::string> &names) const {
   std::vector<DDim> ret;
@@ -61,6 +103,7 @@ std::vector<DDim> InferShapeContext::GetDims(
       [this](const std::string &name) { return this->GetDim(name); });
   return ret;
 }
+
 void InferShapeContext::SetDims(const std::vector<std::string> &names,
                                 const std::vector<DDim> &dims) {
   size_t length = names.size();
@@ -72,14 +115,17 @@ void InferShapeContext::SetDims(const std::vector<std::string> &names,
     SetDim(names[i], dims[i]);
   }
 }
+
 std::vector<proto::VarDesc::VarType> InferShapeContext::GetInputsVarType(
     const std::string &name) const {
   return GetVarTypes(Inputs(name));
 }
+
 std::vector<proto::VarDesc::VarType> InferShapeContext::GetOutputsVarType(
     const std::string &name) const {
   return GetVarTypes(Outputs(name));
 }
+
 std::vector<proto::VarDesc::VarType> InferShapeContext::GetVarTypes(
     const std::vector<std::string> &names) const {
   std::vector<proto::VarDesc::VarType> retv;
diff --git a/paddle/framework/shape_inference.h b/paddle/fluid/framework/shape_inference.h
similarity index 74%
rename from paddle/framework/shape_inference.h
rename to paddle/fluid/framework/shape_inference.h
index 830f199ed1..c907523325 100644
--- a/paddle/framework/shape_inference.h
+++ b/paddle/fluid/framework/shape_inference.h
@@ -14,13 +14,17 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/attribute.h"
-#include "paddle/framework/ddim.h"
-#include "paddle/framework/framework.pb.h"
+#include "paddle/fluid/framework/attribute.h"
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/framework/variable.h"
 
 namespace paddle {
 namespace framework {
 
+using InferShapeVarPtr = boost::variant<VarDesc *, Variable *>;
+
 class InferShapeContext {
  public:
   virtual ~InferShapeContext() = default;
@@ -36,12 +40,13 @@ class InferShapeContext {
   virtual bool HasOutputs(const std::string &name) const = 0;
 
   DDim GetInputDim(const std::string &name) const;
-
   std::vector<DDim> GetInputsDim(const std::string &name) const;
+  std::vector<DDim> GetReaderDims(const std::string &name) const;
   DDim GetInputsElementDim(const std::string &name, int idx) const;
 
   void SetOutputDim(const std::string &name, const DDim &dim);
   void SetOutputsDim(const std::string &name, const std::vector<DDim> &dims);
+  void SetReaderDims(const std::string &name, const std::vector<DDim> &dims);
 
   virtual AttrReader Attrs() const = 0;
   virtual const std::vector<std::string> &Inputs(
@@ -54,6 +59,9 @@ class InferShapeContext {
 
   virtual bool IsRuntime() const = 0;
 
+  std::vector<InferShapeVarPtr> GetInputVarPtrs(const std::string &name);
+  std::vector<InferShapeVarPtr> GetOutputVarPtrs(const std::string &name);
+
   // Note: In while op, we need this to be public
   void SetDims(const std::vector<std::string> &names,
                const std::vector<DDim> &dims);
@@ -61,12 +69,18 @@ class InferShapeContext {
  protected:
   virtual DDim GetDim(const std::string &name) const = 0;
   virtual void SetDim(const std::string &name, const DDim &dim) = 0;
+  virtual std::vector<DDim> GetRepeatedDims(const std::string &name) const = 0;
+  virtual void SetRepeatedDims(const std::string &name,
+                               const std::vector<DDim> &dims) = 0;
 
   std::vector<DDim> GetDims(const std::vector<std::string> &names) const;
+
   std::vector<proto::VarDesc::VarType> GetVarTypes(
       const std::vector<std::string> &names) const;
 
   virtual proto::VarDesc::VarType GetVarType(const std::string &name) const = 0;
+
+  virtual InferShapeVarPtr GetVarPtr(const std::string &name) = 0;
 };
 
 }  // namespace framework
diff --git a/paddle/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
similarity index 93%
rename from paddle/framework/tensor.cc
rename to paddle/fluid/framework/tensor.cc
index f922e60624..a56091d3c6 100644
--- a/paddle/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/tensor.h"
+#include "paddle/fluid/framework/tensor.h"
 
 namespace paddle {
 namespace framework {}
diff --git a/paddle/framework/tensor.h b/paddle/fluid/framework/tensor.h
similarity index 94%
rename from paddle/framework/tensor.h
rename to paddle/fluid/framework/tensor.h
index f0ea709a5c..44d2c7dae9 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -20,12 +20,12 @@ limitations under the License. */
 #include <typeindex>
 #include <vector>
 
-#include "paddle/framework/data_layout.h"
-#include "paddle/framework/ddim.h"
-#include "paddle/memory/memory.h"
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/enforce.h"
-#include "paddle/platform/place.h"
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 
@@ -120,6 +120,7 @@ class Tensor {
     return holder_->type();
   }
 
+  // memory size returns the holding memory size in byte.
   size_t memory_size() const;
 
   inline void check_memory_size() const;
@@ -223,4 +224,4 @@ inline void Tensor::switch_place(platform::Place new_place) {
 }  // namespace framework
 }  // namespace paddle
 
-#include "paddle/framework/tensor_impl.h"
+#include "paddle/fluid/framework/tensor_impl.h"
diff --git a/paddle/framework/tensor.md b/paddle/fluid/framework/tensor.md
similarity index 100%
rename from paddle/framework/tensor.md
rename to paddle/fluid/framework/tensor.md
diff --git a/paddle/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
similarity index 95%
rename from paddle/framework/tensor_impl.h
rename to paddle/fluid/framework/tensor_impl.h
index 1340c5e485..e69836292c 100644
--- a/paddle/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/memory/memcpy.h"
-#include "paddle/platform/enforce.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace framework {
@@ -52,7 +52,7 @@ struct SizeOfTypeFunctor<HEAD, TAIL...> {
 };
 
 static inline size_t SizeOfType(std::type_index type) {
-  SizeOfTypeFunctor<int, float, double, int16_t, int64_t, bool> functor;
+  SizeOfTypeFunctor<int, float, double, int16_t, int64_t, bool, size_t> functor;
   size_t size = functor(type);
   PADDLE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name());
   return size;
@@ -61,15 +61,15 @@ static inline size_t SizeOfType(std::type_index type) {
 inline void Tensor::check_memory_size() const {
   PADDLE_ENFORCE_NOT_NULL(
       holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
-  PADDLE_ENFORCE_GE(
-      holder_->size(), memory_size() + offset_,
+  PADDLE_ENFORCE_LE(
+      numel() * SizeOfType(type()), memory_size(),
       "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
       "first to re-allocate memory.\n"
       "or maybe the required data-type mismatches the data already stored.");
 }
 
 inline size_t Tensor::memory_size() const {
-  return holder_ == nullptr ? 0UL : numel() * SizeOfType(type());
+  return holder_ == nullptr ? 0UL : holder_->size() - offset_;
 }
 
 template <typename T>
diff --git a/paddle/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc
similarity index 99%
rename from paddle/framework/tensor_test.cc
rename to paddle/fluid/framework/tensor_test.cc
index 9a387526ac..6ed416e46f 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/framework/tensor.h"
+#include "paddle/fluid/framework/tensor.h"
 #include <gtest/gtest.h>
 #include <string>
 
diff --git a/paddle/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
similarity index 98%
rename from paddle/framework/tensor_util.cc
rename to paddle/fluid/framework/tensor_util.cc
index a5b83eaa07..537fb4614c 100644
--- a/paddle/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -12,7 +12,7 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/framework/tensor_util.h"
+#include "paddle/fluid/framework/tensor_util.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/tensor_util.cu b/paddle/fluid/framework/tensor_util.cu
new file mode 100644
index 0000000000..537fb4614c
--- /dev/null
+++ b/paddle/fluid/framework/tensor_util.cu
@@ -0,0 +1,119 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/fluid/framework/tensor_util.h"
+
+namespace paddle {
+namespace framework {
+template <typename Predicate, typename DevCtx>
+struct AnyDTypeVisitor {
+  Predicate predicate_;
+  const Tensor& tensor_;
+  const DevCtx& ctx_;
+  Tensor* out_;
+
+  AnyDTypeVisitor(Predicate predicate, const Tensor& tensor, const DevCtx& ctx,
+                  Tensor* out)
+      : predicate_(predicate), tensor_(tensor), ctx_(ctx), out_(out) {}
+
+  template <typename T>
+  void operator()() const {
+    auto t = EigenVector<T>::Flatten(tensor_);
+    auto o = EigenScalar<bool>::From(*out_);
+    // return any of predicate_(t) is true.
+    o.device(*ctx_.eigen_device()) = predicate_(t).any();
+  }
+};
+
+template <typename Predicate, typename DevCtx>
+inline void AnyImpl(Predicate predicate, const framework::Tensor& tensor,
+                    const DevCtx& ctx, framework::Tensor* out) {
+  VisitDataType(ToDataType(tensor.type()), AnyDTypeVisitor<Predicate, DevCtx>(
+                                               predicate, tensor, ctx, out));
+}
+
+template <typename Predicate>
+struct AnyVisitor : public boost::static_visitor<bool> {
+  const framework::Tensor& tensor_;
+  Predicate predicate_;
+
+  AnyVisitor(const framework::Tensor& tensor, Predicate predicate)
+      : tensor_(tensor), predicate_(std::move(predicate)) {}
+
+  template <typename Place>
+  bool operator()(const Place& place) const {
+    framework::Tensor out;
+    out.Resize({1});
+    out.mutable_data<bool>(place);
+    auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
+    AnyImpl(predicate_, tensor_, *ctx, &out);
+    return this->GetResult(out, place);
+  }
+
+  bool GetResult(const framework::Tensor& out,
+                 const platform::CUDAPlace& gpu) const {
+    platform::CPUPlace cpu;
+    framework::Tensor tmp;
+    tmp.Resize({1});
+    tmp.mutable_data<bool>(cpu);
+    auto gpuctx = platform::DeviceContextPool::Instance().Get(gpu);
+    gpuctx->Wait();
+    Copy(out, cpu, *gpuctx, &tmp);
+    gpuctx->Wait();
+    return GetResult(tmp, cpu);
+  }
+
+  bool GetResult(const framework::Tensor& out,
+                 const platform::CPUPlace& cpu) const {
+    return *out.data<bool>();
+  }
+};
+
+template <typename Predicate>
+inline bool Any(const framework::Tensor& tensor, Predicate predicate) {
+  AnyVisitor<Predicate> visitor(tensor, predicate);
+  auto place = tensor.place();
+  return platform::VisitPlace(place, visitor);
+}
+
+struct HasNANPredicate {
+  template <typename T>
+  auto operator()(const T& eigen_vec) const
+      -> decltype(std::declval<T>().isnan()) {
+    // Cast eigen_vector to vector of bool. true if is inf.
+    return eigen_vec.isnan();
+  }
+};
+
+bool HasNAN(const framework::Tensor& tensor) {
+  HasNANPredicate predicate;
+  return Any(tensor, predicate);
+}
+
+struct HasInfPredicate {
+  template <typename T>
+  auto operator()(const T& eigen_vec) const
+      -> decltype(std::declval<T>().isinf()) {
+    // Cast eigen_vector to vector of bool. true if is inf.
+    return eigen_vec.isinf();
+  }
+};
+
+bool HasInf(const framework::Tensor& tensor) {
+  HasInfPredicate predicate;
+  return Any(tensor, predicate);
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
similarity index 98%
rename from paddle/framework/tensor_util.h
rename to paddle/fluid/framework/tensor_util.h
index b49c614499..b7e772b6da 100644
--- a/paddle/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/data_type.h"
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/framework.pb.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/platform/device_context.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc
similarity index 99%
rename from paddle/framework/tensor_util_test.cc
rename to paddle/fluid/framework/tensor_util_test.cc
index 906b0b5656..8764c692e8 100644
--- a/paddle/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/framework/tensor_util.h"
+#include "paddle/fluid/framework/tensor_util.h"
 #include <gtest/gtest.h>
 #include <cmath>
 #include <string>
diff --git a/paddle/framework/tensor_util_test.cu b/paddle/fluid/framework/tensor_util_test.cu
similarity index 91%
rename from paddle/framework/tensor_util_test.cu
rename to paddle/fluid/framework/tensor_util_test.cu
index ebd35fdf6c..1982b642bc 100644
--- a/paddle/framework/tensor_util_test.cu
+++ b/paddle/fluid/framework/tensor_util_test.cu
@@ -13,9 +13,9 @@
    limitations under the License. */
 
 #include "gtest/gtest.h"
-#include "paddle/framework/tensor_util.h"
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/place.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc
similarity index 96%
rename from paddle/framework/threadpool.cc
rename to paddle/fluid/framework/threadpool.cc
index b7d7c00bcf..2c4de41b0c 100644
--- a/paddle/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
@@ -12,9 +12,9 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/framework/threadpool.h"
+#include "paddle/fluid/framework/threadpool.h"
 
-#include "paddle/platform/enforce.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/threadpool.h b/paddle/fluid/framework/threadpool.h
similarity index 65%
rename from paddle/framework/threadpool.h
rename to paddle/fluid/framework/threadpool.h
index 4e9b58679d..e88e6c01f0 100644
--- a/paddle/framework/threadpool.h
+++ b/paddle/fluid/framework/threadpool.h
@@ -21,8 +21,9 @@ limitations under the License. */
 #include <queue>
 #include <thread>
 #include <vector>
-
-#include "paddle/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
+#include "glog/logging.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 
 namespace paddle {
 namespace framework {
@@ -31,7 +32,7 @@ namespace framework {
 // number of threads.
 class ThreadPool {
  public:
-  typedef std::packaged_task<void()> Task;
+  using Task = std::packaged_task<std::unique_ptr<platform::EnforceNotMet>()>;
 
   // Returns the singleton of ThreadPool.
   static ThreadPool* GetInstance();
@@ -52,9 +53,28 @@ class ThreadPool {
   // std::future::wait().
   template <typename Callback>
   std::future<void> Run(Callback fn) {
+    auto f = this->RunAndGetException(fn);
+    return std::async(std::launch::deferred, ExceptionHandler(std::move(f)));
+  }
+
+  template <typename Callback>
+  std::future<std::unique_ptr<platform::EnforceNotMet>> RunAndGetException(
+      Callback fn) {
     std::unique_lock<std::mutex> lock(mutex_);
-    Task task(std::bind(fn));
-    std::future<void> f = task.get_future();
+    Task task([fn]() -> std::unique_ptr<platform::EnforceNotMet> {
+      try {
+        fn();
+        return nullptr;
+      } catch (platform::EnforceNotMet ex) {
+        return std::unique_ptr<platform::EnforceNotMet>(
+            new platform::EnforceNotMet(ex));
+      } catch (...) {
+        LOG(FATAL)
+            << "Unexpected exception is catched in thread pool. All "
+               "throwable exception in Fluid should be an EnforceNotMet.";
+      }
+    });
+    std::future<std::unique_ptr<platform::EnforceNotMet>> f = task.get_future();
     tasks_.push(std::move(task));
     lock.unlock();
     scheduled_.notify_one();
@@ -65,6 +85,22 @@ class ThreadPool {
   void Wait();
 
  private:
+  struct ExceptionHandler {
+    mutable std::future<std::unique_ptr<platform::EnforceNotMet>> future_;
+    explicit ExceptionHandler(
+        std::future<std::unique_ptr<platform::EnforceNotMet>>&& f)
+        : future_(std::move(f)) {}
+    void operator()() const {
+      auto ex = this->future_.get();
+      if (ex != nullptr) {
+        LOG(FATAL) << "The exception is thrown inside the thread pool. You "
+                      "should use RunAndGetException to handle the exception.\n"
+                      "The default exception handler is LOG(FATAL)."
+                   << ex->what();
+      }
+    }
+  };
+
   DISABLE_COPY_AND_ASSIGN(ThreadPool);
 
   explicit ThreadPool(int num_threads);
diff --git a/paddle/framework/threadpool_test.cc b/paddle/fluid/framework/threadpool_test.cc
similarity index 100%
rename from paddle/framework/threadpool_test.cc
rename to paddle/fluid/framework/threadpool_test.cc
diff --git a/paddle/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
similarity index 97%
rename from paddle/framework/type_defs.h
rename to paddle/fluid/framework/type_defs.h
index 1eedbbc419..786d78a644 100644
--- a/paddle/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
-#include "paddle/platform/variant.h"
+#include "paddle/fluid/platform/variant.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/var_desc.cc b/paddle/fluid/framework/var_desc.cc
new file mode 100644
index 0000000000..7ec9b2ced9
--- /dev/null
+++ b/paddle/fluid/framework/var_desc.cc
@@ -0,0 +1,259 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+
+proto::VarDesc::VarType VarDesc::GetType() const { return desc_.type(); }
+
+void VarDesc::SetType(proto::VarDesc::VarType type) { desc_.set_type(type); }
+
+void VarDesc::SetShape(const std::vector<int64_t> &dims) {
+  VectorToRepeated(dims, mutable_tensor_desc()->mutable_dims());
+}
+
+void VarDesc::SetTensorDescNum(size_t num) {
+  switch (desc_.type()) {
+    case proto::VarDesc::READER: {
+      auto *lod_tensors_ptr = desc_.mutable_reader()->mutable_lod_tensor();
+      lod_tensors_ptr->Clear();
+      for (size_t i = 0; i < num; ++i) {
+        lod_tensors_ptr->Add();
+      }
+      return;
+    } break;
+    default:
+      PADDLE_THROW(
+          "Setting 'sub_tensor_number' is not supported by the type of var %s.",
+          this->Name());
+  }
+}
+
+size_t VarDesc::GetTensorDescNum() const {
+  switch (desc_.type()) {
+    case proto::VarDesc::READER:
+      return desc_.reader().lod_tensor_size();
+      break;
+    default:
+      PADDLE_THROW(
+          "Getting 'sub_tensor_number' is not supported by the type of var %s.",
+          this->Name());
+  }
+}
+
+void VarDesc::SetShapes(
+    const std::vector<std::vector<int64_t>> &multiple_dims) {
+  if (multiple_dims.size() != GetTensorDescNum()) {
+    VLOG(3) << "WARNING: The number of given shapes(" << multiple_dims.size()
+            << ") doesn't match the existing tensor number("
+            << GetTensorDescNum()
+            << "). The Reader is going to be reinitialized.";
+    SetTensorDescNum(multiple_dims.size());
+  }
+  std::vector<proto::TensorDesc *> tensors = mutable_tensor_descs();
+  for (size_t i = 0; i < multiple_dims.size(); ++i) {
+    VectorToRepeated(multiple_dims[i], tensors[i]->mutable_dims());
+  }
+}
+
+std::vector<int64_t> VarDesc::GetShape() const {
+  return RepeatedToVector(tensor_desc().dims());
+}
+
+std::vector<std::vector<int64_t>> VarDesc::GetShapes() const {
+  std::vector<proto::TensorDesc> descs = tensor_descs();
+  std::vector<std::vector<int64_t>> res;
+  res.reserve(descs.size());
+  for (const auto &tensor_desc : descs) {
+    res.push_back(RepeatedToVector(tensor_desc.dims()));
+  }
+  return res;
+}
+
+void VarDesc::SetDataType(proto::DataType data_type) {
+  mutable_tensor_desc()->set_data_type(data_type);
+}
+
+void VarDesc::SetDataTypes(
+    const std::vector<proto::DataType> &multiple_data_type) {
+  if (multiple_data_type.size() != GetTensorDescNum()) {
+    VLOG(3) << "WARNING: The number of given data types("
+            << multiple_data_type.size()
+            << ") doesn't match the existing tensor number("
+            << GetTensorDescNum()
+            << "). The Reader is going to be reinitialized.";
+    SetTensorDescNum(multiple_data_type.size());
+  }
+  std::vector<proto::TensorDesc *> tensor_descs = mutable_tensor_descs();
+  for (size_t i = 0; i < multiple_data_type.size(); ++i) {
+    tensor_descs[i]->set_data_type(multiple_data_type[i]);
+  }
+}
+
+proto::DataType VarDesc::GetDataType() const {
+  return tensor_desc().data_type();
+}
+
+std::vector<proto::DataType> VarDesc::GetDataTypes() const {
+  std::vector<proto::TensorDesc> descs = tensor_descs();
+  std::vector<proto::DataType> res;
+  res.reserve(descs.size());
+  for (const auto &tensor_desc : descs) {
+    res.push_back(tensor_desc.data_type());
+  }
+  return res;
+}
+
+void VarDesc::SetLoDLevel(int32_t lod_level) {
+  switch (desc_.type()) {
+    case proto::VarDesc::LOD_TENSOR:
+      desc_.mutable_lod_tensor()->set_lod_level(lod_level);
+      break;
+    case proto::VarDesc::LOD_TENSOR_ARRAY:
+      desc_.mutable_tensor_array()->set_lod_level(lod_level);
+      break;
+    default:
+      PADDLE_THROW(
+          "Setting 'lod_level' is not supported by the type of var %s.",
+          this->Name());
+  }
+}
+
+void VarDesc::SetLoDLevels(const std::vector<int32_t> &multiple_lod_level) {
+  if (multiple_lod_level.size() != GetTensorDescNum()) {
+    VLOG(3) << "WARNING: The number of given lod_levels("
+            << multiple_lod_level.size()
+            << ") doesn't match the existing tensor number("
+            << GetTensorDescNum()
+            << "). The Reader is going to be reinitialized.";
+    SetTensorDescNum(multiple_lod_level.size());
+  }
+  switch (desc_.type()) {
+    case proto::VarDesc::READER: {
+      size_t i = 0;
+      for (auto &lod_tensor : *desc_.mutable_reader()->mutable_lod_tensor()) {
+        lod_tensor.set_lod_level(multiple_lod_level[i++]);
+      }
+    } break;
+    default:
+      PADDLE_THROW(
+          "Setting 'lod_levels' is not supported by the type of var %s.",
+          this->Name());
+  }
+}
+
+int32_t VarDesc::GetLoDLevel() const {
+  switch (desc_.type()) {
+    case proto::VarDesc::LOD_TENSOR:
+      return desc_.lod_tensor().lod_level();
+    case proto::VarDesc::LOD_TENSOR_ARRAY:
+      return desc_.tensor_array().lod_level();
+    default:
+      PADDLE_THROW(
+          "Getting 'lod_level' is not supported by the type of var %s.",
+          this->Name());
+  }
+}
+
+std::vector<int32_t> VarDesc::GetLoDLevels() const {
+  std::vector<int32_t> res;
+  switch (desc_.type()) {
+    case proto::VarDesc::READER:
+      res.reserve(desc_.reader().lod_tensor_size());
+      for (auto &lod_tensor : desc_.reader().lod_tensor()) {
+        res.push_back(lod_tensor.lod_level());
+      }
+      return res;
+      break;
+    default:
+      PADDLE_THROW(
+          "Getting 'lod_levels' is not supported by the type of var %s.",
+          this->Name());
+  }
+}
+
+const proto::TensorDesc &VarDesc::tensor_desc() const {
+  PADDLE_ENFORCE(desc_.has_type(), "The var's type hasn't been set.");
+  switch (desc_.type()) {
+    case proto::VarDesc::SELECTED_ROWS:
+      return desc_.selected_rows();
+    case proto::VarDesc::LOD_TENSOR:
+      return desc_.lod_tensor().tensor();
+    case proto::VarDesc::LOD_TENSOR_ARRAY:
+      return desc_.tensor_array().tensor();
+    default:
+      PADDLE_THROW(
+          "Getting 'tensor_desc' is not supported by the type of var %s.",
+          this->Name());
+  }
+}
+
+std::vector<proto::TensorDesc> VarDesc::tensor_descs() const {
+  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
+  std::vector<proto::TensorDesc> res;
+  res.reserve(GetTensorDescNum());
+  switch (desc_.type()) {
+    case proto::VarDesc::READER:
+      for (const auto &lod_tensor : desc_.reader().lod_tensor()) {
+        res.push_back(lod_tensor.tensor());
+      }
+      return res;
+    default:
+      PADDLE_THROW(
+          "Getting 'tensor_descs' is not supported by the type of var "
+          "%s.",
+          this->Name());
+  }
+}
+
+proto::TensorDesc *VarDesc::mutable_tensor_desc() {
+  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
+  switch (desc_.type()) {
+    case proto::VarDesc::SELECTED_ROWS:
+      return desc_.mutable_selected_rows();
+    case proto::VarDesc::LOD_TENSOR:
+      return desc_.mutable_lod_tensor()->mutable_tensor();
+    case proto::VarDesc::LOD_TENSOR_ARRAY:
+      return desc_.mutable_tensor_array()->mutable_tensor();
+    default:
+      PADDLE_THROW(
+          "Getting 'mutable_tensor_desc' is not supported by the type of var "
+          "%s.",
+          this->Name());
+  }
+}
+
+std::vector<proto::TensorDesc *> VarDesc::mutable_tensor_descs() {
+  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
+  std::vector<proto::TensorDesc *> res;
+  res.reserve(GetTensorDescNum());
+  switch (desc_.type()) {
+    case proto::VarDesc::READER:
+      for (auto &lod_tensor : *desc_.mutable_reader()->mutable_lod_tensor()) {
+        res.push_back(lod_tensor.mutable_tensor());
+      }
+      return res;
+    default:
+      PADDLE_THROW(
+          "Getting 'tensor_descs' is not supported by the type of var "
+          "%s.",
+          this->Name());
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/var_desc.h b/paddle/fluid/framework/var_desc.h
similarity index 80%
rename from paddle/framework/var_desc.h
rename to paddle/fluid/framework/var_desc.h
index 9316b14bb6..cdb1bc3ec0 100644
--- a/paddle/framework/var_desc.h
+++ b/paddle/fluid/framework/var_desc.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <vector>
 #include "glog/logging.h"
-#include "paddle/framework/framework.pb.h"
+#include "paddle/fluid/framework/framework.pb.h"
 
 namespace paddle {
 namespace framework {
@@ -68,18 +68,34 @@ class VarDesc {
 
   void SetName(std::string name) { desc_.set_name(name); }
 
+  void SetTensorDescNum(size_t num);
+
+  size_t GetTensorDescNum() const;
+
   void SetShape(const std::vector<int64_t> &dims);
 
+  void SetShapes(const std::vector<std::vector<int64_t>> &multiple_dims);
+
+  std::vector<int64_t> GetShape() const;
+
+  std::vector<std::vector<int64_t>> GetShapes() const;
+
   void SetDataType(proto::DataType data_type);
 
-  std::vector<int64_t> Shape() const;
+  void SetDataTypes(const std::vector<proto::DataType> &multiple_data_type);
 
   proto::DataType GetDataType() const;
 
+  std::vector<proto::DataType> GetDataTypes() const;
+
   void SetLoDLevel(int32_t lod_level);
 
+  void SetLoDLevels(const std::vector<int32_t> &multiple_lod_level);
+
   int32_t GetLoDLevel() const;
 
+  std::vector<int32_t> GetLoDLevels() const;
+
   proto::VarDesc::VarType GetType() const;
 
   void SetType(proto::VarDesc::VarType type);
@@ -90,7 +106,9 @@ class VarDesc {
 
  private:
   const proto::TensorDesc &tensor_desc() const;
+  std::vector<proto::TensorDesc> tensor_descs() const;
   proto::TensorDesc *mutable_tensor_desc();
+  std::vector<proto::TensorDesc *> mutable_tensor_descs();
 
   proto::VarDesc desc_;
 };
diff --git a/paddle/framework/var_type.h b/paddle/fluid/framework/var_type.h
similarity index 76%
rename from paddle/framework/var_type.h
rename to paddle/fluid/framework/var_type.h
index 5b7a08a087..2dc4de5298 100644
--- a/paddle/framework/var_type.h
+++ b/paddle/fluid/framework/var_type.h
@@ -13,12 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/framework.pb.h"
-#include "paddle/framework/lod_rank_table.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/lod_tensor_array.h"
-#include "paddle/framework/selected_rows.h"
-#include "paddle/framework/variable.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/variable.h"
 
 namespace paddle {
 namespace framework {
@@ -31,6 +32,8 @@ inline proto::VarDesc::VarType ToVarType(std::type_index type) {
     return proto::VarDesc_VarType_LOD_TENSOR_ARRAY;
   } else if (type.hash_code() == typeid(SelectedRows).hash_code()) {
     return proto::VarDesc_VarType_SELECTED_ROWS;
+  } else if (type.hash_code() == typeid(ReaderHolder).hash_code()) {
+    return proto::VarDesc_VarType_READER;
   } else {
     PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
   }
@@ -40,7 +43,7 @@ template <typename Visitor>
 inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
   switch (ToVarType(var.Type())) {
     case proto::VarDesc_VarType_LOD_TENSOR:
-      visitor(var.Get<framework::LoDTensor>());
+      visitor(var.Get<LoDTensor>());
       return;
     case proto::VarDesc_VarType_LOD_RANK_TABLE:
       visitor(var.Get<LoDRankTable>());
@@ -51,6 +54,9 @@ inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
     case proto::VarDesc_VarType_SELECTED_ROWS:
       visitor(var.Get<SelectedRows>());
       return;
+    case proto::VarDesc_VarType_READER:
+      visitor(var.Get<ReaderHolder>());
+      return;
     default:
       PADDLE_THROW("Not supported visit type, %d", ToVarType(var.Type()));
   }
diff --git a/paddle/framework/var_type_inference.h b/paddle/fluid/framework/var_type_inference.h
similarity index 94%
rename from paddle/framework/var_type_inference.h
rename to paddle/fluid/framework/var_type_inference.h
index 6c11f2fee7..44fd4cd622 100644
--- a/paddle/framework/var_type_inference.h
+++ b/paddle/fluid/framework/var_type_inference.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/type_defs.h"
+#include "paddle/fluid/framework/type_defs.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/var_type_inference_test.cc b/paddle/fluid/framework/var_type_inference_test.cc
similarity index 94%
rename from paddle/framework/var_type_inference_test.cc
rename to paddle/fluid/framework/var_type_inference_test.cc
index fa6018b1c5..0ee589c821 100644
--- a/paddle/framework/var_type_inference_test.cc
+++ b/paddle/fluid/framework/var_type_inference_test.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/var_type_inference.h"
+#include "paddle/fluid/framework/var_type_inference.h"
 #include "gtest/gtest.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
-#include "paddle/framework/program_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/variable.h b/paddle/fluid/framework/variable.h
similarity index 98%
rename from paddle/framework/variable.h
rename to paddle/fluid/framework/variable.h
index 3b7ec0a2a9..9fb8ca92d6 100644
--- a/paddle/framework/variable.h
+++ b/paddle/fluid/framework/variable.h
@@ -17,7 +17,7 @@
 #include <typeindex>
 #include <typeinfo>
 
-#include "paddle/platform/enforce.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/variable.md b/paddle/fluid/framework/variable.md
similarity index 100%
rename from paddle/framework/variable.md
rename to paddle/fluid/framework/variable.md
diff --git a/paddle/framework/variable_test.cc b/paddle/fluid/framework/variable_test.cc
similarity index 96%
rename from paddle/framework/variable_test.cc
rename to paddle/fluid/framework/variable_test.cc
index e5585c8724..8c14e506fd 100644
--- a/paddle/framework/variable_test.cc
+++ b/paddle/fluid/framework/variable_test.cc
@@ -16,7 +16,7 @@
 #include <string>
 
 #include "gtest/gtest.h"
-#include "paddle/framework/variable.h"
+#include "paddle/fluid/framework/variable.h"
 
 TEST(Variable, GetMutable) {
   using paddle::framework::Variable;
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
new file mode 100644
index 0000000000..bdb147955c
--- /dev/null
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -0,0 +1,18 @@
+set(FLUID_CORE_MODULES proto_desc paddle_memory lod_tensor executor prune init)
+
+cc_library(paddle_fluid_api
+    SRCS io.cc
+    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
+
+# Create static library
+cc_library(paddle_fluid DEPS paddle_fluid_api ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
+
+# Create shared library
+cc_library(paddle_fluid_shared SHARED
+    SRCS io.cc
+    DEPS ARCHIVE_START ${GLOB_OP_LIB} ${FLUID_CORE_MODULES} ARCHIVE_END)
+set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
+
+if(WITH_TESTING)
+  add_subdirectory(tests/book)
+endif()
diff --git a/paddle/inference/io.cc b/paddle/fluid/inference/io.cc
similarity index 58%
rename from paddle/inference/io.cc
rename to paddle/fluid/inference/io.cc
index 60ad7af1c0..58d7ab40bf 100644
--- a/paddle/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -12,15 +12,26 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/inference/io.h"
+#include "paddle/fluid/inference/io.h"
 
 #include <fstream>
-#include "paddle/framework/block_desc.h"
-#include "paddle/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
 
 namespace paddle {
 namespace inference {
 
+void ReadBinaryFile(const std::string& filename, std::string& contents) {
+  VLOG(3) << "loading model from " << filename;
+  std::ifstream inputfs(filename, std::ios::in | std::ios::binary);
+  inputfs.seekg(0, std::ios::end);
+  contents.clear();
+  contents.resize(inputfs.tellg());
+  inputfs.seekg(0, std::ios::beg);
+  inputfs.read(&contents[0], contents.size());
+  inputfs.close();
+}
+
 bool IsParameter(const framework::VarDesc* var,
                  const framework::ProgramDesc& main_program) {
   if (var->Persistable()) {
@@ -44,32 +55,53 @@ bool IsParameter(const framework::VarDesc* var,
 
 void LoadPersistables(framework::Executor& executor,
                       framework::Scope& scope,
+                      const framework::ProgramDesc& main_program,
                       const std::string& dirname,
-                      const framework::ProgramDesc& main_program) {
+                      const std::string& param_filename) {
   const framework::BlockDesc& global_block = main_program.Block(0);
 
   framework::ProgramDesc* load_program = new framework::ProgramDesc();
   framework::BlockDesc* load_block = load_program->MutableBlock(0);
+  std::vector<std::string> paramlist;
+
   for (auto* var : global_block.AllVars()) {
     if (IsParameter(var, main_program)) {
       VLOG(3) << "parameter's name: " << var->Name();
 
       framework::VarDesc* new_var = load_block->Var(var->Name());
-      new_var->SetShape(var->Shape());
+      new_var->SetShape(var->GetShape());
       new_var->SetDataType(var->GetDataType());
       new_var->SetType(var->GetType());
       new_var->SetLoDLevel(var->GetLoDLevel());
       new_var->SetPersistable(true);
 
-      // append_op
-      framework::OpDesc* op = load_block->AppendOp();
-      op->SetType("load");
-      op->SetOutput("Out", {new_var->Name()});
-      op->SetAttr("file_path", {dirname + "/" + new_var->Name()});
-      op->CheckAttrs();
+      if (!param_filename.empty()) {
+        paramlist.push_back(new_var->Name());
+      } else {
+        // append_op
+        framework::OpDesc* op = load_block->AppendOp();
+        op->SetType("load");
+        op->SetOutput("Out", {new_var->Name()});
+        op->SetAttr("file_path", {dirname + "/" + new_var->Name()});
+        op->CheckAttrs();
+      }
     }
   }
+
+  if (!param_filename.empty()) {
+    // sort paramlist to have consistent ordering
+    std::sort(paramlist.begin(), paramlist.end());
+    // append just the load_combine op
+    framework::OpDesc* op = load_block->AppendOp();
+    op->SetType("load_combine");
+    op->SetOutput("Out", paramlist);
+    op->SetAttr("file_path", {param_filename});
+    op->CheckAttrs();
+  }
+
   executor.Run(*load_program, &scope, 0, true, true);
+
+  VLOG(3) << "Ran loading successfully";
   delete load_program;
 }
 
@@ -77,20 +109,29 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
                                              framework::Scope& scope,
                                              const std::string& dirname) {
   std::string model_filename = dirname + "/__model__";
-  LOG(INFO) << "loading model from " << model_filename;
-  std::ifstream inputfs(model_filename, std::ios::in | std::ios::binary);
   std::string program_desc_str;
-  inputfs.seekg(0, std::ios::end);
-  program_desc_str.resize(inputfs.tellg());
-  inputfs.seekg(0, std::ios::beg);
-  LOG(INFO) << "program_desc_str's size: " << program_desc_str.size();
-  inputfs.read(&program_desc_str[0], program_desc_str.size());
-  inputfs.close();
+  ReadBinaryFile(model_filename, program_desc_str);
+
+  std::unique_ptr<framework::ProgramDesc> main_program(
+      new framework::ProgramDesc(program_desc_str));
+
+  LoadPersistables(executor, scope, *main_program, dirname, "");
+  return main_program;
+}
+
+std::unique_ptr<framework::ProgramDesc> Load(
+    framework::Executor& executor,
+    framework::Scope& scope,
+    const std::string& prog_filename,
+    const std::string& param_filename) {
+  std::string model_filename = prog_filename;
+  std::string program_desc_str;
+  ReadBinaryFile(model_filename, program_desc_str);
 
   std::unique_ptr<framework::ProgramDesc> main_program(
       new framework::ProgramDesc(program_desc_str));
 
-  LoadPersistables(executor, scope, dirname, *main_program);
+  LoadPersistables(executor, scope, *main_program, "", param_filename);
   return main_program;
 }
 
diff --git a/paddle/inference/io.h b/paddle/fluid/inference/io.h
similarity index 70%
rename from paddle/inference/io.h
rename to paddle/fluid/inference/io.h
index 962b6c4e20..9d78640606 100644
--- a/paddle/inference/io.h
+++ b/paddle/fluid/inference/io.h
@@ -17,21 +17,27 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
-#include "paddle/framework/executor.h"
-#include "paddle/framework/program_desc.h"
-#include "paddle/framework/scope.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
 
 namespace paddle {
 namespace inference {
 
 void LoadPersistables(framework::Executor& executor,
                       framework::Scope& scope,
+                      const framework::ProgramDesc& main_program,
                       const std::string& dirname,
-                      const framework::ProgramDesc& main_program);
+                      const std::string& param_filename);
 
 std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
                                              framework::Scope& scope,
                                              const std::string& dirname);
 
+std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
+                                             framework::Scope& scope,
+                                             const std::string& prog_filename,
+                                             const std::string& param_filename);
+
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt
new file mode 100644
index 0000000000..9fe76afb58
--- /dev/null
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
@@ -0,0 +1,34 @@
+function(inference_test TARGET_NAME)
+  set(options "")
+  set(oneValueArgs "")
+  set(multiValueArgs ARGS)
+  cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/tests)
+  set(arg_list "")
+  if(inference_test_ARGS)
+    foreach(arg ${inference_test_ARGS})
+      list(APPEND arg_list "_${arg}")
+    endforeach()
+  else()
+    list(APPEND arg_list "_")
+  endif()
+  foreach(arg ${arg_list})
+    string(REGEX REPLACE "^_$" "" arg "${arg}")
+    cc_test(test_inference_${TARGET_NAME}${arg}
+        SRCS test_inference_${TARGET_NAME}.cc
+        DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
+        ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.inference.model)
+    set_tests_properties(test_inference_${TARGET_NAME}${arg}
+        PROPERTIES DEPENDS test_${TARGET_NAME})
+  endforeach()
+endfunction(inference_test)
+
+inference_test(fit_a_line)
+inference_test(image_classification ARGS vgg resnet)
+inference_test(label_semantic_roles)
+inference_test(recognize_digits ARGS mlp)
+inference_test(recommender_system)
+inference_test(rnn_encoder_decoder)
+inference_test(understand_sentiment)
+inference_test(word2vec)
diff --git a/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc b/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
new file mode 100644
index 0000000000..fa18e69b3a
--- /dev/null
+++ b/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
@@ -0,0 +1,57 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "gflags/gflags.h"
+#include "paddle/fluid/inference/tests/test_helper.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+TEST(inference, fit_a_line) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
+  paddle::framework::LoDTensor input;
+  // The second dim of the input tensor should be 13
+  // The input data should be >= 0
+  int64_t batch_size = 10;
+  SetupTensor<float>(
+      input, {batch_size, 13}, static_cast<float>(0), static_cast<float>(10));
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&input);
+
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.dims();
+
+  CheckError<float>(output1, output2);
+#endif
+}
diff --git a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
new file mode 100644
index 0000000000..27f17712bc
--- /dev/null
+++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
@@ -0,0 +1,63 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "gflags/gflags.h"
+#include "paddle/fluid/inference/tests/test_helper.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+TEST(inference, image_classification) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
+  int64_t batch_size = 1;
+
+  paddle::framework::LoDTensor input;
+  // Use normilized image pixels as input data,
+  // which should be in the range [0.0, 1.0].
+  SetupTensor<float>(input,
+                     {batch_size, 3, 32, 32},
+                     static_cast<float>(0),
+                     static_cast<float>(1));
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&input);
+
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.dims();
+
+  CheckError<float>(output1, output2);
+#endif
+}
diff --git a/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc b/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
new file mode 100644
index 0000000000..55acd95f50
--- /dev/null
+++ b/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
@@ -0,0 +1,77 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "gflags/gflags.h"
+#include "paddle/fluid/inference/tests/test_helper.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+TEST(inference, label_semantic_roles) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
+  paddle::framework::LoDTensor word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1,
+      ctx_p2, mark;
+  paddle::framework::LoD lod{{0, 4, 10}};
+
+  SetupLoDTensor(word, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+  SetupLoDTensor(
+      predicate, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+  SetupLoDTensor(ctx_n2, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+  SetupLoDTensor(ctx_n1, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+  SetupLoDTensor(ctx_0, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+  SetupLoDTensor(ctx_p1, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+  SetupLoDTensor(ctx_p2, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+  SetupLoDTensor(mark, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&word);
+  cpu_feeds.push_back(&predicate);
+  cpu_feeds.push_back(&ctx_n2);
+  cpu_feeds.push_back(&ctx_n1);
+  cpu_feeds.push_back(&ctx_0);
+  cpu_feeds.push_back(&ctx_p1);
+  cpu_feeds.push_back(&ctx_p2);
+  cpu_feeds.push_back(&mark);
+
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << output1.lod();
+  LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.lod();
+  LOG(INFO) << output2.dims();
+
+  CheckError<float>(output1, output2);
+#endif
+}
diff --git a/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
new file mode 100644
index 0000000000..99cf0f3095
--- /dev/null
+++ b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
@@ -0,0 +1,105 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "gflags/gflags.h"
+#include "paddle/fluid/inference/tests/test_helper.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+TEST(inference, recognize_digits) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
+  int64_t batch_size = 1;
+
+  paddle::framework::LoDTensor input;
+  // Use normilized image pixels as input data,
+  // which should be in the range [-1.0, 1.0].
+  SetupTensor<float>(input,
+                     {batch_size, 1, 28, 28},
+                     static_cast<float>(-1),
+                     static_cast<float>(1));
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&input);
+
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.dims();
+
+  CheckError<float>(output1, output2);
+#endif
+}
+
+TEST(inference, recognize_digits_combine) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
+  paddle::framework::LoDTensor input;
+  // Use normilized image pixels as input data,
+  // which should be in the range [-1.0, 1.0].
+  SetupTensor<float>(
+      input, {1, 28, 28}, static_cast<float>(-1), static_cast<float>(1));
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&input);
+
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace, true>(
+      dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace, true>(
+      dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.dims();
+
+  CheckError<float>(output1, output2);
+#endif
+}
diff --git a/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc b/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc
new file mode 100644
index 0000000000..9208c2a599
--- /dev/null
+++ b/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc
@@ -0,0 +1,87 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "gflags/gflags.h"
+#include "paddle/fluid/inference/tests/test_helper.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+TEST(inference, recommender_system) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
+  int64_t batch_size = 1;
+
+  paddle::framework::LoDTensor user_id, gender_id, age_id, job_id, movie_id,
+      category_id, movie_title;
+
+  // Use the first data from paddle.dataset.movielens.test() as input
+  std::vector<int64_t> user_id_data = {1};
+  SetupTensor<int64_t>(user_id, {batch_size, 1}, user_id_data);
+
+  std::vector<int64_t> gender_id_data = {1};
+  SetupTensor<int64_t>(gender_id, {batch_size, 1}, gender_id_data);
+
+  std::vector<int64_t> age_id_data = {0};
+  SetupTensor<int64_t>(age_id, {batch_size, 1}, age_id_data);
+
+  std::vector<int64_t> job_id_data = {10};
+  SetupTensor<int64_t>(job_id, {batch_size, 1}, job_id_data);
+
+  std::vector<int64_t> movie_id_data = {783};
+  SetupTensor<int64_t>(movie_id, {batch_size, 1}, movie_id_data);
+
+  std::vector<int64_t> category_id_data = {10, 8, 9};
+  SetupLoDTensor<int64_t>(category_id, {3, 1}, {{0, 3}}, category_id_data);
+
+  std::vector<int64_t> movie_title_data = {1069, 4140, 2923, 710, 988};
+  SetupLoDTensor<int64_t>(movie_title, {5, 1}, {{0, 5}}, movie_title_data);
+
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&user_id);
+  cpu_feeds.push_back(&gender_id);
+  cpu_feeds.push_back(&age_id);
+  cpu_feeds.push_back(&job_id);
+  cpu_feeds.push_back(&movie_id);
+  cpu_feeds.push_back(&category_id);
+  cpu_feeds.push_back(&movie_title);
+
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.dims();
+
+  CheckError<float>(output1, output2);
+#endif
+}
diff --git a/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc b/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc
new file mode 100644
index 0000000000..c88ca30cb7
--- /dev/null
+++ b/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc
@@ -0,0 +1,65 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "gflags/gflags.h"
+#include "paddle/fluid/inference/tests/test_helper.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+TEST(inference, rnn_encoder_decoder) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
+  paddle::framework::LoDTensor word_data, trg_word;
+  paddle::framework::LoD lod{{0, 4, 10}};
+
+  SetupLoDTensor(
+      word_data, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+  SetupLoDTensor(
+      trg_word, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&word_data);
+  cpu_feeds.push_back(&trg_word);
+
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << output1.lod();
+  LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.lod();
+  LOG(INFO) << output2.dims();
+
+  CheckError<float>(output1, output2);
+#endif
+}
diff --git a/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc b/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
new file mode 100644
index 0000000000..3b29d52880
--- /dev/null
+++ b/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
@@ -0,0 +1,60 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "gflags/gflags.h"
+#include "paddle/fluid/inference/tests/test_helper.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+TEST(inference, understand_sentiment) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
+  paddle::framework::LoDTensor words;
+  paddle::framework::LoD lod{{0, 4, 10}};
+  SetupLoDTensor(words, lod, static_cast<int64_t>(0), static_cast<int64_t>(10));
+
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&words);
+
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << output1.lod();
+  LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.lod();
+  LOG(INFO) << output2.dims();
+
+  CheckError<float>(output1, output2);
+#endif
+}
diff --git a/paddle/fluid/inference/tests/book/test_inference_word2vec.cc b/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
new file mode 100644
index 0000000000..93376b6824
--- /dev/null
+++ b/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
@@ -0,0 +1,68 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "gflags/gflags.h"
+#include "paddle/fluid/inference/tests/test_helper.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+TEST(inference, word2vec) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
+  paddle::framework::LoDTensor first_word, second_word, third_word, fourth_word;
+  paddle::framework::LoD lod{{0, 1}};
+  int64_t dict_size = 2072;  // Hard-coding the size of dictionary
+
+  SetupLoDTensor(first_word, lod, static_cast<int64_t>(0), dict_size);
+  SetupLoDTensor(second_word, lod, static_cast<int64_t>(0), dict_size);
+  SetupLoDTensor(third_word, lod, static_cast<int64_t>(0), dict_size);
+  SetupLoDTensor(fourth_word, lod, static_cast<int64_t>(0), dict_size);
+
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&first_word);
+  cpu_feeds.push_back(&second_word);
+  cpu_feeds.push_back(&third_word);
+  cpu_feeds.push_back(&fourth_word);
+
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << output1.lod();
+  LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.lod();
+  LOG(INFO) << output2.dims();
+
+  CheckError<float>(output1, output2);
+#endif
+}
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
new file mode 100644
index 0000000000..a6c93aa073
--- /dev/null
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -0,0 +1,140 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <time.h>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/inference/io.h"
+
+template <typename T>
+void SetupTensor(paddle::framework::LoDTensor& input,
+                 paddle::framework::DDim dims,
+                 T lower,
+                 T upper) {
+  srand(time(0));
+  T* input_ptr = input.mutable_data<T>(dims, paddle::platform::CPUPlace());
+  for (int i = 0; i < input.numel(); ++i) {
+    input_ptr[i] =
+        (static_cast<T>(rand()) / static_cast<T>(RAND_MAX)) * (upper - lower) +
+        lower;
+  }
+}
+
+template <typename T>
+void SetupTensor(paddle::framework::LoDTensor& input,
+                 paddle::framework::DDim dims,
+                 std::vector<T>& data) {
+  CHECK_EQ(paddle::framework::product(dims), static_cast<int64_t>(data.size()));
+  T* input_ptr = input.mutable_data<T>(dims, paddle::platform::CPUPlace());
+  memcpy(input_ptr, data.data(), input.numel() * sizeof(T));
+}
+
+template <typename T>
+void SetupLoDTensor(paddle::framework::LoDTensor& input,
+                    paddle::framework::LoD& lod,
+                    T lower,
+                    T upper) {
+  input.set_lod(lod);
+  int dim = lod[0][lod[0].size() - 1];
+  SetupTensor<T>(input, {dim, 1}, lower, upper);
+}
+
+template <typename T>
+void SetupLoDTensor(paddle::framework::LoDTensor& input,
+                    paddle::framework::DDim dims,
+                    paddle::framework::LoD lod,
+                    std::vector<T>& data) {
+  const size_t level = lod.size() - 1;
+  CHECK_EQ(dims[0], static_cast<int64_t>((lod[level]).back()));
+  input.set_lod(lod);
+  SetupTensor<T>(input, dims, data);
+}
+
+template <typename T>
+void CheckError(paddle::framework::LoDTensor& output1,
+                paddle::framework::LoDTensor& output2) {
+  // Check lod information
+  EXPECT_EQ(output1.lod(), output2.lod());
+
+  EXPECT_EQ(output1.dims(), output2.dims());
+  EXPECT_EQ(output1.numel(), output2.numel());
+
+  T err = static_cast<T>(0);
+  if (typeid(T) == typeid(float)) {
+    err = 1E-3;
+  } else if (typeid(T) == typeid(double)) {
+    err = 1E-6;
+  } else {
+    err = 0;
+  }
+
+  size_t count = 0;
+  for (int64_t i = 0; i < output1.numel(); ++i) {
+    if (fabs(output1.data<T>()[i] - output2.data<T>()[i]) > err) {
+      count++;
+    }
+  }
+  EXPECT_EQ(count, 0U) << "There are " << count << " different elements.";
+}
+
+template <typename Place, bool IsCombined = false>
+void TestInference(const std::string& dirname,
+                   const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
+                   std::vector<paddle::framework::LoDTensor*>& cpu_fetchs) {
+  // 1. Define place, executor, scope
+  auto place = Place();
+  auto executor = paddle::framework::Executor(place);
+  auto* scope = new paddle::framework::Scope();
+
+  // 2. Initialize the inference_program and load parameters
+  std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
+  if (IsCombined) {
+    // All parameters are saved in a single file.
+    // Hard-coding the file names of program and parameters in unittest.
+    // Users are free to specify different filename
+    // (provided: the filenames are changed in the python api as well: io.py)
+    std::string prog_filename = "__model_combined__";
+    std::string param_filename = "__params_combined__";
+    inference_program = paddle::inference::Load(executor,
+                                                *scope,
+                                                dirname + "/" + prog_filename,
+                                                dirname + "/" + param_filename);
+  } else {
+    // Parameters are saved in separate files sited in the specified `dirname`.
+    inference_program = paddle::inference::Load(executor, *scope, dirname);
+  }
+
+  // 3. Get the feed_target_names and fetch_target_names
+  const std::vector<std::string>& feed_target_names =
+      inference_program->GetFeedTargetNames();
+  const std::vector<std::string>& fetch_target_names =
+      inference_program->GetFetchTargetNames();
+
+  // 4. Prepare inputs: set up maps for feed targets
+  std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
+  for (size_t i = 0; i < feed_target_names.size(); ++i) {
+    // Please make sure that cpu_feeds[i] is right for feed_target_names[i]
+    feed_targets[feed_target_names[i]] = cpu_feeds[i];
+  }
+
+  // 5. Define Tensor to get the outputs: set up maps for fetch targets
+  std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
+  for (size_t i = 0; i < fetch_target_names.size(); ++i) {
+    fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
+  }
+
+  // 6. Run the inference program
+  executor.Run(*inference_program, scope, feed_targets, fetch_targets);
+
+  delete scope;
+}
diff --git a/paddle/platform/.clang-format b/paddle/fluid/memory/.clang-format
similarity index 100%
rename from paddle/platform/.clang-format
rename to paddle/fluid/memory/.clang-format
diff --git a/paddle/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt
similarity index 55%
rename from paddle/memory/CMakeLists.txt
rename to paddle/fluid/memory/CMakeLists.txt
index 496098f804..1a61c48482 100644
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
@@ -14,10 +14,3 @@ cc_library(paddle_memory
     system_allocator)
 
 cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory)
-
-if(NOT WITH_C_API AND WITH_FLUID)
-  file(GLOB MEMORY_HEADERS *.h)
-  file(GLOB MEMORY_DETAIL_HEADERS detail/*.h)
-  install(FILES ${MEMORY_HEADERS} DESTINATION include/paddle/memory)
-  install(FILES ${MEMORY_DETAIL_HEADERS} DESTINATION include/paddle/memory/detail)
-endif()
diff --git a/paddle/memory/README.md b/paddle/fluid/memory/README.md
similarity index 100%
rename from paddle/memory/README.md
rename to paddle/fluid/memory/README.md
diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/fluid/memory/detail/CMakeLists.txt
similarity index 100%
rename from paddle/memory/detail/CMakeLists.txt
rename to paddle/fluid/memory/detail/CMakeLists.txt
diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
similarity index 99%
rename from paddle/memory/detail/buddy_allocator.cc
rename to paddle/fluid/memory/detail/buddy_allocator.cc
index 2bc2c06a15..2cee8271d2 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/memory/detail/buddy_allocator.h"
+#include "paddle/fluid/memory/detail/buddy_allocator.h"
 #include "glog/logging.h"
 
 namespace paddle {
diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h
similarity index 91%
rename from paddle/memory/detail/buddy_allocator.h
rename to paddle/fluid/memory/detail/buddy_allocator.h
index 4e0135dd65..644d793306 100644
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
@@ -14,12 +14,12 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/memory/detail/meta_cache.h"
-#include "paddle/memory/detail/meta_data.h"
-#include "paddle/memory/detail/system_allocator.h"
-#include "paddle/platform/assert.h"
-#include "paddle/platform/cpu_info.h"
-#include "paddle/platform/gpu_info.h"
+#include "paddle/fluid/memory/detail/meta_cache.h"
+#include "paddle/fluid/memory/detail/meta_data.h"
+#include "paddle/fluid/memory/detail/system_allocator.h"
+#include "paddle/fluid/platform/assert.h"
+#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/gpu_info.h"
 
 #include <mutex>
 #include <set>
diff --git a/paddle/memory/detail/memory_block.cc b/paddle/fluid/memory/detail/memory_block.cc
similarity index 95%
rename from paddle/memory/detail/memory_block.cc
rename to paddle/fluid/memory/detail/memory_block.cc
index f50eceba09..23388cdd5b 100644
--- a/paddle/memory/detail/memory_block.cc
+++ b/paddle/fluid/memory/detail/memory_block.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/memory/detail/memory_block.h"
-#include "paddle/memory/detail/meta_cache.h"
-#include "paddle/memory/detail/meta_data.h"
-#include "paddle/platform/assert.h"
+#include "paddle/fluid/memory/detail/memory_block.h"
+#include "paddle/fluid/memory/detail/meta_cache.h"
+#include "paddle/fluid/memory/detail/meta_data.h"
+#include "paddle/fluid/platform/assert.h"
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/memory/detail/memory_block.h b/paddle/fluid/memory/detail/memory_block.h
similarity index 100%
rename from paddle/memory/detail/memory_block.h
rename to paddle/fluid/memory/detail/memory_block.h
diff --git a/paddle/memory/detail/meta_cache.cc b/paddle/fluid/memory/detail/meta_cache.cc
similarity index 91%
rename from paddle/memory/detail/meta_cache.cc
rename to paddle/fluid/memory/detail/meta_cache.cc
index 2bacca7510..7d78811c77 100644
--- a/paddle/memory/detail/meta_cache.cc
+++ b/paddle/fluid/memory/detail/meta_cache.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/memory/detail/meta_cache.h"
+#include "paddle/fluid/memory/detail/meta_cache.h"
 #include "glog/logging.h"
-#include "paddle/memory/detail/memory_block.h"
-#include "paddle/platform/assert.h"
+#include "paddle/fluid/memory/detail/memory_block.h"
+#include "paddle/fluid/platform/assert.h"
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/memory/detail/meta_cache.h b/paddle/fluid/memory/detail/meta_cache.h
similarity index 94%
rename from paddle/memory/detail/meta_cache.h
rename to paddle/fluid/memory/detail/meta_cache.h
index db8ffd49ae..635d6398e6 100644
--- a/paddle/memory/detail/meta_cache.h
+++ b/paddle/fluid/memory/detail/meta_cache.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/memory/detail/memory_block.h"
-#include "paddle/memory/detail/meta_data.h"
+#include "paddle/fluid/memory/detail/memory_block.h"
+#include "paddle/fluid/memory/detail/meta_data.h"
 
 #include <unordered_map>
 
diff --git a/paddle/memory/detail/meta_data.cc b/paddle/fluid/memory/detail/meta_data.cc
similarity index 97%
rename from paddle/memory/detail/meta_data.cc
rename to paddle/fluid/memory/detail/meta_data.cc
index dc57d4d237..eae49ebdcf 100644
--- a/paddle/memory/detail/meta_data.cc
+++ b/paddle/fluid/memory/detail/meta_data.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/memory/detail/meta_data.h"
+#include "paddle/fluid/memory/detail/meta_data.h"
 
 #include <functional>
 
diff --git a/paddle/memory/detail/meta_data.h b/paddle/fluid/memory/detail/meta_data.h
similarity index 96%
rename from paddle/memory/detail/meta_data.h
rename to paddle/fluid/memory/detail/meta_data.h
index 6b83c42eb8..368523701e 100644
--- a/paddle/memory/detail/meta_data.h
+++ b/paddle/fluid/memory/detail/meta_data.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/memory/detail/memory_block.h"
+#include "paddle/fluid/memory/detail/memory_block.h"
 
 #include <stddef.h>
 
diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
similarity index 95%
rename from paddle/memory/detail/system_allocator.cc
rename to paddle/fluid/memory/detail/system_allocator.cc
index 509250debc..1f07c5e789 100644
--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/memory/detail/system_allocator.h"
-#include "paddle/platform/assert.h"
-#include "paddle/platform/enforce.h"
-#include "paddle/platform/gpu_info.h"
+#include "paddle/fluid/memory/detail/system_allocator.h"
+#include "paddle/fluid/platform/assert.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/gpu_info.h"
 
 #include <stdlib.h>    // for malloc and free
 #include <sys/mman.h>  // for mlock and munlock
diff --git a/paddle/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h
similarity index 100%
rename from paddle/memory/detail/system_allocator.h
rename to paddle/fluid/memory/detail/system_allocator.h
diff --git a/paddle/memory/detail/system_allocator_test.cc b/paddle/fluid/memory/detail/system_allocator_test.cc
similarity index 96%
rename from paddle/memory/detail/system_allocator_test.cc
rename to paddle/fluid/memory/detail/system_allocator_test.cc
index 6a8558937b..a850e480ec 100644
--- a/paddle/memory/detail/system_allocator_test.cc
+++ b/paddle/fluid/memory/detail/system_allocator_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/memory/detail/system_allocator.h"
+#include "paddle/fluid/memory/detail/system_allocator.h"
 
 #include <memory>
 #include <vector>
diff --git a/paddle/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
similarity index 98%
rename from paddle/memory/memcpy.cc
rename to paddle/fluid/memory/memcpy.cc
index b46141aafd..8938b36133 100644
--- a/paddle/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/memory/memcpy.h"
+#include "paddle/fluid/memory/memcpy.h"
 
 #include <cstring>  // for memcpy
 
diff --git a/paddle/memory/memcpy.h b/paddle/fluid/memory/memcpy.h
similarity index 95%
rename from paddle/memory/memcpy.h
rename to paddle/fluid/memory/memcpy.h
index 29c20e1860..77d209c3fb 100644
--- a/paddle/memory/memcpy.h
+++ b/paddle/fluid/memory/memcpy.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/platform/gpu_info.h"
-#include "paddle/platform/place.h"
+#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/memory/memory.cc b/paddle/fluid/memory/memory.cc
similarity index 95%
rename from paddle/memory/memory.cc
rename to paddle/fluid/memory/memory.cc
index 1a73a94567..6eedab5d03 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/fluid/memory/memory.cc
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/memory/memory.h"
+#include "paddle/fluid/memory/memory.h"
 
 #include "glog/logging.h"
 
-#include "paddle/memory/detail/buddy_allocator.h"
-#include "paddle/memory/detail/system_allocator.h"
-#include "paddle/platform/gpu_info.h"
+#include "paddle/fluid/memory/detail/buddy_allocator.h"
+#include "paddle/fluid/memory/detail/system_allocator.h"
+#include "paddle/fluid/platform/gpu_info.h"
 
 DECLARE_double(fraction_of_gpu_memory_to_use);
 
diff --git a/paddle/memory/memory.h b/paddle/fluid/memory/memory.h
similarity index 82%
rename from paddle/memory/memory.h
rename to paddle/fluid/memory/memory.h
index 7012b6d331..a9166a6746 100644
--- a/paddle/memory/memory.h
+++ b/paddle/fluid/memory/memory.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/platform/place.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace memory {
@@ -81,5 +81,23 @@ class PODDeleter {
   Place place_;
 };
 
+/**
+ * \brief   Free memory block in one place does not meet POD
+ *
+ * \note    In some cases, custom deleter is used to
+ *          deallocate the memory automatically for
+ *          std::unique_ptr<T> in tensor.h.
+ *
+ */
+template <typename T, typename Place>
+class PlainDeleter {
+ public:
+  explicit PlainDeleter(Place place) : place_(place) {}
+  void operator()(T* ptr) { Free(place_, reinterpret_cast<void*>(ptr)); }
+
+ private:
+  Place place_;
+};
+
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/memory/memory_test.cc b/paddle/fluid/memory/memory_test.cc
similarity index 93%
rename from paddle/memory/memory_test.cc
rename to paddle/fluid/memory/memory_test.cc
index b3f699f9b7..d7505ef0f3 100644
--- a/paddle/memory/memory_test.cc
+++ b/paddle/fluid/memory/memory_test.cc
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/memory/memory.h"
-#include "paddle/memory/detail/memory_block.h"
-#include "paddle/memory/detail/meta_data.h"
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/memory/detail/memory_block.h"
+#include "paddle/fluid/memory/detail/meta_data.h"
 
-#include "paddle/platform/cpu_info.h"
-#include "paddle/platform/gpu_info.h"
-#include "paddle/platform/place.h"
+#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/place.h"
 
 #include <gtest/gtest.h>
 #include <unordered_map>
diff --git a/paddle/fluid/operators/.clang-format b/paddle/fluid/operators/.clang-format
new file mode 100644
index 0000000000..29282dc87e
--- /dev/null
+++ b/paddle/fluid/operators/.clang-format
@@ -0,0 +1,5 @@
+---
+Language:        Cpp
+BasedOnStyle:  Google
+Standard:  Cpp11 
+...
diff --git a/paddle/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
similarity index 97%
rename from paddle/operators/CMakeLists.txt
rename to paddle/fluid/operators/CMakeLists.txt
index 000c2089c1..cadfd735d7 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -1,7 +1,7 @@
 file(GLOB GENERAL_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
 string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}")
 set(DEPS_OPS "")
-set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/pybind/pybind.h)
+set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/pybind.h)
 file(WRITE ${pybind_file} "// Generated by the paddle/operator/CMakeLists.txt.  DO NOT EDIT!\n\n")
 function(op_library TARGET)
     # op_library is a function to create op library. The interface is same as
@@ -62,7 +62,7 @@ function(op_library TARGET)
     endif()
 
     # Define operators that don't need pybind here.
-    foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op")
+    foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op" "create_reader_op")
         if ("${TARGET}" STREQUAL "${manual_pybind_op}")
             set(pybind_flag 1)
         endif()
@@ -155,6 +155,7 @@ op_library(recurrent_op DEPS executor)
 op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale math_function)
 op_library(cos_sim_op DEPS cos_sim_functor)
 op_library(parallel_do_op DEPS executor)
+op_library(create_reader_op DEPS reader)
 
 # Regist multiple Kernel to pybind
 if (WITH_GPU)
@@ -185,7 +186,7 @@ list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
     op_library(${src})
 endforeach()
-file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
+file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\nUSE_NO_KERNEL_OP(create_random_data_generator);\n")
 
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
 
diff --git a/paddle/operators/accuracy_op.cc b/paddle/fluid/operators/accuracy_op.cc
similarity index 98%
rename from paddle/operators/accuracy_op.cc
rename to paddle/fluid/operators/accuracy_op.cc
index 8e8a3c7dd3..43689b3b7d 100644
--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/fluid/operators/accuracy_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/accuracy_op.h"
+#include "paddle/fluid/operators/accuracy_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/accuracy_op.cu b/paddle/fluid/operators/accuracy_op.cu
similarity index 96%
rename from paddle/operators/accuracy_op.cu
rename to paddle/fluid/operators/accuracy_op.cu
index 0aadd5af41..4462b9ba5c 100644
--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/fluid/operators/accuracy_op.cu
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include <thrust/execution_policy.h>
 #include <thrust/reduce.h>
-#include "paddle/operators/accuracy_op.h"
-#include "paddle/platform/cuda_helper.h"
-#include "paddle/platform/gpu_info.h"
+#include "paddle/fluid/operators/accuracy_op.h"
+#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/gpu_info.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/accuracy_op.h b/paddle/fluid/operators/accuracy_op.h
similarity index 97%
rename from paddle/operators/accuracy_op.h
rename to paddle/fluid/operators/accuracy_op.h
index 04104a695f..b3ed1d3fe0 100644
--- a/paddle/operators/accuracy_op.h
+++ b/paddle/fluid/operators/accuracy_op.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <algorithm>
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
similarity index 99%
rename from paddle/operators/activation_op.cc
rename to paddle/fluid/operators/activation_op.cc
index 4188858a90..c04dd8cb91 100644
--- a/paddle/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/activation_op.h"
+#include "paddle/fluid/operators/activation_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
similarity index 97%
rename from paddle/operators/activation_op.cu
rename to paddle/fluid/operators/activation_op.cu
index b9ccdf639c..b86a7926a9 100644
--- a/paddle/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/activation_op.h"
+#include "paddle/fluid/operators/activation_op.h"
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
similarity index 99%
rename from paddle/operators/activation_op.h
rename to paddle/fluid/operators/activation_op.h
index c0809abc05..7a6ae2224c 100644
--- a/paddle/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/detail/safe_ref.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/adadelta_op.cc b/paddle/fluid/operators/adadelta_op.cc
similarity index 98%
rename from paddle/operators/adadelta_op.cc
rename to paddle/fluid/operators/adadelta_op.cc
index d8a9491c82..ececd47e6a 100644
--- a/paddle/operators/adadelta_op.cc
+++ b/paddle/fluid/operators/adadelta_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/adadelta_op.h"
+#include "paddle/fluid/operators/adadelta_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/adadelta_op.cu b/paddle/fluid/operators/adadelta_op.cu
similarity index 94%
rename from paddle/operators/adadelta_op.cu
rename to paddle/fluid/operators/adadelta_op.cu
index 91294a0d5d..733482f788 100644
--- a/paddle/operators/adadelta_op.cu
+++ b/paddle/fluid/operators/adadelta_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/adadelta_op.h"
+#include "paddle/fluid/operators/adadelta_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/adadelta_op.h b/paddle/fluid/operators/adadelta_op.h
similarity index 96%
rename from paddle/operators/adadelta_op.h
rename to paddle/fluid/operators/adadelta_op.h
index 819d0845db..82ced08710 100644
--- a/paddle/operators/adadelta_op.h
+++ b/paddle/fluid/operators/adadelta_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/adagrad_op.cc b/paddle/fluid/operators/adagrad_op.cc
similarity index 97%
rename from paddle/operators/adagrad_op.cc
rename to paddle/fluid/operators/adagrad_op.cc
index c83318a272..61c0ecd019 100644
--- a/paddle/operators/adagrad_op.cc
+++ b/paddle/fluid/operators/adagrad_op.cc
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/adagrad_op.h"
+#include "paddle/fluid/operators/adagrad_op.h"
 
 #include <cmath>
 
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/adagrad_op.cu b/paddle/fluid/operators/adagrad_op.cu
similarity index 91%
rename from paddle/operators/adagrad_op.cu
rename to paddle/fluid/operators/adagrad_op.cu
index 00cb6e9caf..1117363c13 100644
--- a/paddle/operators/adagrad_op.cu
+++ b/paddle/fluid/operators/adagrad_op.cu
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/adagrad_op.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/selected_rows_functor.h"
-#include "paddle/platform/cuda_helper.h"
+#include "paddle/fluid/operators/adagrad_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/platform/cuda_helper.h"
 
 namespace paddle {
 namespace operators {
@@ -101,9 +101,9 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
     SparseAdagradFunctorKernel<
         T, 256><<<grid2, threads, 0,
                   reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                      .stream()>>>(grad_merge_data, merge_rows.cuda_data(), lr,
-                                   param_data, moment_data, grad_width,
-                                   epsilon);
+                      .stream()>>>(
+        grad_merge_data, merge_rows.CUDAMutableData(context.GetPlace()), lr,
+        param_data, moment_data, grad_width, epsilon);
   }
 };
 
diff --git a/paddle/operators/adagrad_op.h b/paddle/fluid/operators/adagrad_op.h
similarity index 97%
rename from paddle/operators/adagrad_op.h
rename to paddle/fluid/operators/adagrad_op.h
index 66f5b0f449..ee503b2c36 100644
--- a/paddle/operators/adagrad_op.h
+++ b/paddle/fluid/operators/adagrad_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/adam_op.cc b/paddle/fluid/operators/adam_op.cc
similarity index 99%
rename from paddle/operators/adam_op.cc
rename to paddle/fluid/operators/adam_op.cc
index 03527de936..25da9336b2 100644
--- a/paddle/operators/adam_op.cc
+++ b/paddle/fluid/operators/adam_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/adam_op.h"
+#include "paddle/fluid/operators/adam_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/adam_op.cu b/paddle/fluid/operators/adam_op.cu
similarity index 94%
rename from paddle/operators/adam_op.cu
rename to paddle/fluid/operators/adam_op.cu
index 94f840c188..85b806eb6a 100644
--- a/paddle/operators/adam_op.cu
+++ b/paddle/fluid/operators/adam_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/adam_op.h"
+#include "paddle/fluid/operators/adam_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/adam_op.h b/paddle/fluid/operators/adam_op.h
similarity index 96%
rename from paddle/operators/adam_op.h
rename to paddle/fluid/operators/adam_op.h
index bf536687d3..a51b46ef15 100644
--- a/paddle/operators/adam_op.h
+++ b/paddle/fluid/operators/adam_op.h
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #pragma once
 #include <math.h>  // for sqrt in CPU and CUDA
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/detail/safe_ref.h"
-#include "paddle/operators/math/selected_rows_functor.h"
-#include "paddle/platform/for_range.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/platform/for_range.h"
 
 namespace paddle {
 namespace operators {
@@ -201,7 +201,7 @@ class AdamOpKernel : public framework::OpKernel<T> {
       const T* grad_data = grad_tensor.template data<T>();
       int64_t* rows = nullptr;
       if (platform::is_gpu_place(ctx.GetPlace())) {
-        rows = grad_merge.mutable_rows()->cuda_data();
+        rows = grad_merge.mutable_rows()->CUDAMutableData(ctx.GetPlace());
       } else {
         rows = grad_merge.mutable_rows()->data();
       }
diff --git a/paddle/operators/adamax_op.cc b/paddle/fluid/operators/adamax_op.cc
similarity index 99%
rename from paddle/operators/adamax_op.cc
rename to paddle/fluid/operators/adamax_op.cc
index 3b0b714184..b2249b8f96 100644
--- a/paddle/operators/adamax_op.cc
+++ b/paddle/fluid/operators/adamax_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/adamax_op.h"
+#include "paddle/fluid/operators/adamax_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/adamax_op.cu b/paddle/fluid/operators/adamax_op.cu
similarity index 94%
rename from paddle/operators/adamax_op.cu
rename to paddle/fluid/operators/adamax_op.cu
index 8f87bb2867..44a5d6c7bd 100644
--- a/paddle/operators/adamax_op.cu
+++ b/paddle/fluid/operators/adamax_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/adamax_op.h"
+#include "paddle/fluid/operators/adamax_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/adamax_op.h b/paddle/fluid/operators/adamax_op.h
similarity index 96%
rename from paddle/operators/adamax_op.h
rename to paddle/fluid/operators/adamax_op.h
index 172c179c5f..124453c0ec 100644
--- a/paddle/operators/adamax_op.h
+++ b/paddle/fluid/operators/adamax_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/array_operator.h b/paddle/fluid/operators/array_operator.h
similarity index 92%
rename from paddle/operators/array_operator.h
rename to paddle/fluid/operators/array_operator.h
index 3fdad5ad9b..4ffb414ece 100644
--- a/paddle/operators/array_operator.h
+++ b/paddle/fluid/operators/array_operator.h
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/lod_tensor_array.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/platform/device_context.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc
similarity index 96%
rename from paddle/operators/array_to_lod_tensor_op.cc
rename to paddle/fluid/operators/array_to_lod_tensor_op.cc
index ba5c6bd3c6..bf8e11bd8c 100644
--- a/paddle/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <numeric>
 
-#include "paddle/framework/lod_rank_table.h"
-#include "paddle/framework/lod_tensor_array.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/memory/memcpy.h"
-#include "paddle/platform/device_context.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/assign_op.cc b/paddle/fluid/operators/assign_op.cc
similarity index 96%
rename from paddle/operators/assign_op.cc
rename to paddle/fluid/operators/assign_op.cc
index e04aa2d28c..f99f9af427 100644
--- a/paddle/operators/assign_op.cc
+++ b/paddle/fluid/operators/assign_op.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/data_type.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/var_type.h"
-#include "paddle/platform/device_context.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/assign_value_op.cc b/paddle/fluid/operators/assign_value_op.cc
similarity index 98%
rename from paddle/operators/assign_value_op.cc
rename to paddle/fluid/operators/assign_value_op.cc
index 8e3a530489..835043d9ab 100644
--- a/paddle/operators/assign_value_op.cc
+++ b/paddle/fluid/operators/assign_value_op.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/operators/assign_value_op.h"
+#include "paddle/fluid/operators/assign_value_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/assign_value_op.cu.cc b/paddle/fluid/operators/assign_value_op.cu.cc
similarity index 93%
rename from paddle/operators/assign_value_op.cu.cc
rename to paddle/fluid/operators/assign_value_op.cu.cc
index b17e201500..616163f97b 100644
--- a/paddle/operators/assign_value_op.cu.cc
+++ b/paddle/fluid/operators/assign_value_op.cu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/assign_value_op.h"
+#include "paddle/fluid/operators/assign_value_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(assign_value, ops::AssignValueKernel<int>,
diff --git a/paddle/operators/assign_value_op.h b/paddle/fluid/operators/assign_value_op.h
similarity index 92%
rename from paddle/operators/assign_value_op.h
rename to paddle/fluid/operators/assign_value_op.h
index ec98c53513..33a344cad5 100644
--- a/paddle/operators/assign_value_op.h
+++ b/paddle/fluid/operators/assign_value_op.h
@@ -14,9 +14,9 @@
 
 #pragma once
 
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/platform/enforce.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/auc_op.cc b/paddle/fluid/operators/auc_op.cc
similarity index 98%
rename from paddle/operators/auc_op.cc
rename to paddle/fluid/operators/auc_op.cc
index b6494f9509..8ac08ea4a1 100644
--- a/paddle/operators/auc_op.cc
+++ b/paddle/fluid/operators/auc_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/auc_op.h"
+#include "paddle/fluid/operators/auc_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/auc_op.h b/paddle/fluid/operators/auc_op.h
similarity index 98%
rename from paddle/operators/auc_op.h
rename to paddle/fluid/operators/auc_op.h
index b80509e2a9..e648db7097 100644
--- a/paddle/operators/auc_op.h
+++ b/paddle/fluid/operators/auc_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
similarity index 99%
rename from paddle/operators/batch_norm_op.cc
rename to paddle/fluid/operators/batch_norm_op.cc
index 0e984c38ba..506c25d50d 100644
--- a/paddle/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/batch_norm_op.h"
-#include "paddle/framework/data_layout.h"
+#include "paddle/fluid/operators/batch_norm_op.h"
+#include "paddle/fluid/framework/data_layout.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/batch_norm_op.cu.cc b/paddle/fluid/operators/batch_norm_op.cu.cc
similarity index 98%
rename from paddle/operators/batch_norm_op.cu.cc
rename to paddle/fluid/operators/batch_norm_op.cu.cc
index 3d17725ab4..b9c97211e1 100644
--- a/paddle/operators/batch_norm_op.cu.cc
+++ b/paddle/fluid/operators/batch_norm_op.cu.cc
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/batch_norm_op.h"
-#include "paddle/framework/data_layout.h"
+#include "paddle/fluid/operators/batch_norm_op.h"
+#include "paddle/fluid/framework/data_layout.h"
 
 #include <cfloat>
-#include "paddle/operators/math/math_function.h"
-#include "paddle/platform/cudnn_helper.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/batch_norm_op.h b/paddle/fluid/operators/batch_norm_op.h
similarity index 92%
rename from paddle/operators/batch_norm_op.h
rename to paddle/fluid/operators/batch_norm_op.h
index a817ef41fc..fa9942ad09 100644
--- a/paddle/operators/batch_norm_op.h
+++ b/paddle/fluid/operators/batch_norm_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc
similarity index 98%
rename from paddle/operators/beam_search_decode_op.cc
rename to paddle/fluid/operators/beam_search_decode_op.cc
index 72e05607b0..7737d4e098 100644
--- a/paddle/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/beam_search_decode_op.h"
-#include "paddle/platform/device_context.h"
+#include "paddle/fluid/operators/beam_search_decode_op.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/beam_search_decode_op.h b/paddle/fluid/operators/beam_search_decode_op.h
similarity index 99%
rename from paddle/operators/beam_search_decode_op.h
rename to paddle/fluid/operators/beam_search_decode_op.h
index 3b1c6cd7a1..aeecb8d39a 100644
--- a/paddle/operators/beam_search_decode_op.h
+++ b/paddle/fluid/operators/beam_search_decode_op.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/lod_tensor_array.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/beam_search_decode_op_test.cc b/paddle/fluid/operators/beam_search_decode_op_test.cc
similarity index 99%
rename from paddle/operators/beam_search_decode_op_test.cc
rename to paddle/fluid/operators/beam_search_decode_op_test.cc
index 5ac23991f3..24f87279d5 100644
--- a/paddle/operators/beam_search_decode_op_test.cc
+++ b/paddle/fluid/operators/beam_search_decode_op_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/beam_search_decode_op.h"
+#include "paddle/fluid/operators/beam_search_decode_op.h"
 #include "gtest/gtest.h"
 
 using CPUPlace = paddle::platform::CPUPlace;
diff --git a/paddle/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc
similarity index 98%
rename from paddle/operators/beam_search_op.cc
rename to paddle/fluid/operators/beam_search_op.cc
index 844ade40eb..6f4c8c7e06 100644
--- a/paddle/operators/beam_search_op.cc
+++ b/paddle/fluid/operators/beam_search_op.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/beam_search_op.h"
+#include "paddle/fluid/operators/beam_search_op.h"
 
 #include <map>
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/beam_search_op.h b/paddle/fluid/operators/beam_search_op.h
similarity index 98%
rename from paddle/operators/beam_search_op.h
rename to paddle/fluid/operators/beam_search_op.h
index 7ad85874fc..9e2a05a60c 100644
--- a/paddle/operators/beam_search_op.h
+++ b/paddle/fluid/operators/beam_search_op.h
@@ -18,8 +18,8 @@ limitations under the License. */
 #include "gtest/gtest.h"
 #endif
 
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/operator.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/beam_search_op_test.cc b/paddle/fluid/operators/beam_search_op_test.cc
similarity index 97%
rename from paddle/operators/beam_search_op_test.cc
rename to paddle/fluid/operators/beam_search_op_test.cc
index d4beb64a85..ea2afda4d4 100644
--- a/paddle/operators/beam_search_op_test.cc
+++ b/paddle/fluid/operators/beam_search_op_test.cc
@@ -12,7 +12,7 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/operators/beam_search_op.h"
+#include "paddle/fluid/operators/beam_search_op.h"
 
 #include <gtest/gtest.h>
 #include <vector>
diff --git a/paddle/operators/bilinear_tensor_product_op.cc b/paddle/fluid/operators/bilinear_tensor_product_op.cc
similarity index 99%
rename from paddle/operators/bilinear_tensor_product_op.cc
rename to paddle/fluid/operators/bilinear_tensor_product_op.cc
index 7640147a12..cc378b1b45 100644
--- a/paddle/operators/bilinear_tensor_product_op.cc
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/bilinear_tensor_product_op.h"
+#include "paddle/fluid/operators/bilinear_tensor_product_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/bilinear_tensor_product_op.cu b/paddle/fluid/operators/bilinear_tensor_product_op.cu
similarity index 95%
rename from paddle/operators/bilinear_tensor_product_op.cu
rename to paddle/fluid/operators/bilinear_tensor_product_op.cu
index 0f48010716..2cec48ee69 100644
--- a/paddle/operators/bilinear_tensor_product_op.cu
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/bilinear_tensor_product_op.h"
+#include "paddle/fluid/operators/bilinear_tensor_product_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/bilinear_tensor_product_op.h b/paddle/fluid/operators/bilinear_tensor_product_op.h
similarity index 98%
rename from paddle/operators/bilinear_tensor_product_op.h
rename to paddle/fluid/operators/bilinear_tensor_product_op.h
index ba9a2c5ce3..626fa957c4 100644
--- a/paddle/operators/bilinear_tensor_product_op.h
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/bipartite_match_op.cc b/paddle/fluid/operators/bipartite_match_op.cc
similarity index 98%
rename from paddle/operators/bipartite_match_op.cc
rename to paddle/fluid/operators/bipartite_match_op.cc
index 1e6fa2091d..d614bf7043 100644
--- a/paddle/operators/bipartite_match_op.cc
+++ b/paddle/fluid/operators/bipartite_match_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/box_coder_op.cc b/paddle/fluid/operators/box_coder_op.cc
similarity index 99%
rename from paddle/operators/box_coder_op.cc
rename to paddle/fluid/operators/box_coder_op.cc
index 539813d485..8e0fee22d8 100644
--- a/paddle/operators/box_coder_op.cc
+++ b/paddle/fluid/operators/box_coder_op.cc
@@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/box_coder_op.h"
+#include "paddle/fluid/operators/box_coder_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/box_coder_op.cu b/paddle/fluid/operators/box_coder_op.cu
similarity index 98%
rename from paddle/operators/box_coder_op.cu
rename to paddle/fluid/operators/box_coder_op.cu
index 98bd93457f..dd9299ceac 100644
--- a/paddle/operators/box_coder_op.cu
+++ b/paddle/fluid/operators/box_coder_op.cu
@@ -9,8 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/box_coder_op.h"
-#include "paddle/platform/cuda_helper.h"
+#include "paddle/fluid/operators/box_coder_op.h"
+#include "paddle/fluid/platform/cuda_helper.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/box_coder_op.h b/paddle/fluid/operators/box_coder_op.h
similarity index 98%
rename from paddle/operators/box_coder_op.h
rename to paddle/fluid/operators/box_coder_op.h
index 086251f6e0..c41bcc212b 100644
--- a/paddle/operators/box_coder_op.h
+++ b/paddle/fluid/operators/box_coder_op.h
@@ -10,8 +10,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc
similarity index 96%
rename from paddle/operators/cast_op.cc
rename to paddle/fluid/operators/cast_op.cc
index 446976edaf..364c21f761 100644
--- a/paddle/operators/cast_op.cc
+++ b/paddle/fluid/operators/cast_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/cast_op.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/operators/cast_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/cast_op.cu b/paddle/fluid/operators/cast_op.cu
similarity index 95%
rename from paddle/operators/cast_op.cu
rename to paddle/fluid/operators/cast_op.cu
index d68bbe6e39..fb597be9d9 100644
--- a/paddle/operators/cast_op.cu
+++ b/paddle/fluid/operators/cast_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/cast_op.h"
+#include "paddle/fluid/operators/cast_op.h"
 
 template <typename T>
 using CastOpKernel =
diff --git a/paddle/operators/cast_op.h b/paddle/fluid/operators/cast_op.h
similarity index 91%
rename from paddle/operators/cast_op.h
rename to paddle/fluid/operators/cast_op.h
index 9f39d91edd..9ab4961cef 100644
--- a/paddle/operators/cast_op.h
+++ b/paddle/fluid/operators/cast_op.h
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/data_type.h"
-#include "paddle/framework/framework.pb.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/platform/transform.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/transform.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/chunk_eval_op.cc b/paddle/fluid/operators/chunk_eval_op.cc
similarity index 99%
rename from paddle/operators/chunk_eval_op.cc
rename to paddle/fluid/operators/chunk_eval_op.cc
index 44f667aead..080e4d80da 100644
--- a/paddle/operators/chunk_eval_op.cc
+++ b/paddle/fluid/operators/chunk_eval_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/chunk_eval_op.h"
+#include "paddle/fluid/operators/chunk_eval_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/chunk_eval_op.h b/paddle/fluid/operators/chunk_eval_op.h
similarity index 99%
rename from paddle/operators/chunk_eval_op.h
rename to paddle/fluid/operators/chunk_eval_op.h
index 300aff90c0..3dca3d2c0f 100644
--- a/paddle/operators/chunk_eval_op.h
+++ b/paddle/fluid/operators/chunk_eval_op.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 #include <set>
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/clip_by_norm_op.cc b/paddle/fluid/operators/clip_by_norm_op.cc
similarity index 98%
rename from paddle/operators/clip_by_norm_op.cc
rename to paddle/fluid/operators/clip_by_norm_op.cc
index b90921d79b..89df118c06 100644
--- a/paddle/operators/clip_by_norm_op.cc
+++ b/paddle/fluid/operators/clip_by_norm_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/clip_by_norm_op.h"
+#include "paddle/fluid/operators/clip_by_norm_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/clip_by_norm_op.cu b/paddle/fluid/operators/clip_by_norm_op.cu
similarity index 93%
rename from paddle/operators/clip_by_norm_op.cu
rename to paddle/fluid/operators/clip_by_norm_op.cu
index cbf8fa4413..a466b33591 100644
--- a/paddle/operators/clip_by_norm_op.cu
+++ b/paddle/fluid/operators/clip_by_norm_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/clip_by_norm_op.h"
+#include "paddle/fluid/operators/clip_by_norm_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h
similarity index 93%
rename from paddle/operators/clip_by_norm_op.h
rename to paddle/fluid/operators/clip_by_norm_op.h
index 87956a707c..82bcf07657 100644
--- a/paddle/operators/clip_by_norm_op.h
+++ b/paddle/fluid/operators/clip_by_norm_op.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/platform/transform.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/transform.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/clip_op.cc b/paddle/fluid/operators/clip_op.cc
similarity index 98%
rename from paddle/operators/clip_op.cc
rename to paddle/fluid/operators/clip_op.cc
index 7adb74eab7..76b2cefbf9 100644
--- a/paddle/operators/clip_op.cc
+++ b/paddle/fluid/operators/clip_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/clip_op.h"
+#include "paddle/fluid/operators/clip_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/clip_op.cu b/paddle/fluid/operators/clip_op.cu
similarity index 94%
rename from paddle/operators/clip_op.cu
rename to paddle/fluid/operators/clip_op.cu
index 5ccbc96434..7b044d6e69 100644
--- a/paddle/operators/clip_op.cu
+++ b/paddle/fluid/operators/clip_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/clip_op.h"
+#include "paddle/fluid/operators/clip_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/clip_op.h b/paddle/fluid/operators/clip_op.h
similarity index 95%
rename from paddle/operators/clip_op.h
rename to paddle/fluid/operators/clip_op.h
index 51db185dff..aecd6f83bf 100644
--- a/paddle/operators/clip_op.h
+++ b/paddle/fluid/operators/clip_op.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/platform/transform.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/transform.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/compare_op.cc b/paddle/fluid/operators/compare_op.cc
similarity index 94%
rename from paddle/operators/compare_op.cc
rename to paddle/fluid/operators/compare_op.cc
index 930c295a9c..f3414c33b5 100644
--- a/paddle/operators/compare_op.cc
+++ b/paddle/fluid/operators/compare_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/compare_op.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/operators/compare_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -58,8 +58,8 @@ class CompareOpInferShape : public framework::InferShapeBase {
                    comment.type);
     auto dim_x = context->GetInputDim("X");
     auto dim_y = context->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(framework::product(dim_x), framework::product(dim_y),
-                      "The number of elements in X and Y should be same");
+    PADDLE_ENFORCE_GE(dim_x.size(), dim_y.size(),
+                      "The size of dim_y should not be greater than dim_x's.");
 
     context->SetOutputDim("Out", context->GetInputDim("X"));
     context->ShareLoD("X", "Out");
diff --git a/paddle/operators/compare_op.cu b/paddle/fluid/operators/compare_op.cu
similarity index 94%
rename from paddle/operators/compare_op.cu
rename to paddle/fluid/operators/compare_op.cu
index f625824dbc..3507af2ae3 100644
--- a/paddle/operators/compare_op.cu
+++ b/paddle/fluid/operators/compare_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/compare_op.h"
+#include "paddle/fluid/operators/compare_op.h"
 
 REGISTER_LOGICAL_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor);
 REGISTER_LOGICAL_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor);
diff --git a/paddle/operators/compare_op.h b/paddle/fluid/operators/compare_op.h
similarity index 92%
rename from paddle/operators/compare_op.h
rename to paddle/fluid/operators/compare_op.h
index b275fd75b3..4b2ee5a9d6 100644
--- a/paddle/operators/compare_op.h
+++ b/paddle/fluid/operators/compare_op.h
@@ -15,9 +15,9 @@ limitations under the License. */
 #pragma once
 #include <math.h>
 #include <type_traits>
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/elementwise_op_function.h"
-#include "paddle/platform/transform.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/platform/transform.h"
 
 namespace paddle {
 namespace operators {
@@ -62,7 +62,7 @@ class CompareOpKernel
     z->mutable_data<T>(context.GetPlace());
     int axis = context.Attr<int>("axis");
     ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, axis,
-                                                          z);
+                                                          Functor(), z);
   }
 };
 
diff --git a/paddle/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
similarity index 98%
rename from paddle/operators/concat_op.cc
rename to paddle/fluid/operators/concat_op.cc
index 32b61edfd0..68eb5412be 100644
--- a/paddle/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/concat_op.h"
+#include "paddle/fluid/operators/concat_op.h"
 #include <vector>
 
 namespace paddle {
diff --git a/paddle/operators/concat_op.cu.cc b/paddle/fluid/operators/concat_op.cu.cc
similarity index 94%
rename from paddle/operators/concat_op.cu.cc
rename to paddle/fluid/operators/concat_op.cu.cc
index 7b46452d3d..143bda6116 100644
--- a/paddle/operators/concat_op.cu.cc
+++ b/paddle/fluid/operators/concat_op.cu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/concat_op.h"
+#include "paddle/fluid/operators/concat_op.h"
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     concat, ops::ConcatKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/concat_op.h b/paddle/fluid/operators/concat_op.h
similarity index 96%
rename from paddle/operators/concat_op.h
rename to paddle/fluid/operators/concat_op.h
index de4011585a..72b3e225bf 100644
--- a/paddle/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/strided_memcpy.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/strided_memcpy.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/cond_op.cc b/paddle/fluid/operators/cond_op.cc
similarity index 98%
rename from paddle/operators/cond_op.cc
rename to paddle/fluid/operators/cond_op.cc
index e333002bfd..dd93790d5b 100644
--- a/paddle/operators/cond_op.cc
+++ b/paddle/fluid/operators/cond_op.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/cond_op.h"
-#include "paddle/operators/gather.h"
-#include "paddle/operators/scatter.h"
-#include "paddle/platform/device_context.h"
+#include "paddle/fluid/operators/cond_op.h"
+#include "paddle/fluid/operators/gather.h"
+#include "paddle/fluid/operators/scatter.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/cond_op.h b/paddle/fluid/operators/cond_op.h
similarity index 92%
rename from paddle/operators/cond_op.h
rename to paddle/fluid/operators/cond_op.h
index 7dcdc47e0b..695af44906 100644
--- a/paddle/operators/cond_op.h
+++ b/paddle/fluid/operators/cond_op.h
@@ -15,11 +15,11 @@ limitations under the License. */
 #pragma once
 #include <vector>
 #include "glog/logging.h"
-#include "paddle/framework/ddim.h"
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/operator.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/operators/net_op.h"
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/net_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/conditional_block_op.cc b/paddle/fluid/operators/conditional_block_op.cc
similarity index 84%
rename from paddle/operators/conditional_block_op.cc
rename to paddle/fluid/operators/conditional_block_op.cc
index 3cae61a438..30435c6cca 100644
--- a/paddle/operators/conditional_block_op.cc
+++ b/paddle/fluid/operators/conditional_block_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <algorithm>
-#include "paddle/framework/executor.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -41,6 +41,21 @@ class ConditionalOp : public framework::OperatorBase {
         });
     return retv;
   }
+
+  bool ScalarCondition(
+      const std::vector<const framework::LoDTensor *> &ips) const {
+    if (!(ips.size() == 1UL && ips[0]->IsInitialized())) {
+      PADDLE_THROW("should have one initialized input as condition");
+    }
+    if (!(ips[0]->type().hash_code() == typeid(bool).hash_code() &&
+          ips[0]->numel() == 1)) {
+      PADDLE_THROW(
+          "condition input's data type should be bool, "
+          "numel should be 1, actual numel is %d",
+          ips[0]->numel());
+    }
+    return ips[0]->data<bool>()[0];
+  }
 };
 
 class ConditionalBlockOp : public ConditionalOp {
@@ -53,9 +68,15 @@ class ConditionalBlockOp : public ConditionalOp {
   void Run(const framework::Scope &scope,
            const platform::Place &dev_place) const override {
     auto xs = InputTensors(scope);
-    bool need_run = std::all_of(
-        xs.begin(), xs.end(),
-        [](const framework::LoDTensor *t) { return t->numel() != 0; });
+
+    bool need_run;
+    if (Attr<bool>("is_scalar_condition")) {
+      need_run = ScalarCondition(xs);
+    } else {
+      need_run = std::all_of(
+          xs.begin(), xs.end(),
+          [](const framework::LoDTensor *t) { return t->numel() != 0; });
+    }
 
     if (need_run) {
       auto *scope_var = scope.FindVar(Output("Scope"));
@@ -88,6 +109,10 @@ class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
               "scope is std::vector<Scope*>");
     AddAttr<framework::BlockDesc *>(
         "sub_block", "The step block of conditional block operator");
+    AddAttr<bool>("is_scalar_condition",
+                  "the input X is used as scalar "
+                  "condition")
+        .SetDefault(false);
     AddComment(R"DOC(Conditional block operator
 
 Run the sub-block if X is not empty. Params is the other inputs and Out is the
@@ -106,9 +131,15 @@ class ConditionalBlockGradOp : public ConditionalOp {
   void Run(const framework::Scope &scope,
            const platform::Place &dev_place) const override {
     auto xs = this->InputTensors(scope);
-    bool need_run = std::all_of(
-        xs.begin(), xs.end(),
-        [](const framework::LoDTensor *t) { return t->numel() != 0; });
+
+    bool need_run;
+    if (Attr<bool>("is_scalar_condition")) {
+      need_run = ScalarCondition(xs);
+    } else {
+      need_run = std::all_of(
+          xs.begin(), xs.end(),
+          [](const framework::LoDTensor *t) { return t->numel() != 0; });
+    }
 
     if (need_run) {
       auto *scope_var = scope.FindVar(Input("Scope"));
@@ -182,6 +213,7 @@ class ConditionalBlockGradMaker : public framework::SingleGradOpDescMaker {
     grad_op->SetOutput(framework::GradVarName("Params"),
                        InputGrad("Params", false));
     grad_op->SetBlockAttr("sub_block", *this->grad_block_[0]);
+    grad_op->SetAttr("is_scalar_condition", GetAttr("is_scalar_condition"));
     return std::unique_ptr<framework::OpDesc>(grad_op);
   }
 };
diff --git a/paddle/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc
similarity index 98%
rename from paddle/operators/conv_cudnn_op.cu.cc
rename to paddle/fluid/operators/conv_cudnn_op.cu.cc
index 3a5409a7e3..a729d376ac 100644
--- a/paddle/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/memory/memory.h"
-#include "paddle/operators/conv_op.h"
-#include "paddle/platform/assert.h"
-#include "paddle/platform/cudnn_helper.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/operators/conv_op.h"
+#include "paddle/fluid/platform/assert.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
similarity index 99%
rename from paddle/operators/conv_op.cc
rename to paddle/fluid/operators/conv_op.cc
index cef7ddd5fe..a047e57916 100644
--- a/paddle/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/conv_op.h"
+#include "paddle/fluid/operators/conv_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/conv_op.cu.cc b/paddle/fluid/operators/conv_op.cu.cc
similarity index 97%
rename from paddle/operators/conv_op.cu.cc
rename to paddle/fluid/operators/conv_op.cu.cc
index d0bd40ee95..b2129d3b46 100644
--- a/paddle/operators/conv_op.cu.cc
+++ b/paddle/fluid/operators/conv_op.cu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/conv_op.h"
+#include "paddle/fluid/operators/conv_op.h"
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/operators/conv_op.h b/paddle/fluid/operators/conv_op.h
similarity index 98%
rename from paddle/operators/conv_op.h
rename to paddle/fluid/operators/conv_op.h
index 3c1d0e9c1c..1156e6c8fe 100644
--- a/paddle/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -14,12 +14,12 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/depthwise_conv.h"
-#include "paddle/operators/math/im2col.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/vol2col.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/depthwise_conv.h"
+#include "paddle/fluid/operators/math/im2col.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/vol2col.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/conv_shift_op.cc b/paddle/fluid/operators/conv_shift_op.cc
similarity index 98%
rename from paddle/operators/conv_shift_op.cc
rename to paddle/fluid/operators/conv_shift_op.cc
index 106b68a0a0..a96aac63e0 100644
--- a/paddle/operators/conv_shift_op.cc
+++ b/paddle/fluid/operators/conv_shift_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/conv_shift_op.h"
-#include "paddle/framework/eigen.h"
+#include "paddle/fluid/operators/conv_shift_op.h"
+#include "paddle/fluid/framework/eigen.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/conv_shift_op.cu b/paddle/fluid/operators/conv_shift_op.cu
similarity index 97%
rename from paddle/operators/conv_shift_op.cu
rename to paddle/fluid/operators/conv_shift_op.cu
index cf7abc196e..9818707ce3 100644
--- a/paddle/operators/conv_shift_op.cu
+++ b/paddle/fluid/operators/conv_shift_op.cu
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/conv_shift_op.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/platform/cuda_helper.h"
+#include "paddle/fluid/operators/conv_shift_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/cuda_helper.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/conv_shift_op.h b/paddle/fluid/operators/conv_shift_op.h
similarity index 95%
rename from paddle/operators/conv_shift_op.h
rename to paddle/fluid/operators/conv_shift_op.h
index 6781d87ef0..987a690895 100644
--- a/paddle/operators/conv_shift_op.h
+++ b/paddle/fluid/operators/conv_shift_op.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/conv_transpose_cudnn_op.cu.cc b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
similarity index 97%
rename from paddle/operators/conv_transpose_cudnn_op.cu.cc
rename to paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
index 23bc97e13c..0aed4ebeff 100644
--- a/paddle/operators/conv_transpose_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/memory/memory.h"
-#include "paddle/operators/conv_transpose_op.h"
-#include "paddle/platform/assert.h"
-#include "paddle/platform/cudnn_helper.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/operators/conv_transpose_op.h"
+#include "paddle/fluid/platform/assert.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
similarity index 99%
rename from paddle/operators/conv_transpose_op.cc
rename to paddle/fluid/operators/conv_transpose_op.cc
index 089290a506..974cffad92 100644
--- a/paddle/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/conv_transpose_op.h"
+#include "paddle/fluid/operators/conv_transpose_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/conv_transpose_op.cu.cc b/paddle/fluid/operators/conv_transpose_op.cu.cc
similarity index 96%
rename from paddle/operators/conv_transpose_op.cu.cc
rename to paddle/fluid/operators/conv_transpose_op.cu.cc
index f1d827c606..ed90c6ec62 100644
--- a/paddle/operators/conv_transpose_op.cu.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/conv_transpose_op.h"
+#include "paddle/fluid/operators/conv_transpose_op.h"
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h
similarity index 98%
rename from paddle/operators/conv_transpose_op.h
rename to paddle/fluid/operators/conv_transpose_op.h
index 8c0d57afcd..f512575468 100644
--- a/paddle/operators/conv_transpose_op.h
+++ b/paddle/fluid/operators/conv_transpose_op.h
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/im2col.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/vol2col.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/im2col.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/vol2col.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/cos_sim_op.cc b/paddle/fluid/operators/cos_sim_op.cc
similarity index 99%
rename from paddle/operators/cos_sim_op.cc
rename to paddle/fluid/operators/cos_sim_op.cc
index 9019a1edb3..57c5a6025a 100644
--- a/paddle/operators/cos_sim_op.cc
+++ b/paddle/fluid/operators/cos_sim_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/cos_sim_op.h"
+#include "paddle/fluid/operators/cos_sim_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/cos_sim_op.cu b/paddle/fluid/operators/cos_sim_op.cu
similarity index 94%
rename from paddle/operators/cos_sim_op.cu
rename to paddle/fluid/operators/cos_sim_op.cu
index 9e5d1b6e4f..c8cf363cdc 100644
--- a/paddle/operators/cos_sim_op.cu
+++ b/paddle/fluid/operators/cos_sim_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/cos_sim_op.h"
+#include "paddle/fluid/operators/cos_sim_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/cos_sim_op.h b/paddle/fluid/operators/cos_sim_op.h
similarity index 96%
rename from paddle/operators/cos_sim_op.h
rename to paddle/fluid/operators/cos_sim_op.h
index eadcca55f9..9cd8b196da 100644
--- a/paddle/operators/cos_sim_op.h
+++ b/paddle/fluid/operators/cos_sim_op.h
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/cos_sim_functor.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/platform/for_range.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/cos_sim_functor.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/for_range.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/create_reader_op.cc b/paddle/fluid/operators/create_reader_op.cc
new file mode 100644
index 0000000000..d1ba51f2c0
--- /dev/null
+++ b/paddle/fluid/operators/create_reader_op.cc
@@ -0,0 +1,240 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/reader.h"
+
+namespace paddle {
+namespace operators {
+
+static std::vector<framework::DDim> RestoreShapes(
+    const std::vector<int>& shape_concat, const std::vector<int>& ranks) {
+  std::vector<framework::DDim> res;
+  int offset = 0;
+  for (int len : ranks) {
+    auto start_it = shape_concat.begin() + offset;
+    auto end_it = start_it + len;
+    res.push_back(framework::make_ddim(std::vector<int>(start_it, end_it)));
+    offset += len;
+  }
+  return res;
+}
+
+// general infershape for file readers
+class CreateFileReaderInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "The output file reader should not be null.");
+    const auto shape_concat =
+        ctx->Attrs().Get<std::vector<int>>("shape_concat");
+    const auto ranks = ctx->Attrs().Get<std::vector<int>>("ranks");
+    std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
+    ctx->SetReaderDims("Out", shapes);
+
+    if (ctx->IsRuntime()) {
+      const auto lod_levels = ctx->Attrs().Get<std::vector<int>>("lod_levels");
+      PADDLE_ENFORCE_EQ(
+          lod_levels.size(), shapes.size(),
+          "The number of 'lod_levels'(%d) doesn't match the number "
+          "of 'shapes'(%d).",
+          lod_levels.size(), shapes.size());
+      framework::VarDesc* reader =
+          boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
+      reader->SetLoDLevels(lod_levels);
+    }
+  }
+};
+
+// general infershape for decorated readers
+class CreateDecoratedReaderInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("UnderlyingReader"),
+                   "Input(UnderlyingReader) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "The output decorated reader should not be null.");
+    ctx->SetReaderDims("Out", ctx->GetReaderDims("UnderlyingReader"));
+
+    if (ctx->IsRuntime()) {
+      framework::VarDesc* in_reader = boost::get<framework::VarDesc*>(
+          ctx->GetInputVarPtrs("UnderlyingReader")[0]);
+      framework::VarDesc* out_reader =
+          boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
+      out_reader->SetLoDLevels(in_reader->GetLoDLevels());
+    }
+  }
+};
+
+// general var type inference for file readers
+class CreateFileReaderInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    std::string reader_name = op_desc.Output("Out")[0];
+    framework::VarDesc* reader = block->FindVarRecursive(reader_name);
+    reader->SetType(framework::proto::VarDesc::READER);
+  }
+};
+
+// general var type inference for decorated readers
+class CreateDecoratedReaderInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    std::string in_reader_name = op_desc.Input("UnderlyingReader")[0];
+    framework::VarDesc* in_reader = block->FindVarRecursive(in_reader_name);
+    std::string out_reader_name = op_desc.Output("Out")[0];
+    framework::VarDesc* out_reader = block->FindVarRecursive(out_reader_name);
+    out_reader->SetType(framework::proto::VarDesc::READER);
+    out_reader->SetDataTypes(in_reader->GetDataTypes());
+  }
+};
+
+template <typename T>
+class CreateRandomDataGeneratorOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+  void Run(const framework::Scope& scope,
+           const platform::Place& dev_place) const override {
+    const auto& shape_concat = Attr<std::vector<int>>("shape_concat");
+    const auto& ranks = Attr<std::vector<int>>("ranks");
+    PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
+    PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
+                      int(shape_concat.size()),
+                      "The accumulate of all ranks should be equal to the "
+                      "shape concat's length.");
+    std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    out->Reset(new framework::RandomDataGenerator<T>(shapes, Attr<float>("min"),
+                                                     Attr<float>("max")));
+  }
+};
+
+class CreateRandomDataGeneratorOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  CreateRandomDataGeneratorOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(op_proto, op_checker) {
+    AddOutput("Out", "(ReaderHolder) The created random reader.");
+    AddAttr<std::vector<int>>("shape_concat",
+                              "The concat of all data's shapes.");
+    AddAttr<std::vector<int>>(
+        "ranks",
+        "The ranks of each data."
+        "e.g."
+        "shape_concat = [2,3,4,5,6]"
+        "ranks = [3,2]"
+        "It means the reader will generate two data each time,"
+        "whose shapes are [2,3,4] and [5,6] respectively.");
+    AddAttr<std::vector<int>>("lod_levels", "The LoD levels of each data.");
+    AddAttr<float>("min", "The lower bound of reader's uniform distribution.");
+    AddAttr<float>("max", "The upper bound of reader's uniform distribution.");
+    AddComment(R"DOC(
+      CreateRandomDataGenerator Operator
+
+      This Op creates a random reader. 
+      The reader generates random data instead of really reading from files.
+      Generated data follow an uniform distribution between 'min' and 'max'.
+    )DOC");
+  }
+};
+
+class CreateShuffleReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+  void Run(const framework::Scope& scope,
+           const platform::Place& dev_place) const override {
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    out->Reset(new framework::ShuffleReader(underlying_reader.Get(),
+                                            Attr<int>("buffer_size")));
+  }
+};
+
+class CreateShuffleReaderOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CreateShuffleReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(op_proto, op_checker) {
+    AddInput(
+        "UnderlyingReader",
+        "(ReaderHolder) The underlying reader for creating a shuffle reader.");
+    AddOutput("Out", "(ReaderHolder) The created shuffle reader.");
+    AddAttr<int>("buffer_size", "The shuffle buffer size.").GreaterThan(0);
+    AddComment(R"DOC(
+      CreateShuffleReader Operator
+
+      A shuffle reader takes another reader as its 'underlying reader'
+      and yields the underlying reader's outputs in a shuffled order. 
+    )DOC");
+  }
+};
+
+class CreateBatchReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+  void Run(const framework::Scope& scope,
+           const platform::Place& dev_place) const override {
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    out->Reset(new framework::BatchReader(underlying_reader.Get(),
+                                          Attr<int>("batch_size")));
+  }
+};
+
+class CreateBatchReaderOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CreateBatchReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(op_proto, op_checker) {
+    AddInput(
+        "UnderlyingReader",
+        "(ReaderHolder) The underlying reader for creating a batch reader.");
+    AddOutput("Out", "(ReaderHolder) The created batch reader.");
+    AddAttr<int>("batch_size",
+                 "How many instances the batch reader yields each time.")
+        .GreaterThan(0);
+    AddComment(R"DOC(
+      CreateBatchReader Operator
+
+      A batch reader takes another reader as its 'underlying reader', 
+      gathers the underlying reader's outputs and then yields them in batches. 
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(create_random_data_generator,
+                  ops::CreateRandomDataGeneratorOp<float>,
+                  ops::CreateFileReaderInferShape,
+                  ops::CreateRandomDataGeneratorOpMaker,
+                  paddle::framework::EmptyGradOpMaker,
+                  ops::CreateFileReaderInferVarType);
+REGISTER_OPERATOR(create_shuffle_reader, ops::CreateShuffleReaderOp,
+                  ops::CreateDecoratedReaderInferShape,
+                  ops::CreateShuffleReaderOpMaker,
+                  paddle::framework::EmptyGradOpMaker,
+                  ops::CreateDecoratedReaderInferVarType);
+REGISTER_OPERATOR(create_batch_reader, ops::CreateBatchReaderOp,
+                  ops::CreateDecoratedReaderInferShape,
+                  ops::CreateBatchReaderOpMaker,
+                  paddle::framework::EmptyGradOpMaker,
+                  ops::CreateDecoratedReaderInferVarType);
diff --git a/paddle/operators/crf_decoding_op.cc b/paddle/fluid/operators/crf_decoding_op.cc
similarity index 99%
rename from paddle/operators/crf_decoding_op.cc
rename to paddle/fluid/operators/crf_decoding_op.cc
index 30626028c1..e3c1fc95a3 100644
--- a/paddle/operators/crf_decoding_op.cc
+++ b/paddle/fluid/operators/crf_decoding_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/crf_decoding_op.h"
+#include "paddle/fluid/operators/crf_decoding_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/crf_decoding_op.h b/paddle/fluid/operators/crf_decoding_op.h
similarity index 96%
rename from paddle/operators/crf_decoding_op.h
rename to paddle/fluid/operators/crf_decoding_op.h
index ce2f4e6622..c3c161eec5 100644
--- a/paddle/operators/crf_decoding_op.h
+++ b/paddle/fluid/operators/crf_decoding_op.h
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc
similarity index 99%
rename from paddle/operators/crop_op.cc
rename to paddle/fluid/operators/crop_op.cc
index 310e351443..8e80f77e49 100644
--- a/paddle/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/crop_op.h"
+#include "paddle/fluid/operators/crop_op.h"
 #include <boost/lexical_cast.hpp>
 
 namespace paddle {
diff --git a/paddle/operators/crop_op.cu b/paddle/fluid/operators/crop_op.cu
similarity index 94%
rename from paddle/operators/crop_op.cu
rename to paddle/fluid/operators/crop_op.cu
index bba5db4c6c..f3610675aa 100644
--- a/paddle/operators/crop_op.cu
+++ b/paddle/fluid/operators/crop_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/crop_op.h"
+#include "paddle/fluid/operators/crop_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(crop, ops::CropKernel<float>);
diff --git a/paddle/operators/crop_op.h b/paddle/fluid/operators/crop_op.h
similarity index 96%
rename from paddle/operators/crop_op.h
rename to paddle/fluid/operators/crop_op.h
index 69d1a92977..9c7c0446d4 100644
--- a/paddle/operators/crop_op.h
+++ b/paddle/fluid/operators/crop_op.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/strided_memcpy.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/strided_memcpy.h"
 
 namespace paddle {
 namespace operators {  // Internal
diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc
similarity index 99%
rename from paddle/operators/cross_entropy_op.cc
rename to paddle/fluid/operators/cross_entropy_op.cc
index 7abd5b1c61..5e34b248b6 100644
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/cross_entropy_op.h"
+#include "paddle/fluid/operators/cross_entropy_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/fluid/operators/cross_entropy_op.cu
similarity index 98%
rename from paddle/operators/cross_entropy_op.cu
rename to paddle/fluid/operators/cross_entropy_op.cu
index 3b04894e6c..de0976c69f 100644
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/fluid/operators/cross_entropy_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/cross_entropy_op.h"
+#include "paddle/fluid/operators/cross_entropy_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h
similarity index 94%
rename from paddle/operators/cross_entropy_op.h
rename to paddle/fluid/operators/cross_entropy_op.h
index 5623d2ded1..4a5b20ecb7 100644
--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/fluid/operators/cross_entropy_op.h
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/cross_entropy.h"
-#include "paddle/operators/math/math_function.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/cross_entropy.h"
+#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/ctc_align_op.cc b/paddle/fluid/operators/ctc_align_op.cc
similarity index 98%
rename from paddle/operators/ctc_align_op.cc
rename to paddle/fluid/operators/ctc_align_op.cc
index eeecbd3212..3c7db78813 100644
--- a/paddle/operators/ctc_align_op.cc
+++ b/paddle/fluid/operators/ctc_align_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/ctc_align_op.h"
+#include "paddle/fluid/operators/ctc_align_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/ctc_align_op.cu b/paddle/fluid/operators/ctc_align_op.cu
similarity index 85%
rename from paddle/operators/ctc_align_op.cu
rename to paddle/fluid/operators/ctc_align_op.cu
index 2a970cd9fa..f629e0a9f1 100644
--- a/paddle/operators/ctc_align_op.cu
+++ b/paddle/fluid/operators/ctc_align_op.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <stdio.h>
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
-#include "paddle/operators/ctc_align_op.h"
+#include "paddle/fluid/operators/ctc_align_op.h"
 
 namespace paddle {
 namespace operators {
@@ -69,8 +69,9 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
 
     auto stream = ctx.cuda_device_context().stream();
     MergeAndDelCudaKernel<T><<<1, 1, 0, stream>>>(
-        num_tokens, tokens, num_seq, input_lod[level].cuda_data(), blank,
-        merge_repeated, dev_out_lod0_ptr, output_data);
+        num_tokens, tokens, num_seq,
+        input_lod[level].CUDAMutableData(ctx.GetPlace()), blank, merge_repeated,
+        dev_out_lod0_ptr, output_data);
 
     // set output lod
     std::vector<size_t> host_out_lod0(dev_out_lod0.begin(), dev_out_lod0.end());
@@ -80,6 +81,14 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
 
     // resize output dims
     output->Resize({static_cast<int64_t>(host_out_lod0.back()), 1});
+
+    if (host_out_lod0.back() == 0) {
+      output->Resize({1, 1});
+      output->mutable_data<T>(ctx.GetPlace());
+      math::SetConstant<platform::CUDADeviceContext, T> set_constant;
+      set_constant(ctx.template device_context<platform::CUDADeviceContext>(),
+                   output, -1);
+    }
   }
 };
 
diff --git a/paddle/operators/ctc_align_op.h b/paddle/fluid/operators/ctc_align_op.h
similarity index 90%
rename from paddle/operators/ctc_align_op.h
rename to paddle/fluid/operators/ctc_align_op.h
index fed89aa1e8..1ef034c2f5 100644
--- a/paddle/operators/ctc_align_op.h
+++ b/paddle/fluid/operators/ctc_align_op.h
@@ -15,7 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include <string.h>
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
 namespace paddle {
 namespace operators {
 
@@ -65,9 +67,14 @@ class CTCAlignKernel : public framework::OpKernel<T> {
     framework::LoD output_lod;
     output_lod.push_back(output_lod0);
     output->set_lod(output_lod);
-
     // resize output dims
     output->Resize({static_cast<int64_t>(output_lod0.back()), 1});
+    // for empty sequence
+    if (output_lod0.back() == 0) {
+      output->Resize({1, 1});
+      output_data = output->mutable_data<T>(ctx.GetPlace());
+      output_data[0] = -1;
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/cum_op.h b/paddle/fluid/operators/cum_op.h
new file mode 100644
index 0000000000..3b22491478
--- /dev/null
+++ b/paddle/fluid/operators/cum_op.h
@@ -0,0 +1,111 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename Functor>
+class CumKernel : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  using T = typename Functor::ELEMENT_TYPE;
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto& X = detail::Ref(context.Input<framework::Tensor>("X"),
+                          "Cannot get input tensor X, variable name = %s",
+                          context.op().Input("X"));
+
+    auto& Out = detail::Ref(context.Output<framework::Tensor>("Out"),
+                            "Cannot get output tensor Out, variable name = %s",
+                            context.op().Output("Out"));
+    int axis = context.Attr<int>("axis");
+    bool exclusive = context.Attr<bool>("exclusive");
+    bool reverse = context.Attr<bool>("reverse");
+    auto x_dims = X.dims();
+    if (axis == -1) {
+      axis = x_dims.size() - 1;
+    }
+    PADDLE_ENFORCE_LT(
+        axis, x_dims.size(),
+        "axis should be less than the dimensiotn of the input tensor");
+    Out.mutable_data<T>(context.GetPlace());
+
+    int pre = 1;
+    int post = 1;
+    int mid = x_dims[axis];
+    for (int i = 0; i < axis; ++i) {
+      pre *= x_dims[i];
+    }
+    for (int i = axis + 1; i < x_dims.size(); ++i) {
+      post *= x_dims[i];
+    }
+
+    auto x = framework::EigenVector<T>::Flatten(X);
+    auto out = framework::EigenVector<T>::Flatten(Out);
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
+
+    using IndexT = Eigen::DenseIndex;
+    if (pre == 1) {
+      if (post == 1) {
+        ComputeImp(*place, Eigen::DSizes<IndexT, 1>(mid), x, out,
+                   /* axis= */ 0, reverse, exclusive);
+      } else {
+        ComputeImp(*place, Eigen::DSizes<IndexT, 2>(mid, post), x, out,
+                   /* axis= */ 0, reverse, exclusive);
+      }
+    } else {
+      if (post == 1) {
+        ComputeImp(*place, Eigen::DSizes<IndexT, 2>(pre, mid), x, out,
+                   /* axis= */ 1, reverse, exclusive);
+      } else {
+        ComputeImp(*place, Eigen::DSizes<IndexT, 3>(pre, mid, post), x, out,
+                   /* axis= */ 1, reverse, exclusive);
+      }
+    }
+  }
+
+ private:
+  template <typename Device, typename Dim, typename X, typename Out>
+  void ComputeImp(Device d, const Dim& dims, X x, Out out, int axis,
+                  bool reverse, bool exclusive) const {
+    if (!reverse) {
+      out.reshape(dims).device(d) = Functor()(x.reshape(dims), axis, exclusive);
+    } else {
+      std::array<bool, Dim::count> rev;
+      rev.fill(false);
+      rev[axis] = reverse;
+      out.reshape(dims).device(d) =
+          Functor()(x.reshape(dims).reverse(rev), axis, exclusive).reverse(rev);
+    }
+  }
+};
+
+template <typename T>
+struct CumsumFunctor {
+  using ELEMENT_TYPE = T;
+  template <typename X>
+  const typename X::TensorScanSumOp operator()(X x, int axis,
+                                               bool exclusive) const {
+    return x.cumsum(axis, exclusive);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/cumsum_op.cc b/paddle/fluid/operators/cumsum_op.cc
new file mode 100644
index 0000000000..d15d4e3db3
--- /dev/null
+++ b/paddle/fluid/operators/cumsum_op.cc
@@ -0,0 +1,82 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/cum_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CumOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class CumsumOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CumsumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Cumsum operator");
+    AddOutput("Out", "Output of Cumsum operator");
+    AddAttr<int>("axis",
+                 "(int, default -1). The dimenstion to accumulate along. "
+                 "-1 means the last dimenstion")
+        .SetDefault(-1)
+        .EqualGreaterThan(-1);
+    AddAttr<bool>("exclusive",
+                  "bool, default false). Whether to perform exclusive cumsum")
+        .SetDefault(false);
+    AddAttr<bool>("reverse",
+                  "bool, default false). If true, the cumsum is performed in "
+                  "the reversed direction")
+        .SetDefault(false);
+    AddComment(R"DOC(
+The cumulative sum of the elements along a given axis.
+By default, the first element of the result is the same of the first element of
+the input. If exlusive is true, the first element of the result is 0.
+)DOC");
+  }
+};
+
+class CumsumGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("cumsum");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttr("axis", Attr<int>("axis"));
+    grad_op->SetAttr("reverse", !Attr<bool>("reverse"));
+    grad_op->SetAttr("exclusive", Attr<bool>("exclusive"));
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+using CPU = paddle::platform::CPUDeviceContext;
+
+REGISTER_OPERATOR(cumsum, ops::CumOp, ops::CumsumOpMaker, ops::CumsumGradMaker);
+REGISTER_OP_CPU_KERNEL(cumsum, ops::CumKernel<CPU, ops::CumsumFunctor<float>>,
+                       ops::CumKernel<CPU, ops::CumsumFunctor<double>>,
+                       ops::CumKernel<CPU, ops::CumsumFunctor<int>>)
diff --git a/paddle/fluid/operators/cumsum_op.cu b/paddle/fluid/operators/cumsum_op.cu
new file mode 100644
index 0000000000..e063cc0f65
--- /dev/null
+++ b/paddle/fluid/operators/cumsum_op.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/cum_op.h"
+
+namespace ops = paddle::operators;
+using CUDA = paddle::platform::CUDADeviceContext;
+
+REGISTER_OP_CUDA_KERNEL(cumsum, ops::CumKernel<CUDA, ops::CumsumFunctor<float>>,
+                        ops::CumKernel<CUDA, ops::CumsumFunctor<double>>,
+                        ops::CumKernel<CUDA, ops::CumsumFunctor<int>>)
diff --git a/paddle/operators/decayed_adagrad_op.cc b/paddle/fluid/operators/decayed_adagrad_op.cc
similarity index 98%
rename from paddle/operators/decayed_adagrad_op.cc
rename to paddle/fluid/operators/decayed_adagrad_op.cc
index 739a8d881c..d827155919 100644
--- a/paddle/operators/decayed_adagrad_op.cc
+++ b/paddle/fluid/operators/decayed_adagrad_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/decayed_adagrad_op.h"
+#include "paddle/fluid/operators/decayed_adagrad_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/decayed_adagrad_op.cu b/paddle/fluid/operators/decayed_adagrad_op.cu
similarity index 93%
rename from paddle/operators/decayed_adagrad_op.cu
rename to paddle/fluid/operators/decayed_adagrad_op.cu
index 7bc8161f23..215d6dbc7d 100644
--- a/paddle/operators/decayed_adagrad_op.cu
+++ b/paddle/fluid/operators/decayed_adagrad_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/decayed_adagrad_op.h"
+#include "paddle/fluid/operators/decayed_adagrad_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/decayed_adagrad_op.h b/paddle/fluid/operators/decayed_adagrad_op.h
similarity index 95%
rename from paddle/operators/decayed_adagrad_op.h
rename to paddle/fluid/operators/decayed_adagrad_op.h
index fec9705cfc..52b67586ea 100644
--- a/paddle/operators/decayed_adagrad_op.h
+++ b/paddle/fluid/operators/decayed_adagrad_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/detail/CMakeLists.txt b/paddle/fluid/operators/detail/CMakeLists.txt
similarity index 100%
rename from paddle/operators/detail/CMakeLists.txt
rename to paddle/fluid/operators/detail/CMakeLists.txt
diff --git a/paddle/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc
similarity index 99%
rename from paddle/operators/detail/grpc_client.cc
rename to paddle/fluid/operators/detail/grpc_client.cc
index 9b5f7afc6a..0d395d347b 100644
--- a/paddle/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "grpc_client.h"
-#include "paddle/framework/threadpool.h"
+#include "paddle/fluid/framework/threadpool.h"
 namespace paddle {
 namespace operators {
 namespace detail {
diff --git a/paddle/operators/detail/grpc_client.h b/paddle/fluid/operators/detail/grpc_client.h
similarity index 93%
rename from paddle/operators/detail/grpc_client.h
rename to paddle/fluid/operators/detail/grpc_client.h
index f9499f6dc7..314fe8168f 100644
--- a/paddle/operators/detail/grpc_client.h
+++ b/paddle/fluid/operators/detail/grpc_client.h
@@ -25,12 +25,12 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-#include "paddle/framework/data_type.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/scope.h"
-#include "paddle/framework/selected_rows.h"
-#include "paddle/operators/detail/sendrecvop_utils.h"
-#include "paddle/operators/detail/simple_block_queue.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+#include "paddle/fluid/operators/detail/simple_block_queue.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc
similarity index 99%
rename from paddle/operators/detail/grpc_server.cc
rename to paddle/fluid/operators/detail/grpc_server.cc
index 4f94e1315f..96f4ea797b 100644
--- a/paddle/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/detail/grpc_server.h"
+#include "paddle/fluid/operators/detail/grpc_server.h"
 
 using grpc::ServerAsyncResponseWriter;
 
diff --git a/paddle/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h
similarity index 85%
rename from paddle/operators/detail/grpc_server.h
rename to paddle/fluid/operators/detail/grpc_server.h
index 3f8b9d9317..1382d17318 100644
--- a/paddle/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@@ -14,19 +14,19 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/scope.h"
-#include "paddle/framework/selected_rows.h"
-#include "paddle/framework/var_type.h"
-#include "paddle/operators/detail/simple_block_queue.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/detail/simple_block_queue.h"
 
-#include "paddle/operators/detail/send_recv.grpc.pb.h"
-#include "paddle/operators/detail/send_recv.pb.h"
+#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
+#include "paddle/fluid/operators/detail/send_recv.pb.h"
 
 #include <grpc++/grpc++.h>
 #include <grpc/support/log.h>
 #include <thread>
-#include "paddle/operators/detail/sendrecvop_utils.h"
+#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/detail/safe_ref.h b/paddle/fluid/operators/detail/safe_ref.h
similarity index 100%
rename from paddle/operators/detail/safe_ref.h
rename to paddle/fluid/operators/detail/safe_ref.h
diff --git a/paddle/operators/detail/send_recv.proto b/paddle/fluid/operators/detail/send_recv.proto
similarity index 100%
rename from paddle/operators/detail/send_recv.proto
rename to paddle/fluid/operators/detail/send_recv.proto
diff --git a/paddle/operators/detail/sendrecvop_utils.cc b/paddle/fluid/operators/detail/sendrecvop_utils.cc
similarity index 97%
rename from paddle/operators/detail/sendrecvop_utils.cc
rename to paddle/fluid/operators/detail/sendrecvop_utils.cc
index 7635b9e8db..ba3ae6add6 100644
--- a/paddle/operators/detail/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/detail/sendrecvop_utils.h"
+#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/detail/sendrecvop_utils.h b/paddle/fluid/operators/detail/sendrecvop_utils.h
similarity index 78%
rename from paddle/operators/detail/sendrecvop_utils.h
rename to paddle/fluid/operators/detail/sendrecvop_utils.h
index 8e66f7299c..fed887c027 100644
--- a/paddle/operators/detail/sendrecvop_utils.h
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.h
@@ -17,14 +17,14 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-#include "paddle/framework/data_type.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/scope.h"
-#include "paddle/framework/selected_rows.h"
-#include "paddle/framework/var_type.h"
-
-#include "paddle/operators/detail/send_recv.grpc.pb.h"
-#include "paddle/operators/detail/send_recv.pb.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/var_type.h"
+
+#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
+#include "paddle/fluid/operators/detail/send_recv.pb.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/detail/simple_block_queue.h b/paddle/fluid/operators/detail/simple_block_queue.h
similarity index 100%
rename from paddle/operators/detail/simple_block_queue.h
rename to paddle/fluid/operators/detail/simple_block_queue.h
diff --git a/paddle/operators/detail/strided_memcpy.h b/paddle/fluid/operators/detail/strided_memcpy.h
similarity index 95%
rename from paddle/operators/detail/strided_memcpy.h
rename to paddle/fluid/operators/detail/strided_memcpy.h
index 9ed524d4dc..d7a7eed50b 100644
--- a/paddle/operators/detail/strided_memcpy.h
+++ b/paddle/fluid/operators/detail/strided_memcpy.h
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/ddim.h"
-#include "paddle/memory/memcpy.h"
-#include "paddle/platform/device_context.h"
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/detection_output_op.cc b/paddle/fluid/operators/detection_output_op.cc
similarity index 98%
rename from paddle/operators/detection_output_op.cc
rename to paddle/fluid/operators/detection_output_op.cc
index ea44cd3267..6dee522295 100644
--- a/paddle/operators/detection_output_op.cc
+++ b/paddle/fluid/operators/detection_output_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/detection_output_op.h"
+#include "paddle/fluid/operators/detection_output_op.h"
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/operators/detection_output_op.cu.cc b/paddle/fluid/operators/detection_output_op.cu.cc
similarity index 93%
rename from paddle/operators/detection_output_op.cu.cc
rename to paddle/fluid/operators/detection_output_op.cu.cc
index 4a6560e049..309e03a25b 100644
--- a/paddle/operators/detection_output_op.cu.cc
+++ b/paddle/fluid/operators/detection_output_op.cu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/detection_output_op.h"
+#include "paddle/fluid/operators/detection_output_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/detection_output_op.h b/paddle/fluid/operators/detection_output_op.h
similarity index 96%
rename from paddle/operators/detection_output_op.h
rename to paddle/fluid/operators/detection_output_op.h
index 86285b748a..05e5b72bd3 100644
--- a/paddle/operators/detection_output_op.h
+++ b/paddle/fluid/operators/detection_output_op.h
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/operators/math/detection_util.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/softmax.h"
-#include "paddle/operators/strided_memcpy.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/math/detection_util.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/softmax.h"
+#include "paddle/fluid/operators/strided_memcpy.h"
 namespace paddle {
 namespace operators {
 template <typename DeviceContext, typename T>
diff --git a/paddle/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc
similarity index 98%
rename from paddle/operators/dropout_op.cc
rename to paddle/fluid/operators/dropout_op.cc
index 5274aa204e..e1dc900512 100644
--- a/paddle/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/dropout_op.h"
+#include "paddle/fluid/operators/dropout_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu
similarity index 98%
rename from paddle/operators/dropout_op.cu
rename to paddle/fluid/operators/dropout_op.cu
index 84d78445a4..4ae9f4ce54 100644
--- a/paddle/operators/dropout_op.cu
+++ b/paddle/fluid/operators/dropout_op.cu
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/random.h>
 #include <thrust/transform.h>
-#include "paddle/operators/dropout_op.h"
+#include "paddle/fluid/operators/dropout_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/dropout_op.h b/paddle/fluid/operators/dropout_op.h
similarity index 97%
rename from paddle/operators/dropout_op.h
rename to paddle/fluid/operators/dropout_op.h
index 46e5dbc64f..9dd1f33669 100644
--- a/paddle/operators/dropout_op.h
+++ b/paddle/fluid/operators/dropout_op.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 #include <random>
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/edit_distance_op.cc b/paddle/fluid/operators/edit_distance_op.cc
similarity index 98%
rename from paddle/operators/edit_distance_op.cc
rename to paddle/fluid/operators/edit_distance_op.cc
index 7e7dfc79eb..ae82408da7 100644
--- a/paddle/operators/edit_distance_op.cc
+++ b/paddle/fluid/operators/edit_distance_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/edit_distance_op.h"
+#include "paddle/fluid/operators/edit_distance_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/edit_distance_op.cu b/paddle/fluid/operators/edit_distance_op.cu
similarity index 96%
rename from paddle/operators/edit_distance_op.cu
rename to paddle/fluid/operators/edit_distance_op.cu
index c3e116af08..bdfead75e7 100644
--- a/paddle/operators/edit_distance_op.cu
+++ b/paddle/fluid/operators/edit_distance_op.cu
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/platform/cuda_helper.h"
-#include "paddle/platform/gpu_info.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/gpu_info.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/edit_distance_op.h b/paddle/fluid/operators/edit_distance_op.h
similarity index 97%
rename from paddle/operators/edit_distance_op.h
rename to paddle/fluid/operators/edit_distance_op.h
index 974299e604..205e16e6bf 100644
--- a/paddle/operators/edit_distance_op.h
+++ b/paddle/fluid/operators/edit_distance_op.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 #include <algorithm>
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/operators/elementwise_add_op.cc b/paddle/fluid/operators/elementwise_add_op.cc
similarity index 94%
rename from paddle/operators/elementwise_add_op.cc
rename to paddle/fluid/operators/elementwise_add_op.cc
index 37951fa758..5b9947b8c9 100644
--- a/paddle/operators/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise_add_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/elementwise_add_op.h"
-#include "paddle/operators/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise_add_op.h"
+#include "paddle/fluid/operators/elementwise_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/elementwise_add_op.cu b/paddle/fluid/operators/elementwise_add_op.cu
similarity index 96%
rename from paddle/operators/elementwise_add_op.cu
rename to paddle/fluid/operators/elementwise_add_op.cu
index 641cea323a..2ac3a998ec 100644
--- a/paddle/operators/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise_add_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/elementwise_add_op.h"
+#include "paddle/fluid/operators/elementwise_add_op.h"
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/operators/elementwise_add_op.h b/paddle/fluid/operators/elementwise_add_op.h
similarity index 96%
rename from paddle/operators/elementwise_add_op.h
rename to paddle/fluid/operators/elementwise_add_op.h
index c32288d698..248e3b9d61 100644
--- a/paddle/operators/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise_add_op.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/elementwise_op_function.h"
 
 namespace paddle {
 namespace operators {
@@ -35,7 +35,8 @@ class ElementwiseAddKernel : public framework::OpKernel<T> {
     auto* z = ctx.Output<Tensor>("Out");
     z->mutable_data<T>(ctx.GetPlace());
     int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
+    ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          AddFunctor<T>(), z);
   }
 };
 
diff --git a/paddle/operators/elementwise_div_op.cc b/paddle/fluid/operators/elementwise_div_op.cc
similarity index 94%
rename from paddle/operators/elementwise_div_op.cc
rename to paddle/fluid/operators/elementwise_div_op.cc
index 6ebd58b1b3..818ae82f44 100644
--- a/paddle/operators/elementwise_div_op.cc
+++ b/paddle/fluid/operators/elementwise_div_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/elementwise_div_op.h"
-#include "paddle/operators/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise_div_op.h"
+#include "paddle/fluid/operators/elementwise_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/elementwise_div_op.cu b/paddle/fluid/operators/elementwise_div_op.cu
similarity index 96%
rename from paddle/operators/elementwise_div_op.cu
rename to paddle/fluid/operators/elementwise_div_op.cu
index a0372123d6..d1bb7a474c 100644
--- a/paddle/operators/elementwise_div_op.cu
+++ b/paddle/fluid/operators/elementwise_div_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/elementwise_div_op.h"
+#include "paddle/fluid/operators/elementwise_div_op.h"
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/operators/elementwise_div_op.h b/paddle/fluid/operators/elementwise_div_op.h
similarity index 97%
rename from paddle/operators/elementwise_div_op.h
rename to paddle/fluid/operators/elementwise_div_op.h
index 07ebade31f..8e0726d946 100644
--- a/paddle/operators/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise_div_op.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/elementwise_op_function.h"
 
 namespace paddle {
 namespace operators {
@@ -35,7 +35,8 @@ class ElementwiseDivKernel : public framework::OpKernel<T> {
     auto* z = ctx.Output<Tensor>("Out");
     z->mutable_data<T>(ctx.GetPlace());
     int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
+    ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          DivFunctor<T>(), z);
   }
 };
 
diff --git a/paddle/operators/elementwise_max_op.cc b/paddle/fluid/operators/elementwise_max_op.cc
similarity index 94%
rename from paddle/operators/elementwise_max_op.cc
rename to paddle/fluid/operators/elementwise_max_op.cc
index 53c27ae5be..1331bcadc8 100644
--- a/paddle/operators/elementwise_max_op.cc
+++ b/paddle/fluid/operators/elementwise_max_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/elementwise_max_op.h"
-#include "paddle/operators/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise_max_op.h"
+#include "paddle/fluid/operators/elementwise_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/elementwise_max_op.cu b/paddle/fluid/operators/elementwise_max_op.cu
similarity index 96%
rename from paddle/operators/elementwise_max_op.cu
rename to paddle/fluid/operators/elementwise_max_op.cu
index 5ff4af1747..7f0259ad00 100644
--- a/paddle/operators/elementwise_max_op.cu
+++ b/paddle/fluid/operators/elementwise_max_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/elementwise_max_op.h"
+#include "paddle/fluid/operators/elementwise_max_op.h"
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/operators/elementwise_max_op.h b/paddle/fluid/operators/elementwise_max_op.h
similarity index 97%
rename from paddle/operators/elementwise_max_op.h
rename to paddle/fluid/operators/elementwise_max_op.h
index 717e45ab31..e1db9bcc01 100644
--- a/paddle/operators/elementwise_max_op.h
+++ b/paddle/fluid/operators/elementwise_max_op.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/elementwise_op_function.h"
 
 namespace paddle {
 namespace operators {
@@ -35,7 +35,8 @@ class ElementwiseMaxKernel : public framework::OpKernel<T> {
     auto* z = ctx.Output<Tensor>("Out");
     z->mutable_data<T>(ctx.GetPlace());
     int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<MaxFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
+    ElementwiseComputeEx<MaxFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          MaxFunctor<T>(), z);
   }
 };
 
diff --git a/paddle/operators/elementwise_min_op.cc b/paddle/fluid/operators/elementwise_min_op.cc
similarity index 94%
rename from paddle/operators/elementwise_min_op.cc
rename to paddle/fluid/operators/elementwise_min_op.cc
index 99482e1bf6..1d69099c8e 100644
--- a/paddle/operators/elementwise_min_op.cc
+++ b/paddle/fluid/operators/elementwise_min_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/elementwise_min_op.h"
-#include "paddle/operators/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise_min_op.h"
+#include "paddle/fluid/operators/elementwise_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/elementwise_min_op.cu b/paddle/fluid/operators/elementwise_min_op.cu
similarity index 96%
rename from paddle/operators/elementwise_min_op.cu
rename to paddle/fluid/operators/elementwise_min_op.cu
index 3547e6ccb7..ed53204735 100644
--- a/paddle/operators/elementwise_min_op.cu
+++ b/paddle/fluid/operators/elementwise_min_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/elementwise_min_op.h"
+#include "paddle/fluid/operators/elementwise_min_op.h"
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/operators/elementwise_min_op.h b/paddle/fluid/operators/elementwise_min_op.h
similarity index 97%
rename from paddle/operators/elementwise_min_op.h
rename to paddle/fluid/operators/elementwise_min_op.h
index 0de9a91c52..bfe213dd43 100644
--- a/paddle/operators/elementwise_min_op.h
+++ b/paddle/fluid/operators/elementwise_min_op.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/elementwise_op_function.h"
 
 namespace paddle {
 namespace operators {
@@ -35,7 +35,8 @@ class ElementwiseMinKernel : public framework::OpKernel<T> {
     auto* z = ctx.Output<Tensor>("Out");
     z->mutable_data<T>(ctx.GetPlace());
     int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<MinFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
+    ElementwiseComputeEx<MinFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          MinFunctor<T>(), z);
   }
 };
 
diff --git a/paddle/operators/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise_mul_op.cc
similarity index 94%
rename from paddle/operators/elementwise_mul_op.cc
rename to paddle/fluid/operators/elementwise_mul_op.cc
index 450dd05c79..0cb96f21d1 100644
--- a/paddle/operators/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise_mul_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/elementwise_mul_op.h"
-#include "paddle/operators/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise_mul_op.h"
+#include "paddle/fluid/operators/elementwise_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise_mul_op.cu
similarity index 96%
rename from paddle/operators/elementwise_mul_op.cu
rename to paddle/fluid/operators/elementwise_mul_op.cu
index f73e8afda9..d72b6250ee 100644
--- a/paddle/operators/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise_mul_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/elementwise_mul_op.h"
+#include "paddle/fluid/operators/elementwise_mul_op.h"
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/operators/elementwise_mul_op.h b/paddle/fluid/operators/elementwise_mul_op.h
similarity index 97%
rename from paddle/operators/elementwise_mul_op.h
rename to paddle/fluid/operators/elementwise_mul_op.h
index ae7a71e024..dc292eb1e7 100644
--- a/paddle/operators/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise_mul_op.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/elementwise_op_function.h"
 
 namespace paddle {
 namespace operators {
@@ -34,7 +34,8 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
     auto* z = ctx.Output<Tensor>("Out");
     z->mutable_data<T>(ctx.GetPlace());
     int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
+    ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          MulFunctor<T>(), z);
   }
 };
 
diff --git a/paddle/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h
similarity index 98%
rename from paddle/operators/elementwise_op.h
rename to paddle/fluid/operators/elementwise_op.h
index 1a0131d8b9..38f83d7ad3 100644
--- a/paddle/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h
similarity index 97%
rename from paddle/operators/elementwise_op_function.h
rename to paddle/fluid/operators/elementwise_op_function.h
index 213fe1f5a8..c1269382a4 100644
--- a/paddle/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
@@ -13,16 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
-#include "paddle/platform/transform.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/transform.h"
 
 #ifdef __NVCC__
 #include <thrust/iterator/iterator_adaptor.h>
 #endif
 
-#include "paddle/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -365,10 +365,10 @@ template <typename Functor, typename DeviceContext, typename T,
           typename OutType = T>
 void ElementwiseComputeEx(const framework::ExecutionContext& ctx,
                           const framework::Tensor* x,
-                          const framework::Tensor* y, int axis,
+                          const framework::Tensor* y, int axis, Functor func,
                           framework::Tensor* z) {
   TransformFunctor<Functor, T, DeviceContext, OutType> functor(
-      x, y, z, ctx.template device_context<DeviceContext>(), Functor());
+      x, y, z, ctx.template device_context<DeviceContext>(), func);
 
   auto x_dims = x->dims();
   auto y_dims = y->dims();
diff --git a/paddle/operators/elementwise_pow_op.cc b/paddle/fluid/operators/elementwise_pow_op.cc
similarity index 92%
rename from paddle/operators/elementwise_pow_op.cc
rename to paddle/fluid/operators/elementwise_pow_op.cc
index 5293cc7dd3..911b5dbd25 100644
--- a/paddle/operators/elementwise_pow_op.cc
+++ b/paddle/fluid/operators/elementwise_pow_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/elementwise_pow_op.h"
-#include "paddle/operators/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise_pow_op.h"
+#include "paddle/fluid/operators/elementwise_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/elementwise_pow_op.cu b/paddle/fluid/operators/elementwise_pow_op.cu
similarity index 93%
rename from paddle/operators/elementwise_pow_op.cu
rename to paddle/fluid/operators/elementwise_pow_op.cu
index 643c978e63..2996600738 100644
--- a/paddle/operators/elementwise_pow_op.cu
+++ b/paddle/fluid/operators/elementwise_pow_op.cu
@@ -10,7 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/elementwise_pow_op.h"
+#include "paddle/fluid/operators/elementwise_pow_op.h"
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/operators/elementwise_pow_op.h b/paddle/fluid/operators/elementwise_pow_op.h
similarity index 89%
rename from paddle/operators/elementwise_pow_op.h
rename to paddle/fluid/operators/elementwise_pow_op.h
index 874fd3f09f..b793c1eae0 100644
--- a/paddle/operators/elementwise_pow_op.h
+++ b/paddle/fluid/operators/elementwise_pow_op.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <cmath>
-#include "paddle/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/elementwise_op_function.h"
 
 namespace paddle {
 namespace operators {
@@ -36,7 +36,8 @@ class ElementwisePowKernel : public framework::OpKernel<T> {
     auto* z = ctx.Output<Tensor>("Out");
     z->mutable_data<T>(ctx.GetPlace());
     int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<PowFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
+    ElementwiseComputeEx<PowFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          PowFunctor<T>(), z);
   }
 };
 
diff --git a/paddle/operators/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise_sub_op.cc
similarity index 94%
rename from paddle/operators/elementwise_sub_op.cc
rename to paddle/fluid/operators/elementwise_sub_op.cc
index d3c51f0a69..46ce01c7cf 100644
--- a/paddle/operators/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise_sub_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/elementwise_sub_op.h"
-#include "paddle/operators/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise_sub_op.h"
+#include "paddle/fluid/operators/elementwise_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise_sub_op.cu
similarity index 96%
rename from paddle/operators/elementwise_sub_op.cu
rename to paddle/fluid/operators/elementwise_sub_op.cu
index 7a2516ef6a..eb09d6c5ed 100644
--- a/paddle/operators/elementwise_sub_op.cu
+++ b/paddle/fluid/operators/elementwise_sub_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/elementwise_sub_op.h"
+#include "paddle/fluid/operators/elementwise_sub_op.h"
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/operators/elementwise_sub_op.h b/paddle/fluid/operators/elementwise_sub_op.h
similarity index 96%
rename from paddle/operators/elementwise_sub_op.h
rename to paddle/fluid/operators/elementwise_sub_op.h
index c2749a8e6b..af2d497b9a 100644
--- a/paddle/operators/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise_sub_op.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/elementwise_op_function.h"
 
 namespace paddle {
 namespace operators {
@@ -34,7 +34,8 @@ class ElementwiseSubKernel : public framework::OpKernel<T> {
     auto* z = ctx.Output<Tensor>("Out");
     z->mutable_data<T>(ctx.GetPlace());
     int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
+    ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          SubFunctor<T>(), z);
   }
 };
 
diff --git a/paddle/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc
similarity index 99%
rename from paddle/operators/expand_op.cc
rename to paddle/fluid/operators/expand_op.cc
index 043c93654d..ccb9a94856 100644
--- a/paddle/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/expand_op.h"
+#include "paddle/fluid/operators/expand_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/expand_op.cu b/paddle/fluid/operators/expand_op.cu
similarity index 94%
rename from paddle/operators/expand_op.cu
rename to paddle/fluid/operators/expand_op.cu
index 84e8fa567b..8a9f39708b 100644
--- a/paddle/operators/expand_op.cu
+++ b/paddle/fluid/operators/expand_op.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #define EIGEN_USE_GPU
 
-#include "paddle/operators/expand_op.h"
+#include "paddle/fluid/operators/expand_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/expand_op.h b/paddle/fluid/operators/expand_op.h
similarity index 98%
rename from paddle/operators/expand_op.h
rename to paddle/fluid/operators/expand_op.h
index a4994cf3a5..8df1cd34d7 100644
--- a/paddle/operators/expand_op.h
+++ b/paddle/fluid/operators/expand_op.h
@@ -21,9 +21,9 @@ limitations under the License. */
 #include <boost/preprocessor/control/if.hpp>
 #include <boost/preprocessor/repetition/repeat.hpp>
 #include <iostream>
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
 
 #define MAX_RANK_SUPPORTED 6
 
diff --git a/paddle/operators/feed_op.cc b/paddle/fluid/operators/feed_op.cc
similarity index 95%
rename from paddle/operators/feed_op.cc
rename to paddle/fluid/operators/feed_op.cc
index 789d01e002..0b3f5f0d1d 100644
--- a/paddle/operators/feed_op.cc
+++ b/paddle/fluid/operators/feed_op.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/feed_fetch_type.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/fetch_op.cc b/paddle/fluid/operators/fetch_op.cc
similarity index 95%
rename from paddle/operators/fetch_op.cc
rename to paddle/fluid/operators/fetch_op.cc
index 7205ee2a87..54e5892016 100644
--- a/paddle/operators/fetch_op.cc
+++ b/paddle/fluid/operators/fetch_op.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/feed_fetch_type.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/platform/device_context.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/fill_constant_batch_size_like_op.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
similarity index 98%
rename from paddle/operators/fill_constant_batch_size_like_op.cc
rename to paddle/fluid/operators/fill_constant_batch_size_like_op.cc
index c74a5b6ced..e6992ba371 100644
--- a/paddle/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/fill_constant_batch_size_like_op.h"
+#include "paddle/fluid/operators/fill_constant_batch_size_like_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/fill_constant_batch_size_like_op.cu.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc
similarity index 91%
rename from paddle/operators/fill_constant_batch_size_like_op.cu.cc
rename to paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc
index 608f4b9162..b4f4d2a503 100644
--- a/paddle/operators/fill_constant_batch_size_like_op.cu.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/fill_constant_batch_size_like_op.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/operators/fill_constant_batch_size_like_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/fill_constant_batch_size_like_op.h b/paddle/fluid/operators/fill_constant_batch_size_like_op.h
similarity index 92%
rename from paddle/operators/fill_constant_batch_size_like_op.h
rename to paddle/fluid/operators/fill_constant_batch_size_like_op.h
index 66da9d0307..da4a20d99a 100644
--- a/paddle/operators/fill_constant_batch_size_like_op.h
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
similarity index 94%
rename from paddle/operators/fill_constant_op.cc
rename to paddle/fluid/operators/fill_constant_op.cc
index dcd43a30c8..d4bf6406e5 100644
--- a/paddle/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/data_type.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/platform/device_context.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/fill_op.cc b/paddle/fluid/operators/fill_op.cc
similarity index 95%
rename from paddle/operators/fill_op.cc
rename to paddle/fluid/operators/fill_op.cc
index 4f5a2ed169..8e318f37cf 100644
--- a/paddle/operators/fill_op.cc
+++ b/paddle/fluid/operators/fill_op.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/data_type.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/detail/safe_ref.h"
-#include "paddle/platform/device_context.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/fill_zeros_like_op.cc b/paddle/fluid/operators/fill_zeros_like_op.cc
similarity index 97%
rename from paddle/operators/fill_zeros_like_op.cc
rename to paddle/fluid/operators/fill_zeros_like_op.cc
index b4ae1de876..958bfb1557 100644
--- a/paddle/operators/fill_zeros_like_op.cc
+++ b/paddle/fluid/operators/fill_zeros_like_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/fill_zeros_like_op.h"
+#include "paddle/fluid/operators/fill_zeros_like_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/fill_zeros_like_op.cu.cc b/paddle/fluid/operators/fill_zeros_like_op.cu.cc
similarity index 91%
rename from paddle/operators/fill_zeros_like_op.cu.cc
rename to paddle/fluid/operators/fill_zeros_like_op.cu.cc
index b7048e8f58..07078573d8 100644
--- a/paddle/operators/fill_zeros_like_op.cu.cc
+++ b/paddle/fluid/operators/fill_zeros_like_op.cu.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/fill_zeros_like_op.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/operators/fill_zeros_like_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/fill_zeros_like_op.h b/paddle/fluid/operators/fill_zeros_like_op.h
similarity index 91%
rename from paddle/operators/fill_zeros_like_op.h
rename to paddle/fluid/operators/fill_zeros_like_op.h
index 351ecf8b2f..141c3809e9 100644
--- a/paddle/operators/fill_zeros_like_op.h
+++ b/paddle/fluid/operators/fill_zeros_like_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/ftrl_op.cc b/paddle/fluid/operators/ftrl_op.cc
similarity index 99%
rename from paddle/operators/ftrl_op.cc
rename to paddle/fluid/operators/ftrl_op.cc
index d00700823d..e72a173751 100644
--- a/paddle/operators/ftrl_op.cc
+++ b/paddle/fluid/operators/ftrl_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/ftrl_op.h"
+#include "paddle/fluid/operators/ftrl_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/ftrl_op.cu b/paddle/fluid/operators/ftrl_op.cu
similarity index 94%
rename from paddle/operators/ftrl_op.cu
rename to paddle/fluid/operators/ftrl_op.cu
index abbbe7adbe..dbdfcb927e 100644
--- a/paddle/operators/ftrl_op.cu
+++ b/paddle/fluid/operators/ftrl_op.cu
@@ -12,7 +12,7 @@ CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/ftrl_op.h"
+#include "paddle/fluid/operators/ftrl_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/ftrl_op.h b/paddle/fluid/operators/ftrl_op.h
similarity index 97%
rename from paddle/operators/ftrl_op.h
rename to paddle/fluid/operators/ftrl_op.h
index 4eea04cd8d..0a9405fcef 100644
--- a/paddle/operators/ftrl_op.h
+++ b/paddle/fluid/operators/ftrl_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/gather.cu.h b/paddle/fluid/operators/gather.cu.h
similarity index 96%
rename from paddle/operators/gather.cu.h
rename to paddle/fluid/operators/gather.cu.h
index 9840c066f0..af5898e29e 100644
--- a/paddle/operators/gather.cu.h
+++ b/paddle/fluid/operators/gather.cu.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/tensor.h"
-#include "paddle/platform/place.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/gather.h b/paddle/fluid/operators/gather.h
similarity index 91%
rename from paddle/operators/gather.h
rename to paddle/fluid/operators/gather.h
index 052db49cb3..287732eeb6 100644
--- a/paddle/operators/gather.h
+++ b/paddle/fluid/operators/gather.h
@@ -16,10 +16,10 @@ limitations under the License. */
 #include <memory.h>
 #include <cstring>
 
-#include "paddle/framework/ddim.h"
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/platform/place.h"
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
similarity index 97%
rename from paddle/operators/gather_op.cc
rename to paddle/fluid/operators/gather_op.cc
index 597fdad079..dceeb71ee3 100644
--- a/paddle/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/gather_op.h"
-#include "paddle/framework/ddim.h"
+#include "paddle/fluid/operators/gather_op.h"
+#include "paddle/fluid/framework/ddim.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu
similarity index 96%
rename from paddle/operators/gather_op.cu
rename to paddle/fluid/operators/gather_op.cu
index eec2415e1d..484f423262 100644
--- a/paddle/operators/gather_op.cu
+++ b/paddle/fluid/operators/gather_op.cu
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "gather.cu.h"
-#include "paddle/framework/eigen.h"
-#include "paddle/operators/gather_op.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/operators/gather_op.h"
 #include "scatter.cu.h"
 
 namespace paddle {
diff --git a/paddle/operators/gather_op.h b/paddle/fluid/operators/gather_op.h
similarity index 95%
rename from paddle/operators/gather_op.h
rename to paddle/fluid/operators/gather_op.h
index 1a1ba0c41a..7ba4a31c81 100644
--- a/paddle/operators/gather_op.h
+++ b/paddle/fluid/operators/gather_op.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 #include "gather.h"
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "scatter.h"
 
 namespace paddle {
diff --git a/paddle/operators/gather_test.cc b/paddle/fluid/operators/gather_test.cc
similarity index 90%
rename from paddle/operators/gather_test.cc
rename to paddle/fluid/operators/gather_test.cc
index cbd86b8796..4d86cf5ce3 100644
--- a/paddle/operators/gather_test.cc
+++ b/paddle/fluid/operators/gather_test.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/gather.h"
-#include "paddle/framework/ddim.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/platform/place.h"
+#include "paddle/fluid/operators/gather.h"
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/place.h"
 
 #include <gtest/gtest.h>
 #include <iostream>
diff --git a/paddle/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc
similarity index 98%
rename from paddle/operators/gaussian_random_op.cc
rename to paddle/fluid/operators/gaussian_random_op.cc
index 2dca05760e..b090f87597 100644
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/fluid/operators/gaussian_random_op.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <random>
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu
similarity index 95%
rename from paddle/operators/gaussian_random_op.cu
rename to paddle/fluid/operators/gaussian_random_op.cu
index 8a70db17e1..70d655d4bb 100644
--- a/paddle/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <thrust/random.h>
 #include <thrust/transform.h>
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/get_places_op.cc b/paddle/fluid/operators/get_places_op.cc
similarity index 95%
rename from paddle/operators/get_places_op.cc
rename to paddle/fluid/operators/get_places_op.cc
index 24fafb2307..ba908e472b 100644
--- a/paddle/operators/get_places_op.cc
+++ b/paddle/fluid/operators/get_places_op.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <thread>
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/detail/safe_ref.h"
-#include "paddle/platform/place.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/platform/place.h"
 #ifdef PADDLE_WITH_CUDA
-#include "paddle/platform/gpu_info.h"
+#include "paddle/fluid/platform/gpu_info.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc
similarity index 99%
rename from paddle/operators/gru_op.cc
rename to paddle/fluid/operators/gru_op.cc
index fb901b6394..1436e55b0e 100644
--- a/paddle/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/gru_op.h"
+#include "paddle/fluid/operators/gru_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/gru_op.cu.cc b/paddle/fluid/operators/gru_op.cu.cc
similarity index 95%
rename from paddle/operators/gru_op.cu.cc
rename to paddle/fluid/operators/gru_op.cu.cc
index 9cb0cc42d5..e908d01d29 100644
--- a/paddle/operators/gru_op.cu.cc
+++ b/paddle/fluid/operators/gru_op.cu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/gru_op.h"
+#include "paddle/fluid/operators/gru_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/gru_op.h b/paddle/fluid/operators/gru_op.h
similarity index 97%
rename from paddle/operators/gru_op.h
rename to paddle/fluid/operators/gru_op.h
index a08bd4233b..37f3ae1a83 100644
--- a/paddle/operators/gru_op.h
+++ b/paddle/fluid/operators/gru_op.h
@@ -14,13 +14,13 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/operators/math/detail/activation_functions.h"
-#include "paddle/operators/math/gru_compute.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/sequence2batch.h"
+#include "paddle/fluid/operators/math/detail/activation_functions.h"
+#include "paddle/fluid/operators/math/gru_compute.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/sequence2batch.h"
 
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/gru_unit_op.cc b/paddle/fluid/operators/gru_unit_op.cc
similarity index 99%
rename from paddle/operators/gru_unit_op.cc
rename to paddle/fluid/operators/gru_unit_op.cc
index c354293be7..21ad3aeb49 100644
--- a/paddle/operators/gru_unit_op.cc
+++ b/paddle/fluid/operators/gru_unit_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/gru_unit_op.h"
+#include "paddle/fluid/operators/gru_unit_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/gru_unit_op.cu b/paddle/fluid/operators/gru_unit_op.cu
similarity index 95%
rename from paddle/operators/gru_unit_op.cu
rename to paddle/fluid/operators/gru_unit_op.cu
index 95c8c23dad..88b707fd13 100644
--- a/paddle/operators/gru_unit_op.cu
+++ b/paddle/fluid/operators/gru_unit_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/gru_unit_op.h"
+#include "paddle/fluid/operators/gru_unit_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/gru_unit_op.h b/paddle/fluid/operators/gru_unit_op.h
similarity index 98%
rename from paddle/operators/gru_unit_op.h
rename to paddle/fluid/operators/gru_unit_op.h
index a77be46718..c4031a5a57 100644
--- a/paddle/operators/gru_unit_op.h
+++ b/paddle/fluid/operators/gru_unit_op.h
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/operators/activation_op.h"
-#include "paddle/operators/math/math_function.h"
+#include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
 
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc
similarity index 98%
rename from paddle/operators/hinge_loss_op.cc
rename to paddle/fluid/operators/hinge_loss_op.cc
index 19d2e9dc56..f644c22c9f 100644
--- a/paddle/operators/hinge_loss_op.cc
+++ b/paddle/fluid/operators/hinge_loss_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/hinge_loss_op.h"
+#include "paddle/fluid/operators/hinge_loss_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/hinge_loss_op.cu b/paddle/fluid/operators/hinge_loss_op.cu
similarity index 94%
rename from paddle/operators/hinge_loss_op.cu
rename to paddle/fluid/operators/hinge_loss_op.cu
index b9cfbc50c4..cb53a9b7f4 100644
--- a/paddle/operators/hinge_loss_op.cu
+++ b/paddle/fluid/operators/hinge_loss_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/hinge_loss_op.h"
+#include "paddle/fluid/operators/hinge_loss_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/hinge_loss_op.h b/paddle/fluid/operators/hinge_loss_op.h
similarity index 96%
rename from paddle/operators/hinge_loss_op.h
rename to paddle/fluid/operators/hinge_loss_op.h
index 91369cfb8a..1e924d236e 100644
--- a/paddle/operators/hinge_loss_op.h
+++ b/paddle/fluid/operators/hinge_loss_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/huber_loss_op.cc b/paddle/fluid/operators/huber_loss_op.cc
similarity index 98%
rename from paddle/operators/huber_loss_op.cc
rename to paddle/fluid/operators/huber_loss_op.cc
index 5c92f2c7b2..dc1f609dcf 100644
--- a/paddle/operators/huber_loss_op.cc
+++ b/paddle/fluid/operators/huber_loss_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/huber_loss_op.h"
+#include "paddle/fluid/operators/huber_loss_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/huber_loss_op.cu b/paddle/fluid/operators/huber_loss_op.cu
similarity index 94%
rename from paddle/operators/huber_loss_op.cu
rename to paddle/fluid/operators/huber_loss_op.cu
index ccc83a16ba..ef5120c69d 100644
--- a/paddle/operators/huber_loss_op.cu
+++ b/paddle/fluid/operators/huber_loss_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/huber_loss_op.h"
+#include "paddle/fluid/operators/huber_loss_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/huber_loss_op.h b/paddle/fluid/operators/huber_loss_op.h
similarity index 96%
rename from paddle/operators/huber_loss_op.h
rename to paddle/fluid/operators/huber_loss_op.h
index 4dd20e8b08..caca89fcf6 100644
--- a/paddle/operators/huber_loss_op.h
+++ b/paddle/fluid/operators/huber_loss_op.h
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/platform/hostdevice.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc
similarity index 99%
rename from paddle/operators/im2sequence_op.cc
rename to paddle/fluid/operators/im2sequence_op.cc
index 31baaedf69..936e5fe49e 100644
--- a/paddle/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/im2sequence_op.h"
+#include "paddle/fluid/operators/im2sequence_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/im2sequence_op.cu b/paddle/fluid/operators/im2sequence_op.cu
similarity index 94%
rename from paddle/operators/im2sequence_op.cu
rename to paddle/fluid/operators/im2sequence_op.cu
index 9db7529112..1e7bf46312 100644
--- a/paddle/operators/im2sequence_op.cu
+++ b/paddle/fluid/operators/im2sequence_op.cu
@@ -13,7 +13,7 @@
    limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/im2sequence_op.h"
+#include "paddle/fluid/operators/im2sequence_op.h"
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/operators/im2sequence_op.h b/paddle/fluid/operators/im2sequence_op.h
similarity index 95%
rename from paddle/operators/im2sequence_op.h
rename to paddle/fluid/operators/im2sequence_op.h
index f33aec71a9..59456f0ea2 100644
--- a/paddle/operators/im2sequence_op.h
+++ b/paddle/fluid/operators/im2sequence_op.h
@@ -14,11 +14,11 @@
 
 #pragma once
 
-#include "paddle/framework/data_layout.h"
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/im2col.h"
-#include "paddle/operators/math/math_function.h"
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/im2col.h"
+#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/images/batch_norm_fork.dot b/paddle/fluid/operators/images/batch_norm_fork.dot
similarity index 100%
rename from paddle/operators/images/batch_norm_fork.dot
rename to paddle/fluid/operators/images/batch_norm_fork.dot
diff --git a/paddle/operators/images/batch_norm_fork.png b/paddle/fluid/operators/images/batch_norm_fork.png
similarity index 100%
rename from paddle/operators/images/batch_norm_fork.png
rename to paddle/fluid/operators/images/batch_norm_fork.png
diff --git a/paddle/operators/images/batch_norm_op_kernel.png b/paddle/fluid/operators/images/batch_norm_op_kernel.png
similarity index 100%
rename from paddle/operators/images/batch_norm_op_kernel.png
rename to paddle/fluid/operators/images/batch_norm_op_kernel.png
diff --git a/paddle/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc
similarity index 98%
rename from paddle/operators/increment_op.cc
rename to paddle/fluid/operators/increment_op.cc
index e0b80cc4e7..3d488067b2 100644
--- a/paddle/operators/increment_op.cc
+++ b/paddle/fluid/operators/increment_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/iou_similarity_op.cc b/paddle/fluid/operators/iou_similarity_op.cc
similarity index 98%
rename from paddle/operators/iou_similarity_op.cc
rename to paddle/fluid/operators/iou_similarity_op.cc
index c520b28b83..c2e452cdfa 100755
--- a/paddle/operators/iou_similarity_op.cc
+++ b/paddle/fluid/operators/iou_similarity_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/iou_similarity_op.h"
+#include "paddle/fluid/operators/iou_similarity_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/iou_similarity_op.cu b/paddle/fluid/operators/iou_similarity_op.cu
similarity index 93%
rename from paddle/operators/iou_similarity_op.cu
rename to paddle/fluid/operators/iou_similarity_op.cu
index fa50526246..f8df1f4aa4 100755
--- a/paddle/operators/iou_similarity_op.cu
+++ b/paddle/fluid/operators/iou_similarity_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/iou_similarity_op.h"
+#include "paddle/fluid/operators/iou_similarity_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/iou_similarity_op.h b/paddle/fluid/operators/iou_similarity_op.h
similarity index 97%
rename from paddle/operators/iou_similarity_op.h
rename to paddle/fluid/operators/iou_similarity_op.h
index e36177069d..2fb1b5f707 100644
--- a/paddle/operators/iou_similarity_op.h
+++ b/paddle/fluid/operators/iou_similarity_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/platform/for_range.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/for_range.h"
 
 template <typename T>
 inline HOSTDEVICE T IOUSimilarity(T xmin1, T ymin1, T xmax1, T ymax1, T xmin2,
diff --git a/paddle/operators/is_empty_op.cc b/paddle/fluid/operators/is_empty_op.cc
similarity index 96%
rename from paddle/operators/is_empty_op.cc
rename to paddle/fluid/operators/is_empty_op.cc
index 492ae48845..ea424018d6 100644
--- a/paddle/operators/is_empty_op.cc
+++ b/paddle/fluid/operators/is_empty_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/l1_norm_op.cc b/paddle/fluid/operators/l1_norm_op.cc
similarity index 98%
rename from paddle/operators/l1_norm_op.cc
rename to paddle/fluid/operators/l1_norm_op.cc
index 1a5d6e1926..974ee404f8 100644
--- a/paddle/operators/l1_norm_op.cc
+++ b/paddle/fluid/operators/l1_norm_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/l1_norm_op.h"
+#include "paddle/fluid/operators/l1_norm_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/l1_norm_op.cu b/paddle/fluid/operators/l1_norm_op.cu
similarity index 94%
rename from paddle/operators/l1_norm_op.cu
rename to paddle/fluid/operators/l1_norm_op.cu
index 7ecc774670..5e9e864a34 100644
--- a/paddle/operators/l1_norm_op.cu
+++ b/paddle/fluid/operators/l1_norm_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/l1_norm_op.h"
+#include "paddle/fluid/operators/l1_norm_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/l1_norm_op.h b/paddle/fluid/operators/l1_norm_op.h
similarity index 96%
rename from paddle/operators/l1_norm_op.h
rename to paddle/fluid/operators/l1_norm_op.h
index 086d42705d..7ddf2ac6a9 100644
--- a/paddle/operators/l1_norm_op.h
+++ b/paddle/fluid/operators/l1_norm_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/label_smooth_op.cc b/paddle/fluid/operators/label_smooth_op.cc
similarity index 99%
rename from paddle/operators/label_smooth_op.cc
rename to paddle/fluid/operators/label_smooth_op.cc
index c89082f44b..c018965bee 100644
--- a/paddle/operators/label_smooth_op.cc
+++ b/paddle/fluid/operators/label_smooth_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/label_smooth_op.h"
+#include "paddle/fluid/operators/label_smooth_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/label_smooth_op.cu b/paddle/fluid/operators/label_smooth_op.cu
similarity index 95%
rename from paddle/operators/label_smooth_op.cu
rename to paddle/fluid/operators/label_smooth_op.cu
index 5a0cec12bc..4a40a4e9ec 100644
--- a/paddle/operators/label_smooth_op.cu
+++ b/paddle/fluid/operators/label_smooth_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/label_smooth_op.h"
+#include "paddle/fluid/operators/label_smooth_op.h"
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/operators/label_smooth_op.h b/paddle/fluid/operators/label_smooth_op.h
similarity index 96%
rename from paddle/operators/label_smooth_op.h
rename to paddle/fluid/operators/label_smooth_op.h
index 87bc9f793e..15752377f6 100644
--- a/paddle/operators/label_smooth_op.h
+++ b/paddle/fluid/operators/label_smooth_op.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc
new file mode 100644
index 0000000000..60e37ed01b
--- /dev/null
+++ b/paddle/fluid/operators/layer_norm_op.cc
@@ -0,0 +1,173 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/layer_norm_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DataLayout = framework::DataLayout;
+
+class LayerNormOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of LayerNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Y"),
+                   "Output(Y) of LayerNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Mean"),
+                   "Output(Mean) of LayerNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Variance"),
+                   "Output(Variance) of LayerNormOp should not be null.");
+
+    auto x_dim = ctx->GetInputDim("X");
+    auto begin_norm_axis = ctx->Attrs().Get<int>("begin_norm_axis");
+    PADDLE_ENFORCE_LT(begin_norm_axis, x_dim.size(),
+                      "'begin_norm_axis' must be less than the rank of X.");
+
+    auto matrix_dim = framework::flatten_to_2d(x_dim, begin_norm_axis);
+    int left = static_cast<int>(matrix_dim[0]);
+    int right = static_cast<int>(matrix_dim[1]);
+    if (ctx->HasInput("Scale")) {
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], right);
+    }
+    if (ctx->HasInput("Bias")) {
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], right);
+    }
+
+    ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
+    ctx->SetOutputDim("Mean", {left});
+    ctx->SetOutputDim("Variance", {left});
+    ctx->ShareLoD("X", "Y");
+  }
+};
+
+class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LayerNormOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(LoDTensor) The input tensor.");
+    AddInput("Scale",
+             "(Tensor, optional) Scale is a 1-dimensional tensor of size "
+             "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
+             "It is applied to the output.")
+        .AsDispensable();
+    AddInput("Bias",
+             "(Tensor, optional) Bias is a 1-dimensional tensor of size "
+             "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
+             "It is applied to the output.")
+        .AsDispensable();
+    AddOutput("Y", "(LoDTensor) Result after normalization.");
+    AddOutput("Mean", "(Tensor) Mean of the current mini batch.")
+        .AsIntermediate();
+    AddOutput("Variance", "(Tensor) Variance of the current mini batch.")
+        .AsIntermediate();
+
+    AddAttr<float>("epsilon",
+                   "(float, default 1e-5) Constant for "
+                   "numerical stability")
+        .SetDefault(1e-5)
+        .AddCustomChecker([](const float &epsilon) {
+          PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f,
+                         "'epsilon' should be between 0.0 and 0.001.");
+        });
+    AddAttr<int>("begin_norm_axis",
+                 "(int default:1), the "
+                 "axis of `begin_norm_axis ... Rank(X) - 1` will be "
+                 "normalized. `begin_norm_axis` splits the tensor(`X`) to a "
+                 "matrix [N,H].")
+        .SetDefault(1)
+        .AddCustomChecker([](const int &begin_norm_axis) {
+          PADDLE_ENFORCE_GT(begin_norm_axis, 0,
+                            "'begin_norm_axis' should be greater than zero.");
+        });
+
+    AddComment(R"DOC(
+Layer Normalization.
+Layer Norm has been implemented as discussed in the paper:
+https://arxiv.org/abs/1607.06450
+...
+)DOC");
+  }
+};
+
+class LayerNormGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    // check input
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of LayerNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Mean"),
+                   "Input(Mean) of LayerNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Variance"),
+                   "Input(Variance) of LayerNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
+                   "Input(Y@GRAD) of LayerNormOp should not be null.");
+
+    // check output
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Scale"))) {
+      ctx->SetOutputDim(framework::GradVarName("Scale"),
+                        ctx->GetInputDim("Scale"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Bias"))) {
+      ctx->SetOutputDim(framework::GradVarName("Bias"),
+                        ctx->GetInputDim("Bias"));
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    const auto *var = ctx.InputVar(framework::GradVarName("Y"));
+    if (var == nullptr) {
+      PADDLE_THROW("can't find Y@GRAD");
+    }
+    const Tensor *t = nullptr;
+    if (var->IsType<Tensor>()) {
+      t = &var->Get<Tensor>();
+    } else if (var->IsType<LoDTensor>()) {
+      t = &var->Get<LoDTensor>();
+    }
+    if (t == nullptr) {
+      PADDLE_THROW("can't find Y@GRAD");
+    }
+    return framework::OpKernelType(framework::ToDataType(t->type()),
+                                   ctx.GetPlace());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker,
+            layer_norm_grad, ops::LayerNormGradOp);
+REGISTER_OP_CPU_KERNEL(
+    layer_norm, ops::LayerNormKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LayerNormKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    layer_norm_grad,
+    ops::LayerNormGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LayerNormGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/layer_norm_op.h b/paddle/fluid/operators/layer_norm_op.cu
similarity index 51%
rename from paddle/operators/layer_norm_op.h
rename to paddle/fluid/operators/layer_norm_op.cu
index bca35b91e6..aa54fd5415 100644
--- a/paddle/operators/layer_norm_op.h
+++ b/paddle/fluid/operators/layer_norm_op.cu
@@ -12,24 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class LayerNormKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override;
-};
-
-template <typename DeviceContext, typename T>
-class LayerNormGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override;
-};
-
-}  // namespace operators
-}  // namespace paddle
+#include "paddle/fluid/operators/layer_norm_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    layer_norm,
+    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    layer_norm_grad,
+    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/layer_norm_op.h b/paddle/fluid/operators/layer_norm_op.h
new file mode 100644
index 0000000000..60c0b07add
--- /dev/null
+++ b/paddle/fluid/operators/layer_norm_op.h
@@ -0,0 +1,238 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+#include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct SubAndSquareFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return (a - b) * (a - b); }
+};
+
+template <typename T>
+struct DivAndSqrtFunctor {
+  explicit DivAndSqrtFunctor(T epsilon) { epsilon_ = epsilon; }
+  inline HOSTDEVICE T operator()(T a, T b) const {
+    return a / (sqrt(b + epsilon_));
+  }
+
+ private:
+  T epsilon_;
+};
+
+template <typename T>
+struct MulFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a * b; }
+};
+
+template <typename T>
+struct AddFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
+};
+
+template <typename T>
+struct SubFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a - b; }
+};
+
+template <typename T>
+struct MulInvVarFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const {
+    return a * std::sqrt(1.0 / b);
+  }
+};
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DataLayout = framework::DataLayout;
+
+template <typename DeviceContext, typename T>
+class LayerNormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const float epsilon = ctx.Attr<float>("epsilon");
+    auto *scale = ctx.Input<Tensor>("Scale");
+    auto *bias = ctx.Input<Tensor>("Bias");
+    auto x = *ctx.Input<Tensor>("X");
+
+    auto *y = ctx.Output<Tensor>("Y");
+    auto *mean = ctx.Output<Tensor>("Mean");
+    auto *var = ctx.Output<Tensor>("Variance");
+    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
+
+    const auto x_dims = x.dims();
+
+    y->mutable_data<T>(ctx.GetPlace());
+    mean->mutable_data<T>(ctx.GetPlace());
+    var->mutable_data<T>(ctx.GetPlace());
+
+    auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
+    int left = static_cast<int>(matrix_dim[0]);
+    int right = static_cast<int>(matrix_dim[1]);
+    framework::DDim matrix_shape({left, right});
+
+    x.Resize(matrix_shape);
+    Tensor out;
+    out.ShareDataWith(*y);
+    out.Resize(matrix_shape);
+
+    auto &dev_ctx = ctx.template device_context<DeviceContext>();
+    math::RowwiseMean<DeviceContext, T> row_mean;
+
+    // get mean
+    row_mean(dev_ctx, x, mean);
+
+    // get variance
+    ElementwiseComputeEx<SubAndSquareFunctor<T>, DeviceContext, T>(
+        ctx, &x, mean, /*axis*/ 0, SubAndSquareFunctor<T>(), &out);
+    row_mean(dev_ctx, out, var);
+
+    // get x_norm
+    ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
+        ctx, &x, mean, /*axis*/ 0, SubFunctor<T>(), &out);
+    ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
+        ctx, &out, var, /*axis*/ 0,
+        DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), &out);
+
+    if (scale) {
+      ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
+          ctx, &out, scale, /*axis*/ 1, MulFunctor<T>(), &out);
+    }
+    if (bias) {
+      ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(
+          ctx, &out, bias, /*axis*/ 1, AddFunctor<T>(), &out);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LayerNormGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const float epsilon = ctx.Attr<float>("epsilon");
+    auto x = *ctx.Input<Tensor>("X");
+    auto *y = ctx.Input<Tensor>("Y");
+    auto *mean = ctx.Input<Tensor>("Mean");
+    auto *var = ctx.Input<Tensor>("Variance");
+    auto *scale = ctx.Input<Tensor>("Scale");
+    auto *bias = ctx.Input<Tensor>("Bias");
+    auto d_y = *ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
+
+    // init output
+    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    const auto &x_dims = x.dims();
+    auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
+    int left = static_cast<int>(matrix_dim[0]);
+    int right = static_cast<int>(matrix_dim[1]);
+    framework::DDim matrix_shape({left, right});
+
+    d_y.Resize(matrix_shape);
+    auto &dev_ctx = ctx.template device_context<DeviceContext>();
+    math::ColwiseSum<DeviceContext, T> colwise_sum;
+
+    Tensor temp;
+    Tensor temp_norm;
+    if (d_scale || d_x) {
+      x.Resize(matrix_shape);
+      temp.mutable_data<T>(matrix_shape, ctx.GetPlace());
+
+      if (!(bias && scale)) {
+        temp_norm.ShareDataWith(*y);
+        temp_norm.Resize(matrix_shape);
+      } else {
+        temp_norm.mutable_data<T>(matrix_shape, ctx.GetPlace());
+        // get x_norm
+        ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
+            ctx, &x, mean, /*axis*/ 0, SubFunctor<T>(), &temp_norm);
+        ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
+            ctx, &temp_norm, var, /*axis*/ 0,
+            DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), &temp_norm);
+      }
+    }
+
+    if (d_bias) {
+      d_bias->mutable_data<T>(ctx.GetPlace());
+      colwise_sum(dev_ctx, d_y, d_bias);
+    }
+    if (d_scale) {
+      d_scale->mutable_data<T>(ctx.GetPlace());
+      ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
+          ctx, &temp_norm, &d_y, /*axis*/ 0, MulFunctor<T>(), &temp);
+      colwise_sum(dev_ctx, temp, d_scale);
+    }
+
+    if (d_x) {
+      framework::DDim vec_shape({left});
+      d_x->mutable_data<T>(ctx.GetPlace());
+      auto dx_dim = d_x->dims();
+      Tensor temp_vec;
+      temp_vec.mutable_data<T>(vec_shape, ctx.GetPlace());
+
+      math::RowwiseMean<DeviceContext, T> row_mean;
+
+      if (d_scale) {
+        // dy_dx
+        ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
+            ctx, &d_y, scale, /*axis*/ 1, MulFunctor<T>(), &temp);
+        framework::Copy(temp, ctx.GetPlace(), ctx.device_context(), d_x);
+
+        // dy_dmean_dx
+        row_mean(dev_ctx, temp, &temp_vec);
+        ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
+            ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor<T>(), d_x);
+
+        // dy_var_dx
+        ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
+            ctx, &temp, &temp_norm, /*axis*/ 0, MulFunctor<T>(), &temp);
+      } else {
+        // dy_dx
+        framework::Copy(d_y, ctx.GetPlace(), ctx.device_context(), d_x);
+
+        // dy_dmean_dx
+        row_mean(dev_ctx, d_y, &temp_vec);
+        ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
+            ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor<T>(), d_x);
+
+        // dy_var_dx
+        ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
+            ctx, &d_y, &temp_norm, /*axis*/ 0, MulFunctor<T>(), &temp);
+      }
+      // dy_var_dx
+      row_mean(dev_ctx, temp, &temp_vec);
+      ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
+          ctx, &temp_norm, &temp_vec, /*axis*/ 0, MulFunctor<T>(), &temp);
+      ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
+          ctx, d_x, &temp, /*axis*/ 0, SubFunctor<T>(), d_x);
+
+      ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
+          ctx, d_x, var, /*axis*/ 0,
+          DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), d_x);
+      d_x->Resize(dx_dim);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc
similarity index 99%
rename from paddle/operators/linear_chain_crf_op.cc
rename to paddle/fluid/operators/linear_chain_crf_op.cc
index e24bf622b7..3e1dfa4948 100644
--- a/paddle/operators/linear_chain_crf_op.cc
+++ b/paddle/fluid/operators/linear_chain_crf_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/linear_chain_crf_op.h"
+#include "paddle/fluid/operators/linear_chain_crf_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/linear_chain_crf_op.cu b/paddle/fluid/operators/linear_chain_crf_op.cu
similarity index 95%
rename from paddle/operators/linear_chain_crf_op.cu
rename to paddle/fluid/operators/linear_chain_crf_op.cu
index da612510b4..6e04e76eeb 100644
--- a/paddle/operators/linear_chain_crf_op.cu
+++ b/paddle/fluid/operators/linear_chain_crf_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/linear_chain_crf_op.h"
+#include "paddle/fluid/operators/linear_chain_crf_op.h"
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/operators/linear_chain_crf_op.h b/paddle/fluid/operators/linear_chain_crf_op.h
similarity index 99%
rename from paddle/operators/linear_chain_crf_op.h
rename to paddle/fluid/operators/linear_chain_crf_op.h
index afc197a1c3..15b64c09bf 100644
--- a/paddle/operators/linear_chain_crf_op.h
+++ b/paddle/fluid/operators/linear_chain_crf_op.h
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
similarity index 94%
rename from paddle/operators/listen_and_serv_op.cc
rename to paddle/fluid/operators/listen_and_serv_op.cc
index 099f6b2373..a72708d9ba 100644
--- a/paddle/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -19,14 +19,14 @@ limitations under the License. */
 
 #include <unistd.h>
 
-#include "paddle/framework/executor.h"
-#include "paddle/framework/framework.pb.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/proto_desc.h"
-#include "paddle/operators/detail/grpc_server.h"
-#include "paddle/operators/detail/sendrecvop_utils.h"
-#include "paddle/operators/detail/simple_block_queue.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/proto_desc.h"
+#include "paddle/fluid/operators/detail/grpc_server.h"
+#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+#include "paddle/fluid/operators/detail/simple_block_queue.h"
 #include "paddle/string/printf.h"
 
 namespace paddle {
diff --git a/paddle/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc
similarity index 97%
rename from paddle/operators/load_combine_op.cc
rename to paddle/fluid/operators/load_combine_op.cc
index f4be793d7b..1948063d88 100644
--- a/paddle/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <fstream>
 
-#include "paddle/framework/op_registry.h"
-#include "paddle/platform/device_context.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
similarity index 96%
rename from paddle/operators/load_op.cc
rename to paddle/fluid/operators/load_op.cc
index f886b423ac..c9bf5d72b2 100644
--- a/paddle/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <fstream>
 
-#include "paddle/framework/op_registry.h"
-#include "paddle/platform/device_context.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/lod_array_length_op.cc b/paddle/fluid/operators/lod_array_length_op.cc
similarity index 96%
rename from paddle/operators/lod_array_length_op.cc
rename to paddle/fluid/operators/lod_array_length_op.cc
index d2c52745cf..f11f5a89f5 100644
--- a/paddle/operators/lod_array_length_op.cc
+++ b/paddle/fluid/operators/lod_array_length_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/lod_tensor_array.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/lod_rank_table_op.cc b/paddle/fluid/operators/lod_rank_table_op.cc
similarity index 97%
rename from paddle/operators/lod_rank_table_op.cc
rename to paddle/fluid/operators/lod_rank_table_op.cc
index 692b9bf371..0b9426a9f8 100644
--- a/paddle/operators/lod_rank_table_op.cc
+++ b/paddle/fluid/operators/lod_rank_table_op.cc
@@ -11,8 +11,8 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/framework/lod_rank_table.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/op_registry.h"
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/operators/lod_reset_op.cc b/paddle/fluid/operators/lod_reset_op.cc
similarity index 98%
rename from paddle/operators/lod_reset_op.cc
rename to paddle/fluid/operators/lod_reset_op.cc
index 3d7b15edcf..55ae71c181 100644
--- a/paddle/operators/lod_reset_op.cc
+++ b/paddle/fluid/operators/lod_reset_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/lod_reset_op.h"
+#include "paddle/fluid/operators/lod_reset_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/lod_reset_op.cu b/paddle/fluid/operators/lod_reset_op.cu
similarity index 95%
rename from paddle/operators/lod_reset_op.cu
rename to paddle/fluid/operators/lod_reset_op.cu
index 910866ea63..8bfc8bd3bf 100644
--- a/paddle/operators/lod_reset_op.cu
+++ b/paddle/fluid/operators/lod_reset_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/lod_reset_op.h"
+#include "paddle/fluid/operators/lod_reset_op.h"
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/operators/lod_reset_op.h b/paddle/fluid/operators/lod_reset_op.h
similarity index 96%
rename from paddle/operators/lod_reset_op.h
rename to paddle/fluid/operators/lod_reset_op.h
index c1bbba7a83..a10efee0bd 100644
--- a/paddle/operators/lod_reset_op.h
+++ b/paddle/fluid/operators/lod_reset_op.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc
similarity index 96%
rename from paddle/operators/lod_tensor_to_array_op.cc
rename to paddle/fluid/operators/lod_tensor_to_array_op.cc
index 685a807a8a..edc32bcec1 100644
--- a/paddle/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
@@ -11,11 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/framework/lod_rank_table.h"
-#include "paddle/framework/lod_tensor_array.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/detail/safe_ref.h"
-#include "paddle/platform/device_context.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc
similarity index 98%
rename from paddle/operators/log_loss_op.cc
rename to paddle/fluid/operators/log_loss_op.cc
index f714945354..6c5cd29568 100644
--- a/paddle/operators/log_loss_op.cc
+++ b/paddle/fluid/operators/log_loss_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/log_loss_op.h"
+#include "paddle/fluid/operators/log_loss_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/log_loss_op.cu b/paddle/fluid/operators/log_loss_op.cu
similarity index 94%
rename from paddle/operators/log_loss_op.cu
rename to paddle/fluid/operators/log_loss_op.cu
index be283e4700..c164a6d040 100644
--- a/paddle/operators/log_loss_op.cu
+++ b/paddle/fluid/operators/log_loss_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/log_loss_op.h"
+#include "paddle/fluid/operators/log_loss_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/log_loss_op.h b/paddle/fluid/operators/log_loss_op.h
similarity index 96%
rename from paddle/operators/log_loss_op.h
rename to paddle/fluid/operators/log_loss_op.h
index 743eddb740..67fac7cfe5 100644
--- a/paddle/operators/log_loss_op.h
+++ b/paddle/fluid/operators/log_loss_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/logical_op.cc b/paddle/fluid/operators/logical_op.cc
similarity index 98%
rename from paddle/operators/logical_op.cc
rename to paddle/fluid/operators/logical_op.cc
index fedd325cf4..ff49895df1 100644
--- a/paddle/operators/logical_op.cc
+++ b/paddle/fluid/operators/logical_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/logical_op.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/operators/logical_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/logical_op.cu b/paddle/fluid/operators/logical_op.cu
similarity index 95%
rename from paddle/operators/logical_op.cu
rename to paddle/fluid/operators/logical_op.cu
index 87f2287b8f..2b17444061 100644
--- a/paddle/operators/logical_op.cu
+++ b/paddle/fluid/operators/logical_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/logical_op.h"
+#include "paddle/fluid/operators/logical_op.h"
 
 REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CUDA,
                                paddle::operators::LogicalAndFunctor);
diff --git a/paddle/operators/logical_op.h b/paddle/fluid/operators/logical_op.h
similarity index 97%
rename from paddle/operators/logical_op.h
rename to paddle/fluid/operators/logical_op.h
index 4138576856..f6d5866c2c 100644
--- a/paddle/operators/logical_op.h
+++ b/paddle/fluid/operators/logical_op.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 #include <math.h>
 #include <type_traits>
-#include "paddle/framework/op_registry.h"
-#include "paddle/platform/transform.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/transform.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
similarity index 98%
rename from paddle/operators/lookup_table_op.cc
rename to paddle/fluid/operators/lookup_table_op.cc
index 2405852f53..2c555f1a3f 100644
--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/lookup_table_op.h"
-#include "paddle/framework/var_type_inference.h"
+#include "paddle/fluid/operators/lookup_table_op.h"
+#include "paddle/fluid/framework/var_type_inference.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu
similarity index 93%
rename from paddle/operators/lookup_table_op.cu
rename to paddle/fluid/operators/lookup_table_op.cu
index 07372808bb..801adba5a4 100644
--- a/paddle/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/lookup_table_op.h"
-#include "paddle/platform/assert.h"
-#include "paddle/platform/cuda_helper.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/lookup_table_op.h"
+#include "paddle/fluid/platform/assert.h"
+#include "paddle/fluid/platform/cuda_helper.h"
 
 namespace paddle {
 namespace operators {
@@ -125,7 +125,9 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
       new_rows.resize(ids_dim[0]);
       auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
 
-      memory::Copy(platform::CPUPlace(), new_rows.cuda_data(), gpu_place,
+      // TODO(yuyang18): Strange code here.
+      memory::Copy(platform::CPUPlace(),
+                   new_rows.CUDAMutableData(context.GetPlace()), gpu_place,
                    ids_data, ids_dim[0] * sizeof(int64_t), stream);
 
       d_table->set_rows(new_rows);
diff --git a/paddle/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
similarity index 95%
rename from paddle/operators/lookup_table_op.h
rename to paddle/fluid/operators/lookup_table_op.h
index 0842c422f7..d264496882 100644
--- a/paddle/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/selected_rows.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/selected_rows.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
similarity index 99%
rename from paddle/operators/lrn_op.cc
rename to paddle/fluid/operators/lrn_op.cc
index 95673ba19e..c84507f231 100644
--- a/paddle/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/lrn_op.h"
+#include "paddle/fluid/operators/lrn_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/lrn_op.cu b/paddle/fluid/operators/lrn_op.cu
similarity index 99%
rename from paddle/operators/lrn_op.cu
rename to paddle/fluid/operators/lrn_op.cu
index eb9d66a73d..03112bf3e0 100644
--- a/paddle/operators/lrn_op.cu
+++ b/paddle/fluid/operators/lrn_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/lrn_op.h"
+#include "paddle/fluid/operators/lrn_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/lrn_op.h b/paddle/fluid/operators/lrn_op.h
similarity index 96%
rename from paddle/operators/lrn_op.h
rename to paddle/fluid/operators/lrn_op.h
index ef3a2883a8..b7b78b4591 100644
--- a/paddle/operators/lrn_op.h
+++ b/paddle/fluid/operators/lrn_op.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/lstm_op.cc b/paddle/fluid/operators/lstm_op.cc
similarity index 99%
rename from paddle/operators/lstm_op.cc
rename to paddle/fluid/operators/lstm_op.cc
index afb095a04e..d1f1b5f235 100644
--- a/paddle/operators/lstm_op.cc
+++ b/paddle/fluid/operators/lstm_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/lstm_op.h"
+#include "paddle/fluid/operators/lstm_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/lstm_op.cu.cc b/paddle/fluid/operators/lstm_op.cu.cc
similarity index 95%
rename from paddle/operators/lstm_op.cu.cc
rename to paddle/fluid/operators/lstm_op.cu.cc
index cfcc1fc92a..679d02b1f9 100644
--- a/paddle/operators/lstm_op.cu.cc
+++ b/paddle/fluid/operators/lstm_op.cu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/lstm_op.h"
+#include "paddle/fluid/operators/lstm_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/lstm_op.h b/paddle/fluid/operators/lstm_op.h
similarity index 98%
rename from paddle/operators/lstm_op.h
rename to paddle/fluid/operators/lstm_op.h
index 72e95b75e2..1c48495533 100644
--- a/paddle/operators/lstm_op.h
+++ b/paddle/fluid/operators/lstm_op.h
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/detail/activation_functions.h"
-#include "paddle/operators/math/lstm_compute.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/sequence2batch.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/detail/activation_functions.h"
+#include "paddle/fluid/operators/math/lstm_compute.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/sequence2batch.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/lstm_unit_op.cc b/paddle/fluid/operators/lstm_unit_op.cc
similarity index 98%
rename from paddle/operators/lstm_unit_op.cc
rename to paddle/fluid/operators/lstm_unit_op.cc
index c2d2c43982..3d33d47e0c 100644
--- a/paddle/operators/lstm_unit_op.cc
+++ b/paddle/fluid/operators/lstm_unit_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/lstm_unit_op.h"
+#include "paddle/fluid/operators/lstm_unit_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/lstm_unit_op.cu b/paddle/fluid/operators/lstm_unit_op.cu
similarity index 97%
rename from paddle/operators/lstm_unit_op.cu
rename to paddle/fluid/operators/lstm_unit_op.cu
index 5ee5ddd280..12ebffca37 100644
--- a/paddle/operators/lstm_unit_op.cu
+++ b/paddle/fluid/operators/lstm_unit_op.cu
@@ -16,10 +16,10 @@ limitations under the License. */
 https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op_gpu.cu
 */
 
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/cross_entropy_op.h"
-#include "paddle/platform/assert.h"
-#include "paddle/platform/hostdevice.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/cross_entropy_op.h"
+#include "paddle/fluid/platform/assert.h"
+#include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/lstm_unit_op.h b/paddle/fluid/operators/lstm_unit_op.h
similarity index 99%
rename from paddle/operators/lstm_unit_op.h
rename to paddle/fluid/operators/lstm_unit_op.h
index fa8d141bcb..9f2370fe69 100644
--- a/paddle/operators/lstm_unit_op.h
+++ b/paddle/fluid/operators/lstm_unit_op.h
@@ -18,7 +18,7 @@ https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op.h
 
 #pragma once
 #include "glog/logging.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/lstmp_op.cc b/paddle/fluid/operators/lstmp_op.cc
similarity index 99%
rename from paddle/operators/lstmp_op.cc
rename to paddle/fluid/operators/lstmp_op.cc
index c96b30ba35..2d30edf5c3 100644
--- a/paddle/operators/lstmp_op.cc
+++ b/paddle/fluid/operators/lstmp_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/lstmp_op.h"
+#include "paddle/fluid/operators/lstmp_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/lstmp_op.cu b/paddle/fluid/operators/lstmp_op.cu
similarity index 95%
rename from paddle/operators/lstmp_op.cu
rename to paddle/fluid/operators/lstmp_op.cu
index 7fcbcfecc8..bcefb94c75 100644
--- a/paddle/operators/lstmp_op.cu
+++ b/paddle/fluid/operators/lstmp_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/lstmp_op.h"
+#include "paddle/fluid/operators/lstmp_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/lstmp_op.h b/paddle/fluid/operators/lstmp_op.h
similarity index 98%
rename from paddle/operators/lstmp_op.h
rename to paddle/fluid/operators/lstmp_op.h
index e064a155df..22ef472186 100644
--- a/paddle/operators/lstmp_op.h
+++ b/paddle/fluid/operators/lstmp_op.h
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/operators/activation_op.h"
-#include "paddle/operators/math/detail/activation_functions.h"
-#include "paddle/operators/math/lstm_compute.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/sequence2batch.h"
-
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/math/detail/activation_functions.h"
+#include "paddle/fluid/operators/math/lstm_compute.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/sequence2batch.h"
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/margin_rank_loss_op.cc b/paddle/fluid/operators/margin_rank_loss_op.cc
similarity index 98%
rename from paddle/operators/margin_rank_loss_op.cc
rename to paddle/fluid/operators/margin_rank_loss_op.cc
index e0df307774..fc31befb20 100644
--- a/paddle/operators/margin_rank_loss_op.cc
+++ b/paddle/fluid/operators/margin_rank_loss_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/margin_rank_loss_op.h"
+#include "paddle/fluid/operators/margin_rank_loss_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/margin_rank_loss_op.cu b/paddle/fluid/operators/margin_rank_loss_op.cu
similarity index 94%
rename from paddle/operators/margin_rank_loss_op.cu
rename to paddle/fluid/operators/margin_rank_loss_op.cu
index 798c3ed182..ca4593a48d 100644
--- a/paddle/operators/margin_rank_loss_op.cu
+++ b/paddle/fluid/operators/margin_rank_loss_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/margin_rank_loss_op.h"
+#include "paddle/fluid/operators/margin_rank_loss_op.h"
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/operators/margin_rank_loss_op.h b/paddle/fluid/operators/margin_rank_loss_op.h
similarity index 97%
rename from paddle/operators/margin_rank_loss_op.h
rename to paddle/fluid/operators/margin_rank_loss_op.h
index 7438e881e1..934a5da0f8 100644
--- a/paddle/operators/margin_rank_loss_op.h
+++ b/paddle/fluid/operators/margin_rank_loss_op.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
similarity index 100%
rename from paddle/operators/math/CMakeLists.txt
rename to paddle/fluid/operators/math/CMakeLists.txt
diff --git a/paddle/operators/math/context_project.cc b/paddle/fluid/operators/math/context_project.cc
similarity index 93%
rename from paddle/operators/math/context_project.cc
rename to paddle/fluid/operators/math/context_project.cc
index 980dd90df8..b73d976d1b 100644
--- a/paddle/operators/math/context_project.cc
+++ b/paddle/fluid/operators/math/context_project.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/context_project.h"
+#include "paddle/fluid/operators/math/context_project.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/context_project.cu b/paddle/fluid/operators/math/context_project.cu
similarity index 94%
rename from paddle/operators/math/context_project.cu
rename to paddle/fluid/operators/math/context_project.cu
index 934e3df645..bbd36a6e8f 100644
--- a/paddle/operators/math/context_project.cu
+++ b/paddle/fluid/operators/math/context_project.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #define EIGEN_USE_GPU
 
-#include "paddle/operators/math/context_project.h"
+#include "paddle/fluid/operators/math/context_project.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/context_project.h b/paddle/fluid/operators/math/context_project.h
similarity index 98%
rename from paddle/operators/math/context_project.h
rename to paddle/fluid/operators/math/context_project.h
index 218de9fb95..2fe593ec3a 100644
--- a/paddle/operators/math/context_project.h
+++ b/paddle/fluid/operators/math/context_project.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/operators/math/im2col.h"
-#include "paddle/operators/math/math_function.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/operators/math/im2col.h"
+#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/cos_sim_functor.cc b/paddle/fluid/operators/math/cos_sim_functor.cc
similarity index 96%
rename from paddle/operators/math/cos_sim_functor.cc
rename to paddle/fluid/operators/math/cos_sim_functor.cc
index 6af9f0fcd9..701a9c23c0 100644
--- a/paddle/operators/math/cos_sim_functor.cc
+++ b/paddle/fluid/operators/math/cos_sim_functor.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/cos_sim_functor.h"
+#include "paddle/fluid/operators/math/cos_sim_functor.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/cos_sim_functor.cu b/paddle/fluid/operators/math/cos_sim_functor.cu
similarity index 95%
rename from paddle/operators/math/cos_sim_functor.cu
rename to paddle/fluid/operators/math/cos_sim_functor.cu
index 6eb0a4ea4c..0323680870 100644
--- a/paddle/operators/math/cos_sim_functor.cu
+++ b/paddle/fluid/operators/math/cos_sim_functor.cu
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/cos_sim_functor.h"
-#include "paddle/platform/cuda_helper.h"
+#include "paddle/fluid/operators/math/cos_sim_functor.h"
+#include "paddle/fluid/platform/cuda_helper.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/cos_sim_functor.h b/paddle/fluid/operators/math/cos_sim_functor.h
similarity index 97%
rename from paddle/operators/math/cos_sim_functor.h
rename to paddle/fluid/operators/math/cos_sim_functor.h
index aae8ab5b7a..445d94f975 100644
--- a/paddle/operators/math/cos_sim_functor.h
+++ b/paddle/fluid/operators/math/cos_sim_functor.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 #include <math.h>
 #include <stdlib.h>
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/hostdevice.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/cross_entropy.cc b/paddle/fluid/operators/math/cross_entropy.cc
similarity index 97%
rename from paddle/operators/math/cross_entropy.cc
rename to paddle/fluid/operators/math/cross_entropy.cc
index d9cb016fb4..76abd03ff8 100644
--- a/paddle/operators/math/cross_entropy.cc
+++ b/paddle/fluid/operators/math/cross_entropy.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/cross_entropy.h"
+#include "paddle/fluid/operators/math/cross_entropy.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu
similarity index 98%
rename from paddle/operators/math/cross_entropy.cu
rename to paddle/fluid/operators/math/cross_entropy.cu
index 16c9e7b28e..39222c484c 100644
--- a/paddle/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/cross_entropy.h"
+#include "paddle/fluid/operators/math/cross_entropy.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/cross_entropy.h b/paddle/fluid/operators/math/cross_entropy.h
similarity index 90%
rename from paddle/operators/math/cross_entropy.h
rename to paddle/fluid/operators/math/cross_entropy.h
index b3b6d767a8..2fe216a805 100644
--- a/paddle/operators/math/cross_entropy.h
+++ b/paddle/fluid/operators/math/cross_entropy.h
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/platform/hostdevice.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/depthwise_conv.cu b/paddle/fluid/operators/math/depthwise_conv.cu
similarity index 99%
rename from paddle/operators/math/depthwise_conv.cu
rename to paddle/fluid/operators/math/depthwise_conv.cu
index b212e78208..7b75e59307 100644
--- a/paddle/operators/math/depthwise_conv.cu
+++ b/paddle/fluid/operators/math/depthwise_conv.cu
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/depthwise_conv.h"
-#include "paddle/platform/cuda_helper.h"
+#include "paddle/fluid/operators/math/depthwise_conv.h"
+#include "paddle/fluid/platform/cuda_helper.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/depthwise_conv.h b/paddle/fluid/operators/math/depthwise_conv.h
similarity index 93%
rename from paddle/operators/math/depthwise_conv.h
rename to paddle/fluid/operators/math/depthwise_conv.h
index 4708920bb4..c3081e7a0d 100644
--- a/paddle/operators/math/depthwise_conv.h
+++ b/paddle/fluid/operators/math/depthwise_conv.h
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/tensor.h"
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/hostdevice.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/detail/CMakeLists.txt b/paddle/fluid/operators/math/detail/CMakeLists.txt
similarity index 100%
rename from paddle/operators/math/detail/CMakeLists.txt
rename to paddle/fluid/operators/math/detail/CMakeLists.txt
diff --git a/paddle/operators/math/detail/activation_functions.h b/paddle/fluid/operators/math/detail/activation_functions.h
similarity index 98%
rename from paddle/operators/math/detail/activation_functions.h
rename to paddle/fluid/operators/math/detail/activation_functions.h
index 585a012343..3af7ba790c 100644
--- a/paddle/operators/math/detail/activation_functions.h
+++ b/paddle/fluid/operators/math/detail/activation_functions.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 #include <math.h>
-#include "paddle/platform/enforce.h"
-#include "paddle/platform/hostdevice.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/hostdevice.h"
 
 #ifdef __AVX__
 #include <immintrin.h>
diff --git a/paddle/operators/math/detail/avx_functions.cc b/paddle/fluid/operators/math/detail/avx_functions.cc
similarity index 97%
rename from paddle/operators/math/detail/avx_functions.cc
rename to paddle/fluid/operators/math/detail/avx_functions.cc
index 921364788c..838cd30e3d 100644
--- a/paddle/operators/math/detail/avx_functions.cc
+++ b/paddle/fluid/operators/math/detail/avx_functions.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef __AVX__
 
 #include <immintrin.h>
-#include "paddle/operators/math/detail/activation_functions.h"
+#include "paddle/fluid/operators/math/detail/activation_functions.h"
 // TODO(qingqing) refine this dependence
 #include "paddle/cuda/src/avx_mathfun.h"
 
diff --git a/paddle/operators/math/detail/gru_cpu_kernel.h b/paddle/fluid/operators/math/detail/gru_cpu_kernel.h
similarity index 99%
rename from paddle/operators/math/detail/gru_cpu_kernel.h
rename to paddle/fluid/operators/math/detail/gru_cpu_kernel.h
index a61b232f42..75c5c8eb29 100644
--- a/paddle/operators/math/detail/gru_cpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/gru_cpu_kernel.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 #include <type_traits>
-#include "paddle/operators/math/detail/activation_functions.h"
-#include "paddle/operators/math/gru_compute.h"
+#include "paddle/fluid/operators/math/detail/activation_functions.h"
+#include "paddle/fluid/operators/math/gru_compute.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/detail/gru_gpu_kernel.h b/paddle/fluid/operators/math/detail/gru_gpu_kernel.h
similarity index 97%
rename from paddle/operators/math/detail/gru_gpu_kernel.h
rename to paddle/fluid/operators/math/detail/gru_gpu_kernel.h
index 1783d46096..fbf69d4a85 100644
--- a/paddle/operators/math/detail/gru_gpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/gru_gpu_kernel.h
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #pragma once
 #include <type_traits>
-#include "paddle/operators/math/detail/activation_functions.h"
-#include "paddle/operators/math/gru_compute.h"
-#include "paddle/platform/cuda_helper.h"
-#include "paddle/platform/device_context.h"
+#include "paddle/fluid/operators/math/detail/activation_functions.h"
+#include "paddle/fluid/operators/math/gru_compute.h"
+#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/detail/gru_kernel.h b/paddle/fluid/operators/math/detail/gru_kernel.h
similarity index 98%
rename from paddle/operators/math/detail/gru_kernel.h
rename to paddle/fluid/operators/math/detail/gru_kernel.h
index 4d8245cb5d..705787e2ff 100644
--- a/paddle/operators/math/detail/gru_kernel.h
+++ b/paddle/fluid/operators/math/detail/gru_kernel.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/detail/activation_functions.h"
-#include "paddle/platform/hostdevice.h"
+#include "paddle/fluid/operators/math/detail/activation_functions.h"
+#include "paddle/fluid/platform/hostdevice.h"
 
 #include <type_traits>
 
diff --git a/paddle/operators/math/detail/lstm_cpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
similarity index 98%
rename from paddle/operators/math/detail/lstm_cpu_kernel.h
rename to paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
index 42888fcdb0..bf26509ba1 100644
--- a/paddle/operators/math/detail/lstm_cpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 #include <type_traits>
-#include "paddle/operators/math/detail/activation_functions.h"
-#include "paddle/operators/math/lstm_compute.h"
+#include "paddle/fluid/operators/math/detail/activation_functions.h"
+#include "paddle/fluid/operators/math/lstm_compute.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/detail/lstm_gpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
similarity index 97%
rename from paddle/operators/math/detail/lstm_gpu_kernel.h
rename to paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
index e31e657e8b..7865d0c0ba 100644
--- a/paddle/operators/math/detail/lstm_gpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/operators/math/detail/activation_functions.h"
-#include "paddle/operators/math/lstm_compute.h"
-#include "paddle/platform/cuda_helper.h"
-#include "paddle/platform/device_context.h"
+#include "paddle/fluid/operators/math/detail/activation_functions.h"
+#include "paddle/fluid/operators/math/lstm_compute.h"
+#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/device_context.h"
 
 #include <type_traits>
 
diff --git a/paddle/operators/math/detail/lstm_kernel.h b/paddle/fluid/operators/math/detail/lstm_kernel.h
similarity index 98%
rename from paddle/operators/math/detail/lstm_kernel.h
rename to paddle/fluid/operators/math/detail/lstm_kernel.h
index fed8f9c4ca..0679cc62ba 100644
--- a/paddle/operators/math/detail/lstm_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_kernel.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/detail/activation_functions.h"
-#include "paddle/platform/hostdevice.h"
+#include "paddle/fluid/operators/math/detail/activation_functions.h"
+#include "paddle/fluid/platform/hostdevice.h"
 
 #include <type_traits>
 
diff --git a/paddle/operators/math/detection_util.h b/paddle/fluid/operators/math/detection_util.h
similarity index 99%
rename from paddle/operators/math/detection_util.h
rename to paddle/fluid/operators/math/detection_util.h
index e3a3ef2bad..13e5d406c1 100644
--- a/paddle/operators/math/detection_util.h
+++ b/paddle/fluid/operators/math/detection_util.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include <map>
-#include "paddle/framework/selected_rows.h"
-#include "paddle/platform/device_context.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/gru_compute.cc b/paddle/fluid/operators/math/gru_compute.cc
similarity index 94%
rename from paddle/operators/math/gru_compute.cc
rename to paddle/fluid/operators/math/gru_compute.cc
index 101ab85962..1003180416 100644
--- a/paddle/operators/math/gru_compute.cc
+++ b/paddle/fluid/operators/math/gru_compute.cc
@@ -9,10 +9,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/gru_compute.h"
-#include "paddle/operators/math/detail/gru_cpu_kernel.h"
-#include "paddle/operators/math/detail/gru_kernel.h"
-#include "paddle/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/gru_compute.h"
+#include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h"
+#include "paddle/fluid/operators/math/detail/gru_kernel.h"
+#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/gru_compute.cu b/paddle/fluid/operators/math/gru_compute.cu
similarity index 97%
rename from paddle/operators/math/gru_compute.cu
rename to paddle/fluid/operators/math/gru_compute.cu
index d5a0e630ea..0d5d5d7a74 100644
--- a/paddle/operators/math/gru_compute.cu
+++ b/paddle/fluid/operators/math/gru_compute.cu
@@ -9,10 +9,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/detail/gru_gpu_kernel.h"
-#include "paddle/operators/math/detail/gru_kernel.h"
-#include "paddle/operators/math/gru_compute.h"
-#include "paddle/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/detail/gru_gpu_kernel.h"
+#include "paddle/fluid/operators/math/detail/gru_kernel.h"
+#include "paddle/fluid/operators/math/gru_compute.h"
+#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/gru_compute.h b/paddle/fluid/operators/math/gru_compute.h
similarity index 91%
rename from paddle/operators/math/gru_compute.h
rename to paddle/fluid/operators/math/gru_compute.h
index bf69147b50..93e19cf557 100644
--- a/paddle/operators/math/gru_compute.h
+++ b/paddle/fluid/operators/math/gru_compute.h
@@ -11,9 +11,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/operators/math/detail/activation_functions.h"
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/enforce.h"
+#include "paddle/fluid/operators/math/detail/activation_functions.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc
similarity index 99%
rename from paddle/operators/math/im2col.cc
rename to paddle/fluid/operators/math/im2col.cc
index c2633b2e16..c298b00bb4 100644
--- a/paddle/operators/math/im2col.cc
+++ b/paddle/fluid/operators/math/im2col.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/im2col.h"
+#include "paddle/fluid/operators/math/im2col.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/im2col.cu b/paddle/fluid/operators/math/im2col.cu
similarity index 99%
rename from paddle/operators/math/im2col.cu
rename to paddle/fluid/operators/math/im2col.cu
index a88e837b03..c26343aacf 100644
--- a/paddle/operators/math/im2col.cu
+++ b/paddle/fluid/operators/math/im2col.cu
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/im2col.h"
-#include "paddle/platform/cuda_helper.h"
+#include "paddle/fluid/operators/math/im2col.h"
+#include "paddle/fluid/platform/cuda_helper.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/im2col.h b/paddle/fluid/operators/math/im2col.h
similarity index 96%
rename from paddle/operators/math/im2col.h
rename to paddle/fluid/operators/math/im2col.h
index 38f2c9fe0a..525c0f5dda 100644
--- a/paddle/operators/math/im2col.h
+++ b/paddle/fluid/operators/math/im2col.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/tensor.h"
-#include "paddle/framework/tensor_util.h"
-#include "paddle/platform/device_context.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc
similarity index 99%
rename from paddle/operators/math/im2col_test.cc
rename to paddle/fluid/operators/math/im2col_test.cc
index 1ba24325ff..59d6a84b89 100644
--- a/paddle/operators/math/im2col_test.cc
+++ b/paddle/fluid/operators/math/im2col_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/im2col.h"
+#include "paddle/fluid/operators/math/im2col.h"
 #include <gtest/gtest.h>
 
 template <typename DeviceContext, typename Place>
diff --git a/paddle/operators/math/lstm_compute.cc b/paddle/fluid/operators/math/lstm_compute.cc
similarity index 94%
rename from paddle/operators/math/lstm_compute.cc
rename to paddle/fluid/operators/math/lstm_compute.cc
index d453102ece..09eb89ec58 100644
--- a/paddle/operators/math/lstm_compute.cc
+++ b/paddle/fluid/operators/math/lstm_compute.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/lstm_compute.h"
-#include "paddle/operators/math/detail/lstm_cpu_kernel.h"
-#include "paddle/operators/math/detail/lstm_kernel.h"
+#include "paddle/fluid/operators/math/lstm_compute.h"
+#include "paddle/fluid/operators/math/detail/lstm_cpu_kernel.h"
+#include "paddle/fluid/operators/math/detail/lstm_kernel.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/lstm_compute.cu b/paddle/fluid/operators/math/lstm_compute.cu
similarity index 92%
rename from paddle/operators/math/lstm_compute.cu
rename to paddle/fluid/operators/math/lstm_compute.cu
index 82065d699f..adedee28bd 100644
--- a/paddle/operators/math/lstm_compute.cu
+++ b/paddle/fluid/operators/math/lstm_compute.cu
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/detail/lstm_gpu_kernel.h"
-#include "paddle/operators/math/detail/lstm_kernel.h"
-#include "paddle/operators/math/lstm_compute.h"
+#include "paddle/fluid/operators/math/detail/lstm_gpu_kernel.h"
+#include "paddle/fluid/operators/math/detail/lstm_kernel.h"
+#include "paddle/fluid/operators/math/lstm_compute.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/lstm_compute.h b/paddle/fluid/operators/math/lstm_compute.h
similarity index 92%
rename from paddle/operators/math/lstm_compute.h
rename to paddle/fluid/operators/math/lstm_compute.h
index e1ad6b64d2..8610e96cf1 100644
--- a/paddle/operators/math/lstm_compute.h
+++ b/paddle/fluid/operators/math/lstm_compute.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/operators/math/detail/activation_functions.h"
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/enforce.h"
+#include "paddle/fluid/operators/math/detail/activation_functions.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
similarity index 96%
rename from paddle/operators/math/math_function.cc
rename to paddle/fluid/operators/math/math_function.cc
index dcf4b85e1a..2636dbddde 100644
--- a/paddle/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/math_function.h"
-#include "paddle/framework/data_type.h"
-#include "paddle/operators/math/math_function_impl.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/operators/math/math_function_impl.h"
 
 namespace paddle {
 namespace operators {
@@ -331,6 +331,12 @@ template struct RowwiseAdd<platform::CPUDeviceContext, double>;
 template struct ColwiseSum<platform::CPUDeviceContext, float>;
 template struct ColwiseSum<platform::CPUDeviceContext, double>;
 
+template struct RowwiseSum<platform::CPUDeviceContext, float>;
+template struct RowwiseSum<platform::CPUDeviceContext, double>;
+
+template struct RowwiseMean<platform::CPUDeviceContext, float>;
+template struct RowwiseMean<platform::CPUDeviceContext, double>;
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
similarity index 91%
rename from paddle/operators/math/math_function.cu
rename to paddle/fluid/operators/math/math_function.cu
index d47a7f818d..5764da71c8 100644
--- a/paddle/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/framework/data_type.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/math_function_impl.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/math_function_impl.h"
 
 namespace paddle {
 namespace operators {
@@ -325,6 +325,31 @@ void ColwiseSum<platform::CUDADeviceContext, double>::operator()(
       vector->data<double>());
 }
 
+template struct RowwiseSum<platform::CUDADeviceContext, float>;
+// template struct RowwiseSum<platform::CUDADeviceContext, double>;
+// TODO(zcd): Following ColwiseSum format, need to confirm.
+// The RowwiseSum<platform::CUDADeviceContext, double> failed in debug mode,
+// and only failed for this case. So reimplemented it.
+template <>
+void RowwiseSum<platform::CUDADeviceContext, double>::operator()(
+    const platform::CUDADeviceContext& context, const framework::Tensor& input,
+    framework::Tensor* vector) {
+  auto in_dims = input.dims();
+  auto size = input.numel() / in_dims[0];
+  PADDLE_ENFORCE_EQ(vector->numel(), in_dims[0]);
+  framework::Tensor one;
+  one.mutable_data<double>({size}, context.GetPlace());
+  SetConstant<platform::CUDADeviceContext, double> set;
+  set(context, &one, static_cast<double>(1.0));
+  gemv<platform::CUDADeviceContext, double>(
+      context, true, static_cast<int>(in_dims[1]), static_cast<int>(in_dims[0]),
+      1.0, one.data<double>(), input.data<double>(), 0.0,
+      vector->data<double>());
+}
+
+template struct RowwiseMean<platform::CUDADeviceContext, float>;
+template struct RowwiseMean<platform::CUDADeviceContext, double>;
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/math_function.h b/paddle/fluid/operators/math/math_function.h
similarity index 88%
rename from paddle/operators/math/math_function.h
rename to paddle/fluid/operators/math/math_function.h
index 8cc03c2ba0..84916af1f8 100644
--- a/paddle/operators/math/math_function.h
+++ b/paddle/fluid/operators/math/math_function.h
@@ -47,11 +47,11 @@ int LAPACKE_dgetri(int matrix_layout, int n, double* a, int lda,
 
 #include <cmath>
 
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/framework/tensor_util.h"
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/enforce.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace operators {
@@ -128,6 +128,18 @@ struct ColwiseSum {
                   framework::Tensor* vec);
 };
 
+template <typename DeviceContext, typename T>
+struct RowwiseSum {
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  framework::Tensor* vec);
+};
+
+template <typename DeviceContext, typename T>
+struct RowwiseMean {
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  framework::Tensor* vec);
+};
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/math_function_impl.h b/paddle/fluid/operators/math/math_function_impl.h
similarity index 50%
rename from paddle/operators/math/math_function_impl.h
rename to paddle/fluid/operators/math/math_function_impl.h
index de591626df..a55ed6c58b 100644
--- a/paddle/operators/math/math_function_impl.h
+++ b/paddle/fluid/operators/math/math_function_impl.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/data_type.h"
-#include "paddle/operators/math/math_function.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -87,6 +87,88 @@ class ColwiseSum<platform::CPUDeviceContext, T> {
   }
 };
 
+template <typename DeviceContext, typename T>
+void RowwiseMean<DeviceContext, T>::operator()(const DeviceContext& context,
+                                               const framework::Tensor& input,
+                                               framework::Tensor* out) {
+  auto in_dims = input.dims();
+  PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
+  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
+
+  auto in = framework::EigenMatrix<T>::From(input);
+  auto vec = framework::EigenVector<T>::Flatten(*out);
+
+  vec.device(*context.eigen_device()) = in.mean(Eigen::array<int, 1>({{1}}));
+}
+// TODO(zcd): Following ColwiseSum format, need to confirm.
+// Specialize for CPU, since Eigen implement a general reduce. However,
+// rowwise-sum can be easily implemented. General reduce has a huge overhead in
+// CPU
+template <typename T>
+class RowwiseMean<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* out) {
+    auto& in_dims = input.dims();
+    PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
+    auto height = in_dims[0];
+    auto size = in_dims[1];
+    PADDLE_ENFORCE_EQ(out->numel(), height);
+    auto inv_size = 1.0 / size;
+    T* out_buf = out->mutable_data<T>(out->place());
+    const T* in_buf = input.data<T>();
+
+    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
+      T sum = 0;
+      for (size_t j = 0; j < static_cast<size_t>(size); ++j) {
+        sum += in_buf[i * size + j];
+      }
+      out_buf[i] = sum * inv_size;
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+void RowwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
+                                              const framework::Tensor& input,
+                                              framework::Tensor* out) {
+  auto in_dims = input.dims();
+  PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
+  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
+
+  auto in = framework::EigenMatrix<T>::From(input);
+  auto vec = framework::EigenVector<T>::Flatten(*out);
+
+  vec.device(*context.eigen_device()) = in.sum(Eigen::array<int, 1>({{1}}));
+}
+// TODO(zcd): Following ColwiseSum format, need to confirm.
+// Specialize for CPU, since Eigen implement a general reduce. However,
+// rowwise-sum can be easily implemented. General reduce has a huge overhead in
+// CPU
+template <typename T>
+class RowwiseSum<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* out) {
+    auto& in_dims = input.dims();
+    PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
+    auto height = in_dims[0];
+    auto size = in_dims[1];
+    PADDLE_ENFORCE_EQ(out->numel(), size);
+
+    T* out_buf = out->mutable_data<T>(out->place());
+    const T* in_buf = input.data<T>();
+
+    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
+      T sum = 0;
+      for (size_t j = 0; j < static_cast<size_t>(size); ++j) {
+        sum += in_buf[i * size + j];
+      }
+      out_buf[i] = sum;
+    }
+  }
+};
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/math_function_test.cc b/paddle/fluid/operators/math/math_function_test.cc
similarity index 99%
rename from paddle/operators/math/math_function_test.cc
rename to paddle/fluid/operators/math/math_function_test.cc
index c9f322b92e..6cd8e8b35a 100644
--- a/paddle/operators/math/math_function_test.cc
+++ b/paddle/fluid/operators/math/math_function_test.cc
@@ -11,7 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/math_function.h"
 #include "gtest/gtest.h"
 
 TEST(math_function, gemm_notrans_cblas) {
diff --git a/paddle/operators/math/math_function_test.cu b/paddle/fluid/operators/math/math_function_test.cu
similarity index 99%
rename from paddle/operators/math/math_function_test.cu
rename to paddle/fluid/operators/math/math_function_test.cu
index 6f16d66792..2ef53a8209 100644
--- a/paddle/operators/math/math_function_test.cu
+++ b/paddle/fluid/operators/math/math_function_test.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "gtest/gtest.h"
-#include "paddle/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/math_function.h"
 
 TEST(math_function, notrans_mul_trans) {
   paddle::framework::Tensor input1;
diff --git a/paddle/operators/math/matmul.h b/paddle/fluid/operators/math/matmul.h
similarity index 98%
rename from paddle/operators/math/matmul.h
rename to paddle/fluid/operators/math/matmul.h
index ae7f1fe9be..50f79979d9 100644
--- a/paddle/operators/math/matmul.h
+++ b/paddle/fluid/operators/math/matmul.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/maxouting.cc b/paddle/fluid/operators/math/maxouting.cc
similarity index 98%
rename from paddle/operators/math/maxouting.cc
rename to paddle/fluid/operators/math/maxouting.cc
index fea86675f7..746328cd45 100644
--- a/paddle/operators/math/maxouting.cc
+++ b/paddle/fluid/operators/math/maxouting.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/maxouting.h"
+#include "paddle/fluid/operators/math/maxouting.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/maxouting.cu b/paddle/fluid/operators/math/maxouting.cu
similarity index 98%
rename from paddle/operators/math/maxouting.cu
rename to paddle/fluid/operators/math/maxouting.cu
index 6056ad251c..68e5dfc3c5 100644
--- a/paddle/operators/math/maxouting.cu
+++ b/paddle/fluid/operators/math/maxouting.cu
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/maxouting.h"
-#include "paddle/platform/cuda_helper.h"
+#include "paddle/fluid/operators/math/maxouting.h"
+#include "paddle/fluid/platform/cuda_helper.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/maxouting.h b/paddle/fluid/operators/math/maxouting.h
similarity index 90%
rename from paddle/operators/math/maxouting.h
rename to paddle/fluid/operators/math/maxouting.h
index 68f4743db0..0e81790f0a 100644
--- a/paddle/operators/math/maxouting.h
+++ b/paddle/fluid/operators/math/maxouting.h
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/tensor.h"
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/hostdevice.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/pooling.cc b/paddle/fluid/operators/math/pooling.cc
similarity index 99%
rename from paddle/operators/math/pooling.cc
rename to paddle/fluid/operators/math/pooling.cc
index 150de6fd59..9adb142f14 100644
--- a/paddle/operators/math/pooling.cc
+++ b/paddle/fluid/operators/math/pooling.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/pooling.h"
+#include "paddle/fluid/operators/math/pooling.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/pooling.cu b/paddle/fluid/operators/math/pooling.cu
similarity index 99%
rename from paddle/operators/math/pooling.cu
rename to paddle/fluid/operators/math/pooling.cu
index 0243cf8316..c65632de90 100644
--- a/paddle/operators/math/pooling.cu
+++ b/paddle/fluid/operators/math/pooling.cu
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/pooling.h"
-#include "paddle/platform/cuda_helper.h"
+#include "paddle/fluid/operators/math/pooling.h"
+#include "paddle/fluid/platform/cuda_helper.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h
similarity index 97%
rename from paddle/operators/math/pooling.h
rename to paddle/fluid/operators/math/pooling.h
index 2759f06cb6..1195038f6a 100644
--- a/paddle/operators/math/pooling.h
+++ b/paddle/fluid/operators/math/pooling.h
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/hostdevice.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/sampler.cc b/paddle/fluid/operators/math/sampler.cc
similarity index 100%
rename from paddle/operators/math/sampler.cc
rename to paddle/fluid/operators/math/sampler.cc
diff --git a/paddle/operators/math/sampler.h b/paddle/fluid/operators/math/sampler.h
similarity index 100%
rename from paddle/operators/math/sampler.h
rename to paddle/fluid/operators/math/sampler.h
diff --git a/paddle/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
similarity index 98%
rename from paddle/operators/math/selected_rows_functor.cc
rename to paddle/fluid/operators/math/selected_rows_functor.cc
index 8a1ebb58c2..01aa37ab35 100644
--- a/paddle/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include <set>
 
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
 
 namespace paddle {
 namespace operators {
@@ -128,7 +128,7 @@ struct SelectedRowsAddTo<platform::CPUDeviceContext, T> {
     auto* in2_value = input2->mutable_value();
 
     // concat rows
-    in2_rows.insert(in2_rows.end(), in1_rows.begin(), in1_rows.end());
+    in2_rows.Extend(in1_rows.begin(), in1_rows.end());
 
     auto in1_place = input1.place();
     PADDLE_ENFORCE(platform::is_cpu_place(in1_place));
diff --git a/paddle/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
similarity index 95%
rename from paddle/operators/math/selected_rows_functor.cu
rename to paddle/fluid/operators/math/selected_rows_functor.cu
index acdd87cb35..ee3b5d5205 100644
--- a/paddle/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include <set>
 
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/selected_rows_functor.h"
-#include "paddle/platform/cuda_helper.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/platform/cuda_helper.h"
 
 namespace paddle {
 namespace operators {
@@ -126,7 +126,8 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
     dim3 grid(1, in1_rows.size());
     SelectedRowsAddTensorKernel<
         T, block_size><<<grid, threads, 0, context.stream()>>>(
-        in1_data, in1_rows.cuda_data(), out_data, in1_row_numel);
+        in1_data, in1_rows.CUDAData(context.GetPlace()), out_data,
+        in1_row_numel);
 
     auto out_eigen = framework::EigenVector<T>::Flatten(*output);
     auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
@@ -153,7 +154,9 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
     auto* in2_value = input2->mutable_value();
 
     // concat rows
-    in2_rows.insert(in2_rows.end(), in1_rows.begin(), in1_rows.end());
+    if (in1_rows.size()) {
+      in2_rows.Extend(in1_rows.begin(), in1_rows.end());
+    }
 
     auto in1_place = input1.place();
     PADDLE_ENFORCE(platform::is_gpu_place(in1_place));
@@ -216,7 +219,8 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
     dim3 grid(1, in1_rows.size());
     SelectedRowsAddToTensorKernel<
         T, block_size><<<grid, threads, 0, context.stream()>>>(
-        in1_data, in1_rows.cuda_data(), in2_data, in1_row_numel);
+        in1_data, in1_rows.CUDAData(context.GetPlace()), in2_data,
+        in1_row_numel);
   }
 };
 
@@ -283,9 +287,10 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
     MergeAddKernel<
         T, 256><<<grid1, threads, 0,
                   reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                      .stream()>>>(input_data, input_rows.cuda_data(), out_data,
-                                   out.mutable_rows()->cuda_data(),
-                                   out.rows().size(), input_width);
+                      .stream()>>>(
+        input_data, input_rows.CUDAData(context.GetPlace()), out_data,
+        out.mutable_rows()->CUDAMutableData(context.GetPlace()),
+        out.rows().size(), input_width);
     return out;
   }
 };
diff --git a/paddle/operators/math/selected_rows_functor.h b/paddle/fluid/operators/math/selected_rows_functor.h
similarity index 97%
rename from paddle/operators/math/selected_rows_functor.h
rename to paddle/fluid/operators/math/selected_rows_functor.h
index 09d4631905..510a9ed8be 100644
--- a/paddle/operators/math/selected_rows_functor.h
+++ b/paddle/fluid/operators/math/selected_rows_functor.h
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/selected_rows.h"
-#include "paddle/platform/device_context.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/platform/device_context.h"
 
 #define INLINE_FOR2(sizei, sizej)     \
   for (int64_t i = 0; i < sizei; i++) \
diff --git a/paddle/operators/math/selected_rows_functor_test.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cc
similarity index 98%
rename from paddle/operators/math/selected_rows_functor_test.cc
rename to paddle/fluid/operators/math/selected_rows_functor_test.cc
index 8c74cab0a1..db6b41cd52 100644
--- a/paddle/operators/math/selected_rows_functor_test.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "gtest/gtest.h"
-#include "paddle/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/math_function.h"
 
 TEST(selected_rows_functor, cpu_add) {
   using namespace paddle::framework;
diff --git a/paddle/operators/math/selected_rows_functor_test.cu b/paddle/fluid/operators/math/selected_rows_functor_test.cu
similarity index 98%
rename from paddle/operators/math/selected_rows_functor_test.cu
rename to paddle/fluid/operators/math/selected_rows_functor_test.cu
index 38808e1301..b3c4bc9244 100644
--- a/paddle/operators/math/selected_rows_functor_test.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "gtest/gtest.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
 
 TEST(selected_rows_functor, gpu_add) {
   using namespace paddle::framework;
diff --git a/paddle/operators/math/sequence2batch.cc b/paddle/fluid/operators/math/sequence2batch.cc
similarity index 95%
rename from paddle/operators/math/sequence2batch.cc
rename to paddle/fluid/operators/math/sequence2batch.cc
index 17abce1c2f..0485070fd9 100644
--- a/paddle/operators/math/sequence2batch.cc
+++ b/paddle/fluid/operators/math/sequence2batch.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/sequence2batch.h"
-#include "paddle/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/sequence2batch.h"
+#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/sequence2batch.cu b/paddle/fluid/operators/math/sequence2batch.cu
similarity index 94%
rename from paddle/operators/math/sequence2batch.cu
rename to paddle/fluid/operators/math/sequence2batch.cu
index f27631271a..450be80ea2 100644
--- a/paddle/operators/math/sequence2batch.cu
+++ b/paddle/fluid/operators/math/sequence2batch.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/math/sequence2batch.h"
+#include "paddle/fluid/operators/math/sequence2batch.h"
 
 namespace paddle {
 namespace operators {
@@ -45,7 +45,6 @@ class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> {
                   const framework::Tensor& src,
                   framework::Vector<size_t> index_lod, framework::Tensor& dst,
                   bool is_src_index) {
-    size_t* index = index_lod.cuda_data();
     auto src_dims = src.dims();
     auto dst_dims = dst.dims();
     PADDLE_ENFORCE_EQ(src_dims.size(), 2,
@@ -63,7 +62,8 @@ class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> {
     dim3 grid(8, 1);
     auto stream = context.stream();
     CopyMatrixRowsKernel<T, 128, 8, 8><<<grid, threads, 0, stream>>>(
-        src_data, dst_data, index, height, width, is_src_index);
+        src_data, dst_data, index_lod.CUDAData(context.GetPlace()), height,
+        width, is_src_index);
   }
 };
 
diff --git a/paddle/operators/math/sequence2batch.h b/paddle/fluid/operators/math/sequence2batch.h
similarity index 97%
rename from paddle/operators/math/sequence2batch.h
rename to paddle/fluid/operators/math/sequence2batch.h
index 6db0427b41..00bd25ab61 100644
--- a/paddle/operators/math/sequence2batch.h
+++ b/paddle/fluid/operators/math/sequence2batch.h
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/platform/device_context.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/sequence_padding.cc b/paddle/fluid/operators/math/sequence_padding.cc
similarity index 99%
rename from paddle/operators/math/sequence_padding.cc
rename to paddle/fluid/operators/math/sequence_padding.cc
index 2e69aa47eb..ad8cd82567 100644
--- a/paddle/operators/math/sequence_padding.cc
+++ b/paddle/fluid/operators/math/sequence_padding.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/sequence_padding.h"
+#include "paddle/fluid/operators/math/sequence_padding.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/sequence_padding.cu b/paddle/fluid/operators/math/sequence_padding.cu
similarity index 95%
rename from paddle/operators/math/sequence_padding.cu
rename to paddle/fluid/operators/math/sequence_padding.cu
index 65c9cfe4a0..c1a3905778 100644
--- a/paddle/operators/math/sequence_padding.cu
+++ b/paddle/fluid/operators/math/sequence_padding.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/sequence_padding.h"
+#include "paddle/fluid/operators/math/sequence_padding.h"
 
 namespace paddle {
 namespace operators {
@@ -121,12 +121,12 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
     if (norm_by_times) {
       SequencePaddingKernel<T, 1, 1><<<grid, threads, 0, context.stream()>>>(
           padding_data, const_cast<T*>(seq_data),
-          abs_offset_lod[level].cuda_data(), sequence_width,
+          abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width,
           max_sequence_length, num_sequences);
     } else {
       SequencePaddingKernel<T, 0, 1><<<grid, threads, 0, context.stream()>>>(
           padding_data, const_cast<T*>(seq_data),
-          abs_offset_lod[level].cuda_data(), sequence_width,
+          abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width,
           max_sequence_length, num_sequences);
     }
   }
@@ -196,12 +196,12 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
     if (norm_by_times) {
       SequencePaddingKernel<T, 1, 0><<<grid, threads, 0, context.stream()>>>(
           const_cast<T*>(padding_data), seq_data,
-          abs_offset_lod[level].cuda_data(), sequence_width,
+          abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width,
           max_sequence_length, num_sequences);
     } else {
       SequencePaddingKernel<T, 0, 0><<<grid, threads, 0, context.stream()>>>(
           const_cast<T*>(padding_data), seq_data,
-          abs_offset_lod[level].cuda_data(), sequence_width,
+          abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width,
           max_sequence_length, num_sequences);
     }
   }
diff --git a/paddle/operators/math/sequence_padding.h b/paddle/fluid/operators/math/sequence_padding.h
similarity index 96%
rename from paddle/operators/math/sequence_padding.h
rename to paddle/fluid/operators/math/sequence_padding.h
index 8f586c5eb4..0d84f9dcb3 100644
--- a/paddle/operators/math/sequence_padding.h
+++ b/paddle/fluid/operators/math/sequence_padding.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/platform/device_context.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/sequence_padding_test.cc b/paddle/fluid/operators/math/sequence_padding_test.cc
similarity index 98%
rename from paddle/operators/math/sequence_padding_test.cc
rename to paddle/fluid/operators/math/sequence_padding_test.cc
index 3e504f4a15..147cb37da2 100644
--- a/paddle/operators/math/sequence_padding_test.cc
+++ b/paddle/fluid/operators/math/sequence_padding_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/sequence_padding.h"
+#include "paddle/fluid/operators/math/sequence_padding.h"
 #include <gtest/gtest.h>
 
 template <typename DeviceContext, typename Place, typename T>
diff --git a/paddle/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc
similarity index 96%
rename from paddle/operators/math/sequence_pooling.cc
rename to paddle/fluid/operators/math/sequence_pooling.cc
index 8fb92b1a13..b3b87ec93e 100644
--- a/paddle/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/sequence_pooling.h"
-#include "paddle/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/sequence_pooling.h"
+#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu
similarity index 95%
rename from paddle/operators/math/sequence_pooling.cu
rename to paddle/fluid/operators/math/sequence_pooling.cu
index f66534a681..c4267e992a 100644
--- a/paddle/operators/math/sequence_pooling.cu
+++ b/paddle/fluid/operators/math/sequence_pooling.cu
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/sequence_pooling.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/sequence_pooling.h"
 
 namespace paddle {
 namespace operators {
@@ -73,7 +73,8 @@ class MaxSeqPoolFunctor<platform::CUDADeviceContext, T> {
     dim3 grid(num_seq, 1);
     auto stream = context.stream();
     KeMaxSequencePool<T><<<grid, threads, 0, stream>>>(
-        in_data, starts.cuda_data(), out_data, max_index, num_seq, dim);
+        in_data, starts.CUDAData(context.GetPlace()), out_data, max_index,
+        num_seq, dim);
   }
 };
 
diff --git a/paddle/operators/math/sequence_pooling.h b/paddle/fluid/operators/math/sequence_pooling.h
similarity index 90%
rename from paddle/operators/math/sequence_pooling.h
rename to paddle/fluid/operators/math/sequence_pooling.h
index 13ffb2ebef..9ba9cad74b 100644
--- a/paddle/operators/math/sequence_pooling.h
+++ b/paddle/fluid/operators/math/sequence_pooling.h
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/platform/device_context.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/sequence_scale.cc b/paddle/fluid/operators/math/sequence_scale.cc
similarity index 96%
rename from paddle/operators/math/sequence_scale.cc
rename to paddle/fluid/operators/math/sequence_scale.cc
index 7e439e9a2c..427689b971 100644
--- a/paddle/operators/math/sequence_scale.cc
+++ b/paddle/fluid/operators/math/sequence_scale.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/sequence_scale.h"
+#include "paddle/fluid/operators/math/sequence_scale.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/sequence_scale.cu b/paddle/fluid/operators/math/sequence_scale.cu
similarity index 89%
rename from paddle/operators/math/sequence_scale.cu
rename to paddle/fluid/operators/math/sequence_scale.cu
index fd4e28f611..7c081ed7f4 100644
--- a/paddle/operators/math/sequence_scale.cu
+++ b/paddle/fluid/operators/math/sequence_scale.cu
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/sequence_scale.h"
-#include "paddle/platform/cuda_helper.h"
+#include "paddle/fluid/operators/math/sequence_scale.h"
+#include "paddle/fluid/platform/cuda_helper.h"
 
 namespace paddle {
 namespace operators {
@@ -46,7 +46,8 @@ class ScaleLoDTensorFunctor<platform::CUDADeviceContext, T> {
 
     SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS><<<
         num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>(
-        seq_data, abs_offset_lod[level].cuda_data(), scales, seq_width);
+        seq_data, abs_offset_lod[level].CUDAMutableData(context.GetPlace()),
+        scales, seq_width);
   }
 };
 
diff --git a/paddle/operators/math/sequence_scale.h b/paddle/fluid/operators/math/sequence_scale.h
similarity index 94%
rename from paddle/operators/math/sequence_scale.h
rename to paddle/fluid/operators/math/sequence_scale.h
index ecd9a57c3f..e8e07fd315 100644
--- a/paddle/operators/math/sequence_scale.h
+++ b/paddle/fluid/operators/math/sequence_scale.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/platform/device_context.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/softmax.cc b/paddle/fluid/operators/math/softmax.cc
similarity index 90%
rename from paddle/operators/math/softmax.cc
rename to paddle/fluid/operators/math/softmax.cc
index 72f10f35f4..eab31ec567 100644
--- a/paddle/operators/math/softmax.cc
+++ b/paddle/fluid/operators/math/softmax.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/softmax.h"
-#include "paddle/operators/math/softmax_impl.h"
+#include "paddle/fluid/operators/math/softmax.h"
+#include "paddle/fluid/operators/math/softmax_impl.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu
similarity index 90%
rename from paddle/operators/math/softmax.cu
rename to paddle/fluid/operators/math/softmax.cu
index 9e73f6a371..733d7eeee6 100644
--- a/paddle/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #define EIGEN_USE_GPU
 
-#include "paddle/operators/math/softmax.h"
-#include "paddle/operators/math/softmax_impl.h"
+#include "paddle/fluid/operators/math/softmax.h"
+#include "paddle/fluid/operators/math/softmax_impl.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/softmax.h b/paddle/fluid/operators/math/softmax.h
similarity index 96%
rename from paddle/operators/math/softmax.h
rename to paddle/fluid/operators/math/softmax.h
index 471f44d340..b7d67d5f12 100644
--- a/paddle/operators/math/softmax.h
+++ b/paddle/fluid/operators/math/softmax.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/tensor.h"
+#include "paddle/fluid/framework/tensor.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h
similarity index 97%
rename from paddle/operators/math/softmax_impl.h
rename to paddle/fluid/operators/math/softmax_impl.h
index 82f597ff79..f7c61cb647 100644
--- a/paddle/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/tensor.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/tensor.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/unpooling.cc b/paddle/fluid/operators/math/unpooling.cc
similarity index 98%
rename from paddle/operators/math/unpooling.cc
rename to paddle/fluid/operators/math/unpooling.cc
index ecd3a647e0..e02bc02e00 100644
--- a/paddle/operators/math/unpooling.cc
+++ b/paddle/fluid/operators/math/unpooling.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/unpooling.h"
+#include "paddle/fluid/operators/math/unpooling.h"
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/operators/math/unpooling.cu b/paddle/fluid/operators/math/unpooling.cu
similarity index 98%
rename from paddle/operators/math/unpooling.cu
rename to paddle/fluid/operators/math/unpooling.cu
index ecbde0f6a7..2e74270fdf 100644
--- a/paddle/operators/math/unpooling.cu
+++ b/paddle/fluid/operators/math/unpooling.cu
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/unpooling.h"
-#include "paddle/platform/cuda_helper.h"
+#include "paddle/fluid/operators/math/unpooling.h"
+#include "paddle/fluid/platform/cuda_helper.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/unpooling.h b/paddle/fluid/operators/math/unpooling.h
similarity index 96%
rename from paddle/operators/math/unpooling.h
rename to paddle/fluid/operators/math/unpooling.h
index 0f0ff1371e..f245ba7ba8 100644
--- a/paddle/operators/math/unpooling.h
+++ b/paddle/fluid/operators/math/unpooling.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/tensor.h"
+#include "paddle/fluid/framework/tensor.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/vol2col.cc b/paddle/fluid/operators/math/vol2col.cc
similarity index 99%
rename from paddle/operators/math/vol2col.cc
rename to paddle/fluid/operators/math/vol2col.cc
index d574ed9234..ded0bbc744 100644
--- a/paddle/operators/math/vol2col.cc
+++ b/paddle/fluid/operators/math/vol2col.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/vol2col.h"
+#include "paddle/fluid/operators/math/vol2col.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/vol2col.cu b/paddle/fluid/operators/math/vol2col.cu
similarity index 99%
rename from paddle/operators/math/vol2col.cu
rename to paddle/fluid/operators/math/vol2col.cu
index b029442fe4..35ef24c7f5 100644
--- a/paddle/operators/math/vol2col.cu
+++ b/paddle/fluid/operators/math/vol2col.cu
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/vol2col.h"
-#include "paddle/platform/cuda_helper.h"
+#include "paddle/fluid/operators/math/vol2col.h"
+#include "paddle/fluid/platform/cuda_helper.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/vol2col.h b/paddle/fluid/operators/math/vol2col.h
similarity index 95%
rename from paddle/operators/math/vol2col.h
rename to paddle/fluid/operators/math/vol2col.h
index dcd80370e8..3ce38b2d11 100644
--- a/paddle/operators/math/vol2col.h
+++ b/paddle/fluid/operators/math/vol2col.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/tensor.h"
-#include "paddle/framework/tensor_util.h"
-#include "paddle/platform/device_context.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/vol2col_test.cc b/paddle/fluid/operators/math/vol2col_test.cc
similarity index 98%
rename from paddle/operators/math/vol2col_test.cc
rename to paddle/fluid/operators/math/vol2col_test.cc
index 7a308ca814..af0a900f80 100644
--- a/paddle/operators/math/vol2col_test.cc
+++ b/paddle/fluid/operators/math/vol2col_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/vol2col.h"
+#include "paddle/fluid/operators/math/vol2col.h"
 #include <gtest/gtest.h>
 #include <iostream>
 
diff --git a/paddle/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
similarity index 99%
rename from paddle/operators/matmul_op.cc
rename to paddle/fluid/operators/matmul_op.cc
index 3336978c8d..267b0057bf 100644
--- a/paddle/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/matmul_op.h"
+#include "paddle/fluid/operators/matmul_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/matmul_op.cu.cc b/paddle/fluid/operators/matmul_op.cu.cc
similarity index 94%
rename from paddle/operators/matmul_op.cu.cc
rename to paddle/fluid/operators/matmul_op.cu.cc
index d28d12164e..988787f0fe 100644
--- a/paddle/operators/matmul_op.cu.cc
+++ b/paddle/fluid/operators/matmul_op.cu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/matmul_op.h"
+#include "paddle/fluid/operators/matmul_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/matmul_op.h b/paddle/fluid/operators/matmul_op.h
similarity index 98%
rename from paddle/operators/matmul_op.h
rename to paddle/fluid/operators/matmul_op.h
index fe6a97465f..f4cae3c91c 100644
--- a/paddle/operators/matmul_op.h
+++ b/paddle/fluid/operators/matmul_op.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/matmul.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/matmul.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/max_sequence_len_op.cc b/paddle/fluid/operators/max_sequence_len_op.cc
similarity index 94%
rename from paddle/operators/max_sequence_len_op.cc
rename to paddle/fluid/operators/max_sequence_len_op.cc
index 019150e491..eff8b927e5 100644
--- a/paddle/operators/max_sequence_len_op.cc
+++ b/paddle/fluid/operators/max_sequence_len_op.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/lod_rank_table.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/maxout_op.cc b/paddle/fluid/operators/maxout_op.cc
similarity index 98%
rename from paddle/operators/maxout_op.cc
rename to paddle/fluid/operators/maxout_op.cc
index 3ee3226941..8ce12cd4c4 100644
--- a/paddle/operators/maxout_op.cc
+++ b/paddle/fluid/operators/maxout_op.cc
@@ -12,7 +12,7 @@
  *     See the License for the specific language governing permissions and
  *     limitations under the License. */
 
-#include "paddle/operators/maxout_op.h"
+#include "paddle/fluid/operators/maxout_op.h"
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/operators/maxout_op.cu.cc b/paddle/fluid/operators/maxout_op.cu.cc
similarity index 95%
rename from paddle/operators/maxout_op.cu.cc
rename to paddle/fluid/operators/maxout_op.cu.cc
index c4a2d676d3..f3f45c90cd 100644
--- a/paddle/operators/maxout_op.cu.cc
+++ b/paddle/fluid/operators/maxout_op.cu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/maxout_op.h"
+#include "paddle/fluid/operators/maxout_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/maxout_op.h b/paddle/fluid/operators/maxout_op.h
similarity index 93%
rename from paddle/operators/maxout_op.h
rename to paddle/fluid/operators/maxout_op.h
index e8b12552b9..e5de3e3760 100644
--- a/paddle/operators/maxout_op.h
+++ b/paddle/fluid/operators/maxout_op.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/maxouting.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/maxouting.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/mean_op.cc b/paddle/fluid/operators/mean_op.cc
similarity index 98%
rename from paddle/operators/mean_op.cc
rename to paddle/fluid/operators/mean_op.cc
index 411f4d14ef..1043820345 100644
--- a/paddle/operators/mean_op.cc
+++ b/paddle/fluid/operators/mean_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/mean_op.h"
+#include "paddle/fluid/operators/mean_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/mean_op.cu b/paddle/fluid/operators/mean_op.cu
similarity index 95%
rename from paddle/operators/mean_op.cu
rename to paddle/fluid/operators/mean_op.cu
index 212d448113..ccf2248760 100644
--- a/paddle/operators/mean_op.cu
+++ b/paddle/fluid/operators/mean_op.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #define EIGEN_USE_GPU
 
-#include "paddle/operators/mean_op.h"
+#include "paddle/fluid/operators/mean_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/mean_op.h b/paddle/fluid/operators/mean_op.h
similarity index 96%
rename from paddle/operators/mean_op.h
rename to paddle/fluid/operators/mean_op.h
index 351b345959..ae162287da 100644
--- a/paddle/operators/mean_op.h
+++ b/paddle/fluid/operators/mean_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc
similarity index 98%
rename from paddle/operators/merge_lod_tensor_op.cc
rename to paddle/fluid/operators/merge_lod_tensor_op.cc
index 87644d316d..255f553340 100644
--- a/paddle/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/op_registry.h"
-#include "paddle/memory/memcpy.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/memcpy.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/mine_hard_examples_op.cc b/paddle/fluid/operators/mine_hard_examples_op.cc
similarity index 99%
rename from paddle/operators/mine_hard_examples_op.cc
rename to paddle/fluid/operators/mine_hard_examples_op.cc
index 051cc24706..73a6c0b679 100644
--- a/paddle/operators/mine_hard_examples_op.cc
+++ b/paddle/fluid/operators/mine_hard_examples_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/minus_op.cc b/paddle/fluid/operators/minus_op.cc
similarity index 97%
rename from paddle/operators/minus_op.cc
rename to paddle/fluid/operators/minus_op.cc
index 3d7742dd4b..8a35d668cc 100644
--- a/paddle/operators/minus_op.cc
+++ b/paddle/fluid/operators/minus_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/minus_op.h"
-#include "paddle/operators/net_op.h"
+#include "paddle/fluid/operators/minus_op.h"
+#include "paddle/fluid/operators/net_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/minus_op.cu b/paddle/fluid/operators/minus_op.cu
similarity index 94%
rename from paddle/operators/minus_op.cu
rename to paddle/fluid/operators/minus_op.cu
index 80cd9f7c16..ce0b1fdc04 100644
--- a/paddle/operators/minus_op.cu
+++ b/paddle/fluid/operators/minus_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/minus_op.h"
+#include "paddle/fluid/operators/minus_op.h"
 
 REGISTER_OP_CUDA_KERNEL(
     minus,
diff --git a/paddle/operators/minus_op.h b/paddle/fluid/operators/minus_op.h
similarity index 93%
rename from paddle/operators/minus_op.h
rename to paddle/fluid/operators/minus_op.h
index 20760b8cd5..dc94cbbeca 100644
--- a/paddle/operators/minus_op.h
+++ b/paddle/fluid/operators/minus_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/modified_huber_loss_op.cc b/paddle/fluid/operators/modified_huber_loss_op.cc
similarity index 98%
rename from paddle/operators/modified_huber_loss_op.cc
rename to paddle/fluid/operators/modified_huber_loss_op.cc
index f5d69071a8..f2d1653165 100644
--- a/paddle/operators/modified_huber_loss_op.cc
+++ b/paddle/fluid/operators/modified_huber_loss_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/modified_huber_loss_op.h"
+#include "paddle/fluid/operators/modified_huber_loss_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/modified_huber_loss_op.cu b/paddle/fluid/operators/modified_huber_loss_op.cu
similarity index 94%
rename from paddle/operators/modified_huber_loss_op.cu
rename to paddle/fluid/operators/modified_huber_loss_op.cu
index 3d2a5562e8..69ac2b1ed5 100644
--- a/paddle/operators/modified_huber_loss_op.cu
+++ b/paddle/fluid/operators/modified_huber_loss_op.cu
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <thrust/for_each.h>
 #include <thrust/tuple.h>
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/modified_huber_loss_op.h"
-#include "paddle/platform/hostdevice.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/modified_huber_loss_op.h"
+#include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/modified_huber_loss_op.h b/paddle/fluid/operators/modified_huber_loss_op.h
similarity index 96%
rename from paddle/operators/modified_huber_loss_op.h
rename to paddle/fluid/operators/modified_huber_loss_op.h
index 6ce86feee5..a470a45e13 100644
--- a/paddle/operators/modified_huber_loss_op.h
+++ b/paddle/fluid/operators/modified_huber_loss_op.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/platform/hostdevice.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/momentum_op.cc b/paddle/fluid/operators/momentum_op.cc
similarity index 98%
rename from paddle/operators/momentum_op.cc
rename to paddle/fluid/operators/momentum_op.cc
index 15b8b80776..a3950ac99d 100644
--- a/paddle/operators/momentum_op.cc
+++ b/paddle/fluid/operators/momentum_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/momentum_op.h"
+#include "paddle/fluid/operators/momentum_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/momentum_op.cu b/paddle/fluid/operators/momentum_op.cu
similarity index 98%
rename from paddle/operators/momentum_op.cu
rename to paddle/fluid/operators/momentum_op.cu
index 2b9314162e..28a14cd4b2 100644
--- a/paddle/operators/momentum_op.cu
+++ b/paddle/fluid/operators/momentum_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/momentum_op.h b/paddle/fluid/operators/momentum_op.h
similarity index 95%
rename from paddle/operators/momentum_op.h
rename to paddle/fluid/operators/momentum_op.h
index da69532ea5..fdab86b24e 100644
--- a/paddle/operators/momentum_op.h
+++ b/paddle/fluid/operators/momentum_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc
similarity index 99%
rename from paddle/operators/mul_op.cc
rename to paddle/fluid/operators/mul_op.cc
index c923e988a5..c9375d8ea1 100644
--- a/paddle/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/mul_op.h"
+#include "paddle/fluid/operators/mul_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/mul_op.cu.cc b/paddle/fluid/operators/mul_op.cu.cc
similarity index 95%
rename from paddle/operators/mul_op.cu.cc
rename to paddle/fluid/operators/mul_op.cu.cc
index 43de9a7194..6f605fd84f 100644
--- a/paddle/operators/mul_op.cu.cc
+++ b/paddle/fluid/operators/mul_op.cu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/mul_op.h"
+#include "paddle/fluid/operators/mul_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/mul_op.h b/paddle/fluid/operators/mul_op.h
similarity index 97%
rename from paddle/operators/mul_op.h
rename to paddle/fluid/operators/mul_op.h
index 1fb0569b49..745989f07f 100644
--- a/paddle/operators/mul_op.h
+++ b/paddle/fluid/operators/mul_op.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/math_function.h"
 
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/multiclass_nms_op.cc b/paddle/fluid/operators/multiclass_nms_op.cc
similarity index 97%
rename from paddle/operators/multiclass_nms_op.cc
rename to paddle/fluid/operators/multiclass_nms_op.cc
index 8a65fe69f1..b2934f69cc 100644
--- a/paddle/operators/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/multiclass_nms_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -85,7 +85,7 @@ static inline void GetMaxScoreIndex(
   std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
                    SortScorePairDescend<int>);
   // Keep top_k scores if needed.
-  if (top_k > -1 && top_k < sorted_indices->size()) {
+  if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
     sorted_indices->resize(top_k);
   }
 }
@@ -151,7 +151,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
     while (sorted_indices.size() != 0) {
       const int idx = sorted_indices.front().second;
       bool keep = true;
-      for (int k = 0; k < selected_indices->size(); ++k) {
+      for (size_t k = 0; k < selected_indices->size(); ++k) {
         if (keep) {
           const int kept_idx = (*selected_indices)[k];
           T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
@@ -201,7 +201,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
         int label = it.first;
         const T* sdata = scores_data + label * predict_dim;
         const std::vector<int>& label_indices = it.second;
-        for (int j = 0; j < label_indices.size(); ++j) {
+        for (size_t j = 0; j < label_indices.size(); ++j) {
           int idx = label_indices[j];
           PADDLE_ENFORCE_LT(idx, predict_dim);
           score_index_pairs.push_back(
@@ -215,7 +215,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
 
       // Store the new indices.
       std::map<int, std::vector<int>> new_indices;
-      for (int j = 0; j < score_index_pairs.size(); ++j) {
+      for (size_t j = 0; j < score_index_pairs.size(); ++j) {
         int label = score_index_pairs[j].second.first;
         int idx = score_index_pairs[j].second.second;
         new_indices[label].push_back(idx);
@@ -238,7 +238,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
       int label = it.first;
       const T* sdata = scores_data + label * predict_dim;
       const std::vector<int>& indices = it.second;
-      for (int j = 0; j < indices.size(); ++j) {
+      for (size_t j = 0; j < indices.size(); ++j) {
         int idx = indices[j];
         const T* bdata = bboxes_data + idx * kBBoxSize;
         odata[count * kOutputDim] = label;           // label
diff --git a/paddle/operators/multiplex_op.cc b/paddle/fluid/operators/multiplex_op.cc
similarity index 99%
rename from paddle/operators/multiplex_op.cc
rename to paddle/fluid/operators/multiplex_op.cc
index d275fa5cbb..f89b00376b 100644
--- a/paddle/operators/multiplex_op.cc
+++ b/paddle/fluid/operators/multiplex_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/multiplex_op.h"
+#include "paddle/fluid/operators/multiplex_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/multiplex_op.cu b/paddle/fluid/operators/multiplex_op.cu
similarity index 97%
rename from paddle/operators/multiplex_op.cu
rename to paddle/fluid/operators/multiplex_op.cu
index 546e6e7a24..3ef7ef1dfc 100644
--- a/paddle/operators/multiplex_op.cu
+++ b/paddle/fluid/operators/multiplex_op.cu
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/multiplex_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/multiplex_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/multiplex_op.h b/paddle/fluid/operators/multiplex_op.h
similarity index 95%
rename from paddle/operators/multiplex_op.h
rename to paddle/fluid/operators/multiplex_op.h
index ef66be5556..682117cb1b 100644
--- a/paddle/operators/multiplex_op.h
+++ b/paddle/fluid/operators/multiplex_op.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/memory/memcpy.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/memcpy.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/nccl/CMakeLists.txt b/paddle/fluid/operators/nccl/CMakeLists.txt
similarity index 100%
rename from paddle/operators/nccl/CMakeLists.txt
rename to paddle/fluid/operators/nccl/CMakeLists.txt
diff --git a/paddle/operators/nccl/nccl_gpu_common.cc b/paddle/fluid/operators/nccl/nccl_gpu_common.cc
similarity index 87%
rename from paddle/operators/nccl/nccl_gpu_common.cc
rename to paddle/fluid/operators/nccl/nccl_gpu_common.cc
index 1602a3d9b5..2a8ce932ec 100644
--- a/paddle/operators/nccl/nccl_gpu_common.cc
+++ b/paddle/fluid/operators/nccl/nccl_gpu_common.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/nccl/nccl_gpu_common.h"
-#include "paddle/platform/gpu_info.h"
+#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
+#include "paddle/fluid/platform/gpu_info.h"
 
 namespace paddle {
 namespace platform {}  // namespace platform
diff --git a/paddle/operators/nccl/nccl_gpu_common.h b/paddle/fluid/operators/nccl/nccl_gpu_common.h
similarity index 90%
rename from paddle/operators/nccl/nccl_gpu_common.h
rename to paddle/fluid/operators/nccl/nccl_gpu_common.h
index 5173996f20..6e78613239 100644
--- a/paddle/operators/nccl/nccl_gpu_common.h
+++ b/paddle/fluid/operators/nccl/nccl_gpu_common.h
@@ -22,10 +22,10 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/dynload/nccl.h"
-#include "paddle/platform/enforce.h"
-#include "paddle/platform/macros.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/dynload/nccl.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/macros.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/operators/nccl_op.cc b/paddle/fluid/operators/nccl_op.cc
similarity index 98%
rename from paddle/operators/nccl_op.cc
rename to paddle/fluid/operators/nccl_op.cc
index 9d51153b06..52420ceba0 100644
--- a/paddle/operators/nccl_op.cc
+++ b/paddle/fluid/operators/nccl_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/nccl/nccl_gpu_common.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/nccl_op.cu.cc b/paddle/fluid/operators/nccl_op.cu.cc
similarity index 97%
rename from paddle/operators/nccl_op.cu.cc
rename to paddle/fluid/operators/nccl_op.cu.cc
index 1b986a1365..333aed2903 100644
--- a/paddle/operators/nccl_op.cu.cc
+++ b/paddle/fluid/operators/nccl_op.cu.cc
@@ -11,9 +11,9 @@ limitations under the License. */
 
 #include <functional>
 
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/nccl/nccl_gpu_common.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl_op_test.cu.cc
similarity index 93%
rename from paddle/operators/nccl_op_test.cu.cc
rename to paddle/fluid/operators/nccl_op_test.cu.cc
index 072e4eb2ef..212ed2f9b6 100644
--- a/paddle/operators/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl_op_test.cu.cc
@@ -21,17 +21,17 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include "paddle/framework/block_desc.h"
-#include "paddle/framework/init.h"
-#include "paddle/framework/op_desc.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/program_desc.h"
-#include "paddle/framework/var_desc.h"
-#include "paddle/operators/nccl/nccl_gpu_common.h"
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/enforce.h"
-#include "paddle/platform/gpu_info.h"
-#include "paddle/platform/place.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/init.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/place.h"
 
 USE_NO_KERNEL_OP(ncclInit);
 USE_CUDA_ONLY_OP(ncclAllReduce);
@@ -287,6 +287,9 @@ TEST_F(NCCLTester, ncclBcastOp) {
 }
 
 int main(int argc, char **argv) {
+  // FIXME(tonyyang-svail):
+  //   Due to the driver issue on our CI, disable for now
+  return 0;
   const int dev_count = p::GetCUDADeviceCount();
   if (dev_count <= 1) {
     LOG(WARNING)
diff --git a/paddle/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc
similarity index 99%
rename from paddle/operators/nce_op.cc
rename to paddle/fluid/operators/nce_op.cc
index 994ddf717e..0841313a10 100644
--- a/paddle/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/nce_op.h"
+#include "paddle/fluid/operators/nce_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/nce_op.h b/paddle/fluid/operators/nce_op.h
similarity index 98%
rename from paddle/operators/nce_op.h
rename to paddle/fluid/operators/nce_op.h
index 86fa13a649..624c2d9bbd 100644
--- a/paddle/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include <math.h>
 #include <random>
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/net_op.cc b/paddle/fluid/operators/net_op.cc
similarity index 97%
rename from paddle/operators/net_op.cc
rename to paddle/fluid/operators/net_op.cc
index 000e029840..c0ca5873ad 100644
--- a/paddle/operators/net_op.cc
+++ b/paddle/fluid/operators/net_op.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/operators/net_op.h"
+#include "paddle/fluid/operators/net_op.h"
 #include <set>
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/net_op.h b/paddle/fluid/operators/net_op.h
similarity index 97%
rename from paddle/operators/net_op.h
rename to paddle/fluid/operators/net_op.h
index b24042f5ef..14e5909851 100644
--- a/paddle/operators/net_op.h
+++ b/paddle/fluid/operators/net_op.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include <set>
-#include "paddle/framework/framework.pb.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/net_op_test.cc b/paddle/fluid/operators/net_op_test.cc
similarity index 98%
rename from paddle/operators/net_op_test.cc
rename to paddle/fluid/operators/net_op_test.cc
index 9358f29f62..cc20be0c81 100644
--- a/paddle/operators/net_op_test.cc
+++ b/paddle/fluid/operators/net_op_test.cc
@@ -11,7 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/operators/net_op.h"
+#include "paddle/fluid/operators/net_op.h"
 
 #include <gtest/gtest.h>
 
diff --git a/paddle/operators/norm_op.cc b/paddle/fluid/operators/norm_op.cc
similarity index 98%
rename from paddle/operators/norm_op.cc
rename to paddle/fluid/operators/norm_op.cc
index 0eeafcaae0..ee85b1a90a 100644
--- a/paddle/operators/norm_op.cc
+++ b/paddle/fluid/operators/norm_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/norm_op.h"
+#include "paddle/fluid/operators/norm_op.h"
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/operators/norm_op.cu b/paddle/fluid/operators/norm_op.cu
similarity index 95%
rename from paddle/operators/norm_op.cu
rename to paddle/fluid/operators/norm_op.cu
index 2941c89b93..438bb3b86e 100644
--- a/paddle/operators/norm_op.cu
+++ b/paddle/fluid/operators/norm_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #define EIGEN_USE_GPU
 
-#include "paddle/operators/norm_op.h"
+#include "paddle/fluid/operators/norm_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/norm_op.h b/paddle/fluid/operators/norm_op.h
similarity index 98%
rename from paddle/operators/norm_op.h
rename to paddle/fluid/operators/norm_op.h
index 5759d6f1f0..db74c9b02a 100644
--- a/paddle/operators/norm_op.h
+++ b/paddle/fluid/operators/norm_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/one_hot_op.cc b/paddle/fluid/operators/one_hot_op.cc
similarity index 97%
rename from paddle/operators/one_hot_op.cc
rename to paddle/fluid/operators/one_hot_op.cc
index e78b7468de..2c3a60da72 100644
--- a/paddle/operators/one_hot_op.cc
+++ b/paddle/fluid/operators/one_hot_op.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/operators/one_hot_op.h"
-#include "paddle/framework/framework.pb.h"
+#include "paddle/fluid/operators/one_hot_op.h"
+#include "paddle/fluid/framework/framework.pb.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/one_hot_op.cu b/paddle/fluid/operators/one_hot_op.cu
similarity index 95%
rename from paddle/operators/one_hot_op.cu
rename to paddle/fluid/operators/one_hot_op.cu
index 16f6d9433e..6a8061edaa 100644
--- a/paddle/operators/one_hot_op.cu
+++ b/paddle/fluid/operators/one_hot_op.cu
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/operators/one_hot_op.h"
-#include "paddle/platform/cuda_helper.h"
-#include "paddle/platform/gpu_info.h"
+#include "paddle/fluid/operators/one_hot_op.h"
+#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/gpu_info.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/one_hot_op.h b/paddle/fluid/operators/one_hot_op.h
similarity index 95%
rename from paddle/operators/one_hot_op.h
rename to paddle/fluid/operators/one_hot_op.h
index 12031ede2c..ddac6edd0e 100644
--- a/paddle/operators/one_hot_op.h
+++ b/paddle/fluid/operators/one_hot_op.h
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/op_documentation/batch_norm_op.md b/paddle/fluid/operators/op_documentation/batch_norm_op.md
similarity index 100%
rename from paddle/operators/op_documentation/batch_norm_op.md
rename to paddle/fluid/operators/op_documentation/batch_norm_op.md
diff --git a/paddle/operators/op_documentation/name_convention.md b/paddle/fluid/operators/op_documentation/name_convention.md
similarity index 100%
rename from paddle/operators/op_documentation/name_convention.md
rename to paddle/fluid/operators/op_documentation/name_convention.md
diff --git a/paddle/operators/op_documentation/net_op_design.md b/paddle/fluid/operators/op_documentation/net_op_design.md
similarity index 100%
rename from paddle/operators/op_documentation/net_op_design.md
rename to paddle/fluid/operators/op_documentation/net_op_design.md
diff --git a/paddle/operators/op_documentation/op_markdown_format.md b/paddle/fluid/operators/op_documentation/op_markdown_format.md
similarity index 100%
rename from paddle/operators/op_documentation/op_markdown_format.md
rename to paddle/fluid/operators/op_documentation/op_markdown_format.md
diff --git a/paddle/operators/op_documentation/rnn_design.md b/paddle/fluid/operators/op_documentation/rnn_design.md
similarity index 100%
rename from paddle/operators/op_documentation/rnn_design.md
rename to paddle/fluid/operators/op_documentation/rnn_design.md
diff --git a/paddle/operators/pad_op.cc b/paddle/fluid/operators/pad_op.cc
similarity index 99%
rename from paddle/operators/pad_op.cc
rename to paddle/fluid/operators/pad_op.cc
index 90c53bd177..4b021fde7c 100644
--- a/paddle/operators/pad_op.cc
+++ b/paddle/fluid/operators/pad_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/pad_op.h"
+#include "paddle/fluid/operators/pad_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/pad_op.cu b/paddle/fluid/operators/pad_op.cu
similarity index 95%
rename from paddle/operators/pad_op.cu
rename to paddle/fluid/operators/pad_op.cu
index 433b5f1112..203c314403 100644
--- a/paddle/operators/pad_op.cu
+++ b/paddle/fluid/operators/pad_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/pad_op.h"
+#include "paddle/fluid/operators/pad_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/pad_op.h b/paddle/fluid/operators/pad_op.h
similarity index 97%
rename from paddle/operators/pad_op.h
rename to paddle/fluid/operators/pad_op.h
index fdf91a5776..244d8f9b6c 100644
--- a/paddle/operators/pad_op.h
+++ b/paddle/fluid/operators/pad_op.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/parallel_do_op.cc b/paddle/fluid/operators/parallel_do_op.cc
similarity index 93%
rename from paddle/operators/parallel_do_op.cc
rename to paddle/fluid/operators/parallel_do_op.cc
index 67f9854c02..e25df92479 100644
--- a/paddle/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #include <vector>
 
-#include "paddle/framework/executor.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/threadpool.h"
-#include "paddle/operators/detail/safe_ref.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/threadpool.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
 
 namespace paddle {
 namespace operators {
@@ -76,16 +76,17 @@ inline void CopyOrShare(const framework::Variable &src,
   if (src.IsType<LoDTensor>()) {
     if (src.Get<LoDTensor>().place() == dst_place) {
       dst->GetMutable<LoDTensor>()->ShareDataWith(src.Get<LoDTensor>());
+      dst->GetMutable<LoDTensor>()->set_lod(src.Get<LoDTensor>().lod());
     } else {
       Copy(src.Get<LoDTensor>(), dst_place, dst->GetMutable<LoDTensor>());
     }
   } else if (src.IsType<SelectedRows>()) {
     auto &src_sr = src.Get<SelectedRows>();
     auto *dst_sr = dst->GetMutable<SelectedRows>();
-    dst_sr->set_rows(src_sr.rows());
     dst_sr->set_height(src_sr.height());
     if (src_sr.value().place() == dst_place) {
       dst_sr->mutable_value()->ShareDataWith(src_sr.value());
+      dst_sr->set_rows(src_sr.rows());
     } else {
       Copy(src_sr.value(), dst_place, dst_sr->mutable_value());
     }
@@ -248,17 +249,19 @@ class ParallelDoGradOp : public framework::OperatorBase {
                       const std::vector<framework::Scope *> &sub_scopes,
                       const platform::PlaceList &places) const {
     for (auto &s : Outputs(framework::GradVarName(kParameters))) {
+      VLOG(3) << "Accumulating " << s;
+      if (s == framework::kEmptyVarName) continue;
       std::string tmp_name;
       auto *tmp = sub_scopes[0]->Var(&tmp_name);
 
       for (size_t i = 1; i < sub_scopes.size(); ++i) {
         CopyOrShare(*sub_scopes[i]->FindVar(s), places[0], tmp);
-        WaitOnPlace(places[0]);
+        WaitOnPlaces(places);
 
         auto sum_op = framework::OpRegistry::CreateOp(
             "sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}},
             framework::AttributeMap{});
-        VLOG(3) << sum_op->DebugStringEx(sub_scopes[0]);
+        VLOG(10) << sum_op->DebugStringEx(sub_scopes[0]);
         sum_op->Run(*sub_scopes[0], places[0]);
         WaitOnPlace(places[0]);
       }
@@ -334,16 +337,9 @@ class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker {
 class ParallelDoGradOpShapeInference : public framework::InferShapeBase {
  public:
   void operator()(framework::InferShapeContext *ctx) const override {
-    std::vector<std::string> input{kParameters, kInputs};
-    std::vector<std::string> output{kOutputs};
-
     PADDLE_ENFORCE(ctx->HasInputs(kParameters));
-    PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters)));
     PADDLE_ENFORCE(ctx->HasInputs(kInputs));
-
-    for (auto &s : output) {
-      PADDLE_ENFORCE(ctx->HasInputs(s));
-    }
+    PADDLE_ENFORCE(ctx->HasInputs(kOutputs));
 
     ctx->SetOutputsDim(framework::GradVarName(kParameters),
                        ctx->GetInputsDim(kParameters));
@@ -360,10 +356,14 @@ class ParallelDoGradOpShapeInference : public framework::InferShapeBase {
       ctx->SetDims({ig_name}, {i_dims[i]});
     }
 
-    if (ctx->HasInputs(kParameters)) {
-      PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters)));
-      ctx->SetOutputsDim(framework::GradVarName(kParameters),
-                         ctx->GetInputsDim(kParameters));
+    auto p_dims = ctx->GetInputsDim(kParameters);
+    auto pg_names = ctx->Outputs(framework::GradVarName(kParameters));
+    for (size_t i = 0; i < pg_names.size(); ++i) {
+      auto &pg_name = pg_names[i];
+      if (pg_name == framework::kEmptyVarName) {
+        continue;
+      }
+      ctx->SetDims({pg_name}, {p_dims[i]});
     }
   }
 };
diff --git a/paddle/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc
similarity index 97%
rename from paddle/operators/pool_cudnn_op.cu.cc
rename to paddle/fluid/operators/pool_cudnn_op.cu.cc
index 446fb0819d..75984b7721 100644
--- a/paddle/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/pool_op.h"
-#include "paddle/platform/cudnn_helper.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/pool_op.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
similarity index 99%
rename from paddle/operators/pool_op.cc
rename to paddle/fluid/operators/pool_op.cc
index b97333bb1a..9dd33eefc5 100644
--- a/paddle/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/pool_op.h"
+#include "paddle/fluid/operators/pool_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/pool_op.cu.cc b/paddle/fluid/operators/pool_op.cu.cc
similarity index 96%
rename from paddle/operators/pool_op.cu.cc
rename to paddle/fluid/operators/pool_op.cu.cc
index 39a9dfbf79..14486c0740 100644
--- a/paddle/operators/pool_op.cu.cc
+++ b/paddle/fluid/operators/pool_op.cu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/pool_op.h"
+#include "paddle/fluid/operators/pool_op.h"
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/operators/pool_op.h b/paddle/fluid/operators/pool_op.h
similarity index 97%
rename from paddle/operators/pool_op.h
rename to paddle/fluid/operators/pool_op.h
index d6ba5e298a..4cabd634d6 100644
--- a/paddle/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/pooling.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/pooling.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc
similarity index 99%
rename from paddle/operators/pool_with_index_op.cc
rename to paddle/fluid/operators/pool_with_index_op.cc
index 1d31d813af..ef6d5d867b 100644
--- a/paddle/operators/pool_with_index_op.cc
+++ b/paddle/fluid/operators/pool_with_index_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/pool_with_index_op.h"
+#include "paddle/fluid/operators/pool_with_index_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/pool_with_index_op.cu.cc b/paddle/fluid/operators/pool_with_index_op.cu.cc
similarity index 97%
rename from paddle/operators/pool_with_index_op.cu.cc
rename to paddle/fluid/operators/pool_with_index_op.cu.cc
index 4c9804da63..722a4d1e2a 100644
--- a/paddle/operators/pool_with_index_op.cu.cc
+++ b/paddle/fluid/operators/pool_with_index_op.cu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/pool_with_index_op.h"
+#include "paddle/fluid/operators/pool_with_index_op.h"
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/operators/pool_with_index_op.h b/paddle/fluid/operators/pool_with_index_op.h
similarity index 95%
rename from paddle/operators/pool_with_index_op.h
rename to paddle/fluid/operators/pool_with_index_op.h
index 4f4087d1dd..da7ef9df73 100644
--- a/paddle/operators/pool_with_index_op.h
+++ b/paddle/fluid/operators/pool_with_index_op.h
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/pooling.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/pooling.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/positive_negative_pair_op.cc b/paddle/fluid/operators/positive_negative_pair_op.cc
similarity index 99%
rename from paddle/operators/positive_negative_pair_op.cc
rename to paddle/fluid/operators/positive_negative_pair_op.cc
index 5aa5167dbb..d237da25a0 100644
--- a/paddle/operators/positive_negative_pair_op.cc
+++ b/paddle/fluid/operators/positive_negative_pair_op.cc
@@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/positive_negative_pair_op.h"
+#include "paddle/fluid/operators/positive_negative_pair_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/positive_negative_pair_op.h b/paddle/fluid/operators/positive_negative_pair_op.h
similarity index 97%
rename from paddle/operators/positive_negative_pair_op.h
rename to paddle/fluid/operators/positive_negative_pair_op.h
index 977e59b7d2..f20f33bbeb 100644
--- a/paddle/operators/positive_negative_pair_op.h
+++ b/paddle/fluid/operators/positive_negative_pair_op.h
@@ -12,8 +12,8 @@ limitations under the License. */
 #pragma once
 #include <unordered_map>
 #include <vector>
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/utils/Logging.h"
 
 namespace paddle {
diff --git a/paddle/operators/precision_recall_op.cc b/paddle/fluid/operators/precision_recall_op.cc
similarity index 99%
rename from paddle/operators/precision_recall_op.cc
rename to paddle/fluid/operators/precision_recall_op.cc
index f1598d53ca..30d594719c 100644
--- a/paddle/operators/precision_recall_op.cc
+++ b/paddle/fluid/operators/precision_recall_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/precision_recall_op.h"
+#include "paddle/fluid/operators/precision_recall_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/precision_recall_op.h b/paddle/fluid/operators/precision_recall_op.h
similarity index 98%
rename from paddle/operators/precision_recall_op.h
rename to paddle/fluid/operators/precision_recall_op.h
index c0d55405a3..7dae86b76f 100644
--- a/paddle/operators/precision_recall_op.h
+++ b/paddle/fluid/operators/precision_recall_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc
similarity index 97%
rename from paddle/operators/prelu_op.cc
rename to paddle/fluid/operators/prelu_op.cc
index ddc21a6570..22b970d971 100644
--- a/paddle/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/prelu_op.h"
-#include "paddle/operators/net_op.h"
+#include "paddle/fluid/operators/prelu_op.h"
+#include "paddle/fluid/operators/net_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/prelu_op.cu b/paddle/fluid/operators/prelu_op.cu
similarity index 95%
rename from paddle/operators/prelu_op.cu
rename to paddle/fluid/operators/prelu_op.cu
index 1718bb5cd6..038b09a493 100644
--- a/paddle/operators/prelu_op.cu
+++ b/paddle/fluid/operators/prelu_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/prelu_op.h"
+#include "paddle/fluid/operators/prelu_op.h"
 
 REGISTER_OP_CUDA_KERNEL(
     prelu,
diff --git a/paddle/operators/prelu_op.h b/paddle/fluid/operators/prelu_op.h
similarity index 95%
rename from paddle/operators/prelu_op.h
rename to paddle/fluid/operators/prelu_op.h
index 56f9a553ec..85ad75d479 100644
--- a/paddle/operators/prelu_op.h
+++ b/paddle/fluid/operators/prelu_op.h
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/platform/transform.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/transform.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/print_op.cc b/paddle/fluid/operators/print_op.cc
similarity index 98%
rename from paddle/operators/print_op.cc
rename to paddle/fluid/operators/print_op.cc
index 8b233d64c9..3616545309 100644
--- a/paddle/operators/print_op.cc
+++ b/paddle/fluid/operators/print_op.cc
@@ -15,8 +15,8 @@
 #include <algorithm>
 #include <ctime>
 
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/variable.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/variable.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/prior_box_op.cc b/paddle/fluid/operators/prior_box_op.cc
similarity index 74%
rename from paddle/operators/prior_box_op.cc
rename to paddle/fluid/operators/prior_box_op.cc
index 105ff4ac3e..ed48603e17 100644
--- a/paddle/operators/prior_box_op.cc
+++ b/paddle/fluid/operators/prior_box_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/prior_box_op.h"
+#include "paddle/fluid/operators/prior_box_op.h"
 
 namespace paddle {
 namespace operators {
@@ -44,12 +44,6 @@ class PriorBoxOp : public framework::OperatorWithKernel {
     auto aspect_ratios = ctx->Attrs().Get<std::vector<float>>("aspect_ratios");
     bool flip = ctx->Attrs().Get<bool>("flip");
 
-    PADDLE_ENFORCE_GT(min_sizes.size(), 0,
-                      "Size of min_sizes must be at least 1.");
-    for (size_t i = 0; i < min_sizes.size(); ++i) {
-      PADDLE_ENFORCE_GT(min_sizes[i], 0, "min_sizes[%d] must be positive.", i);
-    }
-
     std::vector<float> aspect_ratios_vec;
     ExpandAspectRatios(aspect_ratios, flip, aspect_ratios_vec);
 
@@ -65,17 +59,6 @@ class PriorBoxOp : public framework::OperatorWithKernel {
       }
     }
 
-    PADDLE_ENFORCE_EQ(variances.size(), 4, "Must and only provide 4 variance.");
-    for (size_t i = 0; i < variances.size(); ++i) {
-      PADDLE_ENFORCE_GT(variances[i], 0.0,
-                        "variance[%d] must be greater than 0.", i);
-    }
-
-    const float step_h = ctx->Attrs().Get<float>("step_h");
-    PADDLE_ENFORCE_GT(step_h, 0.0, "step_h should be larger than 0.");
-    const float step_w = ctx->Attrs().Get<float>("step_w");
-    PADDLE_ENFORCE_GT(step_w, 0.0, "step_w should be larger than 0.");
-
     std::vector<int64_t> dim_vec(4);
     dim_vec[0] = input_dims[2];
     dim_vec[1] = input_dims[3];
@@ -106,26 +89,54 @@ class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
               "PriorBoxOp. The layout is [H, W, num_priors, 4]. "
               "H is the height of input, W is the width of input, num_priors "
               "is the box count of each position.");
-    AddAttr<std::vector<int>>("min_sizes", "(vector<int>) ",
-                              "List of min sizes of generated prior boxes.");
-    AddAttr<std::vector<int>>("max_sizes", "(vector<int>) ",
-                              "List of max sizes of generated prior boxes.");
+
+    AddAttr<std::vector<int>>("min_sizes",
+                              "(vector<int>) List of min sizes "
+                              "of generated prior boxes.")
+        .AddCustomChecker([](const std::vector<int>& min_sizes) {
+          PADDLE_ENFORCE_GT(min_sizes.size(), 0,
+                            "Size of min_sizes must be at least 1.");
+          for (size_t i = 0; i < min_sizes.size(); ++i) {
+            PADDLE_ENFORCE_GT(min_sizes[i], 0,
+                              "min_sizes[%d] must be positive.", i);
+          }
+        });
+    AddAttr<std::vector<int>>(
+        "max_sizes",
+        "(vector<int>) List of max sizes of generated prior boxes.");
     AddAttr<std::vector<float>>(
-        "aspect_ratios", "(vector<float>) ",
-        "List of aspect ratios of generated prior boxes.");
+        "aspect_ratios",
+        "(vector<float>) List of aspect ratios of generated prior boxes.");
+
     AddAttr<std::vector<float>>(
-        "variances", "(vector<float>) ",
-        "List of variances to be encoded in prior boxes.");
-    AddAttr<bool>("flip", "(bool) ", "Whether to flip aspect ratios.")
+        "variances",
+        "(vector<float>) List of variances to be encoded in prior boxes.")
+        .AddCustomChecker([](const std::vector<float>& variances) {
+          PADDLE_ENFORCE_EQ(variances.size(), 4,
+                            "Must and only provide 4 variance.");
+          for (size_t i = 0; i < variances.size(); ++i) {
+            PADDLE_ENFORCE_GT(variances[i], 0.0,
+                              "variance[%d] must be greater than 0.", i);
+          }
+        });
+    AddAttr<bool>("flip", "(bool) Whether to flip aspect ratios.")
         .SetDefault(true);
-    AddAttr<bool>("clip", "(bool) ", "Whether to clip out-of-boundary boxes.")
+    AddAttr<bool>("clip", "(bool) Whether to clip out-of-boundary boxes.")
         .SetDefault(true);
+
     AddAttr<float>("step_w",
                    "Prior boxes step across width, 0 for auto calculation.")
-        .SetDefault(0.0);
+        .SetDefault(0.0)
+        .AddCustomChecker([](const float& step_w) {
+          PADDLE_ENFORCE_GT(step_w, 0.0, "step_w should be larger than 0.");
+        });
     AddAttr<float>("step_h",
                    "Prior boxes step across height, 0 for auto calculation.")
-        .SetDefault(0.0);
+        .SetDefault(0.0)
+        .AddCustomChecker([](const float& step_h) {
+          PADDLE_ENFORCE_GT(step_h, 0.0, "step_h should be larger than 0.");
+        });
+
     AddAttr<float>("offset",
                    "(float) "
                    "Prior boxes center offset.")
diff --git a/paddle/operators/prior_box_op.h b/paddle/fluid/operators/prior_box_op.h
similarity index 78%
rename from paddle/operators/prior_box_op.h
rename to paddle/fluid/operators/prior_box_op.h
index e0a663ace8..fd07041233 100644
--- a/paddle/operators/prior_box_op.h
+++ b/paddle/fluid/operators/prior_box_op.h
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/platform/transform.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/transform.h"
 
 namespace paddle {
 namespace operators {
@@ -25,7 +25,7 @@ inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
                                std::vector<float>& output_aspect_ratior) {
   constexpr float epsilon = 1e-6;
   output_aspect_ratior.clear();
-  output_aspect_ratior.push_back(1.);
+  output_aspect_ratior.push_back(1.0f);
   for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
     float ar = input_aspect_ratior[i];
     bool already_exist = false;
@@ -38,7 +38,7 @@ inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
     if (!already_exist) {
       output_aspect_ratior.push_back(ar);
       if (flip) {
-        output_aspect_ratior.push_back(1. / ar);
+        output_aspect_ratior.push_back(1.0f / ar);
       }
     }
   }
@@ -46,7 +46,7 @@ inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
 
 template <typename T>
 struct ClipFunctor {
-  HOSTDEVICE T operator()(T in) const {
+  HOSTDEVICE inline T operator()(T in) const {
     return std::min<T>(std::max<T>(in, 0.), 1.);
   }
 };
@@ -97,6 +97,9 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
     boxes->mutable_data<T>(ctx.GetPlace());
     vars->mutable_data<T>(ctx.GetPlace());
 
+    T inv_img_width = 1.0 / img_width;
+    T inv_img_height = 1.0 / img_height;
+
     auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes);
     for (int h = 0; h < feature_height; ++h) {
       for (int w = 0; w < feature_width; ++w) {
@@ -109,13 +112,15 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
           // first prior: aspect_ratio = 1, size = min_size
           box_width = box_height = min_size;
           // xmin
-          e_boxes(h, w, idx, 0) = (center_x - box_width / 2.) / img_width;
+          e_boxes(h, w, idx, 0) = (center_x - box_width * 0.5) * inv_img_width;
           // ymin
-          e_boxes(h, w, idx, 1) = (center_y - box_height / 2.) / img_height;
+          e_boxes(h, w, idx, 1) =
+              (center_y - box_height * 0.5) * inv_img_height;
           // xmax
-          e_boxes(h, w, idx, 2) = (center_x + box_width / 2.) / img_width;
+          e_boxes(h, w, idx, 2) = (center_x + box_width * 0.5) * inv_img_width;
           // ymax
-          e_boxes(h, w, idx, 3) = (center_y + box_height / 2.) / img_height;
+          e_boxes(h, w, idx, 3) =
+              (center_y + box_height * 0.5) * inv_img_height;
 
           idx++;
           if (max_sizes.size() > 0) {
@@ -124,13 +129,17 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
             // size = sqrt(min_size * max_size)
             box_width = box_height = sqrt(min_size * max_size);
             // xmin
-            e_boxes(h, w, idx, 0) = (center_x - box_width / 2.) / img_width;
+            e_boxes(h, w, idx, 0) =
+                (center_x - box_width * 0.5) * inv_img_width;
             // ymin
-            e_boxes(h, w, idx, 1) = (center_y - box_height / 2.) / img_height;
+            e_boxes(h, w, idx, 1) =
+                (center_y - box_height * 0.5) * inv_img_height;
             // xmax
-            e_boxes(h, w, idx, 2) = (center_x + box_width / 2.) / img_width;
+            e_boxes(h, w, idx, 2) =
+                (center_x + box_width * 0.5) * inv_img_width;
             // ymax
-            e_boxes(h, w, idx, 3) = (center_y + box_height / 2.) / img_height;
+            e_boxes(h, w, idx, 3) =
+                (center_y + box_height * 0.5) * inv_img_height;
             idx++;
           }
 
@@ -143,13 +152,17 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
             box_width = min_size * sqrt(ar);
             box_height = min_size / sqrt(ar);
             // xmin
-            e_boxes(h, w, idx, 0) = (center_x - box_width / 2.) / img_width;
+            e_boxes(h, w, idx, 0) =
+                (center_x - box_width * 0.5) * inv_img_width;
             // ymin
-            e_boxes(h, w, idx, 1) = (center_y - box_height / 2.) / img_height;
+            e_boxes(h, w, idx, 1) =
+                (center_y - box_height * 0.5) * inv_img_height;
             // xmax
-            e_boxes(h, w, idx, 2) = (center_x + box_width / 2.) / img_width;
+            e_boxes(h, w, idx, 2) =
+                (center_x + box_width * 0.5) * inv_img_width;
             // ymax
-            e_boxes(h, w, idx, 3) = (center_y + box_height / 2.) / img_height;
+            e_boxes(h, w, idx, 3) =
+                (center_y + box_height * 0.5) * inv_img_height;
             idx++;
           }
         }
diff --git a/paddle/operators/proximal_adagrad_op.cc b/paddle/fluid/operators/proximal_adagrad_op.cc
similarity index 98%
rename from paddle/operators/proximal_adagrad_op.cc
rename to paddle/fluid/operators/proximal_adagrad_op.cc
index b92f46b5bd..d9e3894c57 100644
--- a/paddle/operators/proximal_adagrad_op.cc
+++ b/paddle/fluid/operators/proximal_adagrad_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/proximal_adagrad_op.h"
+#include "paddle/fluid/operators/proximal_adagrad_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/proximal_adagrad_op.cu b/paddle/fluid/operators/proximal_adagrad_op.cu
similarity index 93%
rename from paddle/operators/proximal_adagrad_op.cu
rename to paddle/fluid/operators/proximal_adagrad_op.cu
index 42a178f94b..54c75b3abb 100644
--- a/paddle/operators/proximal_adagrad_op.cu
+++ b/paddle/fluid/operators/proximal_adagrad_op.cu
@@ -12,7 +12,7 @@ CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/proximal_adagrad_op.h"
+#include "paddle/fluid/operators/proximal_adagrad_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/proximal_adagrad_op.h b/paddle/fluid/operators/proximal_adagrad_op.h
similarity index 96%
rename from paddle/operators/proximal_adagrad_op.h
rename to paddle/fluid/operators/proximal_adagrad_op.h
index 523924d80e..70205a8d11 100644
--- a/paddle/operators/proximal_adagrad_op.h
+++ b/paddle/fluid/operators/proximal_adagrad_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/proximal_gd_op.cc b/paddle/fluid/operators/proximal_gd_op.cc
similarity index 98%
rename from paddle/operators/proximal_gd_op.cc
rename to paddle/fluid/operators/proximal_gd_op.cc
index 2d3bbdaf32..de7c6843c8 100644
--- a/paddle/operators/proximal_gd_op.cc
+++ b/paddle/fluid/operators/proximal_gd_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/proximal_gd_op.h"
+#include "paddle/fluid/operators/proximal_gd_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/proximal_gd_op.cu b/paddle/fluid/operators/proximal_gd_op.cu
similarity index 93%
rename from paddle/operators/proximal_gd_op.cu
rename to paddle/fluid/operators/proximal_gd_op.cu
index b7dd840d19..97b672e872 100644
--- a/paddle/operators/proximal_gd_op.cu
+++ b/paddle/fluid/operators/proximal_gd_op.cu
@@ -12,7 +12,7 @@ CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/proximal_gd_op.h"
+#include "paddle/fluid/operators/proximal_gd_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/proximal_gd_op.h b/paddle/fluid/operators/proximal_gd_op.h
similarity index 95%
rename from paddle/operators/proximal_gd_op.h
rename to paddle/fluid/operators/proximal_gd_op.h
index 64648b3cca..8372380f25 100644
--- a/paddle/operators/proximal_gd_op.h
+++ b/paddle/fluid/operators/proximal_gd_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc
similarity index 99%
rename from paddle/operators/rank_loss_op.cc
rename to paddle/fluid/operators/rank_loss_op.cc
index f2164a0f80..222ca73d2a 100644
--- a/paddle/operators/rank_loss_op.cc
+++ b/paddle/fluid/operators/rank_loss_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/rank_loss_op.h"
+#include "paddle/fluid/operators/rank_loss_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/rank_loss_op.cu b/paddle/fluid/operators/rank_loss_op.cu
similarity index 95%
rename from paddle/operators/rank_loss_op.cu
rename to paddle/fluid/operators/rank_loss_op.cu
index 294b227383..1b182ced70 100644
--- a/paddle/operators/rank_loss_op.cu
+++ b/paddle/fluid/operators/rank_loss_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/rank_loss_op.h"
+#include "paddle/fluid/operators/rank_loss_op.h"
 
 REGISTER_OP_CUDA_KERNEL(rank_loss,
                         paddle::operators::RankLossKernel<
diff --git a/paddle/operators/rank_loss_op.h b/paddle/fluid/operators/rank_loss_op.h
similarity index 97%
rename from paddle/operators/rank_loss_op.h
rename to paddle/fluid/operators/rank_loss_op.h
index bd0c49ca6e..08bb2c2821 100644
--- a/paddle/operators/rank_loss_op.h
+++ b/paddle/fluid/operators/rank_loss_op.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/read_op.cc b/paddle/fluid/operators/read_op.cc
new file mode 100644
index 0000000000..4d562c2919
--- /dev/null
+++ b/paddle/fluid/operators/read_op.cc
@@ -0,0 +1,99 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/reader.h"
+
+namespace paddle {
+namespace operators {
+
+class ReadInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Reader"),
+                   "The ReadOp must take a reader as input.");
+    PADDLE_ENFORCE(ctx->HasOutputs("Out"),
+                   "The ReadOp should be assigned with output.");
+    std::vector<framework::DDim> reader_dims = ctx->GetReaderDims("Reader");
+    std::vector<std::string> out_names = ctx->Outputs("Out");
+    PADDLE_ENFORCE_EQ(
+        reader_dims.size(), out_names.size(),
+        "The reader's dim number doesn't match the output number.");
+    ctx->SetOutputsDim("Out", reader_dims);
+  }
+};
+
+class ReadInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    std::string reader_name = op_desc.Input("Reader")[0];
+    std::vector<std::string> out_names = op_desc.Output("Out");
+    framework::VarDesc* reader = block->FindVarRecursive(reader_name);
+    auto dtypes = reader->GetDataTypes();
+    PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size());
+    for (size_t i = 0; i < dtypes.size(); ++i) {
+      framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]);
+      out.SetType(framework::proto::VarDesc::LOD_TENSOR);
+      out.SetDataType(dtypes[i]);
+    }
+  }
+};
+
+class ReadOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+  void Run(const framework::Scope& scope,
+           const platform::Place& dev_place) const override {
+    framework::ReaderHolder* reader =
+        scope.FindVar(Input("Reader"))->GetMutable<framework::ReaderHolder>();
+    if (!reader->HasNext()) {
+      reader->ReInit();
+      PADDLE_ENFORCE(
+          reader->HasNext(),
+          "Reader can not read the next data even it has been re-initialized.");
+    }
+    std::vector<std::string> out_arg_names = Outputs("Out");
+    std::vector<framework::LoDTensor> ins;
+    reader->ReadNext(&ins);
+    PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size());
+    for (size_t i = 0; i < ins.size(); ++i) {
+      auto* out =
+          scope.FindVar(out_arg_names[i])->GetMutable<framework::LoDTensor>();
+      out->ShareDataWith(ins[i]);
+      out->set_lod(ins[i].lod());
+    }
+  }
+};
+
+class ReadOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ReadOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(op_proto, op_checker) {
+    AddInput("Reader", "(ReaderHolder) The executed reader.");
+    AddOutput("Out", "(LoDTensor) The output data.").AsDuplicable();
+    AddComment(R"DOC(
+      Read Operator
+
+      Execute a given reader once and output data.
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(read, ops::ReadOp, ops::ReadInferShape, ops::ReadOpMaker,
+                  paddle::framework::EmptyGradOpMaker, ops::ReadInferVarType);
diff --git a/paddle/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc
similarity index 99%
rename from paddle/operators/recurrent_op.cc
rename to paddle/fluid/operators/recurrent_op.cc
index a136c5b447..e4b9b8dab9 100644
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <vector>
-#include "paddle/framework/executor.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc
similarity index 90%
rename from paddle/operators/recv_op.cc
rename to paddle/fluid/operators/recv_op.cc
index ba71094219..c093f60cee 100644
--- a/paddle/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
@@ -14,13 +14,13 @@ limitations under the License. */
 
 #include <ostream>
 
-#include "paddle/framework/data_type.h"
-#include "paddle/framework/framework.pb.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 #include <future>
-#include "paddle/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/detail/grpc_client.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/reduce_op.cc b/paddle/fluid/operators/reduce_op.cc
similarity index 99%
rename from paddle/operators/reduce_op.cc
rename to paddle/fluid/operators/reduce_op.cc
index 84f24a9095..f4d9d4cc07 100644
--- a/paddle/operators/reduce_op.cc
+++ b/paddle/fluid/operators/reduce_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/reduce_op.h"
+#include "paddle/fluid/operators/reduce_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/reduce_op.cu b/paddle/fluid/operators/reduce_op.cu
similarity index 97%
rename from paddle/operators/reduce_op.cu
rename to paddle/fluid/operators/reduce_op.cu
index 4ed1e051db..1ca107ebfe 100644
--- a/paddle/operators/reduce_op.cu
+++ b/paddle/fluid/operators/reduce_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/reduce_op.h"
+#include "paddle/fluid/operators/reduce_op.h"
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/operators/reduce_op.h b/paddle/fluid/operators/reduce_op.h
similarity index 99%
rename from paddle/operators/reduce_op.h
rename to paddle/fluid/operators/reduce_op.h
index da5f397776..a153cf272b 100644
--- a/paddle/operators/reduce_op.h
+++ b/paddle/fluid/operators/reduce_op.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include "glog/logging.h"
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/reorder_lod_tensor_by_rank_op.cc b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
similarity index 98%
rename from paddle/operators/reorder_lod_tensor_by_rank_op.cc
rename to paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
index 3c30447949..148a65bb4b 100644
--- a/paddle/operators/reorder_lod_tensor_by_rank_op.cc
+++ b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/lod_rank_table.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/detail/safe_ref.h"
-#include "paddle/platform/device_context.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
similarity index 99%
rename from paddle/operators/reshape_op.cc
rename to paddle/fluid/operators/reshape_op.cc
index b9743a5df1..b4f80cc06a 100644
--- a/paddle/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/reshape_op.h"
+#include "paddle/fluid/operators/reshape_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/reshape_op.cu b/paddle/fluid/operators/reshape_op.cu
similarity index 94%
rename from paddle/operators/reshape_op.cu
rename to paddle/fluid/operators/reshape_op.cu
index f487e43b99..f9ae6da29e 100644
--- a/paddle/operators/reshape_op.cu
+++ b/paddle/fluid/operators/reshape_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/reshape_op.h"
+#include "paddle/fluid/operators/reshape_op.h"
 
 REGISTER_OP_CUDA_KERNEL(
     reshape,
diff --git a/paddle/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h
similarity index 94%
rename from paddle/operators/reshape_op.h
rename to paddle/fluid/operators/reshape_op.h
index d884b03cad..a17ba7c619 100644
--- a/paddle/operators/reshape_op.h
+++ b/paddle/fluid/operators/reshape_op.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/rmsprop_op.cc b/paddle/fluid/operators/rmsprop_op.cc
similarity index 99%
rename from paddle/operators/rmsprop_op.cc
rename to paddle/fluid/operators/rmsprop_op.cc
index f7c250bf91..06d3ccafef 100644
--- a/paddle/operators/rmsprop_op.cc
+++ b/paddle/fluid/operators/rmsprop_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/rmsprop_op.h"
+#include "paddle/fluid/operators/rmsprop_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/rmsprop_op.cu b/paddle/fluid/operators/rmsprop_op.cu
similarity index 94%
rename from paddle/operators/rmsprop_op.cu
rename to paddle/fluid/operators/rmsprop_op.cu
index 0295dc262f..a909c94279 100644
--- a/paddle/operators/rmsprop_op.cu
+++ b/paddle/fluid/operators/rmsprop_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/rmsprop_op.h"
+#include "paddle/fluid/operators/rmsprop_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/rmsprop_op.h b/paddle/fluid/operators/rmsprop_op.h
similarity index 96%
rename from paddle/operators/rmsprop_op.h
rename to paddle/fluid/operators/rmsprop_op.h
index 16a561835d..469c102a47 100644
--- a/paddle/operators/rmsprop_op.h
+++ b/paddle/fluid/operators/rmsprop_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/rnn_memory_helper_op.cc b/paddle/fluid/operators/rnn_memory_helper_op.cc
similarity index 98%
rename from paddle/operators/rnn_memory_helper_op.cc
rename to paddle/fluid/operators/rnn_memory_helper_op.cc
index eb55ed6a05..504456c4b0 100644
--- a/paddle/operators/rnn_memory_helper_op.cc
+++ b/paddle/fluid/operators/rnn_memory_helper_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc
similarity index 99%
rename from paddle/operators/roi_pool_op.cc
rename to paddle/fluid/operators/roi_pool_op.cc
index a7351f11c5..09238f89a7 100644
--- a/paddle/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/roi_pool_op.h"
+#include "paddle/fluid/operators/roi_pool_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu
similarity index 98%
rename from paddle/operators/roi_pool_op.cu
rename to paddle/fluid/operators/roi_pool_op.cu
index a874befe4d..0e8fc9ec7a 100644
--- a/paddle/operators/roi_pool_op.cu
+++ b/paddle/fluid/operators/roi_pool_op.cu
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/roi_pool_op.h"
-#include "paddle/platform/cuda_helper.h"
+#include "paddle/fluid/operators/roi_pool_op.h"
+#include "paddle/fluid/platform/cuda_helper.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/roi_pool_op.h b/paddle/fluid/operators/roi_pool_op.h
similarity index 98%
rename from paddle/operators/roi_pool_op.h
rename to paddle/fluid/operators/roi_pool_op.h
index 09a9d3d870..15f3b36fcd 100644
--- a/paddle/operators/roi_pool_op.h
+++ b/paddle/fluid/operators/roi_pool_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/row_conv_op.cc b/paddle/fluid/operators/row_conv_op.cc
similarity index 99%
rename from paddle/operators/row_conv_op.cc
rename to paddle/fluid/operators/row_conv_op.cc
index 68f4e35315..92661ea971 100644
--- a/paddle/operators/row_conv_op.cc
+++ b/paddle/fluid/operators/row_conv_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/row_conv_op.h"
-#include "paddle/framework/eigen.h"
+#include "paddle/fluid/operators/row_conv_op.h"
+#include "paddle/fluid/framework/eigen.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/row_conv_op.cu b/paddle/fluid/operators/row_conv_op.cu
similarity index 98%
rename from paddle/operators/row_conv_op.cu
rename to paddle/fluid/operators/row_conv_op.cu
index b3825212e1..832072edf8 100644
--- a/paddle/operators/row_conv_op.cu
+++ b/paddle/fluid/operators/row_conv_op.cu
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/row_conv_op.h"
-#include "paddle/platform/cuda_helper.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/row_conv_op.h"
+#include "paddle/fluid/platform/cuda_helper.h"
 
 namespace paddle {
 namespace operators {
@@ -307,7 +307,7 @@ class RowConvKernel<platform::CUDADeviceContext, T>
     int input_dim = X->dims()[1];
     int num_sequence = batch_indices.size() - 1;
     int future_context = Filter->dims()[0];
-    size_t *idx = batch_indices.cuda_data();
+    size_t *idx = batch_indices.CUDAMutableData(context.GetPlace());
     auto stream = context.cuda_device_context().stream();
 
     if (future_context <= 32) {
@@ -345,7 +345,7 @@ class RowConvGradKernel<platform::CUDADeviceContext, T>
     int input_dim = X->dims()[1];
     int num_sequence = batch_indices.size() - 1;
     int future_context = Filter->dims()[0];
-    size_t *idx = batch_indices.cuda_data();
+    size_t *idx = batch_indices.CUDAMutableData(context.GetPlace());
 
     auto &device_ctx = context.cuda_device_context();
     math::SetConstant<platform::CUDADeviceContext, T> zero;
diff --git a/paddle/operators/row_conv_op.h b/paddle/fluid/operators/row_conv_op.h
similarity index 95%
rename from paddle/operators/row_conv_op.h
rename to paddle/fluid/operators/row_conv_op.h
index 10d435ab08..59164b5215 100644
--- a/paddle/operators/row_conv_op.h
+++ b/paddle/fluid/operators/row_conv_op.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc
similarity index 94%
rename from paddle/operators/save_combine_op.cc
rename to paddle/fluid/operators/save_combine_op.cc
index bffa2908bc..c23de9073e 100644
--- a/paddle/operators/save_combine_op.cc
+++ b/paddle/fluid/operators/save_combine_op.cc
@@ -17,11 +17,11 @@ limitations under the License. */
 #include <fstream>
 #include <numeric>
 #include <sstream>
-#include "paddle/framework/data_type.h"
-#include "paddle/framework/framework.pb.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/platform/device_context.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/save_load_combine_op_test.cc b/paddle/fluid/operators/save_load_combine_op_test.cc
similarity index 99%
rename from paddle/operators/save_load_combine_op_test.cc
rename to paddle/fluid/operators/save_load_combine_op_test.cc
index f3ddc4a6c5..f8325bac6b 100644
--- a/paddle/operators/save_load_combine_op_test.cc
+++ b/paddle/fluid/operators/save_load_combine_op_test.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "gtest/gtest.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 USE_NO_KERNEL_OP(save_combine);
 USE_NO_KERNEL_OP(load_combine);
diff --git a/paddle/operators/save_load_op_test.cc b/paddle/fluid/operators/save_load_op_test.cc
similarity index 97%
rename from paddle/operators/save_load_op_test.cc
rename to paddle/fluid/operators/save_load_op_test.cc
index d829d5da17..da4573a8ed 100644
--- a/paddle/operators/save_load_op_test.cc
+++ b/paddle/fluid/operators/save_load_op_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "gtest/gtest.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 USE_NO_KERNEL_OP(save);
 USE_NO_KERNEL_OP(load);
diff --git a/paddle/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
similarity index 94%
rename from paddle/operators/save_op.cc
rename to paddle/fluid/operators/save_op.cc
index 4b1cbe8883..483cdfa4c3 100644
--- a/paddle/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -17,11 +17,11 @@ limitations under the License. */
 #include <fstream>
 #include <numeric>
 
-#include "paddle/framework/data_type.h"
-#include "paddle/framework/framework.pb.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/platform/device_context.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
similarity index 97%
rename from paddle/operators/scale_op.cc
rename to paddle/fluid/operators/scale_op.cc
index c0e614743a..017fc2c00e 100644
--- a/paddle/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/scale_op.h"
-#include "paddle/operators/net_op.h"
+#include "paddle/fluid/operators/scale_op.h"
+#include "paddle/fluid/operators/net_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/scale_op.cu b/paddle/fluid/operators/scale_op.cu
similarity index 95%
rename from paddle/operators/scale_op.cu
rename to paddle/fluid/operators/scale_op.cu
index 7202c0de70..a9b46077aa 100644
--- a/paddle/operators/scale_op.cu
+++ b/paddle/fluid/operators/scale_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/scale_op.h"
+#include "paddle/fluid/operators/scale_op.h"
 
 REGISTER_OP_CUDA_KERNEL(
     scale,
diff --git a/paddle/operators/scale_op.h b/paddle/fluid/operators/scale_op.h
similarity index 93%
rename from paddle/operators/scale_op.h
rename to paddle/fluid/operators/scale_op.h
index 395268c2ee..b1c2964ca6 100644
--- a/paddle/operators/scale_op.h
+++ b/paddle/fluid/operators/scale_op.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/scatter.cu.h b/paddle/fluid/operators/scatter.cu.h
similarity index 96%
rename from paddle/operators/scatter.cu.h
rename to paddle/fluid/operators/scatter.cu.h
index 55555300fc..0f1b9426a7 100644
--- a/paddle/operators/scatter.cu.h
+++ b/paddle/fluid/operators/scatter.cu.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/tensor.h"
-#include "paddle/platform/place.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/scatter.h b/paddle/fluid/operators/scatter.h
similarity index 92%
rename from paddle/operators/scatter.h
rename to paddle/fluid/operators/scatter.h
index c1fb844ebd..70cae1286c 100644
--- a/paddle/operators/scatter.h
+++ b/paddle/fluid/operators/scatter.h
@@ -15,10 +15,10 @@ limitations under the License. */
 #pragma once
 #include <cstring>
 
-#include "paddle/framework/ddim.h"
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/platform/place.h"
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc
similarity index 97%
rename from paddle/operators/scatter_op.cc
rename to paddle/fluid/operators/scatter_op.cc
index b653348906..e35930af53 100644
--- a/paddle/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/scatter_op.h"
-#include "paddle/framework/ddim.h"
+#include "paddle/fluid/operators/scatter_op.h"
+#include "paddle/fluid/framework/ddim.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/scatter_op.cu b/paddle/fluid/operators/scatter_op.cu
similarity index 98%
rename from paddle/operators/scatter_op.cu
rename to paddle/fluid/operators/scatter_op.cu
index 0c198d2258..f9eaae33a8 100644
--- a/paddle/operators/scatter_op.cu
+++ b/paddle/fluid/operators/scatter_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "gather.cu.h"
-#include "paddle/operators/gather_op.h"
+#include "paddle/fluid/operators/gather_op.h"
 #include "scatter.cu.h"
 
 namespace paddle {
diff --git a/paddle/operators/scatter_op.h b/paddle/fluid/operators/scatter_op.h
similarity index 96%
rename from paddle/operators/scatter_op.h
rename to paddle/fluid/operators/scatter_op.h
index 1a4f6f99bf..65d1054632 100644
--- a/paddle/operators/scatter_op.h
+++ b/paddle/fluid/operators/scatter_op.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 #include "gather.h"
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "scatter.h"
 
 namespace paddle {
diff --git a/paddle/operators/scatter_test.cc b/paddle/fluid/operators/scatter_test.cc
similarity index 91%
rename from paddle/operators/scatter_test.cc
rename to paddle/fluid/operators/scatter_test.cc
index 00dbdacbfe..8fb5ef96af 100644
--- a/paddle/operators/scatter_test.cc
+++ b/paddle/fluid/operators/scatter_test.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/scatter.h"
-#include "paddle/framework/ddim.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/platform/place.h"
+#include "paddle/fluid/operators/scatter.h"
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/place.h"
 
 #include <gtest/gtest.h>
 #include <iostream>
diff --git a/paddle/operators/send_op.cc b/paddle/fluid/operators/send_op.cc
similarity index 93%
rename from paddle/operators/send_op.cc
rename to paddle/fluid/operators/send_op.cc
index ee0f268b0e..a8390aa659 100644
--- a/paddle/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -14,13 +14,13 @@ limitations under the License. */
 
 #include <ostream>
 
-#include "paddle/framework/data_type.h"
-#include "paddle/framework/framework.pb.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 #include <future>
-#include "paddle/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/detail/grpc_client.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/send_recv_op_test.cc b/paddle/fluid/operators/send_recv_op_test.cc
similarity index 96%
rename from paddle/operators/send_recv_op_test.cc
rename to paddle/fluid/operators/send_recv_op_test.cc
index 31527a906d..716f687044 100644
--- a/paddle/operators/send_recv_op_test.cc
+++ b/paddle/fluid/operators/send_recv_op_test.cc
@@ -17,11 +17,11 @@ limitations under the License. */
 #include <thread>
 
 #include "gtest/gtest.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
-#include "paddle/framework/program_desc.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/string/printf.h"
 
 USE_NO_KERNEL_OP(send);
diff --git a/paddle/operators/sequence_concat_op.cc b/paddle/fluid/operators/sequence_concat_op.cc
similarity index 98%
rename from paddle/operators/sequence_concat_op.cc
rename to paddle/fluid/operators/sequence_concat_op.cc
index 2f0aad2003..4ddf800d85 100644
--- a/paddle/operators/sequence_concat_op.cc
+++ b/paddle/fluid/operators/sequence_concat_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/sequence_concat_op.h"
+#include "paddle/fluid/operators/sequence_concat_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/sequence_concat_op.cu.cc b/paddle/fluid/operators/sequence_concat_op.cu.cc
similarity index 94%
rename from paddle/operators/sequence_concat_op.cu.cc
rename to paddle/fluid/operators/sequence_concat_op.cu.cc
index 144bdb5af6..c5a280ef9e 100644
--- a/paddle/operators/sequence_concat_op.cu.cc
+++ b/paddle/fluid/operators/sequence_concat_op.cu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/sequence_concat_op.h"
+#include "paddle/fluid/operators/sequence_concat_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/sequence_concat_op.h b/paddle/fluid/operators/sequence_concat_op.h
similarity index 98%
rename from paddle/operators/sequence_concat_op.h
rename to paddle/fluid/operators/sequence_concat_op.h
index 8445224f46..9121196369 100644
--- a/paddle/operators/sequence_concat_op.h
+++ b/paddle/fluid/operators/sequence_concat_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/strided_memcpy.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/strided_memcpy.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/sequence_conv_op.cc b/paddle/fluid/operators/sequence_conv_op.cc
similarity index 99%
rename from paddle/operators/sequence_conv_op.cc
rename to paddle/fluid/operators/sequence_conv_op.cc
index c5b7c81bd7..af9938b180 100644
--- a/paddle/operators/sequence_conv_op.cc
+++ b/paddle/fluid/operators/sequence_conv_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/sequence_conv_op.h"
+#include "paddle/fluid/operators/sequence_conv_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/sequence_conv_op.cu.cc b/paddle/fluid/operators/sequence_conv_op.cu.cc
similarity index 95%
rename from paddle/operators/sequence_conv_op.cu.cc
rename to paddle/fluid/operators/sequence_conv_op.cu.cc
index 0b8f2c6955..36f9e8da95 100644
--- a/paddle/operators/sequence_conv_op.cu.cc
+++ b/paddle/fluid/operators/sequence_conv_op.cu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/sequence_conv_op.h"
+#include "paddle/fluid/operators/sequence_conv_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/sequence_conv_op.h b/paddle/fluid/operators/sequence_conv_op.h
similarity index 97%
rename from paddle/operators/sequence_conv_op.h
rename to paddle/fluid/operators/sequence_conv_op.h
index bb584b7bfa..1c81067fea 100644
--- a/paddle/operators/sequence_conv_op.h
+++ b/paddle/fluid/operators/sequence_conv_op.h
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/context_project.h"
-#include "paddle/operators/math/math_function.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/context_project.h"
+#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/sequence_erase_op.cc b/paddle/fluid/operators/sequence_erase_op.cc
similarity index 98%
rename from paddle/operators/sequence_erase_op.cc
rename to paddle/fluid/operators/sequence_erase_op.cc
index aa0c00aa6f..2e0adf8b19 100644
--- a/paddle/operators/sequence_erase_op.cc
+++ b/paddle/fluid/operators/sequence_erase_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/sequence_erase_op.h"
+#include "paddle/fluid/operators/sequence_erase_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/sequence_erase_op.cu b/paddle/fluid/operators/sequence_erase_op.cu
similarity index 95%
rename from paddle/operators/sequence_erase_op.cu
rename to paddle/fluid/operators/sequence_erase_op.cu
index a5311f15f0..43fc352fe7 100644
--- a/paddle/operators/sequence_erase_op.cu
+++ b/paddle/fluid/operators/sequence_erase_op.cu
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
-#include "paddle/operators/sequence_erase_op.h"
-#include "paddle/platform/cuda_helper.h"
+#include "paddle/fluid/operators/sequence_erase_op.h"
+#include "paddle/fluid/platform/cuda_helper.h"
 
 namespace paddle {
 namespace operators {
@@ -87,8 +87,7 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
     // Copy LoD to GPU
     auto lod0 = lod[0];
     auto lod_len = lod0.size();
-    thrust::device_vector<size_t> dev_in_lod = lod0;
-    size_t* dev_in_lod_ptr = thrust::raw_pointer_cast(dev_in_lod.data());
+    const size_t* dev_in_lod_ptr = lod0.CUDAData(ctx.GetPlace());
 
     // Calc output LoD
     thrust::device_vector<size_t> dev_out_lod(lod_len);
diff --git a/paddle/operators/sequence_erase_op.h b/paddle/fluid/operators/sequence_erase_op.h
similarity index 97%
rename from paddle/operators/sequence_erase_op.h
rename to paddle/fluid/operators/sequence_erase_op.h
index cb2d7be009..e151279c7f 100644
--- a/paddle/operators/sequence_erase_op.h
+++ b/paddle/fluid/operators/sequence_erase_op.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/sequence_expand_op.cc b/paddle/fluid/operators/sequence_expand_op.cc
similarity index 98%
rename from paddle/operators/sequence_expand_op.cc
rename to paddle/fluid/operators/sequence_expand_op.cc
index d34dbd35b6..4ebce641d2 100644
--- a/paddle/operators/sequence_expand_op.cc
+++ b/paddle/fluid/operators/sequence_expand_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/sequence_expand_op.h"
+#include "paddle/fluid/operators/sequence_expand_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/sequence_expand_op.cu b/paddle/fluid/operators/sequence_expand_op.cu
similarity index 94%
rename from paddle/operators/sequence_expand_op.cu
rename to paddle/fluid/operators/sequence_expand_op.cu
index 0b9638b2ce..5ac76d83da 100644
--- a/paddle/operators/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_expand_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/sequence_expand_op.h"
+#include "paddle/fluid/operators/sequence_expand_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/sequence_expand_op.h b/paddle/fluid/operators/sequence_expand_op.h
similarity index 97%
rename from paddle/operators/sequence_expand_op.h
rename to paddle/fluid/operators/sequence_expand_op.h
index 6021526eee..8010627ff6 100644
--- a/paddle/operators/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_expand_op.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/op_registry.h"
-#include "paddle/memory/memcpy.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/memcpy.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
 namespace paddle {
diff --git a/paddle/operators/sequence_pool_op.cc b/paddle/fluid/operators/sequence_pool_op.cc
similarity index 99%
rename from paddle/operators/sequence_pool_op.cc
rename to paddle/fluid/operators/sequence_pool_op.cc
index 549d9620ef..2cfb336b2e 100644
--- a/paddle/operators/sequence_pool_op.cc
+++ b/paddle/fluid/operators/sequence_pool_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/sequence_pool_op.h"
+#include "paddle/fluid/operators/sequence_pool_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/sequence_pool_op.cu b/paddle/fluid/operators/sequence_pool_op.cu
similarity index 94%
rename from paddle/operators/sequence_pool_op.cu
rename to paddle/fluid/operators/sequence_pool_op.cu
index 265f695935..364769c39b 100644
--- a/paddle/operators/sequence_pool_op.cu
+++ b/paddle/fluid/operators/sequence_pool_op.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #define EIGEN_USE_GPU
 
-#include "paddle/operators/sequence_pool_op.h"
+#include "paddle/fluid/operators/sequence_pool_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/sequence_pool_op.h b/paddle/fluid/operators/sequence_pool_op.h
similarity index 96%
rename from paddle/operators/sequence_pool_op.h
rename to paddle/fluid/operators/sequence_pool_op.h
index 7519aa1d72..7b67e6201e 100644
--- a/paddle/operators/sequence_pool_op.h
+++ b/paddle/fluid/operators/sequence_pool_op.h
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/sequence_pooling.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/sequence_pooling.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/sequence_reshape_op.cc b/paddle/fluid/operators/sequence_reshape_op.cc
similarity index 98%
rename from paddle/operators/sequence_reshape_op.cc
rename to paddle/fluid/operators/sequence_reshape_op.cc
index d89a46a712..c4e42d3eeb 100644
--- a/paddle/operators/sequence_reshape_op.cc
+++ b/paddle/fluid/operators/sequence_reshape_op.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/operators/sequence_reshape_op.h"
-#include "paddle/framework/ddim.h"
+#include "paddle/fluid/operators/sequence_reshape_op.h"
+#include "paddle/fluid/framework/ddim.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/sequence_reshape_op.cu b/paddle/fluid/operators/sequence_reshape_op.cu
similarity index 96%
rename from paddle/operators/sequence_reshape_op.cu
rename to paddle/fluid/operators/sequence_reshape_op.cu
index d9c2f7e9a4..5ca3497396 100644
--- a/paddle/operators/sequence_reshape_op.cu
+++ b/paddle/fluid/operators/sequence_reshape_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/sequence_reshape_op.h"
+#include "paddle/fluid/operators/sequence_reshape_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/sequence_reshape_op.h b/paddle/fluid/operators/sequence_reshape_op.h
similarity index 96%
rename from paddle/operators/sequence_reshape_op.h
rename to paddle/fluid/operators/sequence_reshape_op.h
index aaae7ab292..7a5d1261da 100644
--- a/paddle/operators/sequence_reshape_op.h
+++ b/paddle/fluid/operators/sequence_reshape_op.h
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/sequence_slice_op.cc b/paddle/fluid/operators/sequence_slice_op.cc
similarity index 98%
rename from paddle/operators/sequence_slice_op.cc
rename to paddle/fluid/operators/sequence_slice_op.cc
index f79106ff0f..87b8eff646 100644
--- a/paddle/operators/sequence_slice_op.cc
+++ b/paddle/fluid/operators/sequence_slice_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/sequence_slice_op.h"
+#include "paddle/fluid/operators/sequence_slice_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/sequence_slice_op.cu b/paddle/fluid/operators/sequence_slice_op.cu
similarity index 94%
rename from paddle/operators/sequence_slice_op.cu
rename to paddle/fluid/operators/sequence_slice_op.cu
index 43a21d619f..041fabdf9a 100755
--- a/paddle/operators/sequence_slice_op.cu
+++ b/paddle/fluid/operators/sequence_slice_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/sequence_slice_op.h"
+#include "paddle/fluid/operators/sequence_slice_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/sequence_slice_op.h b/paddle/fluid/operators/sequence_slice_op.h
similarity index 97%
rename from paddle/operators/sequence_slice_op.h
rename to paddle/fluid/operators/sequence_slice_op.h
index 0e4e4cf65f..65c36a32aa 100644
--- a/paddle/operators/sequence_slice_op.h
+++ b/paddle/fluid/operators/sequence_slice_op.h
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/strided_memcpy.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/strided_memcpy.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_softmax_op.cc
similarity index 98%
rename from paddle/operators/sequence_softmax_op.cc
rename to paddle/fluid/operators/sequence_softmax_op.cc
index b74766f012..f966b71620 100644
--- a/paddle/operators/sequence_softmax_op.cc
+++ b/paddle/fluid/operators/sequence_softmax_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/sequence_softmax_op.h"
+#include "paddle/fluid/operators/sequence_softmax_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/sequence_softmax_op.cu.cc b/paddle/fluid/operators/sequence_softmax_op.cu.cc
similarity index 94%
rename from paddle/operators/sequence_softmax_op.cu.cc
rename to paddle/fluid/operators/sequence_softmax_op.cu.cc
index 5f65b4daf9..c42dfd7540 100644
--- a/paddle/operators/sequence_softmax_op.cu.cc
+++ b/paddle/fluid/operators/sequence_softmax_op.cu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/sequence_softmax_op.h"
+#include "paddle/fluid/operators/sequence_softmax_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/sequence_softmax_op.h b/paddle/fluid/operators/sequence_softmax_op.h
similarity index 97%
rename from paddle/operators/sequence_softmax_op.h
rename to paddle/fluid/operators/sequence_softmax_op.h
index e889e88cb3..e6c21c67b3 100644
--- a/paddle/operators/sequence_softmax_op.h
+++ b/paddle/fluid/operators/sequence_softmax_op.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/softmax.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/softmax.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/sgd_op.cc b/paddle/fluid/operators/sgd_op.cc
similarity index 98%
rename from paddle/operators/sgd_op.cc
rename to paddle/fluid/operators/sgd_op.cc
index a11c9624ce..f1e23a62f4 100644
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/fluid/operators/sgd_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/sgd_op.h"
+#include "paddle/fluid/operators/sgd_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/sgd_op.cu b/paddle/fluid/operators/sgd_op.cu
similarity index 95%
rename from paddle/operators/sgd_op.cu
rename to paddle/fluid/operators/sgd_op.cu
index 29f5aa3542..09374e2049 100644
--- a/paddle/operators/sgd_op.cu
+++ b/paddle/fluid/operators/sgd_op.cu
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/sgd_op.h"
-#include "paddle/platform/cuda_helper.h"
+#include "paddle/fluid/operators/sgd_op.h"
+#include "paddle/fluid/platform/cuda_helper.h"
 
 namespace paddle {
 namespace operators {
@@ -102,8 +102,8 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
       dim3 grid(1, in_rows.size());
       SparseSGDFunctorKernel<
           T, 256><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
-          in_data, in_rows.cuda_data(), learning_rate->data<T>(), out_data,
-          in_row_numel);
+          in_data, in_rows.CUDAData(ctx.GetPlace()), learning_rate->data<T>(),
+          out_data, in_row_numel);
 
     } else {
       PADDLE_THROW("Unsupported Variable Type of Grad");
diff --git a/paddle/operators/sgd_op.h b/paddle/fluid/operators/sgd_op.h
similarity index 95%
rename from paddle/operators/sgd_op.h
rename to paddle/fluid/operators/sgd_op.h
index a6c544591e..f1eaaecdb1 100644
--- a/paddle/operators/sgd_op.h
+++ b/paddle/fluid/operators/sgd_op.h
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/selected_rows.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/selected_rows.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/shrink_rnn_memory_op.cc b/paddle/fluid/operators/shrink_rnn_memory_op.cc
similarity index 97%
rename from paddle/operators/shrink_rnn_memory_op.cc
rename to paddle/fluid/operators/shrink_rnn_memory_op.cc
index bf870115a4..df50a324fd 100644
--- a/paddle/operators/shrink_rnn_memory_op.cc
+++ b/paddle/fluid/operators/shrink_rnn_memory_op.cc
@@ -11,10 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/framework/lod_rank_table.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/operators/array_operator.h"
-#include "paddle/operators/math/math_function.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/operators/array_operator.h"
+#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
similarity index 98%
rename from paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
rename to paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
index c526a88a12..3188415a2b 100644
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/sigmoid_cross_entropy_with_logits_op.h"
+#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
similarity index 93%
rename from paddle/operators/sigmoid_cross_entropy_with_logits_op.cu
rename to paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
index 3f393265f4..daa9d3e4fa 100644
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/sigmoid_cross_entropy_with_logits_op.h"
+#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits,
diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
similarity index 96%
rename from paddle/operators/sigmoid_cross_entropy_with_logits_op.h
rename to paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
index b78bcc436e..977849f762 100644
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/sign_op.cc b/paddle/fluid/operators/sign_op.cc
similarity index 98%
rename from paddle/operators/sign_op.cc
rename to paddle/fluid/operators/sign_op.cc
index f63eaa4464..54b962538b 100644
--- a/paddle/operators/sign_op.cc
+++ b/paddle/fluid/operators/sign_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/sign_op.h"
+#include "paddle/fluid/operators/sign_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/sign_op.cu b/paddle/fluid/operators/sign_op.cu
similarity index 94%
rename from paddle/operators/sign_op.cu
rename to paddle/fluid/operators/sign_op.cu
index f224880cff..93cdb311eb 100644
--- a/paddle/operators/sign_op.cu
+++ b/paddle/fluid/operators/sign_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/sign_op.h"
+#include "paddle/fluid/operators/sign_op.h"
 
 REGISTER_OP_CUDA_KERNEL(
     sign,
diff --git a/paddle/operators/sign_op.h b/paddle/fluid/operators/sign_op.h
similarity index 93%
rename from paddle/operators/sign_op.h
rename to paddle/fluid/operators/sign_op.h
index 9fe49ae1a2..1c2ebebee4 100644
--- a/paddle/operators/sign_op.h
+++ b/paddle/fluid/operators/sign_op.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/smooth_l1_loss_op.cc b/paddle/fluid/operators/smooth_l1_loss_op.cc
similarity index 99%
rename from paddle/operators/smooth_l1_loss_op.cc
rename to paddle/fluid/operators/smooth_l1_loss_op.cc
index dcb18d729d..be4c7a56a8 100644
--- a/paddle/operators/smooth_l1_loss_op.cc
+++ b/paddle/fluid/operators/smooth_l1_loss_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/smooth_l1_loss_op.h"
+#include "paddle/fluid/operators/smooth_l1_loss_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/smooth_l1_loss_op.cu b/paddle/fluid/operators/smooth_l1_loss_op.cu
similarity index 94%
rename from paddle/operators/smooth_l1_loss_op.cu
rename to paddle/fluid/operators/smooth_l1_loss_op.cu
index 213429bc37..94c0d6cd29 100644
--- a/paddle/operators/smooth_l1_loss_op.cu
+++ b/paddle/fluid/operators/smooth_l1_loss_op.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #define EIGEN_USE_GPU
 
-#include "paddle/operators/smooth_l1_loss_op.h"
+#include "paddle/fluid/operators/smooth_l1_loss_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/smooth_l1_loss_op.h b/paddle/fluid/operators/smooth_l1_loss_op.h
similarity index 97%
rename from paddle/operators/smooth_l1_loss_op.h
rename to paddle/fluid/operators/smooth_l1_loss_op.h
index 3facfae116..325ad824e1 100644
--- a/paddle/operators/smooth_l1_loss_op.h
+++ b/paddle/fluid/operators/smooth_l1_loss_op.h
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/platform/hostdevice.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
similarity index 98%
rename from paddle/operators/softmax_op.cc
rename to paddle/fluid/operators/softmax_op.cc
index cef1f1fc99..1d9462d08b 100644
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/softmax_op.h"
+#include "paddle/fluid/operators/softmax_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/softmax_op.cu.cc b/paddle/fluid/operators/softmax_op.cu.cc
similarity index 94%
rename from paddle/operators/softmax_op.cu.cc
rename to paddle/fluid/operators/softmax_op.cu.cc
index e7da40f3e8..c53d8a2bc8 100644
--- a/paddle/operators/softmax_op.cu.cc
+++ b/paddle/fluid/operators/softmax_op.cu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/softmax_op.h"
+#include "paddle/fluid/operators/softmax_op.h"
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h
similarity index 94%
rename from paddle/operators/softmax_op.h
rename to paddle/fluid/operators/softmax_op.h
index 63e379a3b3..9287f02310 100644
--- a/paddle/operators/softmax_op.h
+++ b/paddle/fluid/operators/softmax_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/softmax.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/softmax.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
similarity index 99%
rename from paddle/operators/softmax_with_cross_entropy_op.cc
rename to paddle/fluid/operators/softmax_with_cross_entropy_op.cc
index 7135780c92..79d56cb97d 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/softmax_with_cross_entropy_op.h"
+#include "paddle/fluid/operators/softmax_with_cross_entropy_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
similarity index 98%
rename from paddle/operators/softmax_with_cross_entropy_op.cu
rename to paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index 61583c6161..410d9e8887 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #define EIGEN_USE_GPU
 
-#include "paddle/operators/softmax_with_cross_entropy_op.h"
+#include "paddle/fluid/operators/softmax_with_cross_entropy_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/softmax_with_cross_entropy_op.h b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
similarity index 94%
rename from paddle/operators/softmax_with_cross_entropy_op.h
rename to paddle/fluid/operators/softmax_with_cross_entropy_op.h
index 6bde0f37e0..0927efd42c 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/cross_entropy.h"
-#include "paddle/operators/math/softmax.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/cross_entropy.h"
+#include "paddle/fluid/operators/math/softmax.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc
similarity index 98%
rename from paddle/operators/split_lod_tensor_op.cc
rename to paddle/fluid/operators/split_lod_tensor_op.cc
index bd93c49201..f821dc54d7 100644
--- a/paddle/operators/split_lod_tensor_op.cc
+++ b/paddle/fluid/operators/split_lod_tensor_op.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/op_registry.h"
-#include "paddle/memory/memcpy.h"
-#include "paddle/platform/device_context.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/split_op.cc b/paddle/fluid/operators/split_op.cc
similarity index 98%
rename from paddle/operators/split_op.cc
rename to paddle/fluid/operators/split_op.cc
index 8d55ae5dd7..f8bc22fe1d 100644
--- a/paddle/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/split_op.h"
-#include "paddle/operators/net_op.h"
+#include "paddle/fluid/operators/split_op.h"
+#include "paddle/fluid/operators/net_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/split_op.cu.cc b/paddle/fluid/operators/split_op.cu.cc
similarity index 94%
rename from paddle/operators/split_op.cu.cc
rename to paddle/fluid/operators/split_op.cu.cc
index dbad0bbf68..279691c759 100644
--- a/paddle/operators/split_op.cu.cc
+++ b/paddle/fluid/operators/split_op.cu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/split_op.h"
+#include "paddle/fluid/operators/split_op.h"
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     split, ops::SplitOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/split_op.h b/paddle/fluid/operators/split_op.h
similarity index 94%
rename from paddle/operators/split_op.h
rename to paddle/fluid/operators/split_op.h
index a38c435d53..e78218f2fb 100644
--- a/paddle/operators/split_op.h
+++ b/paddle/fluid/operators/split_op.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/strided_memcpy.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/strided_memcpy.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/split_selected_rows_op.cc b/paddle/fluid/operators/split_selected_rows_op.cc
similarity index 98%
rename from paddle/operators/split_selected_rows_op.cc
rename to paddle/fluid/operators/split_selected_rows_op.cc
index 0515ea13aa..113ce2ce10 100644
--- a/paddle/operators/split_selected_rows_op.cc
+++ b/paddle/fluid/operators/split_selected_rows_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/split_selected_rows_op.h"
+#include "paddle/fluid/operators/split_selected_rows_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/split_selected_rows_op.cu b/paddle/fluid/operators/split_selected_rows_op.cu
similarity index 92%
rename from paddle/operators/split_selected_rows_op.cu
rename to paddle/fluid/operators/split_selected_rows_op.cu
index 983285480f..0bbf1ecfae 100644
--- a/paddle/operators/split_selected_rows_op.cu
+++ b/paddle/fluid/operators/split_selected_rows_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/split_selected_rows_op.h"
+#include "paddle/fluid/operators/split_selected_rows_op.h"
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     split_selected_rows,
diff --git a/paddle/operators/split_selected_rows_op.h b/paddle/fluid/operators/split_selected_rows_op.h
similarity index 96%
rename from paddle/operators/split_selected_rows_op.h
rename to paddle/fluid/operators/split_selected_rows_op.h
index 12e64e2901..527264bd67 100644
--- a/paddle/operators/split_selected_rows_op.h
+++ b/paddle/fluid/operators/split_selected_rows_op.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/spp_op.cc b/paddle/fluid/operators/spp_op.cc
similarity index 98%
rename from paddle/operators/spp_op.cc
rename to paddle/fluid/operators/spp_op.cc
index c0aa87b0f0..e6755b1200 100644
--- a/paddle/operators/spp_op.cc
+++ b/paddle/fluid/operators/spp_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/spp_op.h"
+#include "paddle/fluid/operators/spp_op.h"
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/operators/spp_op.cu.cc b/paddle/fluid/operators/spp_op.cu.cc
similarity index 95%
rename from paddle/operators/spp_op.cu.cc
rename to paddle/fluid/operators/spp_op.cu.cc
index 761e4d6c4a..cad2ca5ef8 100644
--- a/paddle/operators/spp_op.cu.cc
+++ b/paddle/fluid/operators/spp_op.cu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/spp_op.h"
+#include "paddle/fluid/operators/spp_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/spp_op.h b/paddle/fluid/operators/spp_op.h
similarity index 97%
rename from paddle/operators/spp_op.h
rename to paddle/fluid/operators/spp_op.h
index f35b305d02..1da1f80580 100644
--- a/paddle/operators/spp_op.h
+++ b/paddle/fluid/operators/spp_op.h
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/pooling.h"
-#include "paddle/operators/strided_memcpy.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/pooling.h"
+#include "paddle/fluid/operators/strided_memcpy.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/squared_l2_distance_op.cc b/paddle/fluid/operators/squared_l2_distance_op.cc
similarity index 98%
rename from paddle/operators/squared_l2_distance_op.cc
rename to paddle/fluid/operators/squared_l2_distance_op.cc
index 9e097176f3..c1d0c2c7f3 100644
--- a/paddle/operators/squared_l2_distance_op.cc
+++ b/paddle/fluid/operators/squared_l2_distance_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/squared_l2_distance_op.h"
+#include "paddle/fluid/operators/squared_l2_distance_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/squared_l2_distance_op.cu b/paddle/fluid/operators/squared_l2_distance_op.cu
similarity index 94%
rename from paddle/operators/squared_l2_distance_op.cu
rename to paddle/fluid/operators/squared_l2_distance_op.cu
index f2648dde5e..959e7afac9 100644
--- a/paddle/operators/squared_l2_distance_op.cu
+++ b/paddle/fluid/operators/squared_l2_distance_op.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #define EIGEN_USE_GPU
 
-#include "paddle/operators/squared_l2_distance_op.h"
+#include "paddle/fluid/operators/squared_l2_distance_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/squared_l2_distance_op.h b/paddle/fluid/operators/squared_l2_distance_op.h
similarity index 98%
rename from paddle/operators/squared_l2_distance_op.h
rename to paddle/fluid/operators/squared_l2_distance_op.h
index 5bd5f4819a..aab241247e 100644
--- a/paddle/operators/squared_l2_distance_op.h
+++ b/paddle/fluid/operators/squared_l2_distance_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/squared_l2_norm_op.cc b/paddle/fluid/operators/squared_l2_norm_op.cc
similarity index 97%
rename from paddle/operators/squared_l2_norm_op.cc
rename to paddle/fluid/operators/squared_l2_norm_op.cc
index 6626bf0375..a43cc22994 100644
--- a/paddle/operators/squared_l2_norm_op.cc
+++ b/paddle/fluid/operators/squared_l2_norm_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/squared_l2_norm_op.h"
+#include "paddle/fluid/operators/squared_l2_norm_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/squared_l2_norm_op.cu b/paddle/fluid/operators/squared_l2_norm_op.cu
similarity index 94%
rename from paddle/operators/squared_l2_norm_op.cu
rename to paddle/fluid/operators/squared_l2_norm_op.cu
index b222113a8c..52f4ab79b2 100644
--- a/paddle/operators/squared_l2_norm_op.cu
+++ b/paddle/fluid/operators/squared_l2_norm_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/squared_l2_norm_op.h"
+#include "paddle/fluid/operators/squared_l2_norm_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/squared_l2_norm_op.h b/paddle/fluid/operators/squared_l2_norm_op.h
similarity index 96%
rename from paddle/operators/squared_l2_norm_op.h
rename to paddle/fluid/operators/squared_l2_norm_op.h
index 1ce26c775e..56524636b8 100644
--- a/paddle/operators/squared_l2_norm_op.h
+++ b/paddle/fluid/operators/squared_l2_norm_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h
similarity index 96%
rename from paddle/operators/strided_memcpy.h
rename to paddle/fluid/operators/strided_memcpy.h
index 735cabcd97..8a99b405e2 100644
--- a/paddle/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/operators/detail/strided_memcpy.h"
+#include "paddle/fluid/operators/detail/strided_memcpy.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/strided_memcpy_test.cc b/paddle/fluid/operators/strided_memcpy_test.cc
similarity index 97%
rename from paddle/operators/strided_memcpy_test.cc
rename to paddle/fluid/operators/strided_memcpy_test.cc
index 06d8118855..a369941a99 100644
--- a/paddle/operators/strided_memcpy_test.cc
+++ b/paddle/fluid/operators/strided_memcpy_test.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/strided_memcpy.h"
+#include "paddle/fluid/operators/strided_memcpy.h"
 #include "gtest/gtest.h"
-#include "paddle/memory/memory.h"
+#include "paddle/fluid/memory/memory.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
similarity index 98%
rename from paddle/operators/sum_op.cc
rename to paddle/fluid/operators/sum_op.cc
index 88ed67f7ba..96f851720a 100644
--- a/paddle/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -9,10 +9,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/sum_op.h"
+#include "paddle/fluid/operators/sum_op.h"
 #include <vector>
-#include "paddle/framework/var_type_inference.h"
-#include "paddle/operators/detail/safe_ref.h"
+#include "paddle/fluid/framework/var_type_inference.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu
similarity index 95%
rename from paddle/operators/sum_op.cu
rename to paddle/fluid/operators/sum_op.cu
index 873155076c..8d8f90d751 100644
--- a/paddle/operators/sum_op.cu
+++ b/paddle/fluid/operators/sum_op.cu
@@ -10,7 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/sum_op.h"
+#include "paddle/fluid/operators/sum_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/sum_op.h b/paddle/fluid/operators/sum_op.h
similarity index 95%
rename from paddle/operators/sum_op.h
rename to paddle/fluid/operators/sum_op.h
index 3d8102c3ae..5e1222c6ef 100644
--- a/paddle/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@@ -10,11 +10,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/lod_tensor_array.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/target_assign_op.cc b/paddle/fluid/operators/target_assign_op.cc
new file mode 100644
index 0000000000..24f1b72523
--- /dev/null
+++ b/paddle/fluid/operators/target_assign_op.cc
@@ -0,0 +1,202 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/target_assign_op.h"
+
+namespace paddle {
+namespace operators {
+
+class TargetAssignOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    // checkout inputs
+    PADDLE_ENFORCE(ctx->HasInput("EncodedGTBBox"),
+                   "Input(EncodedGTBBox) of TargetAssignOp should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("GTScoreLabel"),
+                   "Input(GTScoreLabel) of TargetAssignOp should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("MatchIndices"),
+                   "Input(MatchIndices) of TargetAssignOp should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("NegIndices"),
+                   "Input(NegIndices) of TargetAssignOp should not be null");
+
+    // checkout outputs
+    PADDLE_ENFORCE(
+        ctx->HasOutput("PredBBoxLabel"),
+        "Output(PredBBoxLabel) of TargetAssignOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("PredBBoxWeight"),
+        "Output(PredBBoxWeight) of TargetAssignOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("PredScoreLabel"),
+        "Output(PredScoreLabel) of TargetAssignOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("PredScoreWeight"),
+        "Output(PredScoreWeight) of TargetAssignOp should not be null.");
+
+    auto blabel_dims = ctx->GetInputDim("EncodedGTBBox");
+    auto slabel_dims = ctx->GetInputDim("GTScoreLabel");
+    auto mi_dims = ctx->GetInputDim("MatchIndices");
+    auto neg_dims = ctx->GetInputDim("NegIndices");
+
+    PADDLE_ENFORCE_EQ(blabel_dims.size(), 3UL,
+                      "The rank of Input(EncodedGTBBox) must be 3.");
+    PADDLE_ENFORCE_EQ(slabel_dims.size(), 2UL,
+                      "The rank of Input(GTScoreLabel) must be 2.");
+    PADDLE_ENFORCE_EQ(mi_dims.size(), 2UL,
+                      "The rank of Input(MatchIndices) must be 2.");
+    PADDLE_ENFORCE_EQ(neg_dims.size(), 2UL,
+                      "The rank of Input(NegIndices) must be 2.");
+
+    PADDLE_ENFORCE_EQ(blabel_dims[0], slabel_dims[0],
+                      "The 1st dimension (means the total number of "
+                      "ground-truth bounding boxes) of Input(EncodedGTBBox) "
+                      "and Input(GTScoreLabel) must be the same.");
+    PADDLE_ENFORCE_EQ(blabel_dims[1], mi_dims[1],
+                      "The 2nd dimension (means the number of priod boxes) "
+                      "of Input(EncodedGTBBox) and "
+                      "Input(MatchIndices) must be the same.");
+    PADDLE_ENFORCE_EQ(blabel_dims[2], 4,
+                      "The 3rd dimension of Input(EncodedGTBBox) must be 4.");
+
+    auto n = mi_dims[0];
+    auto np = mi_dims[1];
+    ctx->SetOutputDim("PredBBoxLabel", {n, np, 4});
+    ctx->SetOutputDim("PredBBoxWeight", {n, np, 1});
+    ctx->SetOutputDim("PredScoreLabel", {n, np, 1});
+    ctx->SetOutputDim("PredScoreWeight", {n, np, 1});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<framework::LoDTensor>("EncodedGTBBox")->type()),
+        ctx.device_context());
+  }
+};
+
+class TargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  TargetAssignOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("EncodedGTBBox",
+             "(LoDTensor), The encoded ground-truth bounding boxes with shape "
+             "[Ng, Np, 4], where Ng is the total number of ground-truth boxes "
+             "in this mini-batch, Np the number of predictions, 4 is the "
+             "number of coordinate in [xmin, ymin, xmax, ymax] layout.");
+    AddInput("GTScoreLabel",
+             "(LoDTensor, default LoDTensor<int>),  The input ground-truth "
+             "labels with shape [Ng, 1], where the Ng is the same as it in "
+             "the input of EncodedGTBBox.");
+    AddInput("MatchIndices",
+             "(Tensor, default Tensor<int>), The input matched indices "
+             "with shape [N, Np], where N is the batch size, Np is the same "
+             "as it in the input of EncodedGTBBox. If MatchIndices[i][j] "
+             "is -1, the j-th prior box is not matched to any ground-truh "
+             "box in i-th instance.");
+    AddInput("NegIndices",
+             "(LoDTensor, default LoDTensor<int>), The input negative example "
+             "indices with shape [Neg, 1], where is the total number of "
+             "negative example indices.");
+    AddAttr<int>("background_label",
+                 "(int, default 0), Label index of background class.")
+        .SetDefault(0);
+    AddOutput("PredBBoxLabel",
+              "(Tensor), The output encoded ground-truth labels "
+              "with shape [N, Np, 4], N is the batch size and Np, 4 is the "
+              "same as they in input of EncodedGTBBox. If MatchIndices[i][j] "
+              "is -1, the PredBBoxLabel[i][j][:] is the encoded ground-truth "
+              "box for background_label in i-th instance.");
+    AddOutput("PredBBoxWeight",
+              "(Tensor), The weight for PredBBoxLabel with the shape "
+              "of [N, Np, 1]");
+    AddOutput("PredScoreLabel",
+              "(Tensor, default Tensor<int>), The output score labels for "
+              "each predictions with shape [N, Np, 1]. If MatchIndices[i][j] "
+              "is -1, PredScoreLabel[i][j] = background_label.");
+    AddOutput("PredScoreWeight",
+              "(Tensor), The weight for PredScoreLabel with the shape "
+              "of [N, Np, 1]");
+    AddComment(R"DOC(
+This operator is, for given the encoded boxes between prior boxes and
+ground-truth boxes and ground-truth class labels, to assign classification
+and regression targets to each prior box as well as weights to each
+prior box. The weights is used to specify which prior box would not contribute
+to training loss.
+
+For each instance, the output `PredBBoxLabel`, `PredBBoxWeight`,
+`PredScoreLabel` and `PredScoreWeight` are assigned based on `MatchIndices`.
+Assumed that the row offset for each instance in `EncodedGTBBox` is called lod,
+this operato assigns classification/regression targets by performing the
+following steps:
+
+1. Assigning all outpts based on `MatchIndices`:
+
+If id = MatchIndices[i][j] > 0,
+
+    PredBBoxLabel[i][j] = EncodedGTBBox[lod[i] + id][j]
+    PredBBoxWeight[i][j] = 1.
+    PredScoreLabel[i][j] = GTScoreLabel[lod[i] + id]
+    PredScoreWeight[i][j] = 1.
+
+Otherwise, 
+
+    PredBBoxLabel[j][j] = [0., 0., 0., 0.]
+    PredBBoxWeight[i][j] = 0.
+    PredScoreLabel[i][j] = background_label
+    PredScoreWeight[i][j] = 0.
+
+2. Assigning PredScoreWeight based on `NegIndices`:
+
+Assumed that the row offset for each instance in `NegIndices` is caleed neg_lod,
+for i-th instance and all ids of NegIndices in this instance:
+
+    PredScoreLabel[i][id] = background_label
+    PredScoreWeight[i][id] = 1.0
+
+    )DOC");
+  }
+};
+
+template <typename T>
+struct NegTargetAssignFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx, const int* neg_indices,
+                  const size_t* lod, const int num, const int num_prior_box,
+                  const int background_label, int* out_label, T* out_label_wt) {
+    for (int i = 0; i < num; ++i) {
+      for (size_t j = lod[i]; j < lod[i + 1]; ++j) {
+        int id = neg_indices[j];
+        out_label[i * num_prior_box + id] = background_label;
+        out_label_wt[i * num_prior_box + id] = static_cast<T>(1.0);
+      }
+    }
+  }
+};
+
+template struct NegTargetAssignFunctor<platform::CPUDeviceContext, float>;
+template struct NegTargetAssignFunctor<platform::CPUDeviceContext, double>;
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(target_assign, ops::TargetAssignOp,
+                             ops::TargetAssignOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    target_assign,
+    ops::TargetAssignKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::TargetAssignKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/target_assign_op.cu b/paddle/fluid/operators/target_assign_op.cu
new file mode 100644
index 0000000000..5c012d27ad
--- /dev/null
+++ b/paddle/fluid/operators/target_assign_op.cu
@@ -0,0 +1,61 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/target_assign_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void NegTargetAssignKernel(const int* neg_indices, const size_t* lod,
+                                      const int num, const int num_prior_box,
+                                      const int background_label,
+                                      int* out_label, T* out_label_wt) {
+  int bidx = blockIdx.x;
+  int st = lod[bidx];
+  int ed = lod[bidx + 1];
+
+  int row_start = bidx * num_prior_box;
+  for (int i = st + threadIdx.x; i < ed; i += blockDim.x) {
+    int id = row_start + neg_indices[i];
+    out_label[id] = background_label;
+    out_label_wt[id] = 1.;
+  }
+}
+
+template <typename T>
+struct NegTargetAssignFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const int* neg_indices, const size_t* lod, const int num,
+                  const int num_prior_box, const int background_label,
+                  int* out_label, T* out_label_wt) {
+    const int block_size = 256;
+    const int grid_size = num;
+    NegTargetAssignKernel<T><<<grid_size, block_size, 0, ctx.stream()>>>(
+        neg_indices, lod, num, num_prior_box, background_label, out_label,
+        out_label_wt);
+  }
+};
+
+template struct NegTargetAssignFunctor<platform::CUDADeviceContext, float>;
+template struct NegTargetAssignFunctor<platform::CUDADeviceContext, double>;
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    target_assign,
+    ops::TargetAssignKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::TargetAssignKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/target_assign_op.h b/paddle/fluid/operators/target_assign_op.h
new file mode 100644
index 0000000000..876111523a
--- /dev/null
+++ b/paddle/fluid/operators/target_assign_op.h
@@ -0,0 +1,160 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/assert.h"
+#include "paddle/fluid/platform/for_range.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct TargetAssignFunctor {
+  const T* gt_box_;
+  const int* gt_label_;
+  const int* match_indices_;
+  const size_t* lod_;
+  const int background_label_;
+  const int64_t num_;
+  const int64_t num_prior_box_;
+
+  T* out_box_;
+  T* out_box_wt_;
+  int* out_label_;
+  T* out_label_wt_;
+
+  TargetAssignFunctor(const T* gt_box, const int* gt_label,
+                      const int* match_indices, const size_t* lod,
+                      const int background_label, const int64_t num,
+                      const int64_t np, T* out_box, T* out_box_wt,
+                      int* out_label, T* out_label_wt)
+      : gt_box_(gt_box),
+        gt_label_(gt_label),
+        match_indices_(match_indices),
+        lod_(lod),
+        background_label_(background_label),
+        num_(num),
+        num_prior_box_(np),
+        out_box_(out_box),
+        out_box_wt_(out_box_wt),
+        out_label_(out_label),
+        out_label_wt_(out_label_wt) {}
+
+  HOSTDEVICE void operator()(size_t i) const {
+    int row = i / num_prior_box_;
+    int col = i - row * num_prior_box_;
+
+    size_t row_off = lod_[row];
+    int offset = row * num_prior_box_ + col;
+
+    int id = match_indices_[offset];
+    T* obox = out_box_ + offset * 4;
+    int* olabel = out_label_ + offset;
+    T* obox_wt = out_box_wt_ + offset;
+    T* olabel_wt = out_label_wt_ + offset;
+
+    if (id > -1) {
+      const T* gtbox = gt_box_ + ((row_off + id) * num_prior_box_ + col) * 4;
+
+      obox[0] = gtbox[0];
+      obox[1] = gtbox[1];
+      obox[2] = gtbox[2];
+      obox[3] = gtbox[3];
+
+      olabel[0] = gt_label_[row_off + id];
+      obox_wt[0] = static_cast<T>(1.);
+      olabel_wt[0] = static_cast<T>(1.);
+    } else {
+      obox[0] = static_cast<T>(0.);
+      obox[1] = static_cast<T>(0.);
+      obox[2] = static_cast<T>(0.);
+      obox[3] = static_cast<T>(0.);
+
+      olabel[0] = background_label_;
+      obox_wt[0] = static_cast<T>(0.);
+      olabel_wt[0] = static_cast<T>(0.);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+struct NegTargetAssignFunctor {
+  void operator()(const platform::DeviceContext& ctx, const int* neg_indices,
+                  const size_t* lod, const int num, const int num_prior_box,
+                  const int background_label, int* out_label,
+                  T* out_label_wt) const;
+};
+
+template <typename DeviceContext, typename T>
+class TargetAssignKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* enc_gt_box = ctx.Input<framework::LoDTensor>("EncodedGTBBox");
+    auto* gt_label = ctx.Input<framework::LoDTensor>("GTScoreLabel");
+    auto* match_indices = ctx.Input<framework::Tensor>("MatchIndices");
+    auto* neg_indices = ctx.Input<framework::LoDTensor>("NegIndices");
+
+    auto* out_box = ctx.Output<framework::Tensor>("PredBBoxLabel");
+    auto* out_box_wt = ctx.Output<framework::Tensor>("PredBBoxWeight");
+    auto* out_label = ctx.Output<framework::Tensor>("PredScoreLabel");
+    auto* out_label_wt = ctx.Output<framework::Tensor>("PredScoreWeight");
+
+    PADDLE_ENFORCE_EQ(enc_gt_box->lod().size(), 1UL);
+    PADDLE_ENFORCE_EQ(gt_label->lod().size(), 1UL);
+    PADDLE_ENFORCE_EQ(neg_indices->lod().size(), 1UL);
+
+    int background_label = ctx.Attr<int>("background_label");
+
+    const T* box_data = enc_gt_box->data<T>();
+    const int* label_data = gt_label->data<int>();
+    const int* match_idx_data = match_indices->data<int>();
+    const int* neg_idx_data = neg_indices->data<int>();
+
+    T* obox_data = out_box->mutable_data<T>(ctx.GetPlace());
+    T* obox_wt_data = out_box_wt->mutable_data<T>(ctx.GetPlace());
+    int* olabel_data = out_label->mutable_data<int>(ctx.GetPlace());
+    T* olabel_wt_data = out_label_wt->mutable_data<T>(ctx.GetPlace());
+
+    int64_t num = match_indices->dims()[0];
+    int64_t num_prior_box = match_indices->dims()[1];
+
+    auto gt_lod = enc_gt_box->lod().back();
+    auto gt_label_lod = gt_label->lod().back();
+    auto neg_lod = neg_indices->lod().back();
+    for (size_t i = 0; i < gt_lod.size(); ++i) {
+      PADDLE_ENFORCE_EQ(gt_lod.data()[i], gt_label_lod.data()[i]);
+    }
+
+    size_t* gt_lod_data = gt_lod.MutableData(ctx.GetPlace());
+    size_t* neg_lod_data = neg_lod.MutableData(ctx.GetPlace());
+
+    TargetAssignFunctor<T> functor(box_data, label_data, match_idx_data,
+                                   gt_lod_data, background_label, num,
+                                   num_prior_box, obox_data, obox_wt_data,
+                                   olabel_data, olabel_wt_data);
+
+    auto& device_ctx = ctx.template device_context<DeviceContext>();
+    platform::ForRange<DeviceContext> for_range(device_ctx,
+                                                num * num_prior_box);
+    for_range(functor);
+
+    NegTargetAssignFunctor<DeviceContext, T> neg_trg_functor;
+    neg_trg_functor(device_ctx, neg_idx_data, neg_lod_data, num, num_prior_box,
+                    background_label, olabel_data, olabel_wt_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/tensor_array_read_write_op.cc b/paddle/fluid/operators/tensor_array_read_write_op.cc
similarity index 98%
rename from paddle/operators/tensor_array_read_write_op.cc
rename to paddle/fluid/operators/tensor_array_read_write_op.cc
index a70be8b875..50811fb224 100644
--- a/paddle/operators/tensor_array_read_write_op.cc
+++ b/paddle/fluid/operators/tensor_array_read_write_op.cc
@@ -11,8 +11,8 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/operators/array_operator.h"
-#include "paddle/operators/detail/safe_ref.h"
+#include "paddle/fluid/operators/array_operator.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc
similarity index 98%
rename from paddle/operators/top_k_op.cc
rename to paddle/fluid/operators/top_k_op.cc
index a8ddd72973..c81ea860d0 100644
--- a/paddle/operators/top_k_op.cc
+++ b/paddle/fluid/operators/top_k_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/top_k_op.h"
+#include "paddle/fluid/operators/top_k_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
similarity index 99%
rename from paddle/operators/top_k_op.cu
rename to paddle/fluid/operators/top_k_op.cu
index f7bf58e721..5390cb5063 100644
--- a/paddle/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/op_registry.h"
-#include "paddle/platform/assert.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/assert.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/top_k_op.h b/paddle/fluid/operators/top_k_op.h
similarity index 96%
rename from paddle/operators/top_k_op.h
rename to paddle/fluid/operators/top_k_op.h
index bf42e15e6b..e32b351500 100644
--- a/paddle/operators/top_k_op.h
+++ b/paddle/fluid/operators/top_k_op.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 #include <algorithm>
 #include <iostream>
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc
similarity index 98%
rename from paddle/operators/transpose_op.cc
rename to paddle/fluid/operators/transpose_op.cc
index c7ae162638..a3d8acffc2 100644
--- a/paddle/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/transpose_op.h"
+#include "paddle/fluid/operators/transpose_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/transpose_op.cu.cc b/paddle/fluid/operators/transpose_op.cu.cc
similarity index 94%
rename from paddle/operators/transpose_op.cu.cc
rename to paddle/fluid/operators/transpose_op.cu.cc
index 281c4468cc..f8667ab369 100644
--- a/paddle/operators/transpose_op.cu.cc
+++ b/paddle/fluid/operators/transpose_op.cu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/transpose_op.h"
+#include "paddle/fluid/operators/transpose_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/transpose_op.h b/paddle/fluid/operators/transpose_op.h
similarity index 96%
rename from paddle/operators/transpose_op.h
rename to paddle/fluid/operators/transpose_op.h
index b9686a2db3..1fb419474a 100644
--- a/paddle/operators/transpose_op.h
+++ b/paddle/fluid/operators/transpose_op.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
similarity index 97%
rename from paddle/operators/uniform_random_op.cc
rename to paddle/fluid/operators/uniform_random_op.cc
index 3a314bdb9b..b6fea1d448 100644
--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -11,8 +11,8 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
similarity index 96%
rename from paddle/operators/uniform_random_op.cu
rename to paddle/fluid/operators/uniform_random_op.cu
index 719d0872a7..9afca68e59 100644
--- a/paddle/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <thrust/random.h>
 #include <thrust/transform.h>
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/unpool_op.cc b/paddle/fluid/operators/unpool_op.cc
similarity index 99%
rename from paddle/operators/unpool_op.cc
rename to paddle/fluid/operators/unpool_op.cc
index 50cee11a7a..2e0b271fed 100644
--- a/paddle/operators/unpool_op.cc
+++ b/paddle/fluid/operators/unpool_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/unpool_op.h"
+#include "paddle/fluid/operators/unpool_op.h"
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/operators/unpool_op.cu.cc b/paddle/fluid/operators/unpool_op.cu.cc
similarity index 95%
rename from paddle/operators/unpool_op.cu.cc
rename to paddle/fluid/operators/unpool_op.cu.cc
index 9b002e35c4..15d81eb296 100644
--- a/paddle/operators/unpool_op.cu.cc
+++ b/paddle/fluid/operators/unpool_op.cu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/unpool_op.h"
+#include "paddle/fluid/operators/unpool_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/unpool_op.h b/paddle/fluid/operators/unpool_op.h
similarity index 95%
rename from paddle/operators/unpool_op.h
rename to paddle/fluid/operators/unpool_op.h
index ee18b118c9..ceed550739 100644
--- a/paddle/operators/unpool_op.h
+++ b/paddle/fluid/operators/unpool_op.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/unpooling.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/unpooling.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/warpctc_op.cc b/paddle/fluid/operators/warpctc_op.cc
similarity index 99%
rename from paddle/operators/warpctc_op.cc
rename to paddle/fluid/operators/warpctc_op.cc
index bd0c5f9957..1c05fed0b4 100644
--- a/paddle/operators/warpctc_op.cc
+++ b/paddle/fluid/operators/warpctc_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/warpctc_op.h"
+#include "paddle/fluid/operators/warpctc_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/warpctc_op.cu.cc b/paddle/fluid/operators/warpctc_op.cu.cc
similarity index 94%
rename from paddle/operators/warpctc_op.cu.cc
rename to paddle/fluid/operators/warpctc_op.cu.cc
index 7d8527ac75..9ee7f970a9 100644
--- a/paddle/operators/warpctc_op.cu.cc
+++ b/paddle/fluid/operators/warpctc_op.cu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/warpctc_op.h"
+#include "paddle/fluid/operators/warpctc_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/operators/warpctc_op.h b/paddle/fluid/operators/warpctc_op.h
similarity index 97%
rename from paddle/operators/warpctc_op.h
rename to paddle/fluid/operators/warpctc_op.h
index 8aea061c00..a1de71627e 100644
--- a/paddle/operators/warpctc_op.h
+++ b/paddle/fluid/operators/warpctc_op.h
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/sequence_padding.h"
-#include "paddle/operators/math/sequence_scale.h"
-#include "paddle/platform/dynload/warpctc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/sequence_padding.h"
+#include "paddle/fluid/operators/math/sequence_scale.h"
+#include "paddle/fluid/platform/dynload/warpctc.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/while_op.cc b/paddle/fluid/operators/while_op.cc
similarity index 98%
rename from paddle/operators/while_op.cc
rename to paddle/fluid/operators/while_op.cc
index a744ebd615..d254c572ac 100644
--- a/paddle/operators/while_op.cc
+++ b/paddle/fluid/operators/while_op.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <vector>
-#include "paddle/framework/executor.h"
-#include "paddle/framework/lod_tensor_array.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
-#include "paddle/operators/detail/safe_ref.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/platform/.clang-format b/paddle/fluid/platform/.clang-format
new file mode 100644
index 0000000000..29282dc87e
--- /dev/null
+++ b/paddle/fluid/platform/.clang-format
@@ -0,0 +1,5 @@
+---
+Language:        Cpp
+BasedOnStyle:  Google
+Standard:  Cpp11 
+...
diff --git a/paddle/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
similarity index 79%
rename from paddle/platform/CMakeLists.txt
rename to paddle/fluid/platform/CMakeLists.txt
index d68caea997..5ce4b3de39 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -39,11 +39,3 @@ nv_test(nccl_test SRCS nccl_test.cu DEPS dynload_cuda gpu_info device_context)
 
 cc_library(profiler SRCS profiler.cc DEPS device_context)
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
-
-if(NOT WITH_C_API AND WITH_FLUID)
-  file(GLOB PLATFORM_HEADERS *.h)
-  file(GLOB PLATFORM_dynload_HEADERS dynload/*.h)
-  install(FILES ${PLATFORM_HEADERS} DESTINATION include/paddle/platform)
-  install(FILES ${PLATFORM_HEADERS} DESTINATION include/paddle/platform/dynload)
-  install(FILES details/device_ptr_cast.h DESTINATION include/paddle/platform/details)
-endif()
diff --git a/paddle/platform/assert.h b/paddle/fluid/platform/assert.h
similarity index 67%
rename from paddle/platform/assert.h
rename to paddle/fluid/platform/assert.h
index d813b9529b..1f5a8f6a19 100644
--- a/paddle/platform/assert.h
+++ b/paddle/fluid/platform/assert.h
@@ -1,16 +1,16 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/platform/call_once.h b/paddle/fluid/platform/call_once.h
similarity index 100%
rename from paddle/platform/call_once.h
rename to paddle/fluid/platform/call_once.h
diff --git a/paddle/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
similarity index 97%
rename from paddle/platform/cpu_info.cc
rename to paddle/fluid/platform/cpu_info.cc
index 78e1fa9df5..47473aead0 100644
--- a/paddle/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/platform/cpu_info.h"
+#include "paddle/fluid/platform/cpu_info.h"
 
 #ifdef __APPLE__
 #include <sys/sysctl.h>
diff --git a/paddle/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h
similarity index 100%
rename from paddle/platform/cpu_info.h
rename to paddle/fluid/platform/cpu_info.h
diff --git a/paddle/platform/cpu_info_test.cc b/paddle/fluid/platform/cpu_info_test.cc
similarity index 96%
rename from paddle/platform/cpu_info_test.cc
rename to paddle/fluid/platform/cpu_info_test.cc
index 1bfe62c1fb..d1fdba13b8 100644
--- a/paddle/platform/cpu_info_test.cc
+++ b/paddle/fluid/platform/cpu_info_test.cc
@@ -11,7 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/platform/cpu_info.h"
+#include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/string/printf.h"
 
 #include <ostream>
diff --git a/paddle/platform/cuda_helper.h b/paddle/fluid/platform/cuda_helper.h
similarity index 100%
rename from paddle/platform/cuda_helper.h
rename to paddle/fluid/platform/cuda_helper.h
diff --git a/paddle/platform/cuda_profiler.h b/paddle/fluid/platform/cuda_profiler.h
similarity index 100%
rename from paddle/platform/cuda_profiler.h
rename to paddle/fluid/platform/cuda_profiler.h
diff --git a/paddle/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
similarity index 98%
rename from paddle/platform/cudnn_helper.h
rename to paddle/fluid/platform/cudnn_helper.h
index 80a4c9bb4b..f2daa4f4fc 100644
--- a/paddle/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -15,9 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
-#include "paddle/platform/dynload/cudnn.h"
-#include "paddle/platform/enforce.h"
-#include "paddle/platform/macros.h"
+#include "paddle/fluid/platform/dynload/cudnn.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/macros.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/platform/cudnn_helper_test.cc b/paddle/fluid/platform/cudnn_helper_test.cc
similarity index 99%
rename from paddle/platform/cudnn_helper_test.cc
rename to paddle/fluid/platform/cudnn_helper_test.cc
index 427359f697..cd0bd3fe3e 100644
--- a/paddle/platform/cudnn_helper_test.cc
+++ b/paddle/fluid/platform/cudnn_helper_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
 #include <gtest/gtest.h>
 
 TEST(CudnnHelper, ScopedTensorDescriptor) {
diff --git a/paddle/platform/details/device_ptr_cast.h b/paddle/fluid/platform/details/device_ptr_cast.h
similarity index 100%
rename from paddle/platform/details/device_ptr_cast.h
rename to paddle/fluid/platform/details/device_ptr_cast.h
diff --git a/paddle/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
similarity index 98%
rename from paddle/platform/device_context.cc
rename to paddle/fluid/platform/device_context.cc
index 9d9348079a..c4da846bb1 100644
--- a/paddle/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -9,8 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/platform/device_context.h"
-#include "paddle/memory/memory.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/memory/memory.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/platform/device_context.h b/paddle/fluid/platform/device_context.h
similarity index 95%
rename from paddle/platform/device_context.h
rename to paddle/fluid/platform/device_context.h
index 9826a64276..10b581f41a 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -15,18 +15,18 @@ limitations under the License. */
 #include <unordered_map>
 
 #ifdef PADDLE_WITH_CUDA
-#include "paddle/platform/dynload/cublas.h"
-#include "paddle/platform/dynload/cudnn.h"
-#include "paddle/platform/gpu_info.h"
+#include "paddle/fluid/platform/dynload/cublas.h"
+#include "paddle/fluid/platform/dynload/cudnn.h"
+#include "paddle/fluid/platform/gpu_info.h"
 #define EIGEN_USE_GPU
 #endif
 
 #ifdef PADDLE_WITH_MKLDNN
-#include "paddle/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
 
-#include "paddle/platform/enforce.h"
-#include "paddle/platform/place.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/place.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
 #include "glog/logging.h"
diff --git a/paddle/platform/device_context_test.cu b/paddle/fluid/platform/device_context_test.cu
similarity index 98%
rename from paddle/platform/device_context_test.cu
rename to paddle/fluid/platform/device_context_test.cu
index 767fe9b24a..f4dae6e90a 100644
--- a/paddle/platform/device_context_test.cu
+++ b/paddle/fluid/platform/device_context_test.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "gtest/gtest.h"
-#include "paddle/platform/device_context.h"
+#include "paddle/fluid/platform/device_context.h"
 
 #include "glog/logging.h"
 
diff --git a/paddle/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
similarity index 100%
rename from paddle/platform/dynload/CMakeLists.txt
rename to paddle/fluid/platform/dynload/CMakeLists.txt
diff --git a/paddle/platform/dynload/cublas.cc b/paddle/fluid/platform/dynload/cublas.cc
similarity index 94%
rename from paddle/platform/dynload/cublas.cc
rename to paddle/fluid/platform/dynload/cublas.cc
index 6aca716657..c599712554 100644
--- a/paddle/platform/dynload/cublas.cc
+++ b/paddle/fluid/platform/dynload/cublas.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/platform/dynload/cublas.h"
+#include "paddle/fluid/platform/dynload/cublas.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h
similarity index 98%
rename from paddle/platform/dynload/cublas.h
rename to paddle/fluid/platform/dynload/cublas.h
index 61a22d9db3..05f69e5065 100644
--- a/paddle/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <cublas_v2.h>
 #include <dlfcn.h>
 #include <mutex>
-#include "paddle/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/platform/dynload/cudnn.cc b/paddle/fluid/platform/dynload/cudnn.cc
similarity index 94%
rename from paddle/platform/dynload/cudnn.cc
rename to paddle/fluid/platform/dynload/cudnn.cc
index 701f6240fe..0b1c4c4f96 100644
--- a/paddle/platform/dynload/cudnn.cc
+++ b/paddle/fluid/platform/dynload/cudnn.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/platform/dynload/cudnn.h"
-#include "paddle/platform/enforce.h"
+#include "paddle/fluid/platform/dynload/cudnn.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
similarity index 99%
rename from paddle/platform/dynload/cudnn.h
rename to paddle/fluid/platform/dynload/cudnn.h
index b926347949..00dfbc8387 100644
--- a/paddle/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <cudnn.h>
 #include <dlfcn.h>
 #include <mutex>
-#include "paddle/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/platform/dynload/curand.cc b/paddle/fluid/platform/dynload/curand.cc
similarity index 94%
rename from paddle/platform/dynload/curand.cc
rename to paddle/fluid/platform/dynload/curand.cc
index d05dd88126..eac690b145 100644
--- a/paddle/platform/dynload/curand.cc
+++ b/paddle/fluid/platform/dynload/curand.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <paddle/platform/dynload/curand.h>
+#include "paddle/fluid/platform/dynload/curand.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/platform/dynload/curand.h b/paddle/fluid/platform/dynload/curand.h
similarity index 97%
rename from paddle/platform/dynload/curand.h
rename to paddle/fluid/platform/dynload/curand.h
index 7bfe0778c7..ce3115b3ce 100644
--- a/paddle/platform/dynload/curand.h
+++ b/paddle/fluid/platform/dynload/curand.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <curand.h>
 #include <dlfcn.h>
 #include <mutex>
-#include "paddle/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
similarity index 98%
rename from paddle/platform/dynload/dynamic_loader.cc
rename to paddle/fluid/platform/dynload/dynamic_loader.cc
index c8c09ae608..eb00f93b7c 100644
--- a/paddle/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
 #include <dlfcn.h>
 #include <memory>
 #include <mutex>
 #include <string>
 #include "gflags/gflags.h"
 #include "glog/logging.h"
-#include "paddle/platform/enforce.h"
+#include "paddle/fluid/platform/enforce.h"
 
 DEFINE_string(cudnn_dir, "",
               "Specify path for loading libcudnn.so. For instance, "
diff --git a/paddle/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h
similarity index 100%
rename from paddle/platform/dynload/dynamic_loader.h
rename to paddle/fluid/platform/dynload/dynamic_loader.h
diff --git a/paddle/platform/dynload/nccl.cc b/paddle/fluid/platform/dynload/nccl.cc
similarity index 95%
rename from paddle/platform/dynload/nccl.cc
rename to paddle/fluid/platform/dynload/nccl.cc
index 4cec829a8a..1dc3e96f04 100644
--- a/paddle/platform/dynload/nccl.cc
+++ b/paddle/fluid/platform/dynload/nccl.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/platform/dynload/nccl.h"
+#include "paddle/fluid/platform/dynload/nccl.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/platform/dynload/nccl.h b/paddle/fluid/platform/dynload/nccl.h
similarity index 96%
rename from paddle/platform/dynload/nccl.h
rename to paddle/fluid/platform/dynload/nccl.h
index 6c776afc97..349a4d0ba3 100644
--- a/paddle/platform/dynload/nccl.h
+++ b/paddle/fluid/platform/dynload/nccl.h
@@ -17,8 +17,8 @@ limitations under the License. */
 #include <dlfcn.h>
 #include <nccl.h>
 #include <mutex>
-#include "paddle/platform/call_once.h"
-#include "paddle/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/call_once.h"
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/platform/dynload/warpctc.cc b/paddle/fluid/platform/dynload/warpctc.cc
similarity index 94%
rename from paddle/platform/dynload/warpctc.cc
rename to paddle/fluid/platform/dynload/warpctc.cc
index 9b7d01a6e8..84de2cae94 100644
--- a/paddle/platform/dynload/warpctc.cc
+++ b/paddle/fluid/platform/dynload/warpctc.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/platform/dynload/warpctc.h"
+#include "paddle/fluid/platform/dynload/warpctc.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/platform/dynload/warpctc.h b/paddle/fluid/platform/dynload/warpctc.h
similarity index 97%
rename from paddle/platform/dynload/warpctc.h
rename to paddle/fluid/platform/dynload/warpctc.h
index acafcaff2c..f1955818de 100644
--- a/paddle/platform/dynload/warpctc.h
+++ b/paddle/fluid/platform/dynload/warpctc.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <dlfcn.h>
 #include <mutex>
 #include "ctc.h"
-#include "paddle/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/platform/enforce.cc b/paddle/fluid/platform/enforce.cc
similarity index 94%
rename from paddle/platform/enforce.cc
rename to paddle/fluid/platform/enforce.cc
index e8d31bc782..55cd80943c 100644
--- a/paddle/platform/enforce.cc
+++ b/paddle/fluid/platform/enforce.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/platform/enforce.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace platform {}  // namespace platform
diff --git a/paddle/platform/enforce.h b/paddle/fluid/platform/enforce.h
similarity index 97%
rename from paddle/platform/enforce.h
rename to paddle/fluid/platform/enforce.h
index d1c7be0790..b22893c0a5 100644
--- a/paddle/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -22,7 +22,7 @@ limitations under the License. */
 #include <stdexcept>
 #include <string>
 
-#include "paddle/platform/macros.h"
+#include "paddle/fluid/platform/macros.h"
 #include "paddle/string/printf.h"
 #include "paddle/string/to_string.h"
 
@@ -34,10 +34,10 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_CUDA
 
-#include "paddle/platform/dynload/cublas.h"
-#include "paddle/platform/dynload/cudnn.h"
-#include "paddle/platform/dynload/curand.h"
-#include "paddle/platform/dynload/nccl.h"
+#include "paddle/fluid/platform/dynload/cublas.h"
+#include "paddle/fluid/platform/dynload/cudnn.h"
+#include "paddle/fluid/platform/dynload/curand.h"
+#include "paddle/fluid/platform/dynload/nccl.h"
 
 #include <cublas_v2.h>
 #include <cudnn.h>
diff --git a/paddle/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc
similarity index 99%
rename from paddle/platform/enforce_test.cc
rename to paddle/fluid/platform/enforce_test.cc
index 8206a055ea..896a9a04ec 100644
--- a/paddle/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 #include <memory>
 
 #include "gtest/gtest.h"
-#include "paddle/platform/enforce.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/string/piece.h"
 
 using StringPiece = paddle::string::Piece;
diff --git a/paddle/platform/for_range.h b/paddle/fluid/platform/for_range.h
similarity index 97%
rename from paddle/platform/for_range.h
rename to paddle/fluid/platform/for_range.h
index 694a66d9ac..0e695328c3 100644
--- a/paddle/platform/for_range.h
+++ b/paddle/fluid/platform/for_range.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/platform/device_context.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
similarity index 97%
rename from paddle/platform/gpu_info.cc
rename to paddle/fluid/platform/gpu_info.cc
index 7037551d75..1797f59a9c 100644
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/platform/gpu_info.h"
+#include "paddle/fluid/platform/gpu_info.h"
 
 #include "gflags/gflags.h"
 
-#include "paddle/platform/enforce.h"
+#include "paddle/fluid/platform/enforce.h"
 
 DEFINE_double(fraction_of_gpu_memory_to_use, 0.92,
               "Default use 92% of GPU memory for PaddlePaddle,"
diff --git a/paddle/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h
similarity index 100%
rename from paddle/platform/gpu_info.h
rename to paddle/fluid/platform/gpu_info.h
diff --git a/paddle/platform/hostdevice.h b/paddle/fluid/platform/hostdevice.h
similarity index 100%
rename from paddle/platform/hostdevice.h
rename to paddle/fluid/platform/hostdevice.h
diff --git a/paddle/platform/macros.h b/paddle/fluid/platform/macros.h
similarity index 100%
rename from paddle/platform/macros.h
rename to paddle/fluid/platform/macros.h
diff --git a/paddle/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
similarity index 100%
rename from paddle/platform/mkldnn_helper.h
rename to paddle/fluid/platform/mkldnn_helper.h
diff --git a/paddle/platform/nccl_test.cu b/paddle/fluid/platform/nccl_test.cu
similarity index 92%
rename from paddle/platform/nccl_test.cu
rename to paddle/fluid/platform/nccl_test.cu
index ef6d845874..75b95aff1a 100644
--- a/paddle/platform/nccl_test.cu
+++ b/paddle/fluid/platform/nccl_test.cu
@@ -19,11 +19,11 @@ limitations under the License. */
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 
-#include "paddle/framework/init.h"
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/dynload/nccl.h"
-#include "paddle/platform/enforce.h"
-#include "paddle/platform/gpu_info.h"
+#include "paddle/fluid/framework/init.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/dynload/nccl.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/gpu_info.h"
 
 static int dev_count = 0;
 
@@ -127,6 +127,9 @@ TEST(NCCL, all_reduce) {
 }  // namespace paddle
 
 int main(int argc, char** argv) {
+  // FIXME(tonyyang-svail):
+  //   Due to the driver issue on our CI, disable for now
+  return 0;
   dev_count = paddle::platform::GetCUDADeviceCount();
   if (dev_count <= 1) {
     LOG(WARNING)
diff --git a/paddle/platform/place.cc b/paddle/fluid/platform/place.cc
similarity index 97%
rename from paddle/platform/place.cc
rename to paddle/fluid/platform/place.cc
index f05260ccac..e99b75d761 100644
--- a/paddle/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/platform/place.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/platform/place.h b/paddle/fluid/platform/place.h
similarity index 97%
rename from paddle/platform/place.h
rename to paddle/fluid/platform/place.h
index fbb43fa043..2977a41036 100644
--- a/paddle/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include <iostream>
-#include "paddle/platform/enforce.h"
-#include "paddle/platform/variant.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/variant.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/platform/place_test.cc b/paddle/fluid/platform/place_test.cc
similarity index 97%
rename from paddle/platform/place_test.cc
rename to paddle/fluid/platform/place_test.cc
index 150b2d3b1f..f248902d91 100644
--- a/paddle/platform/place_test.cc
+++ b/paddle/fluid/platform/place_test.cc
@@ -11,7 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/platform/place.h"
+#include "paddle/fluid/platform/place.h"
 #include <sstream>
 #include "gtest/gtest.h"
 
diff --git a/paddle/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
similarity index 99%
rename from paddle/platform/profiler.cc
rename to paddle/fluid/platform/profiler.cc
index 2a8afc9403..28d2675f79 100644
--- a/paddle/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/platform/profiler.h"
+#include "paddle/fluid/platform/profiler.h"
 #include <iomanip>
 #include <map>
 #include "glog/logging.h"
@@ -233,7 +233,7 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
       };
       break;
     default:
-      sorted_domain = "event end time";
+      sorted_domain = "event first end time";
   }
 
   std::vector<std::vector<EventItem>> events_table;
diff --git a/paddle/platform/profiler.h b/paddle/fluid/platform/profiler.h
similarity index 98%
rename from paddle/platform/profiler.h
rename to paddle/fluid/platform/profiler.h
index 8de1e6ad29..0bc5e666cb 100644
--- a/paddle/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <list>
 #include <mutex>
 #include <vector>
-#include "paddle/platform/device_context.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/platform/profiler_test.cc b/paddle/fluid/platform/profiler_test.cc
similarity index 98%
rename from paddle/platform/profiler_test.cc
rename to paddle/fluid/platform/profiler_test.cc
index 81f10c9134..d2525c38b6 100644
--- a/paddle/platform/profiler_test.cc
+++ b/paddle/fluid/platform/profiler_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/platform/profiler.h"
+#include "paddle/fluid/platform/profiler.h"
 #include "gtest/gtest.h"
 
 TEST(Event, CpuElapsedTime) {
diff --git a/paddle/platform/transform.h b/paddle/fluid/platform/transform.h
similarity index 93%
rename from paddle/platform/transform.h
rename to paddle/fluid/platform/transform.h
index a88902b164..879daed191 100644
--- a/paddle/platform/transform.h
+++ b/paddle/fluid/platform/transform.h
@@ -14,17 +14,17 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/enforce.h"
-#include "paddle/platform/hostdevice.h"
-#include "paddle/platform/place.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/fluid/platform/place.h"
 
 #include <algorithm>
 #include <type_traits>
 #ifdef __NVCC__
 #include <thrust/execution_policy.h>
 #include <thrust/transform.h>
-#include "paddle/platform/details/device_ptr_cast.h"
+#include "paddle/fluid/platform/details/device_ptr_cast.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/platform/transform_test.cu b/paddle/fluid/platform/transform_test.cu
similarity index 94%
rename from paddle/platform/transform_test.cu
rename to paddle/fluid/platform/transform_test.cu
index af9204a0a7..0e4b9edc2f 100644
--- a/paddle/platform/transform_test.cu
+++ b/paddle/fluid/platform/transform_test.cu
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include "paddle/memory/memcpy.h"
-#include "paddle/memory/memory.h"
-#include "paddle/platform/hostdevice.h"
-#include "paddle/platform/transform.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/fluid/platform/transform.h"
 
 template <typename T>
 class Scale {
diff --git a/paddle/platform/variant.h b/paddle/fluid/platform/variant.h
similarity index 100%
rename from paddle/platform/variant.h
rename to paddle/fluid/platform/variant.h
diff --git a/paddle/fluid/pybind/.clang-format b/paddle/fluid/pybind/.clang-format
new file mode 100644
index 0000000000..29282dc87e
--- /dev/null
+++ b/paddle/fluid/pybind/.clang-format
@@ -0,0 +1,5 @@
+---
+Language:        Cpp
+BasedOnStyle:  Google
+Standard:  Cpp11 
+...
diff --git a/paddle/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
similarity index 76%
rename from paddle/pybind/CMakeLists.txt
rename to paddle/fluid/pybind/CMakeLists.txt
index de53fea0dd..d62f340308 100644
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -7,7 +7,3 @@ if(WITH_PYTHON)
     target_link_libraries(paddle_pybind rt)
   endif(NOT APPLE AND NOT ANDROID)
 endif(WITH_PYTHON)
-
-if(WITH_DOC)
-  cc_binary(print_operators_doc SRCS print_operators_doc.cc DEPS ${GLOB_OP_LIB})
-endif(WITH_DOC)
diff --git a/paddle/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc
similarity index 95%
rename from paddle/pybind/const_value.cc
rename to paddle/fluid/pybind/const_value.cc
index b13ad42ea2..098252a83d 100644
--- a/paddle/pybind/const_value.cc
+++ b/paddle/fluid/pybind/const_value.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "const_value.h"
-#include "paddle/framework/operator.h"
+#include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
 namespace pybind {
diff --git a/paddle/pybind/const_value.h b/paddle/fluid/pybind/const_value.h
similarity index 95%
rename from paddle/pybind/const_value.h
rename to paddle/fluid/pybind/const_value.h
index 3d57c972a9..67d14ac9ff 100644
--- a/paddle/pybind/const_value.h
+++ b/paddle/fluid/pybind/const_value.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <Python.h>
-#include "paddle/platform/enforce.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "pybind11/pybind11.h"
 
 namespace py = pybind11;
diff --git a/paddle/pybind/exception.cc b/paddle/fluid/pybind/exception.cc
similarity index 96%
rename from paddle/pybind/exception.cc
rename to paddle/fluid/pybind/exception.cc
index e29ac3ebab..7398a88541 100644
--- a/paddle/pybind/exception.cc
+++ b/paddle/fluid/pybind/exception.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/pybind/exception.h"
+#include "paddle/fluid/pybind/exception.h"
 
 namespace paddle {
 namespace pybind {
diff --git a/paddle/pybind/exception.h b/paddle/fluid/pybind/exception.h
similarity index 94%
rename from paddle/pybind/exception.h
rename to paddle/fluid/pybind/exception.h
index 436ddd5707..43e91a7063 100644
--- a/paddle/pybind/exception.h
+++ b/paddle/fluid/pybind/exception.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <Python.h>
-#include "paddle/platform/enforce.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "pybind11/pybind11.h"
 namespace paddle {
 namespace pybind {
diff --git a/paddle/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
similarity index 92%
rename from paddle/pybind/protobuf.cc
rename to paddle/fluid/pybind/protobuf.cc
index 371d6119d4..4aefcf1a1c 100644
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/pybind/protobuf.h"
+#include "paddle/fluid/pybind/protobuf.h"
 #include <deque>
 #include <iostream>
-#include "paddle/framework/backward.h"
-#include "paddle/framework/block_desc.h"
-#include "paddle/framework/op_desc.h"
-#include "paddle/framework/program_desc.h"
-#include "paddle/framework/var_desc.h"
+#include "paddle/fluid/framework/backward.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/var_desc.h"
 
 // Cast boost::variant for PyBind.
 // Copy from
@@ -214,11 +214,18 @@ void BindVarDsec(py::module &m) {
            py::return_value_policy::reference)
       .def("set_name", &VarDesc::SetName)
       .def("set_shape", &VarDesc::SetShape)
+      .def("set_shapes", &VarDesc::SetShapes)
       .def("set_dtype", &VarDesc::SetDataType)
-      .def("shape", &VarDesc::Shape, py::return_value_policy::reference)
+      .def("set_dtypes", &VarDesc::SetDataTypes)
+      .def("shape", &VarDesc::GetShape, py::return_value_policy::reference)
+      .def("shapes", &VarDesc::GetShapes, py::return_value_policy::reference)
       .def("dtype", &VarDesc::GetDataType, py::return_value_policy::reference)
+      .def("dtypes", &VarDesc::GetDataTypes, py::return_value_policy::reference)
       .def("lod_level", &VarDesc::GetLoDLevel)
+      .def("lod_levels", &VarDesc::GetLoDLevels,
+           py::return_value_policy::reference)
       .def("set_lod_level", &VarDesc::SetLoDLevel)
+      .def("set_lod_levels", &VarDesc::SetLoDLevels)
       .def("type", &VarDesc::GetType)
       .def("set_type", &VarDesc::SetType)
       .def("serialize_to_string", SerializeMessage<VarDesc>)
@@ -233,7 +240,8 @@ void BindVarDsec(py::module &m) {
       .value("STEP_SCOPES", proto::VarDesc::STEP_SCOPES)
       .value("LOD_RANK_TABLE", proto::VarDesc::LOD_RANK_TABLE)
       .value("LOD_TENSOR_ARRAY", proto::VarDesc::LOD_TENSOR_ARRAY)
-      .value("PLACE_LIST", proto::VarDesc::PLACE_LIST);
+      .value("PLACE_LIST", proto::VarDesc::PLACE_LIST)
+      .value("READER", proto::VarDesc::READER);
 }
 
 void BindOpDesc(py::module &m) {
diff --git a/paddle/pybind/protobuf.h b/paddle/fluid/pybind/protobuf.h
similarity index 95%
rename from paddle/pybind/protobuf.h
rename to paddle/fluid/pybind/protobuf.h
index 9e747e9ea6..c828e4583d 100644
--- a/paddle/pybind/protobuf.h
+++ b/paddle/fluid/pybind/protobuf.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <Python.h>
 #include <fstream>
 #include <vector>
-#include "paddle/platform/variant.h"
+#include "paddle/fluid/platform/variant.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
diff --git a/paddle/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
similarity index 94%
rename from paddle/pybind/pybind.cc
rename to paddle/fluid/pybind/pybind.cc
index a880d9bdbc..8924aabd17 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -12,35 +12,35 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/pybind/protobuf.h"
+#include "paddle/fluid/pybind/protobuf.h"
 
 #include <mutex>  // for call_once
 #include <unordered_map>
-#include "paddle/framework/backward.h"
-#include "paddle/framework/executor.h"
-#include "paddle/framework/feed_fetch_method.h"
-#include "paddle/framework/framework.pb.h"
-#include "paddle/framework/init.h"
-#include "paddle/framework/lod_rank_table.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/lod_tensor_array.h"
-#include "paddle/framework/prune.h"
-#include "paddle/framework/selected_rows.h"
-#include "paddle/operators/cond_op.h"
-#include "paddle/operators/net_op.h"
-#include "paddle/platform/enforce.h"
-#include "paddle/platform/place.h"
-#include "paddle/platform/profiler.h"
-#include "paddle/pybind/const_value.h"
-#include "paddle/pybind/exception.h"
-#include "paddle/pybind/pybind.h"
-#include "paddle/pybind/tensor_py.h"
+#include "paddle/fluid/framework/backward.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/feed_fetch_method.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/init.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/prune.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/cond_op.h"
+#include "paddle/fluid/operators/net_op.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/pybind/const_value.h"
+#include "paddle/fluid/pybind/exception.h"
+#include "paddle/fluid/pybind/pybind.h"
+#include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/string/to_string.h"
 
 #ifdef PADDLE_WITH_CUDA
-#include "paddle/operators/nccl/nccl_gpu_common.h"
-#include "paddle/platform/cuda_profiler.h"
-#include "paddle/platform/gpu_info.h"
+#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
+#include "paddle/fluid/platform/cuda_profiler.h"
+#include "paddle/fluid/platform/gpu_info.h"
 #endif
 
 // disable auto conversion to list in Python
diff --git a/paddle/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
similarity index 97%
rename from paddle/pybind/tensor_py.h
rename to paddle/fluid/pybind/tensor_py.h
index 3b5210e2b9..0261709f1e 100644
--- a/paddle/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 #include <string>
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/memory/memcpy.h"
-#include "paddle/platform/device_context.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/device_context.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 
diff --git a/paddle/framework/channel_test.cc b/paddle/framework/channel_test.cc
deleted file mode 100644
index c3533bbb1a..0000000000
--- a/paddle/framework/channel_test.cc
+++ /dev/null
@@ -1,243 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/channel.h"
-
-#include <chrono>
-#include <thread>
-
-#include "gtest/gtest.h"
-
-using paddle::framework::Channel;
-using paddle::framework::MakeChannel;
-using paddle::framework::CloseChannel;
-
-TEST(Channel, MakeAndClose) {
-  using paddle::framework::details::Buffered;
-  using paddle::framework::details::UnBuffered;
-  {
-    // MakeChannel should return a buffered channel is buffer_size > 0.
-    auto ch = MakeChannel<int>(10);
-    EXPECT_NE(dynamic_cast<Buffered<int> *>(ch), nullptr);
-    EXPECT_EQ(dynamic_cast<UnBuffered<int> *>(ch), nullptr);
-    CloseChannel(ch);
-    delete ch;
-  }
-  {
-    // MakeChannel should return an un-buffered channel is buffer_size = 0.
-    auto ch = MakeChannel<int>(0);
-    EXPECT_EQ(dynamic_cast<Buffered<int> *>(ch), nullptr);
-    EXPECT_NE(dynamic_cast<UnBuffered<int> *>(ch), nullptr);
-    CloseChannel(ch);
-    delete ch;
-  }
-}
-
-TEST(Channel, SufficientBufferSizeDoesntBlock) {
-  const size_t buffer_size = 10;
-  auto ch = MakeChannel<size_t>(buffer_size);
-  for (size_t i = 0; i < buffer_size; ++i) {
-    ch->Send(&i);  // should not block
-  }
-
-  size_t out;
-  for (size_t i = 0; i < buffer_size; ++i) {
-    ch->Receive(&out);  // should not block
-    EXPECT_EQ(out, i);
-  }
-  CloseChannel(ch);
-  delete ch;
-}
-
-TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
-  const size_t buffer_size = 10;
-  auto ch = MakeChannel<size_t>(buffer_size);
-  size_t sum = 0;
-  std::thread t([&]() {
-    // Try to write more than buffer size.
-    for (size_t i = 0; i < 2 * buffer_size; ++i) {
-      ch->Send(&i);  // should block after 10 iterations
-      sum += i;
-    }
-  });
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.5 sec
-  EXPECT_EQ(sum, 45U);
-
-  CloseChannel(ch);
-  t.join();
-  delete ch;
-}
-
-TEST(Channel, SimpleUnbufferedChannelTest) {
-  auto ch = MakeChannel<int>(0);
-  unsigned sum_send = 0;
-  std::thread t([&]() {
-    for (int i = 0; i < 5; i++) {
-      ch->Send(&i);
-      sum_send += i;
-    }
-  });
-  for (int i = 0; i < 5; i++) {
-    int recv;
-    ch->Receive(&recv);
-    EXPECT_EQ(recv, i);
-  }
-
-  CloseChannel(ch);
-  t.join();
-  EXPECT_EQ(sum_send, 10U);
-  delete ch;
-}
-
-// This tests that closing an unbuffered channel also unblocks
-//  unblocks any receivers waiting for senders
-TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
-  auto ch = MakeChannel<int>(0);
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
-
-  // Launches threads that try to read and are blocked becausew of no writers
-  for (size_t i = 0; i < num_threads; i++) {
-    thread_ended[i] = false;
-    t[i] = std::thread(
-        [&](bool *p) {
-          int data;
-          ch->Receive(&data);
-          *p = true;
-        },
-        &thread_ended[i]);
-  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
-
-  // Verify that all the threads are blocked
-  for (size_t i = 0; i < num_threads; i++) {
-    EXPECT_EQ(thread_ended[i], false);
-  }
-
-  // Explicitly close the thread
-  // This should unblock all receivers
-  CloseChannel(ch);
-
-  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
-
-  // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
-    EXPECT_EQ(thread_ended[i], true);
-  }
-
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
-  delete ch;
-}
-
-// This tests that closing an unbuffered channel also unblocks
-//  unblocks any senders waiting for senders
-TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) {
-  auto ch = MakeChannel<int>(0);
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
-
-  // Launches threads that try to read and are blocked becausew of no writers
-  for (size_t i = 0; i < num_threads; i++) {
-    thread_ended[i] = false;
-    t[i] = std::thread(
-        [&](bool *p) {
-          int data = 10;
-          ch->Send(&data);
-          *p = true;
-        },
-        &thread_ended[i]);
-  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
-
-  // Verify that all the threads are blocked
-  for (size_t i = 0; i < num_threads; i++) {
-    EXPECT_EQ(thread_ended[i], false);
-  }
-
-  // Explicitly close the thread
-  // This should unblock all receivers
-  CloseChannel(ch);
-
-  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
-
-  // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
-    EXPECT_EQ(thread_ended[i], true);
-  }
-
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
-  delete ch;
-}
-
-TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
-  auto ch = MakeChannel<int>(0);
-  unsigned sum_send = 0;
-  // Send should block after three iterations
-  // since we only have three receivers.
-  std::thread t([&]() {
-    // Try to send more number of times
-    // than receivers
-    for (int i = 0; i < 4; i++) {
-      ch->Send(&i);
-      sum_send += i;
-    }
-  });
-  for (int i = 0; i < 3; i++) {
-    int recv;
-    ch->Receive(&recv);
-    EXPECT_EQ(recv, i);
-  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.5 sec
-  EXPECT_EQ(sum_send, 3U);
-
-  CloseChannel(ch);
-  t.join();
-  delete ch;
-}
-
-TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
-  auto ch = MakeChannel<int>(0);
-  unsigned sum_send = 0;
-  unsigned sum_receive = 0;
-  // The receiver should block after 5
-  // iterations, since there are only 5 senders.
-  std::thread t([&]() {
-    for (int i = 0; i < 8; i++) {
-      int recv;
-      ch->Receive(&recv);  // should block after the fifth iteration.
-      EXPECT_EQ(recv, i);
-      sum_receive += i;
-    }
-  });
-  for (int i = 0; i < 5; i++) {
-    ch->Send(&i);
-    sum_send += i;
-  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
-  EXPECT_EQ(sum_send, 10U);
-  EXPECT_EQ(sum_receive, 10U);
-  // send three more elements
-  for (int i = 5; i < 8; i++) {
-    ch->Send(&i);
-    sum_send += i;
-  }
-
-  CloseChannel(ch);
-  t.join();
-  EXPECT_EQ(sum_send, 28U);
-  EXPECT_EQ(sum_receive, 28U);
-  delete ch;
-}
diff --git a/paddle/framework/mixed_vector.h b/paddle/framework/mixed_vector.h
deleted file mode 100644
index 85caac8dcd..0000000000
--- a/paddle/framework/mixed_vector.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-
-#include <initializer_list>
-#include <vector>
-
-#include "paddle/memory/memcpy.h"
-#include "paddle/memory/memory.h"
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/enforce.h"
-#include "paddle/platform/place.h"
-
-namespace paddle {
-namespace framework {
-
-/**
- * @brief Vector support both cpu and gpu.
- * host vector lifetime is same with Vector
- * device vector is lazily malloc and modified.
- */
-
-template <typename T>
-class Vector : public std::vector<T> {
- public:
-  using std::vector<T>::vector;
-
-  Vector() {}
-  Vector(const std::vector<T> &v) : std::vector<T>(v) {}  // NOLINT
-
-  virtual ~Vector() {
-#ifdef PADDLE_WITH_CUDA
-    if (cuda_ptr_ != nullptr) {
-      memory::Free<platform::CUDAPlace>(place_, cuda_ptr_);
-    }
-#endif
-  }
-
-  /* Get device vector */
-  T *cuda_data() {
-    CopyToCUDA();
-    PADDLE_ENFORCE_NOT_NULL(
-        cuda_ptr_, "No data or Insufficient CUDA memory to allocation");
-    return static_cast<T *>(cuda_ptr_);
-  }
-
-  /* Get host vector */
-  T *data() { return std::vector<T>::data(); }
-  const T *data() const { return std::vector<T>::data(); }
-
-  /* Synchronize host vector to device vector */
-  void CopyToCUDA();
-  /* Synchronize device vector to host vector */
-  void CopyFromCUDA();
-  /* Switch device vector location */
-  void CopyToPeer(platform::Place);
-
- private:
-  void *cuda_ptr_ = nullptr;
-  size_t cuda_size_ = 0;  // device vector numel
-  platform::CUDAPlace place_;
-};
-
-template <typename T>
-void Vector<T>::CopyToCUDA() {
-#ifdef PADDLE_WITH_CUDA
-  if (cuda_size_ < this->size()) {
-    if (cuda_ptr_ != nullptr) {
-      memory::Free<platform::CUDAPlace>(place_, cuda_ptr_);
-    }
-    cuda_ptr_ =
-        memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T));
-  }
-  cuda_size_ = this->size();
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto *ctx = pool.GetByPlace(place_);
-  memory::Copy(place_, cuda_ptr_, platform::CPUPlace(),
-               static_cast<const void *>(this->data()),
-               this->size() * sizeof(T), ctx->stream());
-  ctx->Wait();
-#endif
-}
-
-template <typename T>
-void Vector<T>::CopyFromCUDA() {
-#ifdef PADDLE_WITH_CUDA
-  if (cuda_ptr_ == nullptr) {
-    LOG(WARNING) << "No uncommitted cuda data.";
-    return;
-  }
-  this->resize(cuda_size_);
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto *ctx = pool.GetByPlace(place_);
-  memory::Copy(platform::CPUPlace(), static_cast<void *>(this->data()), place_,
-               static_cast<const void *>(cuda_ptr_), this->size() * sizeof(T),
-               ctx->stream());
-  ctx->Wait();
-#endif
-}
-
-template <typename T>
-void Vector<T>::CopyToPeer(platform::Place peer_place) {
-#ifdef PADDLE_WITH_CUDA
-  auto *ctx = platform::DeviceContextPool::Instance().GetByPlace(place_);
-  void *peer_cuda_ptr = memory::Alloc<platform::CUDAPlace>(
-      boost::get<platform::CUDAPlace>(peer_place), this->size() * sizeof(T));
-  memory::Copy(boost::get<platform::CUDAPlace>(peer_place), peer_cuda_ptr,
-               place_, cuda_ptr_, this->size() * sizeof(T), ctx->stream());
-  ctx->Wait();
-
-  memory::Free<platform::CUDAPlace>(place_, cuda_ptr_);
-  place_ = boost::get<platform::CUDAPlace>(peer_place);
-  cuda_ptr_ = peer_cuda_ptr;
-#endif
-}
-
-template class Vector<int>;
-template class Vector<unsigned>;
-template class Vector<size_t>;
-template class Vector<int64_t>;
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/tensor_util.cu b/paddle/framework/tensor_util.cu
deleted file mode 120000
index b00e6e59d9..0000000000
--- a/paddle/framework/tensor_util.cu
+++ /dev/null
@@ -1 +0,0 @@
-./tensor_util.cc
\ No newline at end of file
diff --git a/paddle/framework/var_desc.cc b/paddle/framework/var_desc.cc
deleted file mode 100644
index 62ab6593ef..0000000000
--- a/paddle/framework/var_desc.cc
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/var_desc.h"
-#include "paddle/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-
-proto::VarDesc::VarType VarDesc::GetType() const { return desc_.type(); }
-
-void VarDesc::SetType(proto::VarDesc::VarType type) { desc_.set_type(type); }
-
-void VarDesc::SetShape(const std::vector<int64_t> &dims) {
-  VectorToRepeated(dims, mutable_tensor_desc()->mutable_dims());
-}
-
-void VarDesc::SetDataType(proto::DataType data_type) {
-  mutable_tensor_desc()->set_data_type(data_type);
-}
-
-std::vector<int64_t> VarDesc::Shape() const {
-  return RepeatedToVector(tensor_desc().dims());
-}
-
-proto::DataType VarDesc::GetDataType() const {
-  return tensor_desc().data_type();
-}
-
-void VarDesc::SetLoDLevel(int32_t lod_level) {
-  switch (desc_.type()) {
-    case proto::VarDesc::LOD_TENSOR:
-      desc_.mutable_lod_tensor()->set_lod_level(lod_level);
-      break;
-    case proto::VarDesc::LOD_TENSOR_ARRAY:
-      desc_.mutable_tensor_array()->set_lod_level(lod_level);
-      break;
-    default:
-      PADDLE_THROW("Tensor type=%d does not support LoDLevel",
-                   desc_.tensor_array().lod_level());
-  }
-}
-
-int32_t VarDesc::GetLoDLevel() const {
-  switch (desc_.type()) {
-    case proto::VarDesc::LOD_TENSOR:
-      return desc_.lod_tensor().lod_level();
-    case proto::VarDesc::LOD_TENSOR_ARRAY:
-      return desc_.tensor_array().lod_level();
-    default:
-      PADDLE_THROW("Tensor type=%d does not support LoDLevel",
-                   desc_.tensor_array().lod_level());
-  }
-}
-
-const proto::TensorDesc &VarDesc::tensor_desc() const {
-  PADDLE_ENFORCE(desc_.has_type(), "invoke TensorDesc must after set type");
-  switch (desc_.type()) {
-    case proto::VarDesc::SELECTED_ROWS:
-      return desc_.selected_rows();
-    case proto::VarDesc::LOD_TENSOR:
-      return desc_.lod_tensor().tensor();
-    case proto::VarDesc::LOD_TENSOR_ARRAY:
-      return desc_.tensor_array().tensor();
-    default:
-      PADDLE_THROW("The type of var %s is unsupported.", this->Name());
-  }
-}
-
-proto::TensorDesc *VarDesc::mutable_tensor_desc() {
-  PADDLE_ENFORCE(desc_.has_type(),
-                 "invoke MutableTensorDesc must after set type");
-  switch (desc_.type()) {
-    case proto::VarDesc::SELECTED_ROWS:
-      return desc_.mutable_selected_rows();
-    case proto::VarDesc::LOD_TENSOR:
-      return desc_.mutable_lod_tensor()->mutable_tensor();
-    case proto::VarDesc::LOD_TENSOR_ARRAY:
-      return desc_.mutable_tensor_array()->mutable_tensor();
-    default:
-      PADDLE_THROW("Unexpected branch.");
-  }
-}
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/gserver/tests/test_CompareSparse.cpp b/paddle/gserver/tests/test_CompareSparse.cpp
index c6e07650fc..2495d8b60a 100644
--- a/paddle/gserver/tests/test_CompareSparse.cpp
+++ b/paddle/gserver/tests/test_CompareSparse.cpp
@@ -212,6 +212,10 @@ TEST(compareSparse, NeuralNetwork) {
 }
 
 int main(int argc, char** argv) {
+  // FIXME(tonyyang-svail):
+  //   Turn off this test due CI failure:
+  //   https://paddleci.ngrok.io/viewLog.html?buildId=27608&buildTypeId=Paddle_PrCi&tab=buildLog&_focus=10430
+  return 0;
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
   initPython(argc, argv);
diff --git a/paddle/inference/CMakeLists.txt b/paddle/inference/CMakeLists.txt
deleted file mode 100644
index 2289ddc139..0000000000
--- a/paddle/inference/CMakeLists.txt
+++ /dev/null
@@ -1,29 +0,0 @@
-set(FLUID_CORE_MODULES proto_desc paddle_memory lod_tensor executor prune init)
-
-cc_library(paddle_fluid_api
-    SRCS io.cc
-    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
-
-# Merge all modules into a single static library
-cc_library(paddle_fluid DEPS paddle_fluid_api ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
-
-# Create shared library
-add_library(paddle_fluid_shared SHARED io.cc)
-
-target_circle_link_libraries(paddle_fluid_shared
-  ARCHIVE_START
-  ${GLOB_OP_LIB}
-  ARCHIVE_END
-  ${FLUID_CORE_MODULES})
-
-SET_TARGET_PROPERTIES(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
-
-# install library & headers
-if(NOT WITH_C_API AND WITH_FLUID)
-  install(FILES io.h DESTINATION include/paddle/inference)
-  install(TARGETS paddle_fluid_shared DESTINATION lib)
-endif()
-
-if(WITH_TESTING)
-  add_subdirectory(tests/book)
-endif()
diff --git a/paddle/inference/tests/book/CMakeLists.txt b/paddle/inference/tests/book/CMakeLists.txt
deleted file mode 100644
index 0e987eb024..0000000000
--- a/paddle/inference/tests/book/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/tests)
-cc_test(test_inference_recognize_digits_mlp
-    SRCS test_inference_recognize_digits.cc
-    DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
-    ARGS --dirname=${PYTHON_TESTS_DIR}/book/recognize_digits_mlp.inference.model)
-set_tests_properties(test_inference_recognize_digits_mlp
-    PROPERTIES DEPENDS test_recognize_digits)
diff --git a/paddle/inference/tests/book/test_inference_recognize_digits.cc b/paddle/inference/tests/book/test_inference_recognize_digits.cc
deleted file mode 100644
index 26dc2aee04..0000000000
--- a/paddle/inference/tests/book/test_inference_recognize_digits.cc
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <time.h>
-#include <sstream>
-#include "gflags/gflags.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/inference/io.h"
-
-DEFINE_string(dirname, "", "Directory of the inference model.");
-
-template <typename Place, typename T>
-void TestInference(const std::string& dirname,
-                   const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
-                   std::vector<paddle::framework::LoDTensor*>& cpu_fetchs) {
-  // 1. Define place, executor and scope
-  auto place = Place();
-  auto executor = paddle::framework::Executor(place);
-  auto* scope = new paddle::framework::Scope();
-
-  // 2. Initialize the inference_program and load all parameters from file
-  auto inference_program = paddle::inference::Load(executor, *scope, dirname);
-
-  // 3. Get the feed_target_names and fetch_target_names
-  const std::vector<std::string>& feed_target_names =
-      inference_program->GetFeedTargetNames();
-  const std::vector<std::string>& fetch_target_names =
-      inference_program->GetFetchTargetNames();
-
-  // 4. Prepare inputs: set up maps for feed targets
-  std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
-  for (size_t i = 0; i < feed_target_names.size(); ++i) {
-    // Please make sure that cpu_feeds[i] is right for feed_target_names[i]
-    feed_targets[feed_target_names[i]] = cpu_feeds[i];
-  }
-
-  // 5. Define Tensor to get the outputs: set up maps for fetch targets
-  std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
-  for (size_t i = 0; i < fetch_target_names.size(); ++i) {
-    fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
-  }
-
-  // 6. Run the inference program
-  executor.Run(*inference_program, scope, feed_targets, fetch_targets);
-
-  delete scope;
-}
-
-TEST(inference, recognize_digits) {
-  if (FLAGS_dirname.empty()) {
-    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
-  }
-
-  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
-  std::string dirname = FLAGS_dirname;
-
-  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
-  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
-
-  paddle::framework::LoDTensor input;
-  srand(time(0));
-  float* input_ptr =
-      input.mutable_data<float>({1, 28, 28}, paddle::platform::CPUPlace());
-  for (int i = 0; i < 784; ++i) {
-    input_ptr[i] = rand() / (static_cast<float>(RAND_MAX));
-  }
-  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
-  cpu_feeds.push_back(&input);
-
-  paddle::framework::LoDTensor output1;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
-  cpu_fetchs1.push_back(&output1);
-
-  // Run inference on CPU
-  TestInference<paddle::platform::CPUPlace, float>(
-      dirname, cpu_feeds, cpu_fetchs1);
-  LOG(INFO) << output1.dims();
-
-#ifdef PADDLE_WITH_CUDA
-  paddle::framework::LoDTensor output2;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
-  cpu_fetchs2.push_back(&output2);
-
-  // Run inference on CUDA GPU
-  TestInference<paddle::platform::CUDAPlace, float>(
-      dirname, cpu_feeds, cpu_fetchs2);
-  LOG(INFO) << output2.dims();
-
-  EXPECT_EQ(output1.dims(), output2.dims());
-  EXPECT_EQ(output1.numel(), output2.numel());
-
-  float err = 1E-3;
-  int count = 0;
-  for (int64_t i = 0; i < output1.numel(); ++i) {
-    if (fabs(output1.data<float>()[i] - output2.data<float>()[i]) > err) {
-      count++;
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
-#endif
-}
diff --git a/paddle/math/float16.h b/paddle/math/float16.h
index efebbce504..63248d36f9 100644
--- a/paddle/math/float16.h
+++ b/paddle/math/float16.h
@@ -22,7 +22,7 @@ limitations under the License. */
 
 #include "unsupported/Eigen/CXX11/Tensor"
 
-#include "paddle/platform/hostdevice.h"
+#include "paddle/fluid/platform/hostdevice.h"
 
 #ifdef __GNUC__
 #define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__)
diff --git a/paddle/memory/.clang-format b/paddle/memory/.clang-format
deleted file mode 120000
index 7d28cb3924..0000000000
--- a/paddle/memory/.clang-format
+++ /dev/null
@@ -1 +0,0 @@
-../framework/.clang-format
\ No newline at end of file
diff --git a/paddle/operators/.clang-format b/paddle/operators/.clang-format
deleted file mode 120000
index 7d28cb3924..0000000000
--- a/paddle/operators/.clang-format
+++ /dev/null
@@ -1 +0,0 @@
-../framework/.clang-format
\ No newline at end of file
diff --git a/paddle/operators/layer_norm_op.cc b/paddle/operators/layer_norm_op.cc
deleted file mode 100644
index 1c6d2ae4d0..0000000000
--- a/paddle/operators/layer_norm_op.cc
+++ /dev/null
@@ -1,370 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/layer_norm_op.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using DataLayout = framework::DataLayout;
-
-template <typename T>
-using EigenMatrixMapRowMajor = Eigen::Map<
-    Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
-template <typename T>
-using ConstEigenMatrixMapRowMajor = Eigen::Map<
-    const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
-
-class LayerNormOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of LayerNormOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Y"),
-                   "Output(Y) of LayerNormOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Mean"),
-                   "Output(Mean) of LayerNormOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Variance"),
-                   "Output(Variance) of LayerNormOp should not be null.");
-
-    auto x_dim = ctx->GetInputDim("X");
-    auto begin_norm_axis = ctx->Attrs().Get<int>("begin_norm_axis");
-    PADDLE_ENFORCE_LT(begin_norm_axis, x_dim.size(),
-                      "'begin_norm_axis' must be less than the rank of X.");
-
-    auto matrix_dim = framework::flatten_to_2d(x_dim, begin_norm_axis);
-    int left = static_cast<int>(matrix_dim[0]);
-    int right = static_cast<int>(matrix_dim[1]);
-    if (ctx->HasInput("Scale")) {
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], right);
-    }
-    if (ctx->HasInput("Bias")) {
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], right);
-    }
-
-    ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
-    ctx->SetOutputDim("Mean", {left});
-    ctx->SetOutputDim("Variance", {left});
-    ctx->ShareLoD("X", "Y");
-  }
-};
-
-class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  LayerNormOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(LoDTensor) The input tensor.");
-    AddInput("Scale",
-             "(Tensor, optional) Scale is a 1-dimensional tensor of size "
-             "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
-             "It is applied to the output.")
-        .AsDispensable();
-    AddInput("Bias",
-             "(Tensor, optional) Bias is a 1-dimensional tensor of size "
-             "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
-             "It is applied to the output.")
-        .AsDispensable();
-    AddOutput("Y", "(LoDTensor) Result after normalization.");
-    AddOutput("Mean", "(Tensor) Mean of the current mini batch.")
-        .AsIntermediate();
-    AddOutput("Variance", "(Tensor) Variance of the current mini batch.")
-        .AsIntermediate();
-
-    AddAttr<float>("epsilon",
-                   "(float, default 1e-5) Constant for "
-                   "numerical stability")
-        .SetDefault(1e-5)
-        .AddCustomChecker([](const float &epsilon) {
-          PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f,
-                         "'epsilon' should be between 0.0 and 0.001.");
-        });
-    AddAttr<int>("begin_norm_axis",
-                 "(int default:1), the "
-                 "axis of `begin_norm_axis ... Rank(X) - 1` will be "
-                 "normalized. `begin_norm_axis` splits the tensor(`X`) to a "
-                 "matrix [N,H].")
-        .SetDefault(1)
-        .AddCustomChecker([](const int &begin_norm_axis) {
-          PADDLE_ENFORCE_GT(begin_norm_axis, 0,
-                            "'begin_norm_axis' should be greater than zero.");
-        });
-
-    AddComment(R"DOC(
-Layer Normalization.
-
-Layer Norm has been implemented as discussed in the paper:
-https://arxiv.org/abs/1607.06450
-...
-)DOC");
-  }
-};
-
-template <typename T>
-class LayerNormKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const float epsilon = ctx.Attr<float>("epsilon");
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto &x_dims = x->dims();
-    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-
-    auto *output = ctx.Output<Tensor>("Y");
-    auto *mean = ctx.Output<Tensor>("Mean");
-    auto *var = ctx.Output<Tensor>("Variance");
-    output->mutable_data<T>(ctx.GetPlace());
-    mean->mutable_data<T>(ctx.GetPlace());
-    var->mutable_data<T>(ctx.GetPlace());
-
-    auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
-    int left = static_cast<int>(matrix_dim[0]);
-    int right = static_cast<int>(matrix_dim[1]);
-
-    auto input_map = ConstEigenMatrixMapRowMajor<T>(x->data<T>(), left, right);
-
-    auto mean_map = EigenMatrixMapRowMajor<T>(mean->data<T>(), left, 1);
-    auto var_map = EigenMatrixMapRowMajor<T>(var->data<T>(), left, 1);
-    auto output_map = EigenMatrixMapRowMajor<T>(output->data<T>(), left, right);
-
-    auto squre = [](T ele) { return ele * ele; };
-    auto add_epslion = [epsilon](T ele) { return ele + epsilon; };
-
-    mean_map = input_map.rowwise().mean();
-    var_map = (input_map - mean_map.replicate(1, right))
-                  .unaryExpr(squre)
-                  .rowwise()
-                  .mean()
-                  .unaryExpr(add_epslion);
-
-    auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); };
-    // TODO(zcd): Some thinking about output_map, is it appropriate that
-    // `output_map` and `input_map` point to the same memory.
-    auto inv_std = var_map.unaryExpr(inv_std_func);
-    if (scale && bias) {
-      auto scale_map =
-          ConstEigenMatrixMapRowMajor<T>(scale->data<T>(), 1, right);
-      auto bias_map = ConstEigenMatrixMapRowMajor<T>(bias->data<T>(), 1, right);
-      output_map = (input_map - mean_map.replicate(1, right))
-                       .cwiseProduct(inv_std.replicate(1, right))
-                       .cwiseProduct(scale_map.replicate(left, 1)) +
-                   bias_map.replicate(left, 1);
-    } else if (scale) {
-      auto scale_map =
-          ConstEigenMatrixMapRowMajor<T>(scale->data<T>(), 1, right);
-      output_map = (input_map - mean_map.replicate(1, right))
-                       .cwiseProduct(inv_std.replicate(1, right))
-                       .cwiseProduct(scale_map.replicate(left, 1));
-    } else if (bias) {
-      auto bias_map = ConstEigenMatrixMapRowMajor<T>(bias->data<T>(), 1, right);
-      output_map = (input_map - mean_map.replicate(1, right))
-                       .cwiseProduct(inv_std.replicate(1, right)) +
-                   bias_map.replicate(left, 1);
-    } else {
-      output_map = (input_map - mean_map.replicate(1, right))
-                       .cwiseProduct(inv_std.replicate(1, right));
-    }
-  }
-};
-
-class LayerNormGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    // check input
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of LayerNormOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Scale"),
-                   "Input(Scale) of LayerNormOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Mean"),
-                   "Input(Mean) of LayerNormOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Variance"),
-                   "Input(Variance) of LayerNormOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
-                   "Input(Y@GRAD) of LayerNormOp should not be null.");
-
-    // check output
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-    }
-    if (ctx->HasOutput(framework::GradVarName("Scale"))) {
-      ctx->SetOutputDim(framework::GradVarName("Scale"),
-                        ctx->GetInputDim("Scale"));
-    }
-    if (ctx->HasOutput(framework::GradVarName("Bias"))) {
-      ctx->SetOutputDim(framework::GradVarName("Bias"),
-                        ctx->GetInputDim("Bias"));
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    const auto *var = ctx.InputVar(framework::GradVarName("Y"));
-    if (var == nullptr) {
-      PADDLE_THROW("can't find Y@GRAD");
-    }
-    const Tensor *t = nullptr;
-    if (var->IsType<Tensor>()) {
-      t = &var->Get<Tensor>();
-    } else if (var->IsType<LoDTensor>()) {
-      t = &var->Get<LoDTensor>();
-    }
-    if (t == nullptr) {
-      PADDLE_THROW("can't find Y@GRAD");
-    }
-    return framework::OpKernelType(framework::ToDataType(t->type()),
-                                   ctx.GetPlace());
-  }
-};
-
-template <typename T>
-class LayerNormGradKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto *mean = ctx.Input<Tensor>("Mean");
-    const auto *var = ctx.Input<Tensor>("Variance");
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-
-    const auto &x_dims = x->dims();
-
-    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-    auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
-    int left = static_cast<int>(matrix_dim[0]);
-    int right = static_cast<int>(matrix_dim[1]);
-
-    // init output
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    auto x_map = ConstEigenMatrixMapRowMajor<T>(x->data<T>(), left, right);
-    auto d_y_map = ConstEigenMatrixMapRowMajor<T>(d_y->data<T>(), left, right);
-    auto mean_map = ConstEigenMatrixMapRowMajor<T>(mean->data<T>(), left, 1);
-    auto var_map = ConstEigenMatrixMapRowMajor<T>(var->data<T>(), left, 1);
-
-    if (d_bias) {
-      d_bias->mutable_data<T>(ctx.GetPlace());
-      auto d_bias_map = EigenMatrixMapRowMajor<T>(d_bias->data<T>(), 1, right);
-      d_bias_map = d_y_map.colwise().sum();
-    }
-    if (d_scale) {
-      d_scale->mutable_data<T>(ctx.GetPlace());
-      auto d_scale_map =
-          EigenMatrixMapRowMajor<T>(d_scale->data<T>(), 1, right);
-      auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); };
-      // There are two equation to compute d_scale. One uses "Y" and the other
-      // does not use "Y"
-      d_scale_map =
-          ((x_map - mean_map.replicate(1, right))
-               .cwiseProduct(
-                   var_map.unaryExpr(inv_std_func).replicate(1, right))
-               .cwiseProduct(d_y_map))
-              .colwise()
-              .sum();
-    }
-
-    if (d_x) {
-      d_x->mutable_data<T>(ctx.GetPlace());
-      auto d_x_map = EigenMatrixMapRowMajor<T>(d_x->data<T>(), left, right);
-      auto triple_product_func = [](T ele) { return ele * ele * ele; };
-      auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); };
-      // TODO(zcd): these code can be refined
-      if (d_scale) {
-        auto scale_map =
-            ConstEigenMatrixMapRowMajor<T>(scale->data<T>(), 1, right);
-        // dy_dx
-        auto dx_end = var_map.unaryExpr(inv_std_func)
-                          .replicate(1, right)
-                          .cwiseProduct(d_y_map)
-                          .cwiseProduct(scale_map.replicate(left, 1));
-        // dy_dmean_dx
-        auto dx_mean = (T(-1.0) / right) *
-                       var_map.unaryExpr(inv_std_func)
-                           .replicate(1, right)
-                           .cwiseProduct(d_y_map)
-                           .cwiseProduct(scale_map.replicate(left, 1))
-                           .rowwise()
-                           .sum()
-                           .replicate(1, right);
-        // dy_var_dx
-        auto dvar_end_part = (x_map - mean_map.replicate(1, right))
-                                 .cwiseProduct(scale_map.replicate(left, 1))
-                                 .cwiseProduct(d_y_map)
-                                 .rowwise()
-                                 .sum();
-        auto dvar_end = var_map.unaryExpr(inv_std_func)
-                            .unaryExpr(triple_product_func)
-                            .cwiseProduct(dvar_end_part)
-                            .replicate(1, right);
-        auto dx_var =
-            (T(-1.0) / right) *
-            (x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end);
-
-        d_x_map = dx_end + dx_mean + dx_var;
-      } else {
-        // dy_dx
-        auto dx_end = var_map.unaryExpr(inv_std_func)
-                          .replicate(1, right)
-                          .cwiseProduct(d_y_map);
-        // dy_dmean_dx
-        auto dx_mean = (T(-1.0) / right) *
-                       var_map.unaryExpr(inv_std_func)
-                           .replicate(1, right)
-                           .cwiseProduct(d_y_map)
-                           .rowwise()
-                           .sum()
-                           .replicate(1, right);
-        // dy_var_dx
-        auto dvar_end_part = (x_map - mean_map.replicate(1, right))
-                                 .cwiseProduct(d_y_map)
-                                 .rowwise()
-                                 .sum();
-        auto dvar_end = var_map.unaryExpr(inv_std_func)
-                            .unaryExpr(triple_product_func)
-                            .cwiseProduct(dvar_end_part)
-                            .replicate(1, right);
-        auto dx_var =
-            (T(-1.0) / right) *
-            (x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end);
-
-        d_x_map = dx_end + dx_mean + dx_var;
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker,
-            layer_norm_grad, ops::LayerNormGradOp);
-REGISTER_OP_CPU_KERNEL(
-    layer_norm,
-    ops::LayerNormKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    layer_norm_grad,
-    ops::LayerNormGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/pybind/.clang-format b/paddle/pybind/.clang-format
deleted file mode 120000
index 7d28cb3924..0000000000
--- a/paddle/pybind/.clang-format
+++ /dev/null
@@ -1 +0,0 @@
-../framework/.clang-format
\ No newline at end of file
diff --git a/paddle/pybind/print_operators_doc.cc b/paddle/pybind/print_operators_doc.cc
deleted file mode 100644
index b55ddee176..0000000000
--- a/paddle/pybind/print_operators_doc.cc
+++ /dev/null
@@ -1,148 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <iostream>
-#include <sstream>  // std::stringstream
-#include <string>
-
-#include "paddle/framework/op_info.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/pybind/pybind.h"
-
-std::string Escape(const std::string& s) {
-  std::string r;
-  for (size_t i = 0; i < s.size(); i++) {
-    switch (s[i]) {
-      case '\"':
-        r += "\\\"";
-        break;
-      case '\\':
-        r += "\\\\";
-        break;
-      case '\n':
-        r += "\\n";
-        break;
-      case '\t':
-        r += "\\t";
-      case '\r':
-        break;
-      default:
-        r += s[i];
-        break;
-    }
-  }
-  return r;
-}
-
-std::string AttrType(paddle::framework::proto::AttrType at) {
-  switch (at) {
-    case paddle::framework::proto::INT:
-      return "int";
-    case paddle::framework::proto::FLOAT:
-      return "float";
-    case paddle::framework::proto::STRING:
-      return "string";
-    case paddle::framework::proto::BOOLEAN:
-      return "bool";
-    case paddle::framework::proto::INTS:
-      return "int array";
-    case paddle::framework::proto::FLOATS:
-      return "float array";
-    case paddle::framework::proto::STRINGS:
-      return "string array";
-    case paddle::framework::proto::BOOLEANS:
-      return "bool array";
-    case paddle::framework::proto::BLOCK:
-      return "block id";
-    case paddle::framework::proto::LONG:
-      return "long";
-  }
-  return "UNKNOWN";  // not possible
-}
-
-void PrintVar(const paddle::framework::proto::OpProto::Var& v,
-              std::stringstream& ss) {
-  ss << " { "
-     << "\n"
-     << "   \"name\" : \"" << Escape(v.name()) << "\",\n"
-     << "   \"comment\" : \"" << Escape(v.comment()) << "\",\n"
-     << "   \"duplicable\" : " << v.duplicable() << ",\n"
-     << "   \"intermediate\" : " << v.intermediate() << "\n"
-     << " },";
-}
-
-void PrintAttr(const paddle::framework::proto::OpProto::Attr& a,
-               std::stringstream& ss) {
-  ss << " { "
-     << "\n"
-     << "   \"name\" : \"" << Escape(a.name()) << "\",\n"
-     << "   \"type\" : \"" << AttrType(a.type()) << "\",\n"
-     << "   \"comment\" : \"" << Escape(a.comment()) << "\",\n"
-     << "   \"generated\" : " << a.generated() << "\n"
-     << " },";
-}
-
-void PrintOpProto(const std::string& type,
-                  const paddle::framework::OpInfo& opinfo,
-                  std::stringstream& ss) {
-  std::cerr << "Processing " << type << "\n";
-
-  const paddle::framework::proto::OpProto* p = opinfo.proto_;
-  if (p == nullptr) {
-    return;  // It is possible that an operator doesn't have OpProto.
-  }
-
-  ss << "{\n"
-     << " \"type\" : \"" << Escape(p->type()) << "\",\n"
-     << " \"comment\" : \"" << Escape(p->comment()) << "\",\n";
-
-  ss << " \"inputs\" : [ "
-     << "\n";
-  for (int i = 0; i < p->inputs_size(); i++) {
-    PrintVar(p->inputs(i), ss);
-  }
-  ss.seekp(-1, ss.cur);  // remove the trailing comma
-  ss << " ], "
-     << "\n";
-
-  ss << " \"outputs\" : [ "
-     << "\n";
-  for (int i = 0; i < p->outputs_size(); i++) {
-    PrintVar(p->outputs(i), ss);
-  }
-  ss.seekp(-1, ss.cur);  // remove the trailing comma
-  ss << " ], "
-     << "\n";
-
-  ss << " \"attrs\" : [ "
-     << "\n";
-  for (int i = 0; i < p->attrs_size(); i++) {
-    PrintAttr(p->attrs(i), ss);
-  }
-  ss.seekp(-1, ss.cur);  // remove the trailing comma
-  ss << " ] "
-     << "\n";
-
-  ss << "},";
-}
-
-int main() {
-  std::stringstream ss;
-  ss << "[\n";
-  for (auto& iter : paddle::framework::OpInfoMap::Instance().map()) {
-    PrintOpProto(iter.first, iter.second, ss);
-  }
-  ss.seekp(-1, ss.cur);  // remove the trailing comma
-  ss << "]\n";
-  std::cout << ss.str();
-}
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index df7310d6b7..2f8dd48efe 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -79,6 +79,7 @@ function run_build() {
     Building in /paddle/build ...
     ============================================
 EOF
+    make clean
     make -j `nproc`
 }
 
@@ -116,9 +117,7 @@ EOF
             -DWITH_STYLE_CHECK=OFF
         make -j `nproc` gen_proto_py
         make -j `nproc` paddle_python
-        make -j `nproc` paddle_docs paddle_docs_cn
-        make -j `nproc` print_operators_doc
-        paddle/pybind/print_operators_doc > doc/en/html/operators.json
+        make -j `nproc` paddle_docs paddle_docs_cn paddle_api_docs
         popd
     fi
 
diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh
index 0db8d33bbc..486c094a6a 100755
--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
@@ -9,13 +9,12 @@ cd $TRAVIS_BUILD_DIR/build
 cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
 make -j `nproc` gen_proto_py
 make -j `nproc` paddle_python
-make -j `nproc` paddle_docs paddle_docs_cn
-make -j `nproc` print_operators_doc
-paddle/pybind/print_operators_doc > doc/en/html/operators.json
+make -j `nproc` paddle_docs paddle_docs_cn paddle_api_docs
 
 # check websites for broken links
 linkchecker doc/en/html/index.html
 linkchecker doc/cn/html/index.html
+linkchecker doc/api/en/html/index.html
 
 # Parse Github URL
 REPO=`git config remote.origin.url`
@@ -54,10 +53,11 @@ function deploy_docs() {
   mkdir -p ${DIR}
   # remove old docs. mv new docs.
   set +e
-  rm -rf ${DIR}/doc ${DIR}/doc_cn
+  rm -rf ${DIR}/doc ${DIR}/doc_cn ${DIR}/api_doc
   set -e
   cp -r ../doc/cn/html ${DIR}/doc_cn
   cp -r ../doc/en/html ${DIR}/doc
+  cp -r ../doc/api/en/html ${DIR}/api_doc
   git add .
 }
 
diff --git a/paddle/string/CMakeLists.txt b/paddle/string/CMakeLists.txt
index 751776dbb5..1fe7f42ca1 100644
--- a/paddle/string/CMakeLists.txt
+++ b/paddle/string/CMakeLists.txt
@@ -2,9 +2,3 @@ cc_library(stringpiece SRCS piece.cc)
 cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags)
 cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags)
 cc_test(to_string_test SRCS to_string_test.cc)
-
-if(NOT WITH_C_API AND WITH_FLUID)
-  file(GLOB STRING_HEADERS *.h)
-  install(FILES ${STRING_HEADERS} DESTINATION include/paddle/string)
-  install(FILES tinyformat/tinyformat.h DESTINATION include/paddle/string/tinyformat)
-endif()
diff --git a/paddle/string/piece.h b/paddle/string/piece.h
index f2bb6b2c76..dcef9791a7 100644
--- a/paddle/string/piece.h
+++ b/paddle/string/piece.h
@@ -28,7 +28,7 @@ namespace string {
 // its syntax is simple as it doesn't own/manage the string, it is
 // cheap to construct Pieces and pass them around.
 class Piece {
- public:
+public:
   static const size_t npos = static_cast<size_t>(-1);
 
   // We provide non-explicit singleton constructors so users can
@@ -55,7 +55,7 @@ class Piece {
   // Return a string that contains the copy of the referenced data.
   std::string ToString() const { return std::string(data_, size_); }
 
- private:
+private:
   const char* data_;
   size_t size_;
 
diff --git a/paddle/string/printf_test.cc b/paddle/string/printf_test.cc
index b5ad35513b..9815f29bdd 100644
--- a/paddle/string/printf_test.cc
+++ b/paddle/string/printf_test.cc
@@ -24,6 +24,6 @@ TEST(StringPrintf, StringPrintf) {
   long hour = 14;
   int min = 44;
   EXPECT_EQ(std::string("Wednesday, July 27, 14:44"),
-            paddle::string::Sprintf("%s, %s %d, %.2d:%.2d", weekday, month, day,
-                                    hour, min));
+            paddle::string::Sprintf(
+                "%s, %s %d, %.2d:%.2d", weekday, month, day, hour, min));
 }
diff --git a/paddle/string/tinyformat/tinyformat.h b/paddle/string/tinyformat/tinyformat.h
index d1a2c47f1a..270198dc52 100644
--- a/paddle/string/tinyformat/tinyformat.h
+++ b/paddle/string/tinyformat/tinyformat.h
@@ -147,7 +147,7 @@ namespace detail {
 // Test whether type T1 is convertible to type T2
 template <typename T1, typename T2>
 struct is_convertible {
- private:
+private:
   // two types of different size
   struct fail {
     char dummy[2];
@@ -160,7 +160,7 @@ struct is_convertible {
   static succeed tryConvert(const T2 &);
   static const T1 &makeT1();
 
- public:
+public:
   // Standard trick: the (...) version of tryConvert will be chosen from
   // the overload set only if the version taking a T2 doesn't match.
   // Then we compare the sizes of the return types to check which
@@ -170,7 +170,8 @@ struct is_convertible {
 
 // Format the value by casting to type fmtT.  This default implementation
 // should never be called.
-template <typename T, typename fmtT,
+template <typename T,
+          typename fmtT,
           bool convertible = is_convertible<T, fmtT>::value>
 struct formatValueAsType {
   static void invoke(std::ostream & /*out*/, const T & /*value*/) { assert(0); }
@@ -240,8 +241,11 @@ TINYFORMAT_DEFINE_FORMAT_TRUNCATED_CSTR(char)
 /// operator<< to format the type T, with special cases for the %c and %p
 /// conversions.
 template <typename T>
-inline void formatValue(std::ostream &out, const char * /*fmtBegin*/,
-                        const char *fmtEnd, int ntrunc, const T &value) {
+inline void formatValue(std::ostream &out,
+                        const char * /*fmtBegin*/,
+                        const char *fmtEnd,
+                        int ntrunc,
+                        const T &value) {
   // The mess here is to support the %c and %p conversions: if these
   // conversions are active we try to convert the type to a char or const
   // void* respectively and format that instead of the value itself.  For the
@@ -263,22 +267,25 @@ inline void formatValue(std::ostream &out, const char * /*fmtBegin*/,
 }
 
 // Overloaded version for char types to support printing as an integer
-#define TINYFORMAT_DEFINE_FORMATVALUE_CHAR(charType)                      \
-  inline void formatValue(std::ostream &out, const char * /*fmtBegin*/,   \
-                          const char *fmtEnd, int /**/, charType value) { \
-    switch (*(fmtEnd - 1)) {                                              \
-      case 'u':                                                           \
-      case 'd':                                                           \
-      case 'i':                                                           \
-      case 'o':                                                           \
-      case 'X':                                                           \
-      case 'x':                                                           \
-        out << static_cast<int>(value);                                   \
-        break;                                                            \
-      default:                                                            \
-        out << value;                                                     \
-        break;                                                            \
-    }                                                                     \
+#define TINYFORMAT_DEFINE_FORMATVALUE_CHAR(charType) \
+  inline void formatValue(std::ostream &out,         \
+                          const char * /*fmtBegin*/, \
+                          const char *fmtEnd,        \
+                          int /**/,                  \
+                          charType value) {          \
+    switch (*(fmtEnd - 1)) {                         \
+      case 'u':                                      \
+      case 'd':                                      \
+      case 'i':                                      \
+      case 'o':                                      \
+      case 'X':                                      \
+      case 'x':                                      \
+        out << static_cast<int>(value);              \
+        break;                                       \
+      default:                                       \
+        out << value;                                \
+        break;                                       \
+    }                                                \
   }
 // per 3.9.1: char, signed char and unsigned char are all distinct types
 TINYFORMAT_DEFINE_FORMATVALUE_CHAR(char)
@@ -475,7 +482,7 @@ namespace detail {
 // each argument to be allocated as a homogenous array inside FormatList
 // whereas a naive implementation based on inheritance does not.
 class FormatArg {
- public:
+public:
   FormatArg() {}
 
   template <typename T>
@@ -484,17 +491,22 @@ class FormatArg {
         m_formatImpl(&formatImpl<T>),
         m_toIntImpl(&toIntImpl<T>) {}
 
-  void format(std::ostream &out, const char *fmtBegin, const char *fmtEnd,
+  void format(std::ostream &out,
+              const char *fmtBegin,
+              const char *fmtEnd,
               int ntrunc) const {
     m_formatImpl(out, fmtBegin, fmtEnd, ntrunc, m_value);
   }
 
   int toInt() const { return m_toIntImpl(m_value); }
 
- private:
+private:
   template <typename T>
-  static void formatImpl(std::ostream &out, const char *fmtBegin,
-                         const char *fmtEnd, int ntrunc, const void *value) {
+  static void formatImpl(std::ostream &out,
+                         const char *fmtBegin,
+                         const char *fmtEnd,
+                         int ntrunc,
+                         const void *value) {
     formatValue(out, fmtBegin, fmtEnd, ntrunc, *static_cast<const T *>(value));
   }
 
@@ -504,8 +516,11 @@ class FormatArg {
   }
 
   const void *m_value;
-  void (*m_formatImpl)(std::ostream &out, const char *fmtBegin,
-                       const char *fmtEnd, int ntrunc, const void *value);
+  void (*m_formatImpl)(std::ostream &out,
+                       const char *fmtBegin,
+                       const char *fmtEnd,
+                       int ntrunc,
+                       const void *value);
   int (*m_toIntImpl)(const void *value);
 };
 
@@ -554,10 +569,12 @@ inline const char *printFormatStringLiteral(std::ostream &out,
 // necessary to pull out variable width and precision .  The function returns a
 // pointer to the character after the end of the current format spec.
 inline const char *streamStateFromFormat(std::ostream &out,
-                                         bool &spacePadPositive, int &ntrunc,
+                                         bool &spacePadPositive,
+                                         int &ntrunc,
                                          const char *fmtStart,
                                          const detail::FormatArg *formatters,
-                                         int &argIndex, int numFormatters) {
+                                         int &argIndex,
+                                         int numFormatters) {
   if (*fmtStart != '%') {
     TINYFORMAT_ERROR(
         "tinyformat: Not enough conversion specifiers in format string");
@@ -733,8 +750,10 @@ inline const char *streamStateFromFormat(std::ostream &out,
 }
 
 //------------------------------------------------------------------------------
-inline void formatImpl(std::ostream &out, const char *fmt,
-                       const detail::FormatArg *formatters, int numFormatters) {
+inline void formatImpl(std::ostream &out,
+                       const char *fmt,
+                       const detail::FormatArg *formatters,
+                       int numFormatters) {
   // Saved stream state
   std::streamsize origWidth = out.width();
   std::streamsize origPrecision = out.precision();
@@ -746,9 +765,13 @@ inline void formatImpl(std::ostream &out, const char *fmt,
     fmt = printFormatStringLiteral(out, fmt);
     bool spacePadPositive = false;
     int ntrunc = -1;
-    const char *fmtEnd =
-        streamStateFromFormat(out, spacePadPositive, ntrunc, fmt, formatters,
-                              argIndex, numFormatters);
+    const char *fmtEnd = streamStateFromFormat(out,
+                                               spacePadPositive,
+                                               ntrunc,
+                                               fmt,
+                                               formatters,
+                                               argIndex,
+                                               numFormatters);
     if (argIndex >= numFormatters) {
       // Check args remain after reading any variable width/precision
       TINYFORMAT_ERROR("tinyformat: Not enough format arguments");
@@ -797,14 +820,15 @@ inline void formatImpl(std::ostream &out, const char *fmt,
 /// information has been stripped from the arguments, leaving just enough of a
 /// common interface to perform formatting as required.
 class FormatList {
- public:
+public:
   FormatList(detail::FormatArg *formatters, int N)
       : m_formatters(formatters), m_N(N) {}
 
-  friend void vformat(std::ostream &out, const char *fmt,
+  friend void vformat(std::ostream &out,
+                      const char *fmt,
                       const FormatList &list);
 
- private:
+private:
   const detail::FormatArg *m_formatters;
   int m_N;
 };
@@ -817,7 +841,7 @@ namespace detail {
 // Format list subclass with fixed storage to avoid dynamic allocation
 template <int N>
 class FormatListN : public FormatList {
- public:
+public:
   template <typename... Args>
   FormatListN(const Args &... args)
       : FormatList(&m_formatterStore[0], N),
@@ -825,14 +849,14 @@ class FormatListN : public FormatList {
     static_assert(sizeof...(args) == N, "Number of args must be N");
   }
 
- private:
+private:
   FormatArg m_formatterStore[N];
 };
 
 // Special 0-arg version - MSVC says zero-sized C array in struct is nonstandard
 template <>
 class FormatListN<0> : public FormatList {
- public:
+public:
   FormatListN() : FormatList(0, 0) {}
 };
 
diff --git a/paddle/string/to_string_test.cc b/paddle/string/to_string_test.cc
index 4956bd96fa..05650ee8f1 100644
--- a/paddle/string/to_string_test.cc
+++ b/paddle/string/to_string_test.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 
 constexpr char kOutputString[] = "User Defined Output";
 class UserDefinedClass {
- public:
+public:
 };
 
 std::ostream& operator<<(std::ostream& s, const UserDefinedClass& ins) {
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index fd8c4a69da..270f2f4c18 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -16,10 +16,11 @@ limitations under the License. */
 
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
-#include "paddle/framework/init.h"
-#include "paddle/memory/memory.h"
+#include "paddle/fluid/framework/init.h"
+#include "paddle/fluid/memory/memory.h"
 
 int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
   std::vector<char*> new_argv;
   std::string gflags_env;
   for (int i = 0; i < argc; ++i) {
@@ -35,7 +36,6 @@ int main(int argc, char** argv) {
   int new_argc = static_cast<int>(new_argv.size());
   char** new_argv_address = new_argv.data();
   google::ParseCommandLineFlags(&new_argc, &new_argv_address, false);
-  testing::InitGoogleTest(&argc, argv);
   paddle::memory::Used(paddle::platform::CPUPlace());
 
 #ifdef PADDLE_WITH_CUDA
diff --git a/python/paddle/v2/fluid/__init__.py b/python/paddle/v2/fluid/__init__.py
index 3ee58393c7..73acbf3e00 100644
--- a/python/paddle/v2/fluid/__init__.py
+++ b/python/paddle/v2/fluid/__init__.py
@@ -29,7 +29,7 @@ import optimizer
 import learning_rate_decay
 import backward
 import regularizer
-from param_attr import ParamAttr
+from param_attr import ParamAttr, WeightNormParamAttr
 from data_feeder import DataFeeder
 from core import LoDTensor, CPUPlace, CUDAPlace
 from distribute_transpiler import DistributeTranspiler
@@ -41,11 +41,26 @@ import profiler
 Tensor = LoDTensor
 
 __all__ = framework.__all__ + executor.__all__ + [
-    'io', 'initializer', 'layers', 'nets', 'optimizer', 'learning_rate_decay',
-    'backward', 'regularizer', 'LoDTensor', 'CPUPlace', 'CUDAPlace', 'Tensor',
-    'ParamAttr'
-    'DataFeeder', 'clip', 'SimpleDistributeTranspiler', 'DistributeTranspiler',
-    'memory_optimize', 'profiler'
+    'io',
+    'initializer',
+    'layers',
+    'nets',
+    'optimizer',
+    'learning_rate_decay',
+    'backward',
+    'regularizer',
+    'LoDTensor',
+    'CPUPlace',
+    'CUDAPlace',
+    'Tensor',
+    'ParamAttr',
+    'WeightNormParamAttr',
+    'DataFeeder',
+    'clip',
+    'SimpleDistributeTranspiler',
+    'DistributeTranspiler',
+    'memory_optimize',
+    'profiler',
 ]
 
 
diff --git a/python/paddle/v2/fluid/debuger.py b/python/paddle/v2/fluid/debuger.py
index d379352442..db1808c647 100644
--- a/python/paddle/v2/fluid/debuger.py
+++ b/python/paddle/v2/fluid/debuger.py
@@ -12,10 +12,202 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import sys
 import re
 from graphviz import GraphPreviewGenerator
 import proto.framework_pb2 as framework_pb2
 
+_vartype2str_ = [
+    "UNK",
+    "LoDTensor",
+    "SelectedRows",
+    "FeedMinibatch",
+    "FetchList",
+    "StepScopes",
+    "LodRankTable",
+    "LoDTensorArray",
+    "PlaceList",
+]
+_dtype2str_ = [
+    "bool",
+    "int16",
+    "int32",
+    "int64",
+    "float16",
+    "float32",
+    "float64",
+]
+
+
+def repr_data_type(type):
+    return _dtype2str_[type]
+
+
+def repr_tensor(proto):
+    return "tensor(type={}, shape={})".format(_dtype2str_[int(proto.data_type)],
+                                              str(proto.dims))
+
+
+reprtpl = "{ttype} {name} ({reprs})"
+
+
+def repr_lodtensor(proto):
+    if not proto.lod_tensor: return
+    level = proto.lod_tensor.lod_level
+    reprs = repr_tensor(proto.lod_tensor.tensor)
+    return reprtpl.format(
+        ttype="LoDTensor" if level > 0 else "Tensor",
+        name=proto.name,
+        reprs="level=%d, %s" % (level, reprs) if level > 0 else reprs)
+
+
+def repr_selected_rows(proto):
+    if not proto.selected_rows: return
+    return reprtpl.format(
+        ttype="SelectedRows",
+        name=proto.name,
+        reprs=repr_tensor(proto.selected_rows))
+
+
+def repr_tensor_array(proto):
+    if not proto.tensor_array: return
+    return reprtpl.format(
+        ttype="TensorArray",
+        name=proto.name,
+        reprs="level=%d, %s" % (proto.tensor_array.lod_level,
+                                repr_tensor(proto.lod_tensor)))
+
+
+type_handlers = [
+    repr_lodtensor,
+    repr_selected_rows,
+    repr_tensor_array,
+]
+
+
+def repr_var(vardesc):
+    for handler in type_handlers:
+        res = handler(vardesc)
+        if res:
+            return res
+
+
+def pprint_program_codes(program_desc):
+    reprs = []
+    for block_idx in range(program_desc.num_blocks()):
+        block_desc = program_desc.block(block_idx)
+        block_repr = pprint_block_codes(block_desc)
+        reprs.append(block_repr)
+    return '\n'.join(reprs)
+
+
+def pprint_block_codes(block_desc, show_backward=False):
+    def is_op_backward(op_desc):
+        if op_desc.type.endswith('_grad'): return True
+
+        def is_var_backward(var):
+            if "@GRAD" in var.parameter: return True
+            for arg in var.arguments:
+                if "@GRAD" in arg: return True
+
+        for var in op_desc.inputs:
+            if is_var_backward(var): return True
+        for var in op_desc.outputs:
+            if is_var_backward(var): return True
+        return False
+
+    def is_var_backward(var_desc):
+        return "@GRAD" in var_desc.name
+
+    if type(block_desc) is not framework_pb2.BlockDesc:
+        block_desc = framework_pb2.BlockDesc.FromString(
+            block_desc.serialize_to_string())
+    var_reprs = []
+    op_reprs = []
+    for var in block_desc.vars:
+        if not show_backward and is_var_backward(var):
+            continue
+        var_reprs.append(repr_var(var))
+
+    for op in block_desc.ops:
+        if not show_backward and is_op_backward(op): continue
+        op_reprs.append(repr_op(op))
+
+    tpl = "// block-{idx}  parent-{pidx}\n// variables\n{vars}\n\n// operators\n{ops}\n"
+    return tpl.format(
+        idx=block_desc.idx,
+        pidx=block_desc.parent_idx,
+        vars='\n'.join(var_reprs),
+        ops='\n'.join(op_reprs), )
+
+
+def repr_attr(desc):
+    tpl = "{key}={value}"
+    valgetter = [
+        lambda attr: attr.i,
+        lambda attr: attr.f,
+        lambda attr: attr.s,
+        lambda attr: attr.ints,
+        lambda attr: attr.floats,
+        lambda attr: attr.strings,
+        lambda attr: attr.b,
+        lambda attr: attr.bools,
+        lambda attr: attr.block_idx,
+        lambda attr: attr.l,
+    ]
+    key = desc.name
+    value = valgetter[desc.type](desc)
+    if key == "dtype":
+        value = repr_data_type(value)
+    return tpl.format(key=key, value=str(value)), (key, value)
+
+
+def _repr_op_fill_constant(optype, inputs, outputs, attrs):
+    if optype == "fill_constant":
+        return "{output} = {data} [shape={shape}]".format(
+            output=','.join(outputs),
+            data=attrs['value'],
+            shape=str(attrs['shape']))
+
+
+op_repr_handlers = [_repr_op_fill_constant, ]
+
+
+def repr_op(opdesc):
+    optype = None
+    attrs = []
+    attr_dict = {}
+    is_target = None
+    inputs = []
+    outputs = []
+
+    tpl = "{outputs} = {optype}({inputs}{is_target}) [{attrs}]"
+    args2value = lambda args: args[0] if len(args) == 1 else str(list(args))
+    for var in opdesc.inputs:
+        key = var.parameter
+        value = args2value(var.arguments)
+        inputs.append("%s=%s" % (key, value))
+    for var in opdesc.outputs:
+        value = args2value(var.arguments)
+        outputs.append(value)
+    for attr in opdesc.attrs:
+        attr_repr, attr_pair = repr_attr(attr)
+        attrs.append(attr_repr)
+        attr_dict[attr_pair[0]] = attr_pair[1]
+
+    is_target = opdesc.is_target
+
+    for handler in op_repr_handlers:
+        res = handler(opdesc.type, inputs, outputs, attr_dict)
+        if res: return res
+
+    return tpl.format(
+        outputs=', '.join(outputs),
+        optype=opdesc.type,
+        inputs=', '.join(inputs),
+        attrs="{%s}" % ','.join(attrs),
+        is_target=", is_target" if is_target else "")
+
 
 def draw_block_graphviz(block, highlights=None, path="./temp.dot"):
     '''
diff --git a/python/paddle/v2/fluid/distribute_transpiler.py b/python/paddle/v2/fluid/distribute_transpiler.py
index 121b407cae..cd89dba72d 100644
--- a/python/paddle/v2/fluid/distribute_transpiler.py
+++ b/python/paddle/v2/fluid/distribute_transpiler.py
@@ -300,6 +300,9 @@ class DistributeTranspiler:
             pass
         return orig_shape
 
+    def _op_input_var(self, op, varname):
+        pass
+
     def _is_op_on_pserver(self, endpoint, all_ops, idx):
         """
         Recursively check if the op need to run on current server.
@@ -309,44 +312,51 @@ class DistributeTranspiler:
             p.name for p in self.param_grad_ep_mapping[endpoint]["params"]
         ]
         op = all_ops[idx]
-        if op.inputs.has_key("Param"):
-            if op.inputs["Param"].name in param_names:
+        input_names = set(op.input_names)
+        # TODO(typhoonzero): using Param and Grad input name to identify
+        # that the operator is an optimization operator, need a better way.
+        if "Param" in input_names:
+            if op.input("Param")[0] in param_names:
                 return True
             else:
                 for n in param_names:
-                    if same_or_split_var(n, op.inputs[
-                            "Param"].name) and n != op.inputs["Param"].name:
+                    if same_or_split_var(n, op.input("Param")[0]) \
+                            and n != op.input("Param")[0]:
                         return True
                 return False
         else:
             j = idx - 1
             while j >= 0:
                 prev_op = all_ops[j]
-                prev_output_names = [o.name for o in prev_op.outputs.values()]
-                prev_input_names = [o.name for o in prev_op.inputs.values()]
+                # prev_output_names = [o.name for o in prev_op.outputs.values()]
+                # prev_input_names = [o.name for o in prev_op.inputs.values()]
+                # NOTE(typhoonzero): consider list input/output
+                prev_output_names = prev_op.desc.output_arg_names()
+                prev_input_names = prev_op.desc.input_arg_names()
                 found1 = False
                 found2 = False
-                for _, v in op.inputs.iteritems():
-                    if v.name in prev_output_names:
+                for varname in op.desc.input_arg_names():
+                    if varname in prev_output_names:
                         found1 = self._is_op_on_pserver(endpoint, all_ops, j)
                 # later ops may produce output for prev op's next batch use.
-                for _, v in op.outputs.iteritems():
-                    if v.name in prev_input_names:
+                for varname in op.desc.output_arg_names():
+                    if varname in prev_input_names:
                         found2 = self._is_op_on_pserver(endpoint, all_ops, j)
                 if found1 or found2:
                     return True
                 j -= 1
             return False
 
-    def _append_pserver_ops(self, program, pserver_program, opt_op, endpoint):
+    def _append_pserver_ops(self, optimize_block, opt_op, endpoint):
+        program = optimize_block.program
         new_inputs = dict()
         # update param/grad shape first, then other inputs like
         # moment can use the updated shape
-        for key, var in opt_op.inputs.iteritems():
+        for key in opt_op.input_names:
             if key == "Grad":
                 grad_block = None
                 for g in self.param_grad_ep_mapping[endpoint]["grads"]:
-                    if same_or_split_var(g.name, var.name):
+                    if same_or_split_var(g.name, opt_op.input(key)[0]):
                         grad_block = g
                         break
                 if not grad_block:
@@ -362,11 +372,11 @@ class DistributeTranspiler:
                 if self.trainers > 1:
                     vars2merge = self._create_var_for_trainers(
                         program.global_block(), grad_block, self.trainers)
-                    program.global_block().append_op(
+                    optimize_block.append_op(
                         type="sum",
                         inputs={"X": vars2merge},
                         outputs={"Out": merged_var})
-                    program.global_block().append_op(
+                    optimize_block.append_op(
                         type="scale",
                         inputs={"X": merged_var},
                         outputs={"Out": merged_var},
@@ -376,7 +386,7 @@ class DistributeTranspiler:
                 # param is already created on global program
                 param_block = None
                 for p in self.param_grad_ep_mapping[endpoint]["params"]:
-                    if same_or_split_var(p.name, var.name):
+                    if same_or_split_var(p.name, opt_op.input(key)[0]):
                         param_block = p
                         break
                 if not param_block:
@@ -389,11 +399,12 @@ class DistributeTranspiler:
 
                 new_inputs[key] = tmpvar
 
-        for key, var in opt_op.inputs.iteritems():
+        for key in opt_op.input_names:
             if key in ["Param", "Grad"]:
                 continue
             # update accumulator variable shape
             param_shape = new_inputs["Param"].shape
+            var = program.global_block().vars[opt_op.input(key)[0]]
             new_shape = self._get_optimizer_input_shape(opt_op.type, key,
                                                         var.shape, param_shape)
             tmpvar = program.global_block().create_var(
@@ -402,40 +413,41 @@ class DistributeTranspiler:
                 dtype=var.dtype,
                 shape=new_shape)
             new_inputs[key] = tmpvar
-            # create var in pserver program global block.
-            # TODO(typhoonzero): put blocks in one program to avoid create two
-            # variables.
-            pserver_program.global_block().create_var(
-                name=var.name,
-                persistable=var.persistable,
-                dtype=var.dtype,
-                shape=new_shape)
 
         # change output's ParamOut variable
-        opt_op.outputs["ParamOut"] = new_inputs["Param"]
-        program.global_block().append_op(
+        outputs = self._get_output_map_from_op(program.global_block(), opt_op)
+        outputs["ParamOut"] = new_inputs["Param"]
+        optimize_block.append_op(
             type=opt_op.type,
             inputs=new_inputs,
-            outputs=opt_op.outputs,
+            outputs=outputs,
             attrs=opt_op.attrs)
 
-    def _append_pserver_non_opt_ops(self, program, pserver_program, opt_op):
+    def _append_pserver_non_opt_ops(self, optimize_block, opt_op):
+        program = optimize_block.program
         # Append the ops for parameters that do not need to be optimized/updated
-        for _, var in opt_op.inputs.iteritems():
-            program.global_block().create_var(
-                name=var.name,
-                persistable=var.persistable,
-                dtype=var.dtype,
-                shape=var.shape)
-            pserver_program.global_block().create_var(
-                name=var.name,
-                persistable=var.persistable,
-                dtype=var.dtype,
-                shape=var.shape)
-        program.global_block().append_op(
+        inputs = self._get_input_map_from_op(self.program.global_block().vars,
+                                             opt_op)
+        for var in inputs.itervalues():
+            if type(var) == list:
+                varlist = var
+            else:
+                varlist = [var]
+            for var in varlist:
+                if not program.global_block().vars.has_key(var.name):
+                    program.global_block().create_var(
+                        name=var.name,
+                        persistable=var.persistable,
+                        dtype=var.dtype,
+                        shape=var.shape)
+
+        outputs = self._get_output_map_from_op(self.program.global_block().vars,
+                                               opt_op)
+
+        optimize_block.append_op(
             type=opt_op.type,
-            inputs=opt_op.inputs,
-            outputs=opt_op.outputs,
+            inputs=inputs,
+            outputs=outputs,
             attrs=opt_op.attrs)
 
     def get_pserver_program(self, endpoint):
@@ -465,26 +477,25 @@ class DistributeTranspiler:
                     dtype=v.dtype,
                     shape=v.shape)
         # step6
-        optimize_sub_program = Program()
+        optimize_block = pserver_program.create_block(0)
         # Iterate through the ops and append ops as needed
         for idx, opt_op in enumerate(self.optimize_ops):
             is_op_on_pserver = self._is_op_on_pserver(endpoint,
                                                       self.optimize_ops, idx)
             if not is_op_on_pserver:
                 continue
-            if opt_op.inputs.has_key("Grad"):
-                self._append_pserver_ops(optimize_sub_program, pserver_program,
-                                         opt_op, endpoint)
+            if "Grad" in opt_op.desc.input_arg_names():
+                self._append_pserver_ops(optimize_block, opt_op, endpoint)
             else:
-                self._append_pserver_non_opt_ops(optimize_sub_program,
-                                                 pserver_program, opt_op)
+                self._append_pserver_non_opt_ops(optimize_block, opt_op)
+
         # Append the listen_and_serv op
         pserver_program.global_block().append_op(
             type="listen_and_serv",
             inputs={},
             outputs={},
             attrs={
-                "OptimizeBlock": optimize_sub_program.global_block(),
+                "OptimizeBlock": optimize_block,
                 "endpoint": endpoint,
                 "ParamList": [
                     p.name
@@ -499,6 +510,30 @@ class DistributeTranspiler:
         pserver_program.sync_with_cpp()
         return pserver_program
 
+    def _get_input_map_from_op(self, varmap, op):
+        iomap = dict()
+        for key in op.input_names:
+            vars = []
+            for varname in op.input(key):
+                vars.append(varmap[varname])
+            if len(vars) == 1:
+                iomap[key] = vars[0]
+            else:
+                iomap[key] = vars
+        return iomap
+
+    def _get_output_map_from_op(self, varmap, op):
+        iomap = dict()
+        for key in op.output_names:
+            vars = []
+            for varname in op.output(key):
+                vars.append(varmap[varname])
+            if len(vars) == 1:
+                iomap[key] = vars[0]
+            else:
+                iomap[key] = vars
+        return iomap
+
     def get_startup_program(self, endpoint, pserver_program):
         """
         Get startup program for current parameter server.
@@ -529,17 +564,21 @@ class DistributeTranspiler:
 
         # 2. rename op outputs
         for op in orig_s_prog.global_block().ops:
+            new_inputs = dict()
             new_outputs = dict()
             # do not append startup op if var is not on this pserver
             op_on_pserver = False
-            for key, var in op.outputs.iteritems():
-                newname, _ = _get_splited_name_and_shape(var.name)
+            for key in op.output_names:
+                newname, _ = _get_splited_name_and_shape(op.output(key)[0])
                 if newname:
                     op_on_pserver = True
                     new_outputs[key] = created_var_map[newname]
-                elif var.name in pserver_vars:
+                elif op.output(key)[0] in pserver_vars:
                     op_on_pserver = True
-                    new_outputs[key] = pserver_vars[var.name]
+                    new_outputs[key] = pserver_vars[op.output(key)[0]]
+
+            # most startup program ops have no inputs
+            new_inputs = self._get_input_map_from_op(pserver_vars, op)
 
             if op_on_pserver:
                 if op.type in [
@@ -548,7 +587,7 @@ class DistributeTranspiler:
                     op.attrs["shape"] = new_outputs["Out"].shape
                 s_prog.global_block().append_op(
                     type=op.type,
-                    inputs=op.inputs,
+                    inputs=new_inputs,
                     outputs=new_outputs,
                     attrs=op.attrs)
         return s_prog
diff --git a/python/paddle/v2/fluid/executor.py b/python/paddle/v2/fluid/executor.py
index 9f48815b8b..01cbdb3ec4 100644
--- a/python/paddle/v2/fluid/executor.py
+++ b/python/paddle/v2/fluid/executor.py
@@ -17,7 +17,9 @@ import contextlib
 from framework import Program, default_main_program
 from . import core
 
-__all__ = ['Executor', 'global_scope', 'scope_guard', 'switch_scope']
+__all__ = [
+    'Executor', 'global_scope', 'scope_guard', 'switch_scope', 'fetch_var'
+]
 
 g_scope = core.Scope()
 
@@ -45,27 +47,13 @@ def as_numpy(tensor):
         return [as_numpy(t) for t in tensor]
     assert isinstance(tensor, core.LoDTensor)
     lod = tensor.lod()
-    tensor_data = np.array(tensor)
-    if len(lod) == 0:
-        ans = tensor_data
-    else:
-        raise RuntimeError("LoD Calculate lacks unit tests and buggy")
-    # elif len(lod) == 1:
-    #     ans = []
-    #     idx = 0
-    #     while idx < len(lod) - 1:
-    #         ans.append(tensor_data[lod[idx]:lod[idx + 1]])
-    #         idx += 1
-    # else:
-    #     for l in reversed(lod):
-    #         ans = []
-    #         idx = 0
-    #         while idx < len(l) - 1:
-    #             ans.append(tensor_data[l[idx]:l[idx + 1]])
-    #             idx += 1
-    #         tensor_data = ans
-    #     ans = tensor_data
-    return ans
+    if len(lod) > 0:
+        raise RuntimeError(
+            "Some of your featched tensors hold LoD information. \
+            They can not be completely cast to Python ndarray. \
+            Please set the parameter 'return_numpy' as 'False' to \
+            return LoDTensor itself directly.")
+    return np.array(tensor)
 
 
 def has_feed_operators(block, feed_targets, feed_holder_name):
@@ -80,12 +68,12 @@ def has_feed_operators(block, feed_targets, feed_holder_name):
     Args:
         block: a block instance (typically global block of a program)
         feed_targets: a dictionary of {feed_target_name: feed_target_data}
-        feed_holder_name: the name of the variable that holds the data of 
-            all feed targets. The type of this feed_holder variable is 
+        feed_holder_name: the name of the variable that holds the data of
+            all feed targets. The type of this feed_holder variable is
             FEED_MINIBATCH, which is essentially vector<LoDTensor>.
 
     Returns:
-        A boolean value that indicates whether a block has feed operators 
+        A boolean value that indicates whether a block has feed operators
         that match the info contained in feed_targets and feed_holder_name.
     """
 
@@ -108,7 +96,7 @@ def has_feed_operators(block, feed_targets, feed_holder_name):
 
 def has_fetch_operators(block, fetch_targets, fetch_holder_name):
     """ Check whether the block already has fetch operators.
-    
+
     Return false if the block does not have any fetch operators.
     If some fetch operators have been appended to the block, check that
     the info contained in these fetch operators matches the fetch_targets
@@ -118,13 +106,13 @@ def has_fetch_operators(block, fetch_targets, fetch_holder_name):
     Args:
         block: a block instance (typically global block of a program)
         fetch_targets: a dictionary of {fetch_target_name: fetch_target_data}
-        fetch_holder_name: the name of the variable that holds the data of 
-            all fetch targets. The type of this fetch_holder variable is 
-            FETCH_LIST, which is essentially vector<LoDTensor>.    
+        fetch_holder_name: the name of the variable that holds the data of
+            all fetch targets. The type of this fetch_holder variable is
+            FETCH_LIST, which is essentially vector<LoDTensor>.
 
-    Return:    
-        A boolean value that indicates whether a block has fetch operators 
-        that match the info contained in fetch_targets and fetch_holder_name.     
+    Return:
+        A boolean value that indicates whether a block has fetch operators
+        that match the info contained in fetch_targets and fetch_holder_name.
     """
 
     fetch_count = 0
@@ -146,6 +134,35 @@ def has_fetch_operators(block, fetch_targets, fetch_holder_name):
     return fetch_count > 0
 
 
+def fetch_var(name, scope=None, return_numpy=True):
+    """
+    Fetch the value of the variable with the given name from the given scope
+    Args:
+        name(str): name of the variable. Typically, only persistable variables
+            can be found in the scope used for running the program.
+        scope(core.Scope|None): scope object. It should be the scope where
+            you pass to Executor.run() when running your program.
+            If None, global_scope() will be used.
+        return_numpy(bool): whether convert the tensor to numpy.ndarray
+    Returns:
+       LodTensor|numpy.ndarray
+    """
+    assert isinstance(name, str)
+    if scope is None:
+        scope = global_scope()
+    assert isinstance(scope, core.Scope)
+
+    var = global_scope().find_var(name)
+    assert var is not None, (
+        "Cannot find " + name + " in scope. Perhaps you need to make the"
+        " variable persistable by using var.persistable = True in your"
+        " program.")
+    tensor = var.get_tensor()
+    if return_numpy:
+        tensor = as_numpy(tensor)
+    return tensor
+
+
 class Executor(object):
     def __init__(self, places):
         if not isinstance(places, list) and not isinstance(places, tuple):
@@ -275,7 +292,6 @@ class Executor(object):
             core.get_fetch_variable(scope, fetch_var_name, i)
             for i in xrange(len(fetch_list))
         ]
-
         if return_numpy:
             outs = as_numpy(outs)
         return outs
diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py
index 69cbebe41e..a517db68c5 100644
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -31,6 +31,7 @@ __all__ = [
     'program_guard',
     'switch_startup_program',
     'switch_main_program',
+    'get_var',
 ]
 
 EMPTY_VAR_NAME = core.kEmptyVarName()
@@ -739,6 +740,9 @@ class Block(object):
             raise e
         self.desc.remove_op(start, end + 1)
 
+    def slice_ops(self, start, end):
+        return list(self.ops)[start:end]
+
     def prepend_op(self, *args, **kwargs):
         op_desc = self.desc.prepend_op()
         op = Operator(self, op_desc, *args, **kwargs)
@@ -1123,3 +1127,22 @@ def program_guard(main_program, startup_program=None):
     switch_main_program(main_program)
     if startup_program is not None:
         switch_startup_program(startup_program)
+
+
+def get_var(name, program=None):
+    """
+    Get a variable by name from the global block of a program
+    Args:
+        name(str): name of the variable
+        program(Program|None): program object.
+             If None, default_global_program() will be used.
+
+    Returns:
+        Variable
+    """
+    if program is None:
+        program = default_main_program()
+    assert isinstance(name, str)
+    assert isinstance(name, Program)
+
+    return program.global_block().var(name)
diff --git a/python/paddle/v2/fluid/initializer.py b/python/paddle/v2/fluid/initializer.py
index b9c0d12ad6..8c70fd90ef 100644
--- a/python/paddle/v2/fluid/initializer.py
+++ b/python/paddle/v2/fluid/initializer.py
@@ -14,14 +14,37 @@
 
 import framework
 import numpy as np
+import contextlib
 
 __all__ = [
-    'Constant',
-    'Uniform',
-    'Normal',
-    'Xavier',
+    'Constant', 'Uniform', 'Normal', 'Xavier', 'force_init_on_cpu',
+    'init_on_cpu'
 ]
 
+_force_init_on_cpu_ = False
+
+
+def force_init_on_cpu():
+    return _force_init_on_cpu_
+
+
+@contextlib.contextmanager
+def init_on_cpu():
+    """
+    Switch program with `with` statement
+
+    Examples:
+        >>> with init_on_cpu():
+        >>>   step = layers.create_global_var()
+
+    """
+    global _force_init_on_cpu_
+
+    pre_state = force_init_on_cpu()
+    _force_init_on_cpu_ = True
+    yield
+    _force_init_on_cpu_ = pre_state
+
 
 class Initializer(object):
     """Base class for variable initializers
@@ -80,7 +103,7 @@ class ConstantInitializer(Initializer):
     """Implements the constant initializer
     """
 
-    def __init__(self, value=0.0):
+    def __init__(self, value=0.0, force_cpu=False):
         """Constructor for ConstantInitializer
 
         Args:
@@ -89,6 +112,7 @@ class ConstantInitializer(Initializer):
         assert value is not None
         super(ConstantInitializer, self).__init__()
         self._value = value
+        self._force_cpu = force_cpu
 
     def __call__(self, var, block):
         """Add constant initialization ops for a variable
@@ -110,7 +134,8 @@ class ConstantInitializer(Initializer):
             attrs={
                 "shape": var.shape,
                 "dtype": int(var.dtype),
-                "value": self._value
+                "value": float(self._value),
+                'force_cpu': self._force_cpu or force_init_on_cpu()
             })
         var.op = op
         return op
diff --git a/python/paddle/v2/fluid/io.py b/python/paddle/v2/fluid/io.py
index 613dc20b6e..0f43e46082 100644
--- a/python/paddle/v2/fluid/io.py
+++ b/python/paddle/v2/fluid/io.py
@@ -342,7 +342,11 @@ def save_inference_model(dirname,
     prepend_feed_ops(inference_program, feeded_var_names)
     append_fetch_ops(inference_program, fetch_var_names)
 
-    model_file_name = dirname + "/__model__"
+    if save_file_name == None:
+        model_file_name = dirname + "/__model__"
+    else:
+        model_file_name = dirname + "/__model_combined__"
+
     with open(model_file_name, "wb") as f:
         f.write(inference_program.desc.serialize_to_string())
 
@@ -384,7 +388,11 @@ def load_inference_model(dirname, executor, load_file_name=None):
     if not os.path.isdir(dirname):
         raise ValueError("There is no directory named '%s'", dirname)
 
-    model_file_name = dirname + "/__model__"
+    if load_file_name == None:
+        model_file_name = dirname + "/__model__"
+    else:
+        model_file_name = dirname + "/__model_combined__"
+
     with open(model_file_name, "rb") as f:
         program_desc_str = f.read()
 
diff --git a/python/paddle/v2/fluid/layers/control_flow.py b/python/paddle/v2/fluid/layers/control_flow.py
index 0fcbfe0e2f..71a9459d55 100644
--- a/python/paddle/v2/fluid/layers/control_flow.py
+++ b/python/paddle/v2/fluid/layers/control_flow.py
@@ -18,6 +18,7 @@ from tensor import assign, fill_constant
 from .. import core
 from ..framework import Program, Variable, Operator
 from ..layer_helper import LayerHelper, unique_name
+from ops import logical_and, logical_not, logical_or
 
 __all__ = [
     'split_lod_tensor',
@@ -27,6 +28,7 @@ __all__ = [
     'StaticRNNMemoryLink',
     'WhileGuard',
     'While',
+    'Switch',
     'lod_rank_table',
     'max_sequence_len',
     'topk',
@@ -36,6 +38,7 @@ __all__ = [
     'array_write',
     'create_array',
     'less_than',
+    'equal',
     'array_read',
     'shrink_memory',
     'array_length',
@@ -274,21 +277,20 @@ class ParallelDo(object):
         parent_block = self.parent_block()
 
         local_inputs = set()
-
-        for op in current_block.ops:
-            for oname in op.output_names:
-                for out_var_name in op.output(oname):
-                    local_inputs.add(out_var_name)
-
+        params = list()
         for var in self.inputs:
             local_inputs.add(var.name)
 
-        params = list()
         for op in current_block.ops:
             for iname in op.input_names:
                 for in_var_name in op.input(iname):
                     if in_var_name not in local_inputs:
                         params.append(in_var_name)
+
+            for oname in op.output_names:
+                for out_var_name in op.output(oname):
+                    local_inputs.add(out_var_name)
+
         params = list(set(params))
 
         return [parent_block.var(name) for name in params]
@@ -973,6 +975,36 @@ def less_than(x, y, cond=None, **ignored):
     return cond
 
 
+def equal(x, y, cond=None, **ignored):
+    """
+    **equal**
+
+    This layer returns the truth value of :math:`x == y` elementwise.
+
+    Args:
+        x(Variable): First operand of *equal*
+        y(Variable): Second operand of *equal*
+        cond(Variable|None): Optional output variable to store the result of *equal*
+
+    Returns:
+        Variable: The tensor variable storing the output of *equal*.
+
+    Examples:
+        .. code-block:: python
+
+          less = fluid.layers.equal(x=label, y=limit)
+    """
+    helper = LayerHelper("equal", **locals())
+    if cond is None:
+        cond = helper.create_tmp_variable(dtype='bool')
+        cond.stop_gradient = True
+
+    helper.append_op(
+        type='equal', inputs={'X': [x],
+                              'Y': [y]}, outputs={'Out': [cond]})
+    return cond
+
+
 def array_read(array, i):
     """This function performs the operation to read the data in as an
     LOD_TENSOR_ARRAY.
@@ -1063,11 +1095,12 @@ class ConditionalBlockGuard(BlockGuard):
 
 
 class ConditionalBlock(object):
-    def __init__(self, inputs, name=None):
+    def __init__(self, inputs, is_scalar_condition=False, name=None):
         for each_input in inputs:
             if not isinstance(each_input, Variable):
                 raise TypeError("Each input should be variable")
         self.inputs = inputs
+        self.is_scalar_condition = is_scalar_condition
         self.helper = LayerHelper('conditional_block', name=name)
 
     def block(self):
@@ -1112,7 +1145,66 @@ class ConditionalBlock(object):
             },
             outputs={'Out': out_list,
                      'Scope': [step_scope]},
-            attrs={'sub_block': inside_block})
+            attrs={
+                'sub_block': inside_block,
+                'is_scalar_condition': self.is_scalar_condition
+            })
+
+
+class Switch(object):
+    def __init__(self, name=None):
+        self.helper = LayerHelper('switch', name=name)
+        self.inside_scope = False
+        self.pre_not_conditions = []
+
+    def case(self, condition):
+        """create a new block for this condition
+        """
+        if not self.inside_scope:
+            raise ValueError("case should be called inside with")
+
+        if len(self.pre_not_conditions) == 0:
+            cond_block = ConditionalBlock([condition], is_scalar_condition=True)
+            not_cond = logical_not(x=condition)
+            self.pre_not_conditions.append(not_cond)
+        else:
+            pre_cond_num = len(self.pre_not_conditions)
+            pre_not_cond = self.pre_not_conditions[pre_cond_num - 1]
+            new_not_cond = logical_and(
+                x=pre_not_cond, y=logical_not(x=condition))
+            self.pre_not_conditions.append(new_not_cond)
+            cond_block = ConditionalBlock(
+                [logical_and(
+                    x=pre_not_cond, y=condition)],
+                is_scalar_condition=True)
+
+        return ConditionalBlockGuard(cond_block)
+
+    def default(self):
+        """create a default case for this switch
+        """
+        pre_cond_num = len(self.pre_not_conditions)
+        if pre_cond_num == 0:
+            raise ValueError("there should be at least one condition")
+        cond_block = ConditionalBlock(
+            [self.pre_not_conditions[pre_cond_num - 1]],
+            is_scalar_condition=True)
+        return ConditionalBlockGuard(cond_block)
+
+    def __enter__(self):
+        """
+        set flag that now is inside switch.block {}
+        :return:
+        """
+        self.inside_scope = True
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.inside_scope = False
+        if exc_type is not None:
+            return False  # re-raise exception
+
+        return True
 
 
 class IfElseBlockGuard(object):
diff --git a/python/paddle/v2/fluid/layers/math_op_patch.py b/python/paddle/v2/fluid/layers/math_op_patch.py
index 79a130a3eb..9b5f22759c 100644
--- a/python/paddle/v2/fluid/layers/math_op_patch.py
+++ b/python/paddle/v2/fluid/layers/math_op_patch.py
@@ -14,6 +14,7 @@
 
 from ..framework import Variable, unique_name
 from layer_function_generator import OpProtoHolder
+from ..initializer import force_init_on_cpu
 
 __all__ = ['monkey_patch_variable']
 
@@ -36,9 +37,12 @@ def monkey_patch_variable():
         block.append_op(
             type="fill_constant",
             outputs={'Out': [var]},
-            attrs={'dtype': var.dtype,
-                   'shape': shape,
-                   'value': value})
+            attrs={
+                'dtype': var.dtype,
+                'shape': shape,
+                'value': value,
+                'force_cpu': force_init_on_cpu()
+            })
         return var
 
     def create_scalar(block, value, dtype):
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index a79479f469..5ebd329fc0 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -65,6 +65,7 @@ __all__ = [
     'beam_search',
     'row_conv',
     'multiplex',
+    'layer_norm',
 ]
 
 
@@ -92,7 +93,7 @@ def fc(input,
 
     .. math::
 
-        Out = Act({\sum_{i=0}^{N-1}W_iX_i + b})
+        Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})
 
     In the above equation:
 
@@ -184,7 +185,7 @@ def fc(input,
         helper.append_op(
             type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias})
     # add bias
-    pre_activation = helper.append_bias_op(pre_bias)
+    pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)
     # add activation
     return helper.append_activation(pre_activation)
 
@@ -410,12 +411,12 @@ def dynamic_lstmp(input,
     """
     **Dynamic LSTMP Layer**
 
-    LSTMP (LSTM with recurrent projection) layer has a separate projection 
-    layer after the LSTM layer, projecting the original hidden state to a 
-    lower-dimensional one, which is proposed to reduce the number of total 
-    parameters and furthermore computational complexity for the LSTM, 
-    espeacially for the case that the size of output units is relative 
-    large (https://research.google.com/pubs/archive/43905.pdf). 
+    LSTMP (LSTM with recurrent projection) layer has a separate projection
+    layer after the LSTM layer, projecting the original hidden state to a
+    lower-dimensional one, which is proposed to reduce the number of total
+    parameters and furthermore computational complexity for the LSTM,
+    espeacially for the case that the size of output units is relative
+    large (https://research.google.com/pubs/archive/43905.pdf).
 
     The formula is as follows:
 
@@ -441,27 +442,27 @@ def dynamic_lstmp(input,
           the matrix of weights from the input gate to the input).
     * :math:`W_{ic}`, :math:`W_{fc}`, :math:`W_{oc}`: Diagonal weight \
           matrices for peephole connections. In our implementation, \
-          we use vectors to reprenset these diagonal weight matrices. 
+          we use vectors to reprenset these diagonal weight matrices.
     * :math:`b`: Denotes bias vectors (e.g. :math:`b_i` is the input gate \
-          bias vector). 
+          bias vector).
     * :math:`\sigma`: The activation, such as logistic sigmoid function.
     * :math:`i, f, o` and :math:`c`: The input gate, forget gate, output \
           gate, and cell activation vectors, respectively, all of which have \
-          the same size as the cell output activation vector :math:`h`. 
+          the same size as the cell output activation vector :math:`h`.
     * :math:`h`: The hidden state.
-    * :math:`r`: The recurrent projection of the hidden state. 
+    * :math:`r`: The recurrent projection of the hidden state.
     * :math:`\\tilde{c_t}`: The candidate hidden state, whose \
           computation is based on the current input and previous hidden state.
-    * :math:`\odot`: The element-wise product of the vectors. 
+    * :math:`\odot`: The element-wise product of the vectors.
     * :math:`act_g` and :math:`act_h`: The cell input and cell output \
-          activation functions and `tanh` is usually used for them. 
+          activation functions and `tanh` is usually used for them.
     * :math:`\overline{act_h}`: The activation function for the projection \
           output, usually using `identity` or same as :math:`act_h`.
 
     Set `use_peepholes` to `False` to disable peephole connection. The formula
     is omitted here, please refer to the paper
     http://www.bioinf.jku.at/publications/older/2604.pdf for details.
-    
+
     Note that these :math:`W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}`
     operations on the input :math:`x_{t}` are NOT included in this operator.
     Users can choose to use fully-connected layer before LSTMP layer.
@@ -479,8 +480,8 @@ def dynamic_lstmp(input,
 
                                - Hidden-hidden weight = {:math:`W_{ch}, W_{ih}, \
                                                 W_{fh}, W_{oh}`}.
-                               - The shape of hidden-hidden weight is (P x 4D), 
-                                 where P is the projection size and D the hidden 
+                               - The shape of hidden-hidden weight is (P x 4D),
+                                 where P is the projection size and D the hidden
                                  size.
                                - Projection weight = {:math:`W_{rh}`}.
                                - The shape of projection weight is (D x P).
@@ -525,9 +526,9 @@ def dynamic_lstmp(input,
             hidden_dim, proj_dim = 512, 256
             fc_out = fluid.layers.fc(input=input_seq, size=hidden_dim * 4,
                                      act=None, bias_attr=None)
-            proj_out, _ = fluid.layers.dynamic_lstmp(input=fc_out, 
-                                                     size=hidden_dim * 4, 
-                                                     proj_size=proj_dim, 
+            proj_out, _ = fluid.layers.dynamic_lstmp(input=fc_out,
+                                                     size=hidden_dim * 4,
+                                                     proj_size=proj_dim,
                                                      use_peepholes=False,
                                                      is_reverse=True,
                                                      cell_activation="tanh",
@@ -641,8 +642,8 @@ def dynamic_gru(input,
             Choices = ["sigmoid", "tanh", "relu", "identity"], default "tanh".
 
     Returns:
-        Variable: The hidden state of GRU. The shape is (T \\times D), and lod \
-            is the same with the input.
+        Variable: The hidden state of GRU. The shape is :math:`(T \\times D)`, \
+            and lod is the same with the input.
 
     Examples:
         .. code-block:: python
@@ -990,7 +991,7 @@ def square_error_cost(input, label, **kwargs):
        label(Variable): Label tensor, has target labels.
 
     Returns:
-        Variable: The tensor variable storing the element-wise squared error
+        Variable: The tensor variable storing the element-wise squared error \
                   difference of input and label.
 
     Examples:
@@ -1214,7 +1215,7 @@ def conv2d(input,
        act(str): Activation type. Default: None
 
     Returns:
-        Variable: The tensor variable storing the convolution and
+        Variable: The tensor variable storing the convolution and \
                   non-linearity activation result.
 
     Raises:
@@ -1565,6 +1566,102 @@ def batch_norm(input,
     return helper.append_activation(batch_norm_out)
 
 
+def layer_norm(input,
+               scale=True,
+               shift=True,
+               begin_norm_axis=1,
+               epsilon=1e-05,
+               param_attr=None,
+               bias_attr=None,
+               act=None,
+               name=None):
+    """
+    **Layer Normalization**
+
+    Assume feature vectors exist on dimensions 
+    :attr:`begin_norm_axis ... rank(input)` and calculate the moment statistics
+    along these dimensions for each feature vector :math:`a` with size
+    :math:`H`, then normalize each feature vector using the corresponding
+    statistics. After that, apply learnable gain and bias on the normalized
+    tensor to scale and shift if :attr:`scale` and :attr:`shift` are set.
+
+    Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_
+
+    The formula is as follows:
+
+    .. math::
+
+        \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} a_i
+
+        \\sigma & = \\sqrt{\\frac{1}{H}\sum_{i=1}^{H}(a_i - \\mu)^2}
+
+        h & = f(\\frac{g}{\\sigma}(a - \\mu) + b)
+
+    Args:
+        input(Variable): The input tensor variable.
+        scale(bool): Whether to learn the adaptive gain :math:`g` after 
+            normalization.
+        shift(bool): Whether to learn the adaptive bias :math:`b` after 
+            normalization.
+        begin_norm_axis(bool): The normalization will be performed along 
+            dimensions from :attr:`begin_norm_axis` to :attr:`rank(input)`.
+        epsilon(float): The small value added to the variance to prevent 
+            division by zero.
+        param_attr(ParamAttr|None): The parameter attribute for the learnable
+            gain :math:`g`.
+        bias_attr(ParamAttr|None): The parameter attribute for the learnable
+            bias :math:`b`.
+        act(str): Activation to be applied to the output of layer normalizaiton.
+
+    Returns:
+        Variable: A tensor variable with the same shape as the input.
+
+    Examples:
+        .. code-block:: python
+
+            data = fluid.layers.data(
+              name='data', shape=[3, 32, 32], dtype='float32')
+            x = fluid.layers.layer_norm(input=data, begin_norm_axis=1)
+    """
+    helper = LayerHelper('layer_norm', **locals())
+    dtype = helper.input_dtype()
+
+    # create intput and parameters
+    inputs = {'X': input}
+    input_shape = input.shape
+    param_shape = [reduce(lambda x, y: x * y, input_shape[begin_norm_axis:])]
+    if scale:
+        scale = helper.create_parameter(
+            attr=helper.param_attr,
+            shape=param_shape,
+            dtype=dtype,
+            default_initializer=Constant(1.0))
+        inputs['Scale'] = scale
+    if shift:
+        assert bias_attr is not False
+        bias = helper.create_parameter(
+            attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
+        inputs['Bias'] = bias
+
+    # create output
+    mean_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
+    variance_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
+    layer_norm_out = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type="layer_norm",
+        inputs=inputs,
+        outputs={
+            "Y": layer_norm_out,
+            "Mean": mean_out,
+            "Variance": variance_out,
+        },
+        attrs={"epsilon": epsilon,
+               "begin_norm_axis": begin_norm_axis})
+
+    return helper.append_activation(layer_norm_out)
+
+
 def beam_search_decode(ids, scores, name=None):
     helper = LayerHelper('beam_search_decode', **locals())
     sentence_ids = helper.create_tmp_variable(dtype=ids.dtype)
@@ -2525,7 +2622,8 @@ def ctc_greedy_decoder(input, blank, name=None):
                     interval [0, num_classes + 1).
 
     Returns:
-        Variable: CTC greedy decode result.
+        Variable: CTC greedy decode result. If all the sequences in result were
+        empty, the result LoDTensor will be [-1] with LoD [[0]] and dims [1, 1].
 
     Examples:
         .. code-block:: python
diff --git a/python/paddle/v2/fluid/layers/ops.py b/python/paddle/v2/fluid/layers/ops.py
index c701e79ad2..bb3f71abbb 100644
--- a/python/paddle/v2/fluid/layers/ops.py
+++ b/python/paddle/v2/fluid/layers/ops.py
@@ -61,6 +61,12 @@ __all__ = [
     'clip_by_norm',
     'softmax',
     'sequence_softmax',
+    'logical_and',
+    'logical_or',
+    'logical_xor',
+    'logical_not',
+    'uniform_random',
+    'cumsum',
 ] + __activations__
 
 for _OP in set(__all__):
diff --git a/python/paddle/v2/fluid/layers/tensor.py b/python/paddle/v2/fluid/layers/tensor.py
index 8460af2a08..2d4e0ab0cc 100644
--- a/python/paddle/v2/fluid/layers/tensor.py
+++ b/python/paddle/v2/fluid/layers/tensor.py
@@ -16,7 +16,7 @@ from ..layer_helper import LayerHelper
 from ..param_attr import ParamAttr
 from ..framework import convert_np_dtype_to_dtype_
 from ..framework import Variable
-from ..initializer import Constant
+from ..initializer import Constant, force_init_on_cpu
 from ..core import DataType
 import numpy
 
@@ -35,13 +35,15 @@ __all__ = [
 ]
 
 
-def create_tensor(dtype, name=None):
+def create_tensor(dtype, name=None, persistable=False):
     helper = LayerHelper("create_tensor", **locals())
-    return helper.create_variable(name=helper.name, dtype=dtype)
+    return helper.create_variable(
+        name=helper.name, dtype=dtype, persistable=persistable)
 
 
 def create_parameter(shape,
                      dtype,
+                     name=None,
                      attr=None,
                      is_bias=False,
                      default_initializer=None):
@@ -62,17 +64,35 @@ def create_parameter(shape,
     """
     helper = LayerHelper("create_parameter", **locals())
     if attr is None:
-        attr = ParamAttr()
+        attr = ParamAttr(name=name)
     return helper.create_parameter(attr, shape, dtype, is_bias,
                                    default_initializer)
 
 
-def create_global_var(shape, value, dtype, persistable=False, name=None):
+def create_global_var(shape,
+                      value,
+                      dtype,
+                      persistable=False,
+                      force_cpu=False,
+                      name=None):
+    """
+    Create a global variable. such as global_step
+    Args:
+        shape(list[int]): shape of the variable
+        value(float): the value of the variable
+        dtype(string): element type of the parameter
+        persistable(bool): if this variable is persistable
+        force_cpu(bool): force this variable to be on CPU
+
+    Returns:
+        Variable: the created Variable
+    """
     helper = LayerHelper("global_var", **locals())
     var = helper.create_global_variable(
         dtype=dtype, shape=shape, persistable=persistable, name=name)
     helper.set_variable_initializer(
-        var, initializer=Constant(value=float(value)))
+        var, initializer=Constant(
+            value=float(value), force_cpu=force_cpu))
     return var
 
 
@@ -219,6 +239,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None):
         dtype(np.dtype|core.DataType|str): Data type of the output tensor.
         value(float): The constant value used to initialize the output tensor.
         out(Variable): The output tensor.
+        force_cpu(True|False): data should be on CPU if set true.
 
     Returns:
         Variable: The tensor variable storing the output.
@@ -240,7 +261,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None):
             'shape': shape,
             'dtype': out.dtype,
             'value': float(value),
-            'force_cpu': force_cpu
+            'force_cpu': force_cpu or force_init_on_cpu()
         })
     out.stop_gradient = True
     return out
diff --git a/python/paddle/v2/fluid/learning_rate_decay.py b/python/paddle/v2/fluid/learning_rate_decay.py
index 96b3e9a0d7..2a2a29fd9c 100644
--- a/python/paddle/v2/fluid/learning_rate_decay.py
+++ b/python/paddle/v2/fluid/learning_rate_decay.py
@@ -14,8 +14,12 @@
 
 import layers
 from framework import Variable
+from initializer import init_on_cpu
 
-__all__ = ['exponential_decay', 'natural_exp_decay', 'inverse_time_decay']
+__all__ = [
+    'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
+    'polynomial_decay', 'piecewise_decay'
+]
 """
 When training a model, it's often useful to decay the
 learning rate during training process, this is called
@@ -51,11 +55,14 @@ def exponential_decay(learning_rate,
     if not isinstance(global_step, Variable):
         raise ValueError("global_step is required for exponential_decay.")
 
-    # update learning_rate
-    div_res = global_step / decay_steps
-    if staircase:
-        div_res = layers.floor(x=div_res)
-    return learning_rate * (decay_rate**div_res)
+    with init_on_cpu():
+        # update learning_rate
+        div_res = global_step / decay_steps
+        if staircase:
+            div_res = layers.floor(x=div_res)
+        decayed_lr = learning_rate * (decay_rate**div_res)
+
+    return decayed_lr
 
 
 def natural_exp_decay(learning_rate,
@@ -85,10 +92,13 @@ def natural_exp_decay(learning_rate,
     if not isinstance(global_step, Variable):
         raise ValueError("global_step is required for natural_exp_decay.")
 
-    div_res = global_step / decay_steps
-    if staircase:
-        div_res = layers.floor(x=div_res)
-    return learning_rate * layers.exp(x=(-1 * decay_rate * div_res))
+    with init_on_cpu():
+        div_res = global_step / decay_steps
+        if staircase:
+            div_res = layers.floor(x=div_res)
+        decayed_lr = learning_rate * layers.exp(x=(-1 * decay_rate * div_res))
+
+    return decayed_lr
 
 
 def inverse_time_decay(learning_rate,
@@ -101,7 +111,7 @@ def inverse_time_decay(learning_rate,
     ```python
     if staircase:
       decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
-    else
+    else:
       decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step)
     ```
     Args:
@@ -118,8 +128,114 @@ def inverse_time_decay(learning_rate,
     if not isinstance(global_step, Variable):
         raise ValueError("global_step is required for inverse_time_decay.")
 
-    div_res = global_step / decay_steps
-    if staircase:
-        div_res = layers.floor(x=div_res)
+    with init_on_cpu():
+        div_res = global_step / decay_steps
+        if staircase:
+            div_res = layers.floor(x=div_res)
+
+        decayed_lr = learning_rate / (1 + decay_rate * div_res)
+
+    return decayed_lr
+
+
+def polynomial_decay(learning_rate,
+                     global_step,
+                     decay_steps,
+                     end_learning_rate=0.0001,
+                     power=1.0,
+                     cycle=False):
+    """Applies polynomial decay to the initial learning rate.
+
+    ```python
+    if cycle:
+        decay_steps = decay_steps * ceil(global_step / decay_steps)
+    else:
+        global_step = min(global_step, decay_steps)
+    decayed_learning_rate = (learning_rate - end_learning_rate) *
+                      (1 - global_step / decay_steps) ^ power +
+                      end_learning_rate
+    ```
+    Args:
+        learning_rate: A scalar float32 value or a Variable. This
+          will be the initial learning rate during training
+        global_step: A Variable that record the training step.
+        decay_steps: A Python `int32` number.
+        end_learning_rate: A Python `float` number.
+        power: A Python `float` number
+        cycle: Boolean. If set true, decay the learning rate every decay_steps.
 
-    return learning_rate / (1 + decay_rate * div_res)
+    Returns:
+        The decayed learning rate
+    """
+    if not isinstance(global_step, Variable):
+        raise ValueError("global_step is required for inverse_time_decay.")
+
+    with init_on_cpu():
+        if cycle:
+            div_res = layers.ceil(x=(global_step / decay_steps))
+            zero_var = layers.fill_constant(
+                shape=[1], dtype='float32', value=0.0)
+            one_var = layers.fill_constant(
+                shape=[1], dtype='float32', value=1.0)
+
+            with layers.Switch() as switch:
+                with switch.case(layers.equal(x=global_step, y=zero_var)):
+                    layers.assign(input=one_var, output=div_res)
+            decay_steps = decay_steps * div_res
+        else:
+            decay_steps_var = layers.fill_constant(
+                shape=[1], dtype='float32', value=float(decay_steps))
+            global_step = layers.elementwise_min(
+                x=global_step, y=decay_steps_var)
+
+        decayed_lr = (learning_rate - end_learning_rate) * \
+                     ((1 - global_step / decay_steps) ** power) + end_learning_rate
+    return decayed_lr
+
+
+def piecewise_decay(global_step, boundaries, values):
+    """Applies piecewise decay to the initial learning rate.
+
+    ```python
+    boundaries = [10000, 20000]
+    values = [1.0, 0.5, 0.1]
+
+    if step < 10000:
+        learning_rate = 1.0
+    elif step >= 10000 and step < 20000:
+        learning_rate = 0.5
+    else:
+        learning_rate = 0.1
+    ```
+    """
+
+    if len(values) - len(boundaries) != 1:
+        raise ValueError("len(values) - len(boundaries) should be 1")
+
+    if not isinstance(global_step, Variable):
+        raise ValueError("global_step is required for piecewise_decay.")
+
+    with init_on_cpu():
+        lr = layers.create_global_var(
+            shape=[1],
+            value=0.0,
+            dtype='float32',
+            persistable=True,
+            name="learning_rate")
+
+        with layers.Switch() as switch:
+            for i in range(len(boundaries)):
+                boundary_val = layers.fill_constant(
+                    shape=[1], dtype='float32', value=float(boundaries[i]))
+                value_var = layers.fill_constant(
+                    shape=[1], dtype='float32', value=float(values[i]))
+                with switch.case(layers.less_than(global_step, boundary_val)):
+                    layers.assign(value_var, lr)
+            last_value_var = layers.fill_constant(
+                shape=[1],
+                dtype='float32',
+                value=float(values[len(values) - 1]))
+            with switch.default():
+                layers.assign(last_value_var, lr)
+
+    return lr
diff --git a/python/paddle/v2/fluid/memory_optimization_transpiler.py b/python/paddle/v2/fluid/memory_optimization_transpiler.py
index 2b00923f5e..53e0991ee8 100644
--- a/python/paddle/v2/fluid/memory_optimization_transpiler.py
+++ b/python/paddle/v2/fluid/memory_optimization_transpiler.py
@@ -92,14 +92,13 @@ class ControlFlowGraph(object):
         live_in = defaultdict(set)
         live_out = defaultdict(set)
         while True:
-            for i in range(self.op_size):
+            for i in range(self.op_size, 0, -1):
                 live_in[i] = set(self._live_in[i])
                 live_out[i] = set(self._live_out[i])
-                self._live_in[i] = self._uses[i] | (
-                    self._live_out[i] - self._defs[i])
                 for s in self._successors[i]:
                     self._live_out[i] |= self._live_in[s]
-
+                self._live_in[i] = self._uses[i] | (
+                    self._live_out[i] - self._defs[i])
             if self._reach_fixed_point(live_in, live_out):
                 break
 
@@ -145,7 +144,6 @@ class ControlFlowGraph(object):
             if op.type() == "while" or op.type() == "while_grad":
                 continue
             block_desc = op.block()
-            self.current_block_desc = block_desc
             is_forward = i < self._forward_num
             if self.pool:
                 defs_can_optimize = filter(
@@ -156,6 +154,9 @@ class ControlFlowGraph(object):
                     for x in defs_can_optimize
                 ]
                 for x, x_shape in out_pair:
+                    # If x is both in uses and defs, it can not be optimized!
+                    if x in self._uses[i]:
+                        continue
                     for index, cache_pair in enumerate(self.pool):
                         cache_var = cache_pair[0]
                         cache_shape = cache_pair[1]
@@ -208,17 +209,17 @@ def get_cfgs(input_program):
 
     while_sub_block_ids = []
     while_grad_sub_block_ids = []
-    while_op_output = set()
     while_block_id_pair = []
+    while_op_dict = {}
 
     for i in range(op_size):
         op = block_desc.op(i)
         if op.type() == "while":
             while_sub_block_ids.append(op.attr("sub_block").id)
-            while_op_output.update(op.output_arg_names())
+            while_op_dict[op.attr("sub_block").id] = op
         elif op.type() == "while_grad":
             while_grad_sub_block_ids.append(op.attr("sub_block").id)
-            while_op_output.update(op.output_arg_names())
+            while_op_dict[op.attr("sub_block").id] = op
 
     # Find while/while_grad block pair
     for grad_id in while_grad_sub_block_ids:
@@ -240,6 +241,10 @@ def get_cfgs(input_program):
         for i in range(while_grad_block_op_size):
             while_block_ops.append(while_grad_block.op(i))
 
+        while_op_output = set()
+        while_op_output.update(while_op_dict[parent_id].output_arg_names())
+        while_op_output.update(while_op_dict[grad_id].output_arg_names())
+
         ops_list.append((while_block_ops, while_block_op_size, while_op_output))
 
     # Process rest while block ops
@@ -250,9 +255,15 @@ def get_cfgs(input_program):
         for i in range(while_block_op_size):
             while_block_ops.append(while_block.op(i))
 
-        ops_list.append((while_block_ops, while_block_op_size))
+        while_op_output = set()
+        while_op_output.update(while_op_dict[parent_id].output_arg_names())
+
+        ops_list.append((while_block_ops, while_block_op_size, while_op_output))
 
-    cfgs = [ControlFlowGraph(input_program, i, j, k) for i, j, k in ops_list]
+    cfgs = [
+        ControlFlowGraph(input_program, ops, forward_num, skip_opt)
+        for ops, forward_num, skip_opt in ops_list
+    ]
     return cfgs
 
 
diff --git a/python/paddle/v2/fluid/nets.py b/python/paddle/v2/fluid/nets.py
index cb63d43709..be7878f869 100644
--- a/python/paddle/v2/fluid/nets.py
+++ b/python/paddle/v2/fluid/nets.py
@@ -194,7 +194,7 @@ def scaled_dot_product_attention(queries,
 
     Returns:
 
-        Variable: A 3-D Tensor computed by multi-head scaled dot product
+        Variable: A 3-D Tensor computed by multi-head scaled dot product \
                   attention.
 
     Raises:
@@ -333,6 +333,7 @@ def scaled_dot_product_attention(queries,
             x=product, shape=[-1, product.shape[-1]], act="softmax"),
         shape=product.shape)
     if dropout_rate:
-        weights = layers.dropout(x, dropout_prob=dropout_rate, is_test=False)
+        weights = layers.dropout(
+            weights, dropout_prob=dropout_rate, is_test=False)
     ctx_multiheads = layers.matmul(weights, v)
     return __combine_heads(ctx_multiheads)
diff --git a/python/paddle/v2/fluid/optimizer.py b/python/paddle/v2/fluid/optimizer.py
index 7844a4e2df..f8a00e3a5f 100644
--- a/python/paddle/v2/fluid/optimizer.py
+++ b/python/paddle/v2/fluid/optimizer.py
@@ -190,6 +190,8 @@ class Optimizer(object):
         # Create any accumulators
         program = loss.block.program
         with program_guard(program, startup_program):
+            global_block = framework.default_main_program().global_block()
+            start = len(global_block.ops)
             self.helper = LayerHelper(self.__class__.__name__)
             self._create_accumulators(loss.block,
                                       [p[0] for p in parameters_and_grads])
@@ -203,19 +205,14 @@ class Optimizer(object):
                                                            param_and_grad)
                     optimize_ops.append(optimize_op)
 
-            # Returned list of ops can include more ops in addition
-            # to optimization ops
-            return_ops = optimize_ops
-
             # Get custom finish ops for subclasses
             # FIXME: Need to fix this once we figure out how to handle dependencies
-            finish_ops = self._finish_update(loss.block)
-            if finish_ops is not None:
-                return_ops += finish_ops
+            self._finish_update(loss.block)
 
             if self._global_step is not None:
-                return_ops.append(self._increment_global_step(loss.block))
-            return return_ops
+                self._increment_global_step(loss.block)
+            end = len(global_block.ops)
+            return global_block.slice_ops(start, end)
 
     def minimize(self,
                  loss,
diff --git a/python/paddle/v2/fluid/profiler.py b/python/paddle/v2/fluid/profiler.py
index d4a2cd7eea..d33a4c52a8 100644
--- a/python/paddle/v2/fluid/profiler.py
+++ b/python/paddle/v2/fluid/profiler.py
@@ -103,10 +103,10 @@ def profiler(state, sorted_key=None):
     core.enable_profiler(prof_state)
     yield
 
-    if sorted_key not in ['calls', 'total', 'max', 'min', 'ave']:
-        raise ValueError("The state must be in 'calls', 'total', "
-                         "'max', 'min', 'ave'")
     sorted_key = 'default' if sorted_key is None else sorted_key
+    if sorted_key not in ['default', 'calls', 'total', 'max', 'min', 'ave']:
+        raise ValueError("The sorted_key must be None or in 'calls', 'total', "
+                         "'max', 'min' and 'ave'")
     key_map = {
         'default': core.EventSortingKey.kDefault,
         'calls': core.EventSortingKey.kCalls,
diff --git a/python/paddle/v2/fluid/tests/book/.gitignore b/python/paddle/v2/fluid/tests/book/.gitignore
index f0b574b939..dd28d354f4 100644
--- a/python/paddle/v2/fluid/tests/book/.gitignore
+++ b/python/paddle/v2/fluid/tests/book/.gitignore
@@ -1 +1 @@
-recognize_digits_*.inference.model
+*.inference.model
diff --git a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
index 27f34b1733..b3332b4810 100644
--- a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
@@ -15,13 +15,13 @@
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
 import contextlib
+import numpy
 import unittest
+import math
+import sys
 
 
-def main(use_cuda):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
-
+def train(use_cuda, save_dirname):
     x = fluid.layers.data(name='x', shape=[13], dtype='float32')
 
     y_predict = fluid.layers.fc(input=x, size=1, act=None)
@@ -49,19 +49,59 @@ def main(use_cuda):
 
     PASS_NUM = 100
     for pass_id in range(PASS_NUM):
-        fluid.io.save_persistables(exe, "./fit_a_line.model/")
-        fluid.io.load_persistables(exe, "./fit_a_line.model/")
         for data in train_reader():
             avg_loss_value, = exe.run(fluid.default_main_program(),
                                       feed=feeder.feed(data),
                                       fetch_list=[avg_cost])
             print(avg_loss_value)
             if avg_loss_value[0] < 10.0:
+                if save_dirname is not None:
+                    fluid.io.save_inference_model(save_dirname, ['x'],
+                                                  [y_predict], exe)
                 return
+            if math.isnan(float(avg_loss_value)):
+                sys.exit("got NaN loss, training failed.")
     raise AssertionError("Fit a line cost is too large, {0:2.2}".format(
         avg_loss_value[0]))
 
 
+def infer(use_cuda, save_dirname=None):
+    if save_dirname is None:
+        return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    # Use fluid.io.load_inference_model to obtain the inference program desc,
+    # the feed_target_names (the names of variables that will be feeded 
+    # data using feed operators), and the fetch_targets (variables that 
+    # we want to obtain data from using fetch operators).
+    [inference_program, feed_target_names,
+     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+    # The input's dimension should be 2-D and the second dim is 13
+    # The input data should be >= 0
+    batch_size = 10
+    tensor_x = numpy.random.uniform(0, 10, [batch_size, 13]).astype("float32")
+    assert feed_target_names[0] == 'x'
+    results = exe.run(inference_program,
+                      feed={feed_target_names[0]: tensor_x},
+                      fetch_list=fetch_targets)
+    print("infer shape: ", results[0].shape)
+    print("infer results: ", results[0])
+
+
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+
+    # Directory for saving the trained model
+    save_dirname = "fit_a_line.inference.model"
+
+    train(use_cuda, save_dirname)
+    infer(use_cuda, save_dirname)
+
+
 class TestFitALine(unittest.TestCase):
     def test_cpu(self):
         with self.program_scope_guard():
diff --git a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py b/python/paddle/v2/fluid/tests/book/test_image_classification.py
similarity index 64%
rename from python/paddle/v2/fluid/tests/book/test_image_classification_train.py
rename to python/paddle/v2/fluid/tests/book/test_image_classification.py
index a4168d16db..ffbe5bdbd6 100644
--- a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
+++ b/python/paddle/v2/fluid/tests/book/test_image_classification.py
@@ -16,8 +16,11 @@ from __future__ import print_function
 
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
-import unittest
 import contextlib
+import math
+import sys
+import numpy
+import unittest
 
 
 def resnet_cifar10(input, depth=32):
@@ -89,10 +92,7 @@ def vgg16_bn_drop(input):
     return fc2
 
 
-def main(net_type, use_cuda):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
-
+def train(net_type, use_cuda, save_dirname):
     classdim = 10
     data_shape = [3, 32, 32]
 
@@ -111,12 +111,14 @@ def main(net_type, use_cuda):
     predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
     cost = fluid.layers.cross_entropy(input=predict, label=label)
     avg_cost = fluid.layers.mean(x=cost)
+    acc = fluid.layers.accuracy(input=predict, label=label)
+
+    # Test program 
+    test_program = fluid.default_main_program().clone()
 
     optimizer = fluid.optimizer.Adam(learning_rate=0.001)
     optimizer.minimize(avg_cost)
 
-    accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
-
     BATCH_SIZE = 128
     PASS_NUM = 1
 
@@ -125,6 +127,9 @@ def main(net_type, use_cuda):
             paddle.dataset.cifar.train10(), buf_size=128 * 10),
         batch_size=BATCH_SIZE)
 
+    test_reader = paddle.batch(
+        paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE)
+
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     exe = fluid.Executor(place)
     feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
@@ -132,18 +137,70 @@ def main(net_type, use_cuda):
 
     loss = 0.0
     for pass_id in range(PASS_NUM):
-        accuracy.reset(exe)
-        for data in train_reader():
-            loss, acc = exe.run(fluid.default_main_program(),
-                                feed=feeder.feed(data),
-                                fetch_list=[avg_cost] + accuracy.metrics)
-            pass_acc = accuracy.eval(exe)
-            print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
-                pass_acc))
-            return
-
-    raise AssertionError(
-        "Image classification loss is too large, {0:2.2}".format(loss))
+        for batch_id, data in enumerate(train_reader()):
+            exe.run(feed=feeder.feed(data))
+
+            if (batch_id % 10) == 0:
+                acc_list = []
+                avg_loss_list = []
+                for tid, test_data in enumerate(test_reader()):
+                    loss_t, acc_t = exe.run(program=test_program,
+                                            feed=feeder.feed(test_data),
+                                            fetch_list=[avg_cost, acc])
+                    if math.isnan(float(loss_t)):
+                        sys.exit("got NaN loss, training failed.")
+                    acc_list.append(float(acc_t))
+                    avg_loss_list.append(float(loss_t))
+                    break  # Use 1 segment for speeding up CI
+
+                acc_value = numpy.array(acc_list).mean()
+                avg_loss_value = numpy.array(avg_loss_list).mean()
+
+                print(
+                    'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
+                    format(pass_id, batch_id + 1,
+                           float(avg_loss_value), float(acc_value)))
+
+                if acc_value > 0.01:  # Low threshold for speeding up CI
+                    fluid.io.save_inference_model(save_dirname, ["pixel"],
+                                                  [predict], exe)
+                    return
+
+
+def infer(use_cuda, save_dirname=None):
+    if save_dirname is None:
+        return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    # Use fluid.io.load_inference_model to obtain the inference program desc,
+    # the feed_target_names (the names of variables that will be feeded 
+    # data using feed operators), and the fetch_targets (variables that 
+    # we want to obtain data from using fetch operators).
+    [inference_program, feed_target_names,
+     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+    # The input's dimension of conv should be 4-D or 5-D.
+    tensor_img = numpy.random.rand(1, 3, 32, 32).astype("float32")
+
+    # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+    # and results will contain a list of data corresponding to fetch_targets.
+    results = exe.run(inference_program,
+                      feed={feed_target_names[0]: tensor_img},
+                      fetch_list=fetch_targets)
+    print("infer results: ", results[0])
+
+
+def main(net_type, use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+
+    # Directory for saving the trained model
+    save_dirname = "image_classification_" + net_type + ".inference.model"
+
+    train(net_type, use_cuda, save_dirname)
+    infer(use_cuda, save_dirname)
 
 
 class TestImageClassification(unittest.TestCase):
diff --git a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
index f85768de99..f33e81186b 100644
--- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
@@ -18,7 +18,10 @@ import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.dataset.conll05 as conll05
 import paddle.v2.fluid as fluid
+from paddle.v2.fluid.initializer import init_on_cpu
+import contextlib
 import time
+import unittest
 
 word_dict, verb_dict, label_dict = conll05.get_dict()
 word_dict_len = len(word_dict)
@@ -127,7 +130,15 @@ def to_lodtensor(data, place):
     return res
 
 
-def main():
+def create_random_lodtensor(lod, place, low, high):
+    data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
+    res = fluid.LoDTensor()
+    res.set(data, place)
+    res.set_lod([lod])
+    return res
+
+
+def train(use_cuda, save_dirname=None):
     # define network topology
     word = fluid.layers.data(
         name='word_data', shape=[1], dtype='int64', lod_level=1)
@@ -157,7 +168,16 @@ def main():
 
     # TODO(qiao)
     # check other optimizers and check why out will be NAN
-    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001)
+    global_step = fluid.layers.create_global_var(
+        shape=[1], value=0, dtype='float32', force_cpu=True, persistable=True)
+    sgd_optimizer = fluid.optimizer.SGD(
+        learning_rate=fluid.learning_rate_decay.exponential_decay(
+            learning_rate=0.0001,
+            global_step=global_step,
+            decay_steps=100000,
+            decay_rate=0.5,
+            staircase=True),
+        global_step=global_step)
     sgd_optimizer.minimize(avg_cost)
 
     # TODO(qiao)
@@ -175,8 +195,8 @@ def main():
         paddle.reader.shuffle(
             paddle.dataset.conll05.test(), buf_size=8192),
         batch_size=BATCH_SIZE)
-    # place = fluid.CPUPlace()
-    place = fluid.CUDAPlace(0)
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     feeder = fluid.DataFeeder(
         feed_list=[
             word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target
@@ -211,12 +231,102 @@ def main():
                 if batch_id != 0:
                     print("second per batch: " + str((time.time() - start_time)
                                                      / batch_id))
-
-            # exit early for CI
-            exit(0)
+                # Set the threshold low to speed up the CI test
+                if float(pass_precision) > 0.05:
+                    if save_dirname is not None:
+                        fluid.io.save_inference_model(save_dirname, [
+                            'word_data', 'verb_data', 'ctx_n2_data',
+                            'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data',
+                            'ctx_p2_data', 'mark_data'
+                        ], [feature_out], exe)
+                    return
 
             batch_id = batch_id + 1
 
 
+def infer(use_cuda, save_dirname=None):
+    if save_dirname is None:
+        return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    # Use fluid.io.load_inference_model to obtain the inference program desc,
+    # the feed_target_names (the names of variables that will be feeded 
+    # data using feed operators), and the fetch_targets (variables that 
+    # we want to obtain data from using fetch operators).
+    [inference_program, feed_target_names,
+     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+    lod = [0, 4, 10]
+    ts_word = create_random_lodtensor(lod, place, low=0, high=1)
+    ts_pred = create_random_lodtensor(lod, place, low=0, high=1)
+    ts_ctx_n2 = create_random_lodtensor(lod, place, low=0, high=1)
+    ts_ctx_n1 = create_random_lodtensor(lod, place, low=0, high=1)
+    ts_ctx_0 = create_random_lodtensor(lod, place, low=0, high=1)
+    ts_ctx_p1 = create_random_lodtensor(lod, place, low=0, high=1)
+    ts_ctx_p2 = create_random_lodtensor(lod, place, low=0, high=1)
+    ts_mark = create_random_lodtensor(lod, place, low=0, high=1)
+
+    # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+    # and results will contain a list of data corresponding to fetch_targets.
+    assert feed_target_names[0] == 'word_data'
+    assert feed_target_names[1] == 'verb_data'
+    assert feed_target_names[2] == 'ctx_n2_data'
+    assert feed_target_names[3] == 'ctx_n1_data'
+    assert feed_target_names[4] == 'ctx_0_data'
+    assert feed_target_names[5] == 'ctx_p1_data'
+    assert feed_target_names[6] == 'ctx_p2_data'
+    assert feed_target_names[7] == 'mark_data'
+
+    results = exe.run(inference_program,
+                      feed={
+                          feed_target_names[0]: ts_word,
+                          feed_target_names[1]: ts_pred,
+                          feed_target_names[2]: ts_ctx_n2,
+                          feed_target_names[3]: ts_ctx_n1,
+                          feed_target_names[4]: ts_ctx_0,
+                          feed_target_names[5]: ts_ctx_p1,
+                          feed_target_names[6]: ts_ctx_p2,
+                          feed_target_names[7]: ts_mark
+                      },
+                      fetch_list=fetch_targets,
+                      return_numpy=False)
+    print(results[0].lod())
+    np_data = np.array(results[0])
+    print("Inference Shape: ", np_data.shape)
+    print("Inference results: ", np_data)
+
+
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+
+    # Directory for saving the trained model
+    save_dirname = "label_semantic_roles.inference.model"
+
+    train(use_cuda, save_dirname)
+    infer(use_cuda, save_dirname)
+
+
+class TestLabelSemanticRoles(unittest.TestCase):
+    def test_cuda(self):
+        with self.scope_prog_guard():
+            main(use_cuda=True)
+
+    def test_cpu(self):
+        with self.scope_prog_guard():
+            main(use_cuda=False)
+
+    @contextlib.contextmanager
+    def scope_prog_guard(self):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                yield
+
+
 if __name__ == '__main__':
-    main()
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
index b8f55c813b..244c1749cd 100644
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
@@ -18,6 +18,8 @@ import paddle.v2 as paddle
 import sys
 import numpy
 import unittest
+import math
+import sys
 
 
 def parse_arg():
@@ -65,6 +67,7 @@ def conv_net(img, label):
         pool_size=2,
         pool_stride=2,
         act="relu")
+    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
@@ -75,7 +78,7 @@ def conv_net(img, label):
     return loss_net(conv_pool_2, label)
 
 
-def train(nn_type, use_cuda, parallel, save_dirname):
+def train(nn_type, use_cuda, parallel, save_dirname, save_param_filename):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
     img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
@@ -140,18 +143,22 @@ def train(nn_type, use_cuda, parallel, save_dirname):
                 avg_loss_val = numpy.array(avg_loss_set).mean()
                 if float(acc_val) > 0.85:  # test acc > 85%
                     if save_dirname is not None:
-                        fluid.io.save_inference_model(save_dirname, ["img"],
-                                                      [prediction], exe)
+                        fluid.io.save_inference_model(
+                            save_dirname, ["img"], [prediction],
+                            exe,
+                            save_file_name=save_param_filename)
                     return
                 else:
                     print(
                         'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
                         format(pass_id, batch_id + 1,
                                float(avg_loss_val), float(acc_val)))
+                    if math.isnan(float(avg_loss_val)):
+                        sys.exit("got NaN loss, training failed.")
     raise AssertionError("Loss of recognize digits is too large")
 
 
-def infer(use_cuda, save_dirname=None):
+def infer(use_cuda, save_dirname=None, param_filename=None):
     if save_dirname is None:
         return
 
@@ -162,11 +169,14 @@ def infer(use_cuda, save_dirname=None):
     # the feed_target_names (the names of variables that will be feeded 
     # data using feed operators), and the fetch_targets (variables that 
     # we want to obtain data from using fetch operators).
-    [inference_program, feed_target_names,
-     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+    [inference_program, feed_target_names, fetch_targets
+     ] = fluid.io.load_inference_model(save_dirname, exe, param_filename)
 
     # The input's dimension of conv should be 4-D or 5-D.
-    tensor_img = numpy.random.rand(1, 1, 28, 28).astype("float32")
+    # Use normilized image pixels as input data, which should be in the range [-1.0, 1.0].
+    batch_size = 1
+    tensor_img = numpy.random.uniform(-1.0, 1.0,
+                                      [batch_size, 1, 28, 28]).astype("float32")
 
     # Construct feed as a dictionary of {feed_target_name: feed_target_data}
     # and results will contain a list of data corresponding to fetch_targets.
@@ -176,36 +186,45 @@ def infer(use_cuda, save_dirname=None):
     print("infer results: ", results[0])
 
 
-def main(use_cuda, parallel, nn_type):
+def main(use_cuda, parallel, nn_type, combine):
     if not use_cuda and not parallel:
         save_dirname = "recognize_digits_" + nn_type + ".inference.model"
+        save_filename = None
+        if combine == True:
+            save_filename = "__params_combined__"
     else:
         save_dirname = None
+        save_filename = None
 
     train(
         nn_type=nn_type,
         use_cuda=use_cuda,
         parallel=parallel,
-        save_dirname=save_dirname)
-    infer(use_cuda=use_cuda, save_dirname=save_dirname)
+        save_dirname=save_dirname,
+        save_param_filename=save_filename)
+    infer(
+        use_cuda=use_cuda,
+        save_dirname=save_dirname,
+        param_filename=save_filename)
 
 
 class TestRecognizeDigits(unittest.TestCase):
     pass
 
 
-def inject_test_method(use_cuda, parallel, nn_type):
+def inject_test_method(use_cuda, parallel, nn_type, combine):
     def __impl__(self):
         prog = fluid.Program()
         startup_prog = fluid.Program()
         scope = fluid.core.Scope()
         with fluid.scope_guard(scope):
             with fluid.program_guard(prog, startup_prog):
-                main(use_cuda, parallel, nn_type)
+                main(use_cuda, parallel, nn_type, combine)
 
-    fn = 'test_{0}_{1}_{2}'.format(nn_type, 'cuda'
-                                   if use_cuda else 'cpu', 'parallel'
-                                   if parallel else 'normal')
+    fn = 'test_{0}_{1}_{2}_{3}'.format(nn_type, 'cuda'
+                                       if use_cuda else 'cpu', 'parallel'
+                                       if parallel else 'normal', 'combine'
+                                       if combine else 'separate')
 
     setattr(TestRecognizeDigits, fn, __impl__)
 
@@ -214,7 +233,10 @@ def inject_all_tests():
     for use_cuda in (False, True):
         for parallel in (False, True):
             for nn_type in ('mlp', 'conv'):
-                inject_test_method(use_cuda, parallel, nn_type)
+                inject_test_method(use_cuda, parallel, nn_type, True)
+
+    # One unit-test for saving parameters as separate files
+    inject_test_method(False, False, 'mlp', False)
 
 
 inject_all_tests()
diff --git a/python/paddle/v2/fluid/tests/book/test_recommender_system.py b/python/paddle/v2/fluid/tests/book/test_recommender_system.py
index d4a694e572..612d51e08e 100644
--- a/python/paddle/v2/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/v2/fluid/tests/book/test_recommender_system.py
@@ -12,9 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import math
+import sys
 import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid.core as core
+import paddle.v2.fluid as fluid
 import paddle.v2.fluid.framework as framework
 import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.nets as nets
@@ -102,7 +104,8 @@ def get_mov_combined_features():
 
     CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())
 
-    category_id = layers.data(name='category_id', shape=[1], dtype='int64')
+    category_id = layers.data(
+        name='category_id', shape=[1], dtype='int64', lod_level=1)
 
     mov_categories_emb = layers.embedding(
         input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE)
@@ -112,7 +115,8 @@ def get_mov_combined_features():
 
     MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())
 
-    mov_title_id = layers.data(name='movie_title', shape=[1], dtype='int64')
+    mov_title_id = layers.data(
+        name='movie_title', shape=[1], dtype='int64', lod_level=1)
 
     mov_title_emb = layers.embedding(
         input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE)
@@ -142,23 +146,22 @@ def model():
     scale_infer = layers.scale(x=inference, scale=5.0)
 
     label = layers.data(name='score', shape=[1], dtype='float32')
-
     square_cost = layers.square_error_cost(input=scale_infer, label=label)
-
     avg_cost = layers.mean(x=square_cost)
 
-    return avg_cost
+    return scale_infer, avg_cost
+
 
+def train(use_cuda, save_dirname):
+    scale_infer, avg_cost = model()
+
+    # test program
+    test_program = fluid.default_main_program().clone()
 
-def main():
-    cost = model()
     sgd_optimizer = SGDOptimizer(learning_rate=0.2)
-    opts = sgd_optimizer.minimize(cost)
+    opts = sgd_optimizer.minimize(avg_cost)
 
-    if USE_GPU:
-        place = core.CUDAPlace(0)
-    else:
-        place = core.CPUPlace()
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     exe = Executor(place)
     exe.run(framework.default_startup_program())
@@ -167,6 +170,8 @@ def main():
         paddle.reader.shuffle(
             paddle.dataset.movielens.train(), buf_size=8192),
         batch_size=BATCH_SIZE)
+    test_reader = paddle.batch(
+        paddle.dataset.movielens.test(), batch_size=BATCH_SIZE)
 
     feeding = {
         'user_id': 0,
@@ -182,7 +187,7 @@ def main():
     def func_feed(feeding, data):
         feed_tensors = {}
         for (key, idx) in feeding.iteritems():
-            tensor = core.LoDTensor()
+            tensor = fluid.LoDTensor()
             if key != "category_id" and key != "movie_title":
                 if key == "score":
                     numpy_data = np.array(map(lambda x: x[idx], data)).astype(
@@ -209,14 +214,117 @@ def main():
 
     PASS_NUM = 100
     for pass_id in range(PASS_NUM):
-        for data in train_reader():
-            outs = exe.run(framework.default_main_program(),
+        for batch_id, data in enumerate(train_reader()):
+            # train a mini-batch
+            outs = exe.run(program=fluid.default_main_program(),
                            feed=func_feed(feeding, data),
-                           fetch_list=[cost])
+                           fetch_list=[avg_cost])
             out = np.array(outs[0])
-            if out[0] < 6.0:
-                # if avg cost less than 6.0, we think our code is good.
-                exit(0)
-
-
-main()
+            if (batch_id + 1) % 10 == 0:
+                avg_cost_set = []
+                for test_data in test_reader():
+                    avg_cost_np = exe.run(program=test_program,
+                                          feed=func_feed(feeding, test_data),
+                                          fetch_list=[avg_cost])
+                    avg_cost_set.append(avg_cost_np[0])
+                    break  # test only 1 segment for speeding up CI
+
+                # get test avg_cost
+                test_avg_cost = np.array(avg_cost_set).mean()
+                if test_avg_cost < 6.0:
+                    # if avg_cost less than 6.0, we think our code is good.
+                    if save_dirname is not None:
+                        fluid.io.save_inference_model(save_dirname, [
+                            "user_id", "gender_id", "age_id", "job_id",
+                            "movie_id", "category_id", "movie_title"
+                        ], [scale_infer], exe)
+                    return
+
+            if math.isnan(float(out[0])):
+                sys.exit("got NaN loss, training failed.")
+
+
+def infer(use_cuda, save_dirname=None):
+    if save_dirname is None:
+        return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    # Use fluid.io.load_inference_model to obtain the inference program desc,
+    # the feed_target_names (the names of variables that will be feeded
+    # data using feed operators), and the fetch_targets (variables that
+    # we want to obtain data from using fetch operators).
+    [inference_program, feed_target_names,
+     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+    def create_lod_tensor(data, lod=None):
+        tensor = fluid.LoDTensor()
+        if lod is None:
+            # Tensor, the shape is [batch_size, 1]
+            index = 0
+            lod_0 = [index]
+            for l in range(len(data)):
+                index += 1
+                lod_0.append(index)
+            lod = [lod_0]
+        tensor.set_lod(lod)
+
+        flattened_data = np.concatenate(data, axis=0).astype("int64")
+        flattened_data = flattened_data.reshape([len(flattened_data), 1])
+        tensor.set(flattened_data, place)
+        return tensor
+
+    # Use the first data from paddle.dataset.movielens.test() as input
+    assert feed_target_names[0] == "user_id"
+    user_id = create_lod_tensor([[1]])
+
+    assert feed_target_names[1] == "gender_id"
+    gender_id = create_lod_tensor([[1]])
+
+    assert feed_target_names[2] == "age_id"
+    age_id = create_lod_tensor([[0]])
+
+    assert feed_target_names[3] == "job_id"
+    job_id = create_lod_tensor([[10]])
+
+    assert feed_target_names[4] == "movie_id"
+    movie_id = create_lod_tensor([[783]])
+
+    assert feed_target_names[5] == "category_id"
+    category_id = create_lod_tensor([[10], [8], [9]], [[0, 3]])
+
+    assert feed_target_names[6] == "movie_title"
+    movie_title = create_lod_tensor([[1069], [4140], [2923], [710], [988]],
+                                    [[0, 5]])
+
+    # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+    # and results will contain a list of data corresponding to fetch_targets.
+    results = exe.run(inference_program,
+                      feed={
+                          feed_target_names[0]: user_id,
+                          feed_target_names[1]: gender_id,
+                          feed_target_names[2]: age_id,
+                          feed_target_names[3]: job_id,
+                          feed_target_names[4]: movie_id,
+                          feed_target_names[5]: category_id,
+                          feed_target_names[6]: movie_title
+                      },
+                      fetch_list=fetch_targets,
+                      return_numpy=False)
+    print("inferred score: ", np.array(results[0]))
+
+
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+
+    # Directory for saving the inference model
+    save_dirname = "recommender_system.inference.model"
+
+    train(use_cuda, save_dirname)
+    infer(use_cuda, save_dirname)
+
+
+if __name__ == '__main__':
+    main(USE_GPU)
diff --git a/python/paddle/v2/fluid/tests/book/test_rnn_encoder_decoder.py b/python/paddle/v2/fluid/tests/book/test_rnn_encoder_decoder.py
index fdc6086176..7fe43c680c 100644
--- a/python/paddle/v2/fluid/tests/book/test_rnn_encoder_decoder.py
+++ b/python/paddle/v2/fluid/tests/book/test_rnn_encoder_decoder.py
@@ -18,6 +18,10 @@ import paddle.v2.fluid as fluid
 import paddle.v2.fluid.core as core
 import paddle.v2.fluid.framework as framework
 import paddle.v2.fluid.layers as layers
+import contextlib
+import math
+import sys
+import unittest
 from paddle.v2.fluid.executor import Executor
 
 dict_size = 30000
@@ -145,7 +149,7 @@ def seq_to_seq_net():
     cost = fluid.layers.cross_entropy(input=prediction, label=label)
     avg_cost = fluid.layers.mean(x=cost)
 
-    return avg_cost
+    return avg_cost, prediction
 
 
 def to_lodtensor(data, place):
@@ -163,8 +167,16 @@ def to_lodtensor(data, place):
     return res
 
 
-def main():
-    avg_cost = seq_to_seq_net()
+def create_random_lodtensor(lod, place, low, high):
+    data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
+    res = fluid.LoDTensor()
+    res.set(data, place)
+    res.set_lod([lod])
+    return res
+
+
+def train(use_cuda, save_dirname=None):
+    [avg_cost, prediction] = seq_to_seq_net()
 
     optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
     optimizer.minimize(avg_cost)
@@ -174,7 +186,7 @@ def main():
             paddle.dataset.wmt14.train(dict_size), buf_size=1000),
         batch_size=batch_size)
 
-    place = core.CPUPlace()
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     exe = Executor(place)
 
     exe.run(framework.default_startup_program())
@@ -185,6 +197,7 @@ def main():
             word_data = to_lodtensor(map(lambda x: x[0], data), place)
             trg_word = to_lodtensor(map(lambda x: x[1], data), place)
             trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
+
             outs = exe.run(framework.default_main_program(),
                            feed={
                                'source_sequence': word_data,
@@ -192,13 +205,86 @@ def main():
                                'label_sequence': trg_word_next
                            },
                            fetch_list=[avg_cost])
+
             avg_cost_val = np.array(outs[0])
             print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
                   " avg_cost=" + str(avg_cost_val))
+            if math.isnan(float(avg_cost_val[0])):
+                sys.exit("got NaN loss, training failed.")
             if batch_id > 3:
-                exit(0)
+                if save_dirname is not None:
+                    fluid.io.save_inference_model(
+                        save_dirname, ['source_sequence',
+                                       'target_sequence'], [prediction], exe)
+                return
+
             batch_id += 1
 
 
+def infer(use_cuda, save_dirname=None):
+    if save_dirname is None:
+        return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    # Use fluid.io.load_inference_model to obtain the inference program desc,
+    # the feed_target_names (the names of variables that will be feeded 
+    # data using feed operators), and the fetch_targets (variables that 
+    # we want to obtain data from using fetch operators).
+    [inference_program, feed_target_names,
+     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+    lod = [0, 4, 10]
+    word_data = create_random_lodtensor(lod, place, low=0, high=1)
+    trg_word = create_random_lodtensor(lod, place, low=0, high=1)
+
+    # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+    # and results will contain a list of data corresponding to fetch_targets.
+    assert feed_target_names[0] == 'source_sequence'
+    assert feed_target_names[1] == 'target_sequence'
+    results = exe.run(inference_program,
+                      feed={
+                          feed_target_names[0]: word_data,
+                          feed_target_names[1]: trg_word,
+                      },
+                      fetch_list=fetch_targets,
+                      return_numpy=False)
+    print(results[0].lod())
+    np_data = np.array(results[0])
+    print("Inference shape: ", np_data.shape)
+    print("Inference results: ", np_data)
+
+
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+
+    # Directory for saving the trained model
+    save_dirname = "rnn_encoder_decoder.inference.model"
+
+    train(use_cuda, save_dirname)
+    infer(use_cuda, save_dirname)
+
+
+class TestRnnEncoderDecoder(unittest.TestCase):
+    def test_cuda(self):
+        with self.scope_prog_guard():
+            main(use_cuda=True)
+
+    def test_cpu(self):
+        with self.scope_prog_guard():
+            main(use_cuda=False)
+
+    @contextlib.contextmanager
+    def scope_prog_guard(self):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                yield
+
+
 if __name__ == '__main__':
-    main()
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py
index 2ba9077a26..6e0206d41d 100644
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py
@@ -16,6 +16,9 @@ import unittest
 import paddle.v2.fluid as fluid
 import paddle.v2 as paddle
 import contextlib
+import math
+import numpy as np
+import sys
 
 
 def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
@@ -41,7 +44,7 @@ def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
     adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
     adam_optimizer.minimize(avg_cost)
     accuracy = fluid.layers.accuracy(input=prediction, label=label)
-    return avg_cost, accuracy
+    return avg_cost, accuracy, prediction
 
 
 def stacked_lstm_net(data,
@@ -79,13 +82,18 @@ def stacked_lstm_net(data,
     adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
     adam_optimizer.minimize(avg_cost)
     accuracy = fluid.layers.accuracy(input=prediction, label=label)
-    return avg_cost, accuracy
+    return avg_cost, accuracy, prediction
 
 
-def main(word_dict, net_method, use_cuda):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
+def create_random_lodtensor(lod, place, low, high):
+    data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
+    res = fluid.LoDTensor()
+    res.set(data, place)
+    res.set_lod([lod])
+    return res
 
+
+def train(word_dict, net_method, use_cuda, save_dirname=None):
     BATCH_SIZE = 128
     PASS_NUM = 5
     dict_dim = len(word_dict)
@@ -94,7 +102,7 @@ def main(word_dict, net_method, use_cuda):
     data = fluid.layers.data(
         name="words", shape=[1], dtype="int64", lod_level=1)
     label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-    cost, acc_out = net_method(
+    cost, acc_out, prediction = net_method(
         data, label, input_dim=dict_dim, class_dim=class_dim)
 
     train_data = paddle.batch(
@@ -114,11 +122,59 @@ def main(word_dict, net_method, use_cuda):
                                         fetch_list=[cost, acc_out])
             print("cost=" + str(cost_val) + " acc=" + str(acc_val))
             if cost_val < 0.4 and acc_val > 0.8:
+                if save_dirname is not None:
+                    fluid.io.save_inference_model(save_dirname, ["words"],
+                                                  prediction, exe)
                 return
+            if math.isnan(float(cost_val)):
+                sys.exit("got NaN loss, training failed.")
     raise AssertionError("Cost is too large for {0}".format(
         net_method.__name__))
 
 
+def infer(use_cuda, save_dirname=None):
+    if save_dirname is None:
+        return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    # Use fluid.io.load_inference_model to obtain the inference program desc,
+    # the feed_target_names (the names of variables that will be feeded 
+    # data using feed operators), and the fetch_targets (variables that 
+    # we want to obtain data from using fetch operators).
+    [inference_program, feed_target_names,
+     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+    lod = [0, 4, 10]
+    word_dict = paddle.dataset.imdb.word_dict()
+    tensor_words = create_random_lodtensor(
+        lod, place, low=0, high=len(word_dict) - 1)
+
+    # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+    # and results will contain a list of data corresponding to fetch_targets.
+    assert feed_target_names[0] == "words"
+    results = exe.run(inference_program,
+                      feed={feed_target_names[0]: tensor_words},
+                      fetch_list=fetch_targets,
+                      return_numpy=False)
+    print(results[0].lod())
+    np_data = np.array(results[0])
+    print("Inference Shape: ", np_data.shape)
+    print("Inference results: ", np_data)
+
+
+def main(word_dict, net_method, use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+
+    # Directory for saving the trained model
+    save_dirname = "understand_sentiment.inference.model"
+
+    train(word_dict, net_method, use_cuda, save_dirname)
+    infer(use_cuda, save_dirname)
+
+
 class TestUnderstandSentiment(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
diff --git a/python/paddle/v2/fluid/tests/book/test_word2vec.py b/python/paddle/v2/fluid/tests/book/test_word2vec.py
index 766ba9681d..69bfbcee69 100644
--- a/python/paddle/v2/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/v2/fluid/tests/book/test_word2vec.py
@@ -1,6 +1,5 @@
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
+# # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
@@ -16,12 +15,67 @@ import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
 import unittest
 import os
+import numpy as np
+import math
+import sys
 
 
-def main(use_cuda, is_sparse, parallel):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
+def create_random_lodtensor(lod, place, low, high):
+    data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
+    res = fluid.LoDTensor()
+    res.set(data, place)
+    res.set_lod([lod])
+    return res
+
+
+def infer(use_cuda, save_dirname=None):
+    if save_dirname is None:
         return
 
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    # Use fluid.io.load_inference_model to obtain the inference program desc,
+    # the feed_target_names (the names of variables that will be feeded 
+    # data using feed operators), and the fetch_targets (variables that 
+    # we want to obtain data from using fetch operators).
+    [inference_program, feed_target_names,
+     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+    word_dict = paddle.dataset.imikolov.build_dict()
+    dict_size = len(word_dict) - 1
+
+    # Setup input, by creating 4 words, and setting up lod required for 
+    # lookup_table_op
+    lod = [0, 1]
+    first_word = create_random_lodtensor(lod, place, low=0, high=dict_size)
+    second_word = create_random_lodtensor(lod, place, low=0, high=dict_size)
+    third_word = create_random_lodtensor(lod, place, low=0, high=dict_size)
+    fourth_word = create_random_lodtensor(lod, place, low=0, high=dict_size)
+
+    assert feed_target_names[0] == 'firstw'
+    assert feed_target_names[1] == 'secondw'
+    assert feed_target_names[2] == 'thirdw'
+    assert feed_target_names[3] == 'forthw'
+
+    # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+    # and results will contain a list of data corresponding to fetch_targets.
+    results = exe.run(inference_program,
+                      feed={
+                          feed_target_names[0]: first_word,
+                          feed_target_names[1]: second_word,
+                          feed_target_names[2]: third_word,
+                          feed_target_names[3]: fourth_word
+                      },
+                      fetch_list=fetch_targets,
+                      return_numpy=False)
+    print(results[0].lod())
+    np_data = np.array(results[0])
+    print("Inference Shape: ", np_data.shape)
+    print("Inference results: ", np_data)
+
+
+def train(use_cuda, is_sparse, parallel, save_dirname):
     PASS_NUM = 100
     EMBED_SIZE = 32
     HIDDEN_SIZE = 256
@@ -65,7 +119,7 @@ def main(use_cuda, is_sparse, parallel):
                                        act='softmax')
         cost = fluid.layers.cross_entropy(input=predict_word, label=words[4])
         avg_cost = fluid.layers.mean(x=cost)
-        return avg_cost
+        return avg_cost, predict_word
 
     word_dict = paddle.dataset.imikolov.build_dict()
     dict_size = len(word_dict)
@@ -77,13 +131,13 @@ def main(use_cuda, is_sparse, parallel):
     next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
 
     if not parallel:
-        avg_cost = __network__(
+        avg_cost, predict_word = __network__(
             [first_word, second_word, third_word, forth_word, next_word])
     else:
         places = fluid.layers.get_places()
         pd = fluid.layers.ParallelDo(places)
         with pd.do():
-            avg_cost = __network__(
+            avg_cost, predict_word = __network__(
                 map(pd.read_input, [
                     first_word, second_word, third_word, forth_word, next_word
                 ]))
@@ -111,10 +165,25 @@ def main(use_cuda, is_sparse, parallel):
                                   feed=feeder.feed(data),
                                   fetch_list=[avg_cost])
             if avg_cost_np[0] < 5.0:
+                if save_dirname is not None:
+                    fluid.io.save_inference_model(save_dirname, [
+                        'firstw', 'secondw', 'thirdw', 'forthw'
+                    ], [predict_word], exe)
                 return
+            if math.isnan(float(avg_cost_np[0])):
+                sys.exit("got NaN loss, training failed.")
+
     raise AssertionError("Cost is too large {0:2.2}".format(avg_cost_np[0]))
 
 
+def main(use_cuda, is_sparse, parallel):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    save_dirname = "word2vec.inference.model"
+    train(use_cuda, is_sparse, parallel, save_dirname)
+    infer(use_cuda, save_dirname)
+
+
 FULL_TEST = os.getenv('FULL_TEST',
                       '0').lower() in ['true', '1', 't', 'y', 'yes', 'on']
 SKIP_REASON = "Only run minimum number of tests in CI server, to make CI faster"
@@ -137,7 +206,8 @@ def inject_test_method(use_cuda, is_sparse, parallel):
             with fluid.program_guard(prog, startup_prog):
                 main(use_cuda=use_cuda, is_sparse=is_sparse, parallel=parallel)
 
-    if use_cuda and is_sparse and parallel:
+    # run only 2 cases: use_cuda is either True or False
+    if is_sparse == False and parallel == False:
         fn = __impl__
     else:
         # skip the other test when on CI server
diff --git a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
index 7ad5e2c594..045db8390c 100644
--- a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
@@ -15,6 +15,8 @@
 import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
+import math
+import sys
 
 # need to fix random seed and training data to compare the loss
 # value accurately calculated by the default and the memory optimization
@@ -63,4 +65,6 @@ for pass_id in range(PASS_NUM):
 
         if avg_loss_value[0] < 10.0:
             exit(0)  # if avg cost less than 10.0, we think our code is good.
+        if math.isnan(float(avg_loss_value)):
+            sys.exit("got NaN loss, training failed.")
 exit(1)
diff --git a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
index 26673afd83..9fbb36d363 100644
--- a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
+++ b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
@@ -18,6 +18,8 @@ import sys
 
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
+import math
+import sys
 
 # need to fix random seed and training data to compare the loss
 # value accurately calculated by the default and the memory optimization
@@ -152,7 +154,10 @@ for pass_id in range(PASS_NUM):
         print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
             pass_acc))
         # this model is slow, so if we can train two mini batch, we think it works properly.
+
         if i > 2:
             exit(0)
+        if math.isnan(float(loss)):
+            sys.exit("got NaN loss, training failed.")
         i += 1
 exit(1)
diff --git a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
index ffd53e7a78..48abaa8d87 100644
--- a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
+++ b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
@@ -19,6 +19,8 @@ import paddle.v2.fluid.core as core
 import paddle.v2.fluid.framework as framework
 import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.executor import Executor
+import math
+import sys
 
 dict_size = 30000
 source_dict_dim = target_dict_dim = dict_size
@@ -137,6 +139,8 @@ def main():
                   " avg_cost=" + str(avg_cost_val))
             if batch_id > 2:
                 exit(0)
+            if math.isnan(float(avg_cost_val)):
+                sys.exit("got NaN loss, training failed.")
             batch_id += 1
 
 
diff --git a/python/paddle/v2/fluid/tests/notest_csp.py b/python/paddle/v2/fluid/tests/notest_csp.py
new file mode 100644
index 0000000000..7fe234a20b
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/notest_csp.py
@@ -0,0 +1,37 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.v2.fluid as fluid
+
+
+class TestCSPFramework(unittest.TestCase):
+    def daisy_chain(self):
+        n = 10000
+        leftmost = fluid.make_channel(dtype=int)
+        right = leftmost
+        left = leftmost
+        with fluid.While(steps=n):
+            right = fluid.make_channel(dtype=int)
+            with fluid.go():
+                fluid.send(left, 1 + fluid.recv(right))
+            left = right
+
+        with fluid.go():
+            fluid.send(right, 1)
+        fluid.Print(fluid.recv(leftmost))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/op_test.py b/python/paddle/v2/fluid/tests/op_test.py
index 3f6d7070c2..f8475813c0 100644
--- a/python/paddle/v2/fluid/tests/op_test.py
+++ b/python/paddle/v2/fluid/tests/op_test.py
@@ -326,7 +326,8 @@ class OpTest(unittest.TestCase):
                 self.assertTrue(
                     np.allclose(
                         actual_t, expect_t, atol=atol),
-                    "Output (" + out_name + ") has diff at " + str(place))
+                    "Output (" + out_name + ") has diff at " + str(place) +
+                    str(actual_t) + str(expect_t))
                 if isinstance(expect, tuple):
                     self.assertListEqual(actual.lod(), expect[1],
                                          "Output (" + out_name +
diff --git a/python/paddle/v2/fluid/tests/test_cpp_reader.py b/python/paddle/v2/fluid/tests/test_cpp_reader.py
new file mode 100644
index 0000000000..66d6c28ef7
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_cpp_reader.py
@@ -0,0 +1,72 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import numpy as np
+
+prog = fluid.framework.Program()
+block = prog.current_block()
+
+random_reader = block.create_var(
+    type=fluid.core.VarDesc.VarType.READER, name="RandomDataGenerator")
+random_reader.desc.set_dtypes(
+    [fluid.core.DataType.FP32, fluid.core.DataType.FP32])
+
+create_random_data_generator_op = block.append_op(
+    type="create_random_data_generator",
+    outputs={"Out": random_reader},
+    attrs={
+        "shape_concat": [1, 2, 1, 1],
+        "ranks": [2, 2],
+        "min": 0.0,
+        "max": 1.0,
+        'lod_levels': [0, 0]
+    })
+shuffle_reader = block.create_var(
+    type=fluid.core.VarDesc.VarType.READER, name="ShuffleReader")
+
+create_shuffle_reader_op = block.append_op(
+    type="create_shuffle_reader",
+    inputs={"UnderlyingReader": random_reader},
+    outputs={"Out": shuffle_reader},
+    attrs={"buffer_size": 7})
+
+batch_reader = block.create_var(
+    type=fluid.core.VarDesc.VarType.READER, name="BatchReader")
+
+create_batch_reader_op = block.append_op(
+    type="create_batch_reader",
+    inputs={"UnderlyingReader": shuffle_reader},
+    outputs={"Out": batch_reader},
+    attrs={"batch_size": 10})
+
+out1 = block.create_var(type=fluid.core.VarDesc.VarType.LOD_TENSOR, name="Out1")
+out2 = block.create_var(type=fluid.core.VarDesc.VarType.LOD_TENSOR, name="Out2")
+
+read_op = block.append_op(
+    type="read", inputs={"Reader": batch_reader},
+    outputs={"Out": [out1, out2]})
+
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+
+[res1, res2] = exe.run(prog, fetch_list=[out1, out2])
+
+test_pass = res1.shape == (10, 2) and res2.shape == (10, 1)
+
+if not test_pass:
+    exit(1)
+
+exit(0)
diff --git a/python/paddle/v2/fluid/tests/test_ctc_align.py b/python/paddle/v2/fluid/tests/test_ctc_align.py
index 773c69d1ad..cc815d8e9e 100644
--- a/python/paddle/v2/fluid/tests/test_ctc_align.py
+++ b/python/paddle/v2/fluid/tests/test_ctc_align.py
@@ -31,6 +31,8 @@ def CTCAlign(input, lod, blank, merge_repeated):
                 result.append(token)
             prev_token = token
     result = np.array(result).reshape([len(result), 1]).astype("int32")
+    if len(result) == 0:
+        result = np.array([-1])
     return result
 
 
@@ -72,5 +74,14 @@ class TestCTCAlignOpCase1(TestCTCAlignOp):
                 [19, 1]).astype("int32")
 
 
+class TestCTCAlignOpCase2(TestCTCAlignOp):
+    def config(self):
+        self.op_type = "ctc_align"
+        self.input_lod = [[0, 4]]
+        self.blank = 0
+        self.merge_repeated = True
+        self.input = np.array([0, 0, 0, 0]).reshape([4, 1]).astype("int32")
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_cumsum_op.py b/python/paddle/v2/fluid/tests/test_cumsum_op.py
new file mode 100644
index 0000000000..e45ef45730
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_cumsum_op.py
@@ -0,0 +1,127 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestSumOp1(OpTest):
+    def setUp(self):
+        self.op_type = "cumsum"
+        self.attrs = {'axis': 2}
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
+        self.outputs = {'Out': self.inputs['X'].cumsum(axis=2)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestSumOp2(OpTest):
+    def setUp(self):
+        self.op_type = "cumsum"
+        self.attrs = {'axis': -1, 'reverse': True}
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
+        self.outputs = {
+            'Out': np.flip(
+                np.flip(
+                    self.inputs['X'], axis=2).cumsum(axis=2), axis=2)
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestSumOp3(OpTest):
+    def setUp(self):
+        self.op_type = "cumsum"
+        self.attrs = {'axis': 1}
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
+        self.outputs = {'Out': self.inputs['X'].cumsum(axis=1)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestSumOp4(OpTest):
+    def setUp(self):
+        self.op_type = "cumsum"
+        self.attrs = {'axis': 0}
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
+        self.outputs = {'Out': self.inputs['X'].cumsum(axis=0)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestSumOp5(OpTest):
+    def setUp(self):
+        self.op_type = "cumsum"
+        self.inputs = {'X': np.random.random((5, 6)).astype("float64")}
+        self.outputs = {'Out': self.inputs['X'].cumsum(axis=1)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestSumOp7(OpTest):
+    def setUp(self):
+        self.op_type = "cumsum"
+        self.inputs = {'X': np.random.random((6)).astype("float64")}
+        self.outputs = {'Out': self.inputs['X'].cumsum(axis=0)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestSumOp8(OpTest):
+    def setUp(self):
+        self.op_type = "cumsum"
+        self.attrs = {'axis': 2, "exclusive": True}
+        a = np.random.random((5, 6, 3)).astype("float64")
+        self.inputs = {'X': a}
+        self.outputs = {
+            'Out': np.concatenate(
+                (np.zeros(
+                    (5, 6, 1), dtype=np.float64), a[:, :, :-1].cumsum(axis=2)),
+                axis=2)
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_fetch_var.py b/python/paddle/v2/fluid/tests/test_fetch_var.py
new file mode 100644
index 0000000000..ed75a350b0
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_fetch_var.py
@@ -0,0 +1,37 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.layers as layers
+import op_test
+import numpy
+import unittest
+
+
+class TestFetchVar(op_test.OpTest):
+    def test_fetch_var(self):
+        val = numpy.array([1, 3, 5]).astype(numpy.int32)
+        x = layers.create_tensor(dtype="int32", persistable=True, name="x")
+        layers.assign(input=val, output=x)
+        exe = fluid.Executor(fluid.CPUPlace())
+        exe.run(fluid.default_main_program(), feed={}, fetch_list=[])
+        fetched_x = fluid.fetch_var("x")
+        self.assertTrue(
+            numpy.array_equal(fetched_x, val),
+            "fetch_x=%s val=%s" % (fetched_x, val))
+        self.assertEqual(fetched_x.dtype, val.dtype)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_layer_norm_op.py b/python/paddle/v2/fluid/tests/test_layer_norm_op.py
index 68cf8673cd..4460ffaf9c 100644
--- a/python/paddle/v2/fluid/tests/test_layer_norm_op.py
+++ b/python/paddle/v2/fluid/tests/test_layer_norm_op.py
@@ -20,6 +20,8 @@ import paddle.v2.fluid.core as core
 from paddle.v2.fluid.op import Operator
 from paddle.v2.fluid.framework import grad_var_name
 
+np.random.random(123)
+
 
 def _reference_layer_norm_naive(x, scale, beta, epsilon, begin_norm_axis=1):
     x_shape = x.shape
@@ -62,9 +64,9 @@ def _reference_layer_norm_grad(x, grad_y, scale, mean, var, begin_norm_axis=1):
 
     grad_x = dx_end + d_mean + d_std
 
-    grad_y.shape = x_shape
-    x.shape = x_shape
+    grad_x.shape, x.shape, grad_y.shape = x_shape, x_shape, x_shape
     scale.shape = scale_shape
+    var.shape, mean.shape = [N, ], [N, ]
     return grad_x, d_scale, d_bias
 
 
@@ -112,10 +114,7 @@ def set_output_grad(scope, outputs, place, feed_dict=None):
 
 class TestLayerNormdOp(OpTest):
     def __assert_close(self, tensor, np_array, msg, atol=1e-4):
-        self.assertTrue(
-            np.allclose(
-                np.array(tensor).reshape(np_array.shape), np_array, atol=atol),
-            msg)
+        self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
 
     def __assert_grad_close(self,
                             tensor,
@@ -123,7 +122,7 @@ class TestLayerNormdOp(OpTest):
                             name,
                             place,
                             max_relative_error=0.02):
-        a = np.array(tensor).reshape(np_array.shape)
+        a = np.array(tensor)
         b = np_array
         abs_a = np.abs(a)
         abs_a[abs_a < 1e-5] = 1
@@ -151,7 +150,7 @@ class TestLayerNormdOp(OpTest):
             x_shape = shape
             D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
             scale_shape = [D]
-            np.random.random(123)
+
             x_val = np.random.random_sample(x_shape).astype(np.float32)
             scale_val = np.random.random_sample(scale_shape).astype(np.float32)
             bias_val = np.random.random_sample(scale_shape).astype(np.float32)
diff --git a/python/paddle/v2/fluid/tests/test_learning_rate_decay.py b/python/paddle/v2/fluid/tests/test_learning_rate_decay.py
index dc348cf2d2..1d6bab3d6c 100644
--- a/python/paddle/v2/fluid/tests/test_learning_rate_decay.py
+++ b/python/paddle/v2/fluid/tests/test_learning_rate_decay.py
@@ -15,6 +15,8 @@
 import unittest
 
 import math
+import copy
+
 import paddle.v2.fluid.framework as framework
 import paddle.v2.fluid as fluid
 import paddle.v2.fluid.layers as layers
@@ -54,21 +56,37 @@ def inverse_time_decay(learning_rate,
     return learning_rate / (1 + decay_rate * temp)
 
 
-class TestLearningRateDecay(unittest.TestCase):
-    def check_decay(self, python_decay_fn, fluid_decay_fn, staircase):
-        init_lr = 1.0
-        decay_steps = 5
-        decay_rate = 0.5
+def polynomial_decay(learning_rate,
+                     global_step,
+                     decay_steps,
+                     end_learning_rate=0.0001,
+                     power=1.0,
+                     cycle=False):
+    if cycle:
+        div = math.ceil(global_step / float(decay_steps))
+        if div == 0:
+            div = 1
+        decay_steps = decay_steps * div
+    else:
+        global_step = min(global_step, decay_steps)
+    return (learning_rate - end_learning_rate) * \
+           ((1 - float(global_step) / float(decay_steps)) ** power) + end_learning_rate
+
+
+def piecewise_decay(global_step, boundaries, values):
+    assert len(boundaries) + 1 == len(values)
+    for i in range(len(boundaries)):
+        if global_step < boundaries[i]:
+            return values[i]
+    return values[len(values) - 1]
 
+
+class TestLearningRateDecay(unittest.TestCase):
+    def check_decay(self, python_decay_fn, fluid_decay_fn, kwargs):
         global_step = layers.create_global_var(
             shape=[1], value=0.0, dtype='float32', persistable=True)
 
-        decayed_lr = fluid_decay_fn(
-            learning_rate=init_lr,
-            global_step=global_step,
-            decay_steps=decay_steps,
-            decay_rate=decay_rate,
-            staircase=staircase)
+        decayed_lr = fluid_decay_fn(global_step=global_step, **kwargs)
         layers.increment(global_step, 1.0)
 
         place = fluid.CPUPlace()
@@ -79,31 +97,52 @@ class TestLearningRateDecay(unittest.TestCase):
             step_val, lr_val = exe.run(fluid.default_main_program(),
                                        feed=[],
                                        fetch_list=[global_step, decayed_lr])
-            python_decayed_lr = python_decay_fn(
-                learning_rate=init_lr,
-                global_step=step,
-                decay_steps=decay_steps,
-                decay_rate=decay_rate,
-                staircase=staircase)
+            python_decayed_lr = python_decay_fn(global_step=step, **kwargs)
             self.assertAlmostEqual(python_decayed_lr, lr_val[0])
 
     def test_decay(self):
+        common_kwargs_true = {
+            "learning_rate": 1.0,
+            "decay_steps": 5,
+            "decay_rate": 0.5,
+            "staircase": True
+        }
+        common_kwargs_false = copy.deepcopy(common_kwargs_true)
+        common_kwargs_false["staircase"] = False
+
         decay_fns = [
-            (exponential_decay, lr_decay.exponential_decay, True),
-            (exponential_decay, lr_decay.exponential_decay, False),
-            (natural_exp_decay, lr_decay.natural_exp_decay, True),
-            (natural_exp_decay, lr_decay.natural_exp_decay, False),
-            (inverse_time_decay, lr_decay.inverse_time_decay, True),
-            (inverse_time_decay, lr_decay.inverse_time_decay, False),
+            (exponential_decay, lr_decay.exponential_decay, common_kwargs_true),
+            (exponential_decay, lr_decay.exponential_decay,
+             common_kwargs_false),
+            (natural_exp_decay, lr_decay.natural_exp_decay, common_kwargs_true),
+            (natural_exp_decay, lr_decay.natural_exp_decay,
+             common_kwargs_false),
+            (inverse_time_decay, lr_decay.inverse_time_decay,
+             common_kwargs_true),
+            (inverse_time_decay, lr_decay.inverse_time_decay,
+             common_kwargs_false),
+            (polynomial_decay, lr_decay.polynomial_decay, {
+                "learning_rate": 1.0,
+                "decay_steps": 5,
+                "cycle": True
+            }),
+            (polynomial_decay, lr_decay.polynomial_decay, {
+                "learning_rate": 1.0,
+                "decay_steps": 5,
+                "cycle": False
+            }),
+            (piecewise_decay, lr_decay.piecewise_decay, {
+                "boundaries": [3, 6, 9],
+                "values": [0.1, 0.2, 0.3, 0.4]
+            }),
         ]
 
-        for py_decay_fn, fluid_decay_fn, staircase in decay_fns:
-            print("decay_fn=" + str(py_decay_fn) + " staircase=" + str(
-                staircase))
+        for py_decay_fn, fluid_decay_fn, kwargs in decay_fns:
+            print("decay_fn=" + py_decay_fn.__name__ + " kwargs=" + str(kwargs))
             main_program = framework.Program()
             startup_program = framework.Program()
             with framework.program_guard(main_program, startup_program):
-                self.check_decay(py_decay_fn, fluid_decay_fn, staircase)
+                self.check_decay(py_decay_fn, fluid_decay_fn, kwargs)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/fluid/tests/test_optimizer.py b/python/paddle/v2/fluid/tests/test_optimizer.py
index 480ee70915..dc6b84dcdc 100644
--- a/python/paddle/v2/fluid/tests/test_optimizer.py
+++ b/python/paddle/v2/fluid/tests/test_optimizer.py
@@ -42,9 +42,9 @@ class TestOptimizer(unittest.TestCase):
             type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
         sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01)
         opts, _ = sgd_optimizer.minimize(mean_out, init_program)
-        self.assertEqual(len(opts), 1)
-        sgd_op = opts[0]
-        self.assertEqual(sgd_op.type, "sgd")
+        self.assertEqual(len(opts), 3)
+        self.assertEqual([op.type for op in opts],
+                         ["fill_constant", "elementwise_mul", "sgd"])
 
     def test_sgd_optimizer_with_global_step(self):
         init_program = framework.Program()
@@ -72,11 +72,10 @@ class TestOptimizer(unittest.TestCase):
         sgd_optimizer = optimizer.SGDOptimizer(
             learning_rate=learning_rate, global_step=global_step)
         opts, _ = sgd_optimizer.minimize(mean_out, init_program)
-        self.assertEqual(len(opts), 2)
-        sgd_op = opts[0]
-        self.assertEqual(sgd_op.type, "sgd")
-        increment_op = opts[1]
-        self.assertEqual(increment_op.type, "increment")
+        self.assertEqual(len(opts), 4)
+        self.assertEqual(
+            [op.type for op in opts],
+            ["fill_constant", "elementwise_mul", "sgd", "increment"])
 
         # Check init_program
         init_ops = init_program.global_block().ops
@@ -121,9 +120,10 @@ class TestMomentumOptimizer(unittest.TestCase):
         self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
         opts = momentum_optimizer.create_optimization_pass(
             params_grads, mul_out, init_program)
-        self.assertEqual(len(opts), 1)
-        sgd_op = opts[0]
-        self.assertEqual(sgd_op.type, "momentum")
+        self.assertEqual(len(opts), 3)
+        sgd_op = opts[-1]
+        self.assertEqual([op.type for op in opts],
+                         ["fill_constant", "elementwise_mul", "momentum"])
         self.assertFalse(sgd_op.attr('use_nesterov'))
 
         # Check accumulators
@@ -170,9 +170,10 @@ class TestMomentumOptimizer(unittest.TestCase):
         self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
         opts = momentum_optimizer.create_optimization_pass(
             params_grads, mul_out, init_program)
-        self.assertEqual(len(opts), 1)
-        sgd_op = opts[0]
-        self.assertEqual(sgd_op.type, "momentum")
+        self.assertEqual(len(opts), 3)
+        sgd_op = opts[-1]
+        self.assertEqual([op.type for op in opts],
+                         ["fill_constant", "elementwise_mul", "momentum"])
         self.assertTrue(sgd_op.attr('use_nesterov'))
 
         # Check accumulators
@@ -228,9 +229,9 @@ class TestAdagradOptimizer(unittest.TestCase):
         self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0)
         opts = adagrad_optimizer.create_optimization_pass(params_grads, mul_out,
                                                           init_program)
-        self.assertEqual(len(opts), 1)
-        adagrad_op = opts[0]
-        self.assertEqual(adagrad_op.type, "adagrad")
+        self.assertEqual(len(opts), 3)
+        self.assertEqual([op.type for op in opts],
+                         ["fill_constant", "elementwise_mul", "adagrad"])
 
         # Check accumulators
         accumulators = adagrad_optimizer.get_accumulators()
@@ -288,9 +289,10 @@ class TestAdamOptimizer(unittest.TestCase):
         self.assertEqual(len(adam_optimizer.get_accumulators()), 0)
         opts = adam_optimizer.create_optimization_pass(params_grads, mul_out,
                                                        init_program)
-        self.assertEqual(len(opts), 3)
-        adam_op = opts[0]
-        self.assertEqual(adam_op.type, "adam")
+        self.assertEqual(len(opts), 5)
+        self.assertEqual(
+            [op.type for op in opts],
+            ["fill_constant", "elementwise_mul", "adam", "scale", "scale"])
 
         # Check accumulators
         accumulators = adam_optimizer.get_accumulators()
@@ -350,9 +352,10 @@ class TestAdamaxOptimizer(unittest.TestCase):
         self.assertEqual(len(adamax_optimizer.get_accumulators()), 0)
         opts = adamax_optimizer.create_optimization_pass(params_grads, mul_out,
                                                          init_program)
-        self.assertEqual(len(opts), 2)
-        adam_op = opts[0]
-        self.assertEqual(adam_op.type, "adamax")
+        self.assertEqual(len(opts), 4)
+        self.assertEqual(
+            [op.type for op in opts],
+            ["fill_constant", "elementwise_mul", "adamax", "scale"])
 
         # Check accumulators
         accumulators = adamax_optimizer.get_accumulators()
@@ -409,9 +412,10 @@ class TestDecayedAdagradOptimizer(unittest.TestCase):
         self.assertEqual(len(decayed_adagrad_optimizer.get_accumulators()), 0)
         opts = decayed_adagrad_optimizer.create_optimization_pass(
             params_grads, mul_out, init_program)
-        self.assertEqual(len(opts), 1)
-        decayed_adagrad_op = opts[0]
-        self.assertEqual(decayed_adagrad_op.type, "decayed_adagrad")
+        self.assertEqual(len(opts), 3)
+        self.assertEqual(
+            [op.type for op in opts],
+            ["fill_constant", "elementwise_mul", "decayed_adagrad"])
 
         # Check accumulators
         accumulators = decayed_adagrad_optimizer.get_accumulators()
diff --git a/python/paddle/v2/fluid/tests/test_parallel_op.py b/python/paddle/v2/fluid/tests/test_parallel_op.py
index 367cc8b1aa..f1fd09a7fd 100644
--- a/python/paddle/v2/fluid/tests/test_parallel_op.py
+++ b/python/paddle/v2/fluid/tests/test_parallel_op.py
@@ -197,5 +197,5 @@ class ParallelOpTestMultipleInput(BaseParallelForTest):
             fetch=['fc1.w@GRAD', 'fc2.w@GRAD', 'fc3.w@GRAD'])
 
 
-if __name__ == '__main__':
-    unittest.main()
+#if __name__ == '__main__':
+#    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_protobuf_descs.py b/python/paddle/v2/fluid/tests/test_protobuf_descs.py
index 9034b2f4ef..c590bf1c65 100644
--- a/python/paddle/v2/fluid/tests/test_protobuf_descs.py
+++ b/python/paddle/v2/fluid/tests/test_protobuf_descs.py
@@ -115,6 +115,17 @@ class TestVarDesc(unittest.TestCase):
         self.assertEqual(src_shape, res_shape)
         self.assertEqual(core.VarDesc.VarType.SELECTED_ROWS, var.type())
 
+    def test_multiple_shape(self):
+        program_desc = core.ProgramDesc()
+        block = program_desc.block(0)
+        var = block.var('my_reader')
+        var.set_type(core.VarDesc.VarType.READER)
+        src_shapes = [[2, 3, 3], [4, 5], [6, 7, 8, 9]]
+        var.set_shapes(src_shapes)
+        res_shapes = var.shapes()
+        self.assertEqual(src_shapes, res_shapes)
+        self.assertEqual(core.VarDesc.VarType.READER, var.type())
+
     def test_dtype(self):
         program_desc = core.ProgramDesc()
         block = program_desc.block(0)
@@ -124,6 +135,28 @@ class TestVarDesc(unittest.TestCase):
         self.assertEqual(core.DataType.INT32, var.dtype())
         self.assertEqual(core.VarDesc.VarType.LOD_TENSOR, var.type())
 
+    def test_multiple_dtype(self):
+        program_desc = core.ProgramDesc()
+        block = program_desc.block(0)
+        var = block.var('my_reader')
+        var.set_type(core.VarDesc.VarType.READER)
+        src_types = [
+            core.DataType.INT32, core.DataType.FP64, core.DataType.FP32
+        ]
+        var.set_dtypes(src_types)
+        self.assertEqual(src_types, var.dtypes())
+        self.assertEqual(core.VarDesc.VarType.READER, var.type())
+
+    def test_multiple_lod_level(self):
+        program_desc = core.ProgramDesc()
+        block = program_desc.block(0)
+        var = block.var('my_reader')
+        var.set_type(core.VarDesc.VarType.READER)
+        src_types = [3, 1, 2]
+        var.set_lod_levels(src_types)
+        self.assertEqual(src_types, var.lod_levels())
+        self.assertEqual(core.VarDesc.VarType.READER, var.type())
+
 
 class TestBlockDesc(unittest.TestCase):
     def test_add_var(self):
diff --git a/python/paddle/v2/fluid/tests/test_switch.py b/python/paddle/v2/fluid/tests/test_switch.py
new file mode 100644
index 0000000000..52ebf773ec
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_switch.py
@@ -0,0 +1,64 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.framework as framework
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.framework import default_startup_program
+
+
+class TestSwitch(unittest.TestCase):
+    def check_switch(self, value):
+        x = layers.fill_constant(shape=[1], dtype='float32', value=value)
+
+        zero_var = layers.fill_constant(shape=[1], dtype='float32', value=0.0)
+        one_var = layers.fill_constant(shape=[1], dtype='float32', value=1.0)
+        two_var = layers.fill_constant(shape=[1], dtype='float32', value=2.0)
+        three_var = layers.fill_constant(shape=[1], dtype='float32', value=3.0)
+
+        result = layers.create_global_var(
+            shape=[1], value=-1.0, dtype='float32', persistable=True)
+
+        with layers.Switch() as switch:
+            with switch.case(layers.less_than(x, zero_var)):
+                layers.assign(zero_var, result)
+            with switch.case(layers.less_than(x, one_var)):
+                layers.assign(one_var, result)
+            with switch.case(layers.less_than(x, two_var)):
+                layers.assign(two_var, result)
+            with switch.default():
+                layers.assign(three_var, result)
+
+        cpu = core.CPUPlace()
+        exe = Executor(cpu)
+        exe.run(default_startup_program())
+
+        out = exe.run(feed={}, fetch_list=[result])[0][0]
+        return out
+
+    def test_switch(self):
+        test_data = {(-0.1, 0), (0.1, 1), (1.1, 2), (2.1, 3)}
+        for x, expected_result in test_data:
+            main_program = framework.Program()
+            startup_program = framework.Program()
+            with framework.program_guard(main_program, startup_program):
+                result = self.check_switch(x)
+                self.assertEqual(result, expected_result)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_target_assign_op.py b/python/paddle/v2/fluid/tests/test_target_assign_op.py
new file mode 100755
index 0000000000..8a1155c621
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_target_assign_op.py
@@ -0,0 +1,122 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import random
+from op_test import OpTest
+
+
+def gen_match_and_neg_indices(num_prior, gt_lod, neg_lod):
+    if len(gt_lod) != len(neg_lod):
+        raise AssertionError("The input arguments are illegal.")
+
+    batch_size = len(gt_lod) - 1
+
+    match_indices = -1 * np.ones((batch_size, num_prior)).astype('int32')
+    neg_indices = np.zeros((neg_lod[-1], 1)).astype('int32')
+
+    for n in range(batch_size):
+        gt_num = gt_lod[n + 1] - gt_lod[n]
+        ids = random.sample([i for i in range(num_prior)], gt_num)
+        match_indices[n, ids] = [i for i in range(gt_num)]
+
+        ret_ids = set([i for i in range(num_prior)]) - set(ids)
+        s = neg_lod[n]
+        e = neg_lod[n + 1]
+        l = e - s
+        neg_ids = random.sample(ret_ids, l)
+        neg_indices[s:e, :] = np.array(neg_ids).astype('int32').reshape(l, 1)
+
+    return match_indices, neg_indices
+
+
+def target_assign(encoded_box, gt_label, match_indices, neg_indices, gt_lod,
+                  neg_lod, background_label):
+    batch_size, num_prior = match_indices.shape
+
+    # init target bbox
+    trg_box = np.zeros((batch_size, num_prior, 4)).astype('float32')
+    # init weight for target bbox
+    trg_box_wt = np.zeros((batch_size, num_prior, 1)).astype('float32')
+    # init target label
+    trg_label = np.ones((batch_size, num_prior, 1)).astype('int32')
+    trg_label = trg_label * background_label
+    # init weight for target label
+    trg_label_wt = np.zeros((batch_size, num_prior, 1)).astype('float32')
+
+    for i in range(batch_size):
+        cur_indices = match_indices[i]
+        col_ids = np.where(cur_indices > -1)
+        col_val = cur_indices[col_ids]
+
+        gt_start = gt_lod[i]
+        # target bbox
+        for v, c in zip(col_val + gt_start, col_ids[0].tolist()):
+            trg_box[i][c][:] = encoded_box[v][c][:]
+
+        # weight for target bbox
+        trg_box_wt[i][col_ids] = 1.0
+
+        trg_label[i][col_ids] = gt_label[col_val + gt_start]
+
+        trg_label_wt[i][col_ids] = 1.0
+        # set target label weight to 1.0 for the negative samples
+        neg_ids = neg_indices[neg_lod[i]:neg_lod[i + 1]]
+        trg_label_wt[i][neg_ids] = 1.0
+
+    return trg_box, trg_box_wt, trg_label, trg_label_wt
+
+
+class TestTargetAssginOp(OpTest):
+    def setUp(self):
+        self.op_type = "target_assign"
+
+        num_prior = 120
+        num_class = 21
+        gt_lod = [0, 5, 11, 23]
+        neg_lod = [0, 4, 7, 13]
+        batch_size = len(gt_lod) - 1
+        num_gt = gt_lod[-1]
+        background_label = 0
+
+        encoded_box = np.random.random((num_gt, num_prior, 4)).astype('float32')
+        gt_label = np.random.randint(
+            num_class, size=(num_gt, 1)).astype('int32')
+        match_indices, neg_indices = gen_match_and_neg_indices(num_prior,
+                                                               gt_lod, neg_lod)
+        trg_box, trg_box_wt, trg_label, trg_label_wt = target_assign(
+            encoded_box, gt_label, match_indices, neg_indices, gt_lod, neg_lod,
+            background_label)
+
+        self.inputs = {
+            'EncodedGTBBox': (encoded_box, [gt_lod]),
+            'GTScoreLabel': (gt_label, [gt_lod]),
+            'MatchIndices': (match_indices),
+            'NegIndices': (neg_indices, [neg_lod]),
+        }
+        self.attrs = {'background_label': background_label}
+        self.outputs = {
+            'PredBBoxLabel': (trg_box),
+            'PredBBoxWeight': (trg_box_wt),
+            'PredScoreLabel': (trg_label),
+            'PredScoreWeight': (trg_label_wt),
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/setup.py.in b/python/setup.py.in
index 65ec58ecf9..5a0d999954 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -109,7 +109,7 @@ setup(name='${PACKAGE_NAME}',
           '': '${CMAKE_CURRENT_SOURCE_DIR}',
           # The paddle.v2.fluid.proto will be generated while compiling.
           # So that package points to other directory.
-          'paddle.v2.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/framework',
+          'paddle.v2.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework',
           'py_paddle': '${PADDLE_SOURCE_DIR}/paddle/py_paddle'
       },
       scripts=paddle_bins,