Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into doc

7 years ago · 231d3a88f8
parent a9de00a86d f3cdeb9a29
commit 231d3a88f8
84 changed files with 3268 additions and 1649 deletions
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
@ -1,11 +1,11 @@
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -138,13 +138,14 @@ def main():
    avg_cost = fluid.layers.mean(x=cost)

    # Evaluator
-    accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+    batch_size = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(
+        input=predict, label=label, total=batch_size)

    # inference program
    inference_program = fluid.default_main_program().clone()
    with fluid.program_guard(inference_program):
-        test_target = accuracy.metrics + accuracy.states
-        inference_program = fluid.io.get_inference_program(test_target)
+        inference_program = fluid.io.get_inference_program(batch_acc)

    # Optimization
    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
@ -157,27 +158,30 @@ def main():

    # test
    def test(exe):
-        accuracy.reset(exe)
+        test_pass_acc = fluid.average.WeightedAverage()
        for batch_id, data in enumerate(test_reader()):
            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
                                    data)).astype("float32")
            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
            y_data = y_data.reshape([-1, 1])

-            exe.run(inference_program,
-                    feed={"pixel": img_data,
-                          "label": y_data})
+            outs = exe.run(inference_program,
+                           feed={"pixel": img_data,
+                                 "label": y_data},
+                           fetch_list=[batch_acc, batch_size])
+            test_pass_acc.add(value=np.array(outs[0]), weight=np.array(outs[1]))

-        return accuracy.eval(exe)
+        return test_pass_acc.eval()

    def train_loop(exe, trainer_prog):
        iters = 0
        ts = time.time()
+        train_pass_acc = fluid.average.WeightedAverage()
        for pass_id in range(args.num_passes):
            # train
            start_time = time.time()
            num_samples = 0
-            accuracy.reset(exe)
+            train_pass_acc.reset()
            with profiler.profiler("CPU", 'total') as prof:
                for batch_id, data in enumerate(train_reader()):
                    ts = time.time()
@ -187,13 +191,14 @@ def main():
                    y_data = np.array(map(lambda x: x[1], data)).astype("int64")
                    y_data = y_data.reshape([-1, 1])

-                    loss, acc = exe.run(
+                    loss, acc, b_size = exe.run(
                        trainer_prog,
                        feed={"pixel": img_data,
                              "label": y_data},
-                        fetch_list=[avg_cost] + accuracy.metrics)
+                        fetch_list=[avg_cost, batch_acc, batch_size])
                    iters += 1
                    num_samples += len(data)
+                    train_pass_acc.add(value=acc, weight=b_size)
                    print(
                        "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s"
                        % (pass_id, iters, loss, acc,
@ -201,7 +206,7 @@ def main():
                    )  # The accuracy is the accumulation of batches, but not the current batch.

            pass_elapsed = time.time() - start_time
-            pass_train_acc = accuracy.eval(exe)
+            pass_train_acc = train_pass_acc.eval()
            pass_test_acc = test(exe)
            print(
                "Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n"
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@ -77,7 +77,8 @@ IF(NOT ${CBLAS_FOUND})
        INSTALL_DIR         ${CBLAS_INSTALL_DIR}
        BUILD_IN_SOURCE     1
        BUILD_COMMAND       ${CMAKE_MAKE_PROGRAM} ${COMMON_ARGS} ${OPTIONAL_ARGS}
-        INSTALL_COMMAND     ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 NO_LAPACK=1 PREFIX=<INSTALL_DIR>
+        INSTALL_COMMAND     ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 NO_LAPACK=1 PREFIX=<INSTALL_DIR> 
+                            && rm -r ${CBLAS_INSTALL_DIR}/lib/cmake ${CBLAS_INSTALL_DIR}/lib/pkgconfig
        UPDATE_COMMAND      ""
        CONFIGURE_COMMAND   ""
    )
@ -100,11 +101,6 @@ IF(NOT ${CBLAS_FOUND})
                \"${CBLAS_INSTALL_DIR}/lib -> ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}\"
            )"
        )
-        INSTALL(CODE "execute_process(
-            COMMAND rm -r ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}/cmake
-                    ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}/pkgconfig
-            )"
-        )
    ENDIF()
 ENDIF(NOT ${CBLAS_FOUND})

--- a/cmake/external/snappy.cmake
+++ b/cmake/external/snappy.cmake
@ -39,6 +39,7 @@ ExternalProject_Add(
                    -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
                    -DBUILD_TESTING=OFF
+                    -DSNAPPY_BUILD_TESTS:BOOL=OFF
                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                    ${EXTERNAL_OPTIONAL_ARGS}
    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPY_INSTALL_DIR}
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@ -186,7 +186,9 @@ function(cc_library TARGET_NAME)
      add_library(${TARGET_NAME} SHARED ${cc_library_SRCS})
    else()
      add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
+      find_fluid_modules(${TARGET_NAME})
    endif()
+
    if(cc_library_DEPS)
      # Don't need link libwarpctc.so
      if("${cc_library_DEPS};" MATCHES "warpctc;")
@ -263,7 +265,8 @@ function(nv_library TARGET_NAME)
      if (nv_library_SHARED OR nv_library_shared) # build *.so
        cuda_add_library(${TARGET_NAME} SHARED ${nv_library_SRCS})
      else()
-          cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS})
+        cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS})
+        find_fluid_modules(${TARGET_NAME})
      endif()
      if (nv_library_DEPS)
        add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@ -1,9 +1,22 @@
+set_property(GLOBAL PROPERTY FLUID_MODULES "")
+# find all fluid modules is used for paddle fluid static library
+function(find_fluid_modules TARGET_NAME)
+  get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
+  string(FIND "${__target_path}" "fluid" pos)
+  if(pos GREATER 1)
+    get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
+    set(fluid_modules ${fluid_modules} ${TARGET_NAME})
+    set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
+  endif()
+endfunction(find_fluid_modules)
+
 # make package for paddle fluid shared and static library
 function(copy TARGET)
    set(options "")
    set(oneValueArgs "")
    set(multiValueArgs SRCS DSTS DEPS)
    cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    set(inference_lib_dist_dep ${TARGET} ${inference_lib_dist_dep} PARENT_SCOPE)

    list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
    list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
@ -42,13 +55,21 @@ copy(glog_lib
  DSTS ${dst_dir} ${dst_dir}/lib
 )

-IF(NOT PROTOBUF_FOUND)
+if(NOT PROTOBUF_FOUND)
    set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/protobuf")
    copy(protobuf_lib
-      SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LITE_LIBRARY}
+      SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY}
      DSTS ${dst_dir} ${dst_dir}/lib
    )
-ENDIF(NOT PROTOBUF_FOUND)
+endif()
+
+if(NOT CBLAS_FOUND)
+    set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/openblas")
+    copy(openblas_lib
+      SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
+      DSTS ${dst_dir} ${dst_dir}
+    )
+endif()

 # paddle fluid module
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
@ -66,8 +87,8 @@ copy(memory_lib
 )

 set(module "inference")
-copy(inference_lib DEPENDS paddle_fluid_shared
-  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.so
+copy(inference_lib DEPS paddle_fluid_shared paddle_fluid
+  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
  DSTS ${dst_dir}/${module} ${dst_dir}/${module}
 )

@ -83,6 +104,4 @@ copy(string_lib
  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat
 )

-add_custom_target(inference_lib_dist DEPENDS 
-  inference_lib framework_lib memory_lib platform_lib string_lib
-  gflags_lib glog_lib protobuf_lib eigen3_lib)
+add_custom_target(inference_lib_dist DEPENDS ${inference_lib_dist_dep}) 
--- a/doc/fluid/howto/optimization/timeline.jpeg
+++ b/doc/fluid/howto/optimization/timeline.jpeg
--- a/doc/fluid/howto/optimization/timeline.md
+++ b/doc/fluid/howto/optimization/timeline.md
@ -0,0 +1,27 @@
+## how to use timeline tool to do profile
+
+1. Add `with profiler.profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
+
+	```python
+	with profiler.profiler('All', 'total', '/tmp/profile') as prof:
+	    for pass_id in range(pass_num):
+	        for batch_id, data in enumerate(train_reader()):
+	            exe.run(fluid.default_main_program(),
+	                    feed=feeder.feed(data),
+	                    fetch_list=[],
+	                    use_program_cache=True)
+	            ...
+	```
+
+1. Run `python paddle/tools/timeline.py` to process `/tmp/profile`, it will generate another
+file `/tmp/timeline` by default. You can change the path by cmd parameter, please take a look at
+[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py) for details.
+
+1. Open chrome and visit <chrome://tracing/>, use `load` button to load the generated `timeline` file.
+
+	![chrome tracing](./tracing.jpeg)
+
+1. The resulting timeline should be like:
+
+
+	![chrome timeline](./timeline.jpeg)
--- a/doc/fluid/howto/optimization/tracing.jpeg
+++ b/doc/fluid/howto/optimization/tracing.jpeg
--- a/doc/v2/build_and_install/pip_install_cn.rst
+++ b/doc/v2/build_and_install/pip_install_cn.rst
@ -39,7 +39,7 @@ PaddlePaddle可以使用常用的Python包管理工具

    "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
    "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
-    "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
+    "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddle.tgz>`_"
    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
--- a/doc/v2/build_and_install/pip_install_en.rst
+++ b/doc/v2/build_and_install/pip_install_en.rst
@ -42,7 +42,7 @@ If the links below shows up the login form, just click "Log in as guest" to star

    "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
    "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
-    "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
+    "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddle.tgz>`_"
    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
--- a/doc/v2/howto/index_cn.rst
+++ b/doc/v2/howto/index_cn.rst
@ -1,11 +1,37 @@
 进阶使用
 ========

+PaddlePaddle支持用户灵活地设置各种命令行参数，以实现对模型训练或预测流程的控制。使用方式请参考：
+
 ..  toctree::
  :maxdepth: 1

  cmd_parameter/index_cn.rst
+
+PaddlePaddle支持在fabric集群、MPI集群、kubernetes集群上分布式训练任务，具体环境配置和使用说明请参考：
+
+..  toctree::
+  :maxdepth: 1
+
  cluster/index_cn.rst
+
+PaddlePaddle提供了用于预测的C-API，关于C-API的使用，我们提供了如下指南:
+
+..  toctree::
+  :maxdepth: 1
+
  capi/index_cn.rst
+
+PaddlePaddle支持多种灵活和高效的循环神经网络，具体配置使用方式请参考：
+
+..  toctree::
+  :maxdepth: 1
+
  rnn/index_cn.rst
+
+关于如何使用内置的定时工具、nvprof 或 nvvp 来运行性能分析和调优，请参考：
+
+..  toctree::
+  :maxdepth: 1
+
  optimization/gpu_profiling_cn.rst
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -5,14 +5,14 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)

-if (WITH_GPU)
+if(WITH_GPU)
  nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context framework_proto)
 else()
  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place paddle_memory device_context framework_proto)
-endif ()
+endif()

 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
-if (WITH_GPU)
+if(WITH_GPU)
  nv_test(tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor)
 else()
  cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor)
@ -39,8 +39,13 @@ cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
 nv_test(data_device_transform_test SRCS data_device_transform_test.cu
        DEPS operator op_registry init math_function)

-cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor)
-cc_test(data_type_transform_test SRCS data_type_transform_test.cc DEPS data_type_transform)
+if(WITH_GPU)
+  nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
+  nv_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform)
+else()
+  cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor)
+  cc_test(data_type_transform_test SRCS data_type_transform_test.cc DEPS data_type_transform)
+endif()

 cc_library(data_layout_transform SRCS data_layout_transform.cc DEPS tensor math_function)
 cc_test(data_layout_transform_test SRCS data_layout_transform_test.cc DEPS data_layout_transform)
--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
@ -28,24 +28,19 @@ class Channel {
  virtual bool Send(T*) = 0;
  virtual bool Receive(T*) = 0;
  virtual size_t Cap() = 0;
+  virtual void Lock() = 0;
+  virtual void Unlock() = 0;
  virtual void Close() = 0;
  virtual ~Channel() {}
 };

 // Forward declaration of channel implementations.
-namespace details {
 template <typename T>
-class Buffered;
-template <typename T>
-class UnBuffered;
-}  // namespace details
+class ChannelImpl;

 template <typename T>
 Channel<T>* MakeChannel(size_t buffer_size) {
-  if (buffer_size > 0) {
-    return new details::Buffered<T>(buffer_size);
-  }
-  return new details::UnBuffered<T>();
+  return new ChannelImpl<T>(buffer_size);
 }

 template <typename T>
@ -89,6 +84,19 @@ class ChannelHolder {
    if (IsInitialized()) holder_->Close();
  }

+  size_t Cap() {
+    if (IsInitialized()) return holder_->Cap();
+    return -1;
+  }
+
+  void Lock() {
+    if (IsInitialized()) holder_->Lock();
+  }
+
+  void Unlock() {
+    if (IsInitialized()) holder_->Unlock();
+  }
+
  inline bool IsInitialized() const { return holder_ != nullptr; }

  inline const std::type_index Type() {
@ -106,6 +114,9 @@ class ChannelHolder {
    virtual const std::type_index Type() const = 0;
    virtual void* Ptr() const = 0;
    virtual void Close() = 0;
+    virtual void Lock() = 0;
+    virtual void Unlock() = 0;
+    virtual size_t Cap() = 0;
  };

  template <typename T>
@ -115,11 +126,28 @@ class ChannelHolder {
    }

    virtual const std::type_index Type() const { return type_; }
+
    virtual void* Ptr() const { return static_cast<void*>(channel_.get()); }
+
    virtual void Close() {
      if (channel_) channel_->Close();
    }

+    virtual size_t Cap() {
+      if (channel_)
+        return channel_->Cap();
+      else
+        return -1;
+    }
+
+    virtual void Lock() {
+      if (channel_) channel_->Lock();
+    }
+
+    virtual void Unlock() {
+      if (channel_) channel_->Unlock();
+    }
+
    std::unique_ptr<Channel<T>> channel_;
    const std::type_index type_;
  };
@ -131,5 +159,4 @@ class ChannelHolder {
 }  // namespace framework
 }  // namespace paddle

-#include "paddle/fluid/framework/details/buffered_channel.h"
-#include "paddle/fluid/framework/details/unbuffered_channel.h"
+#include "paddle/fluid/framework/channel_impl.h"
--- a/paddle/fluid/framework/channel_impl.h
+++ b/paddle/fluid/framework/channel_impl.h
@ -0,0 +1,229 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <stddef.h>  // for size_t
+#include <atomic>
+#include <condition_variable>
+#include <deque>
+#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+
+template <typename T>
+class ChannelImpl : public paddle::framework::Channel<T> {
+  friend Channel<T> *paddle::framework::MakeChannel<T>(size_t);
+  friend void paddle::framework::CloseChannel<T>(Channel<T> *);
+
+ public:
+  virtual bool Send(T *);
+  virtual bool Receive(T *);
+  virtual size_t Cap() { return cap_; }
+  virtual void Lock();
+  virtual void Unlock();
+  virtual void Close();
+
+  ChannelImpl(size_t);
+  virtual ~ChannelImpl();
+
+ private:
+  struct QueueMessage {
+    T *data;
+    std::condition_variable_any cond;
+    bool chan_closed = false;
+    bool completed = false;
+
+    QueueMessage(T *item) : data(item) {}
+
+    void Wait(std::unique_lock<std::recursive_mutex> &lock) {
+      cond.wait(lock, [this]() { return completed; });
+    }
+
+    void Notify() {
+      completed = true;
+      cond.notify_all();
+    }
+  };
+
+  bool send_return(bool value) {
+    send_ctr--;
+    destructor_cond_.notify_all();
+    return value;
+  }
+
+  bool recv_return(bool value) {
+    recv_ctr--;
+    destructor_cond_.notify_all();
+    return value;
+  }
+
+  size_t cap_;
+  std::recursive_mutex mu_;
+  bool closed_;
+  std::deque<T> buf_;
+  std::deque<std::shared_ptr<QueueMessage>> recvq;
+  std::deque<std::shared_ptr<QueueMessage>> sendq;
+  std::atomic<unsigned> send_ctr{0};
+  std::atomic<unsigned> recv_ctr{0};
+  std::condition_variable_any destructor_cond_;
+};
+
+template <typename T>
+ChannelImpl<T>::ChannelImpl(size_t capacity)
+    : cap_(capacity), closed_(false), send_ctr(0), recv_ctr(0) {
+  PADDLE_ENFORCE_GE(capacity, 0);
+}
+
+template <typename T>
+bool ChannelImpl<T>::Send(T *item) {
+  send_ctr++;
+  std::unique_lock<std::recursive_mutex> lock{mu_};
+
+  // If channel is closed, do nothing
+  if (closed_) {
+    lock.unlock();
+    // TODO(abhinavarora) Should panic on closed channel
+    return send_return(false);
+  }
+
+  // If there is a receiver, directly pass the value we want
+  // to send to the receiver, bypassing the channel buffer if any
+  if (!recvq.empty()) {
+    std::shared_ptr<QueueMessage> m = recvq.front();
+    recvq.pop_front();
+    // Do the data transfer
+    *(m->data) = std::move(*item);
+    // Wake up the blocked process and unlock
+    m->Notify();
+    lock.unlock();
+    return send_return(true);
+  }
+
+  // Unbuffered channel will always bypass this
+  // If buffered channel has space in buffer,
+  // write the element to the buffer.
+  if (buf_.size() < cap_) {
+    // Copy to buffer
+    buf_.push_back(std::move(*item));
+    // Release lock and return true
+    lock.unlock();
+    return send_return(true);
+  }
+
+  // Block on channel, because some receiver will complete
+  // the operation for us
+  auto m = std::make_shared<QueueMessage>(item);
+  sendq.push_back(m);
+  m->Wait(lock);
+  // TODO(abhinavarora) Should panic on closed channel
+  return send_return(!m->chan_closed);
+}
+
+template <typename T>
+bool ChannelImpl<T>::Receive(T *item) {
+  recv_ctr++;
+  std::unique_lock<std::recursive_mutex> lock{mu_};
+
+  // If channel is closed and buffer is empty or
+  // channel is unbuffered
+  if (closed_ && buf_.empty()) {
+    lock.unlock();
+    return recv_return(false);
+  }
+
+  // If there is a sender, directly receive the value we want
+  // from the sender, bypassing the channel buffer if any
+  if (!sendq.empty()) {
+    std::shared_ptr<QueueMessage> m = sendq.front();
+    sendq.pop_front();
+    // Do the data transfer
+    *item = std::move(*(m->data));
+    // Wake up the blocked process and unlock
+    m->Notify();
+    lock.unlock();
+    return recv_return(true);
+  }
+
+  // If this is a buffered channel and there are items in buffer
+  if (buf_.size() > 0) {
+    // Directly read from buffer
+    *item = std::move(buf_.front());
+    buf_.pop_front();
+    // Release lock and return true
+    lock.unlock();
+    return recv_return(true);
+  }
+
+  // No sender available, block on this channel
+  // Some receiver will complete the option for us
+  auto m = std::make_shared<QueueMessage>(item);
+  recvq.push_back(m);
+  m->Wait(lock);
+
+  return recv_return(!m->chan_closed);
+}
+
+template <typename T>
+void ChannelImpl<T>::Lock() {
+  mu_.lock();
+}
+
+template <typename T>
+void ChannelImpl<T>::Unlock() {
+  mu_.unlock();
+}
+
+template <typename T>
+void ChannelImpl<T>::Close() {
+  std::unique_lock<std::recursive_mutex> lock{mu_};
+
+  if (closed_) {
+    // TODO(abhinavarora): closing an already closed channel should panic
+    lock.unlock();
+    return;
+  }
+
+  closed_ = true;
+
+  // Empty the readers
+  while (!recvq.empty()) {
+    std::shared_ptr<QueueMessage> m = recvq.front();
+    recvq.pop_front();
+    m->chan_closed = true;
+    m->Notify();
+  }
+
+  // Empty the senders
+  while (!sendq.empty()) {
+    std::shared_ptr<QueueMessage> m = sendq.front();
+    sendq.pop_front();
+    m->chan_closed = true;
+    m->Notify();
+  }
+}
+
+template <typename T>
+ChannelImpl<T>::~ChannelImpl() {
+  Close();
+  // The destructor must wait for all readers and writers to complete their task
+  // The channel has been closed, so we will not accept new readers and writers
+  std::unique_lock<std::recursive_mutex> lock{mu_};
+  destructor_cond_.wait(lock,
+                        [this]() { return send_ctr == 0 && recv_ctr == 0; });
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/channel_test.cc
+++ b/paddle/fluid/framework/channel_test.cc
--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@ -42,6 +42,7 @@ void DataTransform(const OpKernelType& expected_kernel_type,
    PassTensorData(&out, &in);
  }

+  // do data type transform
  if (expected_kernel_type.data_type_ != kernel_type_for_var.data_type_) {
    TransDataType(kernel_type_for_var, expected_kernel_type, in, &out);
    transformed = true;
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@ -16,13 +16,16 @@ limitations under the License. */
 #include <typeindex>
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/float16.h"

 namespace paddle {
 namespace framework {

 inline proto::VarType::Type ToDataType(std::type_index type) {
  using namespace paddle::framework::proto;
-  if (typeid(float).hash_code() == type.hash_code()) {
+  if (typeid(platform::float16).hash_code() == type.hash_code()) {
+    return proto::VarType::FP16;
+  } else if (typeid(float).hash_code() == type.hash_code()) {
    return proto::VarType::FP32;
  } else if (typeid(double).hash_code() == type.hash_code()) {
    return proto::VarType::FP64;
@ -40,6 +43,8 @@ inline proto::VarType::Type ToDataType(std::type_index type) {
 inline std::type_index ToTypeIndex(proto::VarType::Type type) {
  using namespace paddle::framework::proto;
  switch (type) {
+    case proto::VarType::FP16:
+      return typeid(platform::float16);
    case proto::VarType::FP32:
      return typeid(float);
    case proto::VarType::FP64:
@ -59,6 +64,9 @@ template <typename Visitor>
 inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
  using namespace paddle::framework::proto;
  switch (type) {
+    case proto::VarType::FP16:
+      visitor.template operator()<platform::float16>();
+      break;
    case proto::VarType::FP32:
      visitor.template operator()<float>();
      break;
--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
@ -47,9 +47,16 @@ struct CastDataType {
      auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
      trans(*context, in_begin, in_end, out_begin,
            CastDataTypeFunctor<InType, OutType>());
+#ifdef __NVCC__
+    } else if (platform::is_gpu_place(in_.place())) {
+      platform::Transform<platform::CUDADeviceContext> trans;
+      auto* context = static_cast<const platform::CUDADeviceContext*>(ctx_);
+      trans(*context, in_begin, in_end, out_begin,
+            CastDataTypeFunctor<InType, OutType>());
+      context->Wait();
+#endif
    } else {
-      // TODO(dzhwinter): enhance Copy CPU<->GPU with different data type?
-      PADDLE_THROW("Unsupport CPU <-> GPU!");
+      PADDLE_THROW("Unsupported place!");
    }
  }
 };
@ -65,6 +72,10 @@ void TransDataType(const OpKernelType& kernel_type_for_var,
  auto ctx = pool.Get(in.place());

  switch (src_type) {
+    case proto::VarType::FP16:
+      framework::VisitDataType(dst_type,
+                               CastDataType<platform::float16>(in, out, ctx));
+      break;
    case proto::VarType::FP32:
      framework::VisitDataType(dst_type, CastDataType<float>(in, out, ctx));
      break;
--- a/paddle/fluid/framework/data_type_transform.cu
+++ b/paddle/fluid/framework/data_type_transform.cu
@ -0,0 +1 @@
+data_type_transform.cc
--- a/paddle/fluid/framework/data_type_transform_test.cc
+++ b/paddle/fluid/framework/data_type_transform_test.cc
@ -22,32 +22,145 @@ TEST(DataTypeTransform, CPUTransform) {

  auto place = CPUPlace();

-  Tensor in;
-  Tensor out;
-
-  float* ptr = in.mutable_data<float>(make_ddim({2, 3}), place);
-  int data_number = 2 * 3;
-
-  for (int i = 0; i < data_number; ++i) {
-    ptr[i] = i / 3;
-  }
-
+  auto kernel_fp16 = OpKernelType(proto::VarType::FP16, place,
+                                  DataLayout::kAnyLayout, LibraryType::kPlain);
  auto kernel_fp32 = OpKernelType(proto::VarType::FP32, place,
                                  DataLayout::kAnyLayout, LibraryType::kPlain);
  auto kernel_fp64 = OpKernelType(proto::VarType::FP64, place,
                                  DataLayout::kAnyLayout, LibraryType::kPlain);
  auto kernel_int32 = OpKernelType(proto::VarType::INT32, place,
                                   DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto kernel_int64 = OpKernelType(proto::VarType::INT64, place,
+                                   DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto kernel_bool = OpKernelType(proto::VarType::BOOL, place,
+                                  DataLayout::kAnyLayout, LibraryType::kPlain);

-  TransDataType(kernel_fp32, kernel_fp64, in, &out);
-  double* out_data_double = out.data<double>();
-  for (int i = 0; i < data_number; ++i) {
-    ASSERT_EQ(out_data_double[i], static_cast<double>(i / 3));
+  // data type transform from float32
+  {
+    Tensor in;
+    Tensor out;
+
+    float* ptr = in.mutable_data<float>(make_ddim({2, 3}), place);
+    int data_number = 2 * 3;
+
+    for (int i = 0; i < data_number; ++i) {
+      ptr[i] = i / 3;
+    }
+
+    TransDataType(kernel_fp32, kernel_fp64, in, &out);
+    double* out_data_double = out.data<double>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_double[i], static_cast<double>(i / 3));
+    }
+
+    TransDataType(kernel_fp32, kernel_int32, in, &out);
+    int* out_data_int = out.data<int>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_int[i], static_cast<int>(i / 3));
+    }
  }

-  TransDataType(kernel_fp32, kernel_int32, in, &out);
-  int* out_data_int = out.data<int>();
-  for (int i = 0; i < data_number; ++i) {
-    ASSERT_EQ(out_data_int[i], static_cast<int>(i / 3));
+  // data type transform from/to float16
+  {
+    Tensor in;
+    Tensor out;
+
+    float16* ptr = in.mutable_data<float16>(make_ddim({2, 3}), place);
+    int data_number = 2 * 3;
+
+    for (int i = 0; i < data_number; ++i) {
+      ptr[i] = i;
+    }
+
+    // transform from float16 to other data types
+    TransDataType(kernel_fp16, kernel_fp32, in, &out);
+    float* out_data_float = out.data<float>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_float[i], static_cast<float>(ptr[i]));
+    }
+
+    TransDataType(kernel_fp16, kernel_fp64, in, &out);
+    double* out_data_double = out.data<double>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_double[i], static_cast<double>(ptr[i]));
+    }
+
+    TransDataType(kernel_fp16, kernel_int32, in, &out);
+    int* out_data_int = out.data<int>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_int[i], static_cast<int>(ptr[i]));
+    }
+
+    TransDataType(kernel_fp16, kernel_int64, in, &out);
+    int64_t* out_data_int64 = out.data<int64_t>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_int64[i], static_cast<int64_t>(ptr[i]));
+    }
+
+    TransDataType(kernel_fp16, kernel_bool, in, &out);
+    bool* out_data_bool = out.data<bool>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_bool[i], static_cast<bool>(ptr[i]));
+    }
+
+    // transform float to float16
+    float* in_data_float = in.mutable_data<float>(make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_float[i] = i;
+    }
+
+    TransDataType(kernel_fp32, kernel_fp16, in, &out);
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_float[i]).x);
+    }
+
+    // transform double to float16
+    double* in_data_double = in.mutable_data<double>(make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_double[i] = i;
+    }
+
+    TransDataType(kernel_fp64, kernel_fp16, in, &out);
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_double[i]).x);
+    }
+
+    // transform int to float16
+    int* in_data_int = in.mutable_data<int>(make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_int[i] = i;
+    }
+
+    TransDataType(kernel_int32, kernel_fp16, in, &out);
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int[i]).x);
+    }
+
+    // transform int64 to float16
+    int64_t* in_data_int64 = in.mutable_data<int64_t>(make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_int64[i] = i;
+    }
+
+    TransDataType(kernel_int64, kernel_fp16, in, &out);
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int64[i]).x);
+    }
+
+    // transform bool to float16
+    bool* in_data_bool = in.mutable_data<bool>(make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_bool[i] = i;
+    }
+
+    TransDataType(kernel_bool, kernel_fp16, in, &out);
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_bool[i]).x);
+    }
  }
 }
--- a/paddle/fluid/framework/data_type_transform_test.cu
+++ b/paddle/fluid/framework/data_type_transform_test.cu
@ -0,0 +1,222 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/data_type_transform.h"
+#include "paddle/fluid/framework/tensor_util.h"
+
+#include "gtest/gtest.h"
+
+TEST(DataTypeTransform, GPUTransform) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+
+  auto cpu_place = CPUPlace();
+  auto gpu_place = CUDAPlace(0);
+  CUDADeviceContext context(gpu_place);
+
+  auto kernel_fp16 = OpKernelType(proto::VarType::FP16, gpu_place,
+                                  DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto kernel_fp32 = OpKernelType(proto::VarType::FP32, gpu_place,
+                                  DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto kernel_fp64 = OpKernelType(proto::VarType::FP64, gpu_place,
+                                  DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto kernel_int32 = OpKernelType(proto::VarType::INT32, gpu_place,
+                                   DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto kernel_int64 = OpKernelType(proto::VarType::INT64, gpu_place,
+                                   DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto kernel_bool = OpKernelType(proto::VarType::BOOL, gpu_place,
+                                  DataLayout::kAnyLayout, LibraryType::kPlain);
+
+  // data type transform from float32
+  {
+    Tensor in;
+    Tensor in_gpu;
+    Tensor out_gpu;
+    Tensor out;
+
+    float* in_ptr = in.mutable_data<float>(make_ddim({2, 3}), cpu_place);
+    float arr[6] = {0, 1, 2, 3, 4, 5};
+    int data_number = sizeof(arr) / sizeof(arr[0]);
+    memcpy(in_ptr, arr, sizeof(arr));
+
+    TensorCopy(in, gpu_place, context, &in_gpu);
+    context.Wait();
+    TransDataType(kernel_fp32, kernel_fp64, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+
+    double* out_data_double = out.data<double>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_double[i], static_cast<double>(arr[i]));
+    }
+
+    TransDataType(kernel_fp32, kernel_int32, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+
+    int* out_data_int = out.data<int>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_int[i], static_cast<int>(arr[i]));
+    }
+  }
+
+  // data type transform from/to float16
+  {
+    Tensor in;
+    Tensor in_gpu;
+    Tensor out_gpu;
+    Tensor out;
+
+    float16* ptr = in.mutable_data<float16>(make_ddim({2, 3}), cpu_place);
+    float16 arr[6] = {float16(0), float16(1), float16(2),
+                      float16(3), float16(4), float16(5)};
+    int data_number = sizeof(arr) / sizeof(arr[0]);
+    memcpy(ptr, arr, sizeof(arr));
+    TensorCopy(in, gpu_place, context, &in_gpu);
+    context.Wait();
+
+    // transform from float16 to other data types
+    TransDataType(kernel_fp16, kernel_fp32, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+
+    float* out_data_float = out.data<float>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_float[i], static_cast<float>(ptr[i]));
+    }
+
+    TransDataType(kernel_fp16, kernel_fp64, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+
+    double* out_data_double = out.data<double>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_double[i], static_cast<double>(ptr[i]));
+    }
+
+    TransDataType(kernel_fp16, kernel_int32, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+
+    int* out_data_int = out.data<int>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_int[i], static_cast<int>(ptr[i]));
+    }
+
+    TransDataType(kernel_fp16, kernel_int64, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+
+    int64_t* out_data_int64 = out.data<int64_t>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_int64[i], static_cast<int64_t>(ptr[i]));
+    }
+
+    TransDataType(kernel_fp16, kernel_bool, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+
+    bool* out_data_bool = out.data<bool>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_bool[i], static_cast<bool>(ptr[i]));
+    }
+
+    // transform float to float16
+    float* in_data_float = in.mutable_data<float>(make_ddim({2, 3}), cpu_place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_float[i] = i;
+    }
+
+    TensorCopy(in, gpu_place, context, &in_gpu);
+    context.Wait();
+    TransDataType(kernel_fp32, kernel_fp16, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_float[i]).x);
+    }
+
+    // transform double to float16
+    double* in_data_double =
+        in.mutable_data<double>(make_ddim({2, 3}), cpu_place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_double[i] = i;
+    }
+
+    TensorCopy(in, gpu_place, context, &in_gpu);
+    context.Wait();
+    TransDataType(kernel_fp64, kernel_fp16, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_double[i]).x);
+    }
+
+    // transform int to float16
+    int* in_data_int = in.mutable_data<int>(make_ddim({2, 3}), cpu_place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_int[i] = i;
+    }
+
+    TensorCopy(in, gpu_place, context, &in_gpu);
+    context.Wait();
+    TransDataType(kernel_int32, kernel_fp16, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int[i]).x);
+    }
+
+    // transform int64 to float16
+    int64_t* in_data_int64 =
+        in.mutable_data<int64_t>(make_ddim({2, 3}), cpu_place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_int64[i] = i;
+    }
+
+    TensorCopy(in, gpu_place, context, &in_gpu);
+    context.Wait();
+    TransDataType(kernel_int64, kernel_fp16, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int64[i]).x);
+    }
+
+    // transform bool to float16
+    bool* in_data_bool = in.mutable_data<bool>(make_ddim({2, 3}), cpu_place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_bool[i] = i;
+    }
+
+    TensorCopy(in, gpu_place, context, &in_gpu);
+    context.Wait();
+    TransDataType(kernel_bool, kernel_fp16, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_bool[i]).x);
+    }
+  }
+}
--- a/paddle/fluid/framework/details/buffered_channel.h
+++ b/paddle/fluid/framework/details/buffered_channel.h
@ -1,142 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <atomic>
-#include <condition_variable>
-#include <deque>
-#include <mutex>
-
-#include "paddle/fluid/framework/channel.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-// Four of the properties of Buffered Channel:
-// - A send to a full channel blocks temporarily until a receive from the
-// channel or the channel is closed.
-// - A receive from an empty channel blocks temporarily until a send to the
-// channel or the channel is closed.
-// - A send to a closed channel returns false immediately.
-// - A receive from a closed channel returns false immediately.
-
-template <typename T>
-class Buffered : public paddle::framework::Channel<T> {
-  friend Channel<T>* paddle::framework::MakeChannel<T>(size_t);
-  friend void paddle::framework::CloseChannel<T>(Channel<T>*);
-
- public:
-  virtual bool Send(T*);
-  virtual bool Receive(T*);
-  virtual size_t Cap() { return cap_; }
-  virtual void Close();
-  virtual ~Buffered();
-
- private:
-  size_t cap_;
-  std::mutex mu_;
-  std::condition_variable empty_cond_var_;
-  std::condition_variable full_cond_var_;
-  std::condition_variable destructor_cond_var_;
-  std::deque<T> channel_;
-  std::atomic<bool> closed_{false};
-  std::atomic<unsigned> send_ctr{0};
-  std::atomic<unsigned> recv_ctr{0};
-
-  Buffered(size_t cap) : cap_(cap), closed_(false) {
-    PADDLE_ENFORCE_GT(cap, 0);
-  }
-
-  void NotifyAllParticipants(std::unique_lock<std::mutex>*);
-};
-
-template <typename T>
-bool Buffered<T>::Send(T* item) {
-  bool ret = false;
-  if (closed_) {
-    return ret;
-  }
-  send_ctr++;
-  std::unique_lock<std::mutex> lock(mu_);
-  full_cond_var_.wait(lock,
-                      [this]() { return channel_.size() < cap_ || closed_; });
-  if (!closed_) {
-    channel_.push_back(std::move(*item));
-    lock.unlock();
-    empty_cond_var_.notify_one();
-    ret = true;
-  }
-  send_ctr--;
-  destructor_cond_var_.notify_one();
-  return ret;
-}
-
-template <typename T>
-bool Buffered<T>::Receive(T* item) {
-  bool ret = false;
-  // Once the channel has been closed and all data has been consumed,
-  // just return false. Don't even try acquiring the mutex.
-  if (closed_ && channel_.empty()) {
-    return false;
-  }
-  recv_ctr++;
-  std::unique_lock<std::mutex> lock(mu_);
-  empty_cond_var_.wait(lock, [this]() { return !channel_.empty() || closed_; });
-  if (!channel_.empty()) {
-    *item = std::move(channel_.front());
-    channel_.pop_front();
-    full_cond_var_.notify_one();
-    ret = true;
-  }
-  recv_ctr--;
-  destructor_cond_var_.notify_one();
-  return ret;
-}
-
-template <typename T>
-void Buffered<T>::Close() {
-  if (closed_) {
-    return;
-  }
-  std::unique_lock<std::mutex> lock(mu_);
-  closed_ = true;
-  NotifyAllParticipants(&lock);
-}
-
-template <typename T>
-Buffered<T>::~Buffered() {
-  std::unique_lock<std::mutex> lock(mu_);
-  closed_ = true;
-  channel_.clear();
-  NotifyAllParticipants(&lock);
-
-  // The destructor must wait for all readers and writers to complete their task
-  // The channel has been closed, so we will not accept new readers and writers
-  lock.lock();
-  destructor_cond_var_.wait(
-      lock, [this]() { return send_ctr == 0 && recv_ctr == 0; });
-}
-
-template <typename T>
-void Buffered<T>::NotifyAllParticipants(std::unique_lock<std::mutex>* lock) {
-  lock->unlock();
-  full_cond_var_.notify_all();
-  empty_cond_var_.notify_all();
-}
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/framework/details/unbuffered_channel.h
+++ b/paddle/fluid/framework/details/unbuffered_channel.h
@ -1,174 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <atomic>
-#include <condition_variable>
-#include <mutex>
-
-#include "paddle/fluid/framework/channel.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-// Four of the properties of UnBuffered Channel:
-// - A send to a channel blocks temporarily until a receive from the
-// channel or the channel is closed.
-// - A receive from a channel blocks temporarily until a send to the
-// channel or the channel is closed.
-// - A send to a closed channel returns false immediately.
-// - A receive from a closed channel returns false immediately.
-template <typename T>
-class UnBuffered : public paddle::framework::Channel<T> {
-  friend Channel<T>* paddle::framework::MakeChannel<T>(size_t);
-  friend void paddle::framework::CloseChannel<T>(Channel<T>*);
-
- public:
-  virtual bool Send(T*);
-  virtual bool Receive(T*);
-  virtual size_t Cap() { return 0; }
-  virtual void Close();
-  virtual ~UnBuffered();
-
- private:
-  std::mutex mu_ch_;
-  // Mutex for readers and writers who are waiting for other reader
-  // and writer to complete execution
-  std::recursive_mutex mu_read_, mu_write_;
-  // reader_found_ is set true when a reader is ready to accept data
-  // writer_found_ is set true when a writer is ready to send data
-  // A transaction occurs only when both are true
-  std::atomic<bool> reader_found_{false}, writer_found_{false};
-  std::condition_variable cv_channel_;
-  std::condition_variable_any cv_reader_, cv_writer_, cv_destructor_;
-  T* item{nullptr};
-  std::atomic<bool> closed_{false};
-  std::atomic<unsigned> send_ctr{0};
-  std::atomic<unsigned> recv_ctr{0};
-
-  UnBuffered() : closed_(false) {}
-
-  void NotifyAllParticipants(std::unique_lock<std::mutex>*);
-};
-
-// This function implements the concept of how data should
-// be sent from a writer to a reader.
-template <typename T>
-bool UnBuffered<T>::Send(T* data) {
-  bool ret = false;
-  if (closed_) {
-    return ret;
-  }
-  send_ctr++;
-  // Prevent other writers from entering
-  std::unique_lock<std::recursive_mutex> writer_lock(mu_write_);
-  writer_found_ = true;
-  std::unique_lock<std::recursive_mutex> cv_lock(mu_write_);
-  // If writer comes first, it should wait till a reader arrives
-  cv_writer_.wait(cv_lock,
-                  [this]() { return reader_found_ == true || closed_; });
-  cv_reader_.notify_one();
-  if (!closed_) {
-    std::unique_lock<std::mutex> channel_lock(mu_ch_);
-    item = data;
-    channel_lock.unlock();
-    cv_channel_.notify_one();
-    channel_lock.lock();
-    cv_channel_.wait(channel_lock,
-                     [this]() { return item == nullptr || closed_; });
-    ret = true;
-  }
-  writer_found_ = false;
-  send_ctr--;
-  cv_destructor_.notify_one();
-  return ret;
-}
-
-// This function implements the concept of how
-// data that was sent by a writer is read from a reader.
-template <typename T>
-bool UnBuffered<T>::Receive(T* data) {
-  bool ret = false;
-  // If channel is closed, we don't even want any reader to enter.
-  // Unlike a buffered channel, an unbuffered channel does not allow
-  // readers to read after closing because there is no buffer to be consumed.
-  if (closed_) return ret;
-  recv_ctr++;
-  // Prevent other readers from entering
-  std::unique_lock<std::recursive_mutex> read_lock{mu_read_};
-  reader_found_ = true;
-  std::unique_lock<std::recursive_mutex> cv_lock{mu_read_};
-  // If reader comes first, it should wait till a writer arrives
-  cv_reader_.wait(cv_lock,
-                  [this]() { return writer_found_ == true || closed_; });
-  cv_writer_.notify_one();
-  if (!closed_) {
-    std::unique_lock<std::mutex> lock_ch{mu_ch_};
-    // Reader should wait for the writer to first write its data
-    cv_channel_.wait(lock_ch, [this]() { return item != nullptr || closed_; });
-    if (!closed_) {
-      *data = std::move(*item);
-      item = nullptr;
-      lock_ch.unlock();
-      ret = true;
-    }
-    cv_channel_.notify_one();
-  }
-  reader_found_ = false;
-  recv_ctr--;
-  cv_destructor_.notify_one();
-  return ret;
-}
-
-// This function implements the sequence of events
-// that take place once the channel is closed.
-template <typename T>
-void UnBuffered<T>::Close() {
-  if (closed_) {
-    return;
-  }
-  std::unique_lock<std::mutex> lock(mu_ch_);
-  item = nullptr;
-  closed_ = true;
-  NotifyAllParticipants(&lock);
-}
-
-// This function implements the sequence of events
-// that are executed once the object of an UnBuffered
-// channel is destroyed.
-template <typename T>
-UnBuffered<T>::~UnBuffered() {
-  std::unique_lock<std::mutex> lock(mu_ch_);
-  item = nullptr;
-  closed_ = true;
-  NotifyAllParticipants(&lock);
-  lock.lock();
-  cv_destructor_.wait(lock,
-                      [this]() { return send_ctr == 0 && recv_ctr == 0; });
-}
-
-// This function notifies all the readers, writers and
-// the channel condition variables.
-template <typename T>
-void UnBuffered<T>::NotifyAllParticipants(std::unique_lock<std::mutex>* lock) {
-  lock->unlock();
-  cv_writer_.notify_all();
-  cv_channel_.notify_all();
-  cv_reader_.notify_all();
-}
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/framework/reader.cc
+++ b/paddle/fluid/framework/reader.cc
@ -25,92 +25,5 @@ DDim ReaderBase::shape(size_t idx) const {
  return shapes_[idx];
 }

-void ShuffleReader::ReadNext(std::vector<LoDTensor>* out) {
-  if (iteration_pos_ >= buffer_.size()) {
-    // Reload buffer with new data
-    buffer_.clear();
-    buffer_.reserve(buffer_size_);
-    for (int i = 0; i < buffer_size_; ++i) {
-      if (reader_->HasNext()) {
-        buffer_.push_back(std::vector<LoDTensor>());
-        reader_->ReadNext(&buffer_.back());
-      } else {
-        break;
-      }
-    }
-    // TODO(fengjiayi): 'std::random_shuffle' can be very slow. It needs to be
-    // optimize.
-    std::random_shuffle(buffer_.begin(), buffer_.end());
-    iteration_pos_ = 0;
-  }
-  out->clear();
-  if (!buffer_.empty()) {
-    std::swap(*out, buffer_[iteration_pos_++]);
-  }
-  // if buffer_ is empty, the 'out' will return as an empty vector.
-}
-
-void BatchReader::ReadNext(std::vector<LoDTensor>* out) {
-  buffer_.clear();
-  buffer_.reserve(batch_size_);
-  for (int i = 0; i < batch_size_; ++i) {
-    if (reader_->HasNext()) {
-      buffer_.push_back(std::vector<LoDTensor>());
-      reader_->ReadNext(&buffer_.back());
-    } else {
-      break;
-    }
-  }
-  // Concat instances
-  out->clear();
-  if (buffer_.empty()) {
-    // if buffer_ is empty, the 'out' will return as an empty vector.
-    return;
-  }
-  int out_num = buffer_[0].size();
-  out->reserve(out_num);
-  for (int j = 0; j < out_num; ++j) {
-    // Merge shape and check date type
-    std::type_index batch_type = buffer_[0][j].type();
-    DDim batch_shape = buffer_[0][j].dims();
-    for (size_t i = 1; i < buffer_.size(); ++i) {
-      std::type_index ins_type = buffer_[i][j].type();
-      DDim ins_shape = buffer_[i][j].dims();
-      PADDLE_ENFORCE_EQ(batch_type, ins_type);
-      PADDLE_ENFORCE_EQ(slice_ddim(batch_shape, 1, batch_shape.size()),
-                        slice_ddim(ins_shape, 1, ins_shape.size()));
-      PADDLE_ENFORCE_GT(ins_shape[0], 0);
-      batch_shape[0] += ins_shape[0];
-    }
-
-    LoDTensor out_tensor;
-    out_tensor.Resize(batch_shape);
-    out_tensor.mutable_data(platform::CPUPlace(), batch_type);
-    int64_t dst_offset = 0;
-
-    // Merge lod and data
-    LoD batch_lod;
-    for (size_t i = 0; i < buffer_.size(); ++i) {
-      DDim ins_shape = buffer_[i][j].dims();
-      LoD ins_lod = buffer_[i][j].lod();
-      if (i == 0) {
-        batch_lod = ins_lod;
-      } else {
-        PADDLE_ENFORCE_EQ(batch_lod.size(), ins_lod.size());
-        for (size_t level_idx = 0; level_idx < batch_lod.size(); ++level_idx) {
-          auto& lod_level = batch_lod[level_idx];
-          for (size_t k = 1; k < ins_lod[level_idx].size(); ++k) {
-            lod_level.push_back(ins_lod[level_idx][k] + lod_level.back());
-          }
-        }
-      }
-      Tensor dst = out_tensor.Slice(dst_offset, dst_offset + ins_shape[0]);
-      TensorCopy(buffer_[i][j], platform::CPUPlace(), &dst);
-      dst_offset += ins_shape[0];
-    }
-    out_tensor.set_lod(batch_lod);
-    out->push_back(out_tensor);
-  }
-}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/reader.h
+++ b/paddle/fluid/framework/reader.h
@ -60,83 +60,8 @@ class DecoratedReader : public ReaderBase {
  ReaderBase* reader_;
 };

-// file readers
-
-template <typename T>
-class RandomDataGenerator : public FileReader {
- public:
-  RandomDataGenerator(const std::vector<DDim>& shapes, float min, float max)
-      : FileReader(shapes), min_(min), max_(max) {
-    PADDLE_ENFORCE_LE(
-        min, max, "'min' shouldn't be greater than 'max'.(%f vs %f)", min, max);
-    unsigned int seed = std::random_device()();
-    engine_.seed(seed);
-    dist_ = std::uniform_real_distribution<float>(min_, max_);
-  }
-
-  void ReadNext(std::vector<LoDTensor>* out) override {
-    out->clear();
-    out->reserve(shapes_.size());
-    for (const DDim& shape : shapes_) {
-      PADDLE_ENFORCE_GE(
-          shape.size(), 2,
-          "The rank of reader's output data should be 2 at least.(Now it's %d)",
-          shape.size());
-      LoDTensor out_tensor;
-      out_tensor.Resize(shape);
-      T* data = out_tensor.mutable_data<T>(platform::CPUPlace());
-      int64_t numel = product(shape);
-      for (int64_t i = 0; i < numel; ++i) {
-        data[i] = dist_(engine_);
-      }
-      out->push_back(out_tensor);
-    }
-  }
-
-  bool HasNext() const override { return true; }
-
-  void ReInit() override { return; }
-
- private:
-  float min_;
-  float max_;
-  std::minstd_rand engine_;
-  std::uniform_real_distribution<float> dist_;
-};
-
-// decorated readers
-
-class ShuffleReader : public DecoratedReader {
- public:
-  ShuffleReader(ReaderBase* reader, int buffer_size)
-      : DecoratedReader(reader), buffer_size_(buffer_size), iteration_pos_(0) {
-    buffer_.reserve(buffer_size);
-  }
-
-  void ReadNext(std::vector<LoDTensor>* out) override;
-
- private:
-  int buffer_size_;
-  std::vector<std::vector<LoDTensor>> buffer_;
-  size_t iteration_pos_;
-};
-
-class BatchReader : public DecoratedReader {
- public:
-  BatchReader(ReaderBase* reader, int batch_size)
-      : DecoratedReader(reader), batch_size_(batch_size) {
-    buffer_.reserve(batch_size_);
-  }
-
-  void ReadNext(std::vector<LoDTensor>* out) override;
-
- private:
-  int batch_size_;
-  std::vector<std::vector<LoDTensor>> buffer_;
-};
-
-// The ReaderHolder is used as readers' unified wrapper,
-// making it easier to access different type readers in Variables.
+// The ReaderHolder is used as reader' unified wrapper,
+// making it easier to access different type reader in Variables.
 class ReaderHolder {
 public:
  void Reset(ReaderBase* reader) { reader_.reset(reader); }
--- a/Show More
+++ b/Show More