Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into feature/recordio_file_reader

7 years ago · b536799af0
parent db46778bdd 47ca1814f3
commit b536799af0
59 changed files with 1892 additions and 736 deletions
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
@ -138,13 +138,14 @@ def main():
    avg_cost = fluid.layers.mean(x=cost)
    # Evaluator
-    accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+    batch_size = fluid.layers.create_tensor(dtype='int64')
    batch_acc = fluid.layers.accuracy(
        input=predict, label=label, total=batch_size)
    # inference program
    inference_program = fluid.default_main_program().clone()
    with fluid.program_guard(inference_program):
-        test_target = accuracy.metrics + accuracy.states
+        inference_program = fluid.io.get_inference_program(batch_acc)
        inference_program = fluid.io.get_inference_program(test_target)
    # Optimization
    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
@ -157,27 +158,30 @@ def main():
    # test
    def test(exe):
-        accuracy.reset(exe)
+        test_pass_acc = fluid.average.WeightedAverage()
        for batch_id, data in enumerate(test_reader()):
            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
                                    data)).astype("float32")
            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
            y_data = y_data.reshape([-1, 1])
-            exe.run(inference_program,
+            outs = exe.run(inference_program,
                           feed={"pixel": img_data,
-                          "label": y_data})
+                                 "label": y_data},
                           fetch_list=[batch_acc, batch_size])
            test_pass_acc.add(value=np.array(outs[0]), weight=np.array(outs[1]))
-        return accuracy.eval(exe)
+        return test_pass_acc.eval()
    def train_loop(exe, trainer_prog):
        iters = 0
        ts = time.time()
        train_pass_acc = fluid.average.WeightedAverage()
        for pass_id in range(args.num_passes):
            # train
            start_time = time.time()
            num_samples = 0
-            accuracy.reset(exe)
+            train_pass_acc.reset()
            with profiler.profiler("CPU", 'total') as prof:
                for batch_id, data in enumerate(train_reader()):
                    ts = time.time()
@ -187,13 +191,14 @@ def main():
                    y_data = np.array(map(lambda x: x[1], data)).astype("int64")
                    y_data = y_data.reshape([-1, 1])
-                    loss, acc = exe.run(
+                    loss, acc, b_size = exe.run(
                        trainer_prog,
                        feed={"pixel": img_data,
                              "label": y_data},
-                        fetch_list=[avg_cost] + accuracy.metrics)
+                        fetch_list=[avg_cost, batch_acc, batch_size])
                    iters += 1
                    num_samples += len(data)
                    train_pass_acc.add(value=acc, weight=b_size)
                    print(
                        "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s"
                        % (pass_id, iters, loss, acc,
@ -201,7 +206,7 @@ def main():
                    )  # The accuracy is the accumulation of batches, but not the current batch.
            pass_elapsed = time.time() - start_time
-            pass_train_acc = accuracy.eval(exe)
+            pass_train_acc = train_pass_acc.eval()
            pass_test_acc = test(exe)
            print(
                "Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n"
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@ -78,6 +78,7 @@ IF(NOT ${CBLAS_FOUND})
        BUILD_IN_SOURCE     1
        BUILD_COMMAND       ${CMAKE_MAKE_PROGRAM} ${COMMON_ARGS} ${OPTIONAL_ARGS}
        INSTALL_COMMAND     ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 NO_LAPACK=1 PREFIX=<INSTALL_DIR> 
                            && rm -r ${CBLAS_INSTALL_DIR}/lib/cmake ${CBLAS_INSTALL_DIR}/lib/pkgconfig
        UPDATE_COMMAND      ""
        CONFIGURE_COMMAND   ""
    )
@ -100,11 +101,6 @@ IF(NOT ${CBLAS_FOUND})
                \"${CBLAS_INSTALL_DIR}/lib -> ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}\"
            )"
        )
        INSTALL(CODE "execute_process(
            COMMAND rm -r ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}/cmake
                    ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}/pkgconfig
            )"
        )
    ENDIF()
 ENDIF(NOT ${CBLAS_FOUND})
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@ -186,7 +186,9 @@ function(cc_library TARGET_NAME)
      add_library(${TARGET_NAME} SHARED ${cc_library_SRCS})
    else()
      add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
      find_fluid_modules(${TARGET_NAME})
    endif()
    if(cc_library_DEPS)
      # Don't need link libwarpctc.so
      if("${cc_library_DEPS};" MATCHES "warpctc;")
@ -264,6 +266,7 @@ function(nv_library TARGET_NAME)
        cuda_add_library(${TARGET_NAME} SHARED ${nv_library_SRCS})
      else()
        cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS})
        find_fluid_modules(${TARGET_NAME})
      endif()
      if (nv_library_DEPS)
        add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@ -1,9 +1,22 @@
 set_property(GLOBAL PROPERTY FLUID_MODULES "")
 # find all fluid modules is used for paddle fluid static library
 function(find_fluid_modules TARGET_NAME)
  get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
  string(FIND "${__target_path}" "fluid" pos)
  if(pos GREATER 1)
    get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
    set(fluid_modules ${fluid_modules} ${TARGET_NAME})
    set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
  endif()
 endfunction(find_fluid_modules)
 # make package for paddle fluid shared and static library
 function(copy TARGET)
    set(options "")
    set(oneValueArgs "")
    set(multiValueArgs SRCS DSTS DEPS)
    cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    set(inference_lib_dist_dep ${TARGET} ${inference_lib_dist_dep} PARENT_SCOPE)
    list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
    list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
@ -42,13 +55,21 @@ copy(glog_lib
  DSTS ${dst_dir} ${dst_dir}/lib
 )
-IF(NOT PROTOBUF_FOUND)
+if(NOT PROTOBUF_FOUND)
    set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/protobuf")
    copy(protobuf_lib
-      SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LITE_LIBRARY}
+      SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY}
      DSTS ${dst_dir} ${dst_dir}/lib
    )
-ENDIF(NOT PROTOBUF_FOUND)
+endif()
 if(NOT CBLAS_FOUND)
    set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/openblas")
    copy(openblas_lib
      SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
      DSTS ${dst_dir} ${dst_dir}
    )
 endif()
 # paddle fluid module
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
@ -66,8 +87,8 @@ copy(memory_lib
 )
 set(module "inference")
-copy(inference_lib DEPENDS paddle_fluid_shared
+copy(inference_lib DEPS paddle_fluid_shared paddle_fluid
-  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.so
+  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
  DSTS ${dst_dir}/${module} ${dst_dir}/${module}
 )
@ -83,6 +104,4 @@ copy(string_lib
  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat
 )
-add_custom_target(inference_lib_dist DEPENDS 
+add_custom_target(inference_lib_dist DEPENDS ${inference_lib_dist_dep}) 
  inference_lib framework_lib memory_lib platform_lib string_lib
  gflags_lib glog_lib protobuf_lib eigen3_lib)
--- a/doc/fluid/howto/optimization/timeline.jpeg
+++ b/doc/fluid/howto/optimization/timeline.jpeg
--- a/doc/fluid/howto/optimization/timeline.md
+++ b/doc/fluid/howto/optimization/timeline.md
@ -0,0 +1,27 @@
 ## how to use timeline tool to do profile
 1. Add `with profiler.profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
 	```python
 	with profiler.profiler('All', 'total', '/tmp/profile') as prof:
 	    for pass_id in range(pass_num):
 	        for batch_id, data in enumerate(train_reader()):
 	            exe.run(fluid.default_main_program(),
 	                    feed=feeder.feed(data),
 	                    fetch_list=[],
 	                    use_program_cache=True)
 	            ...
 	```
 1. Run `python paddle/tools/timeline.py` to process `/tmp/profile`, it will generate another
 file `/tmp/timeline` by default. You can change the path by cmd parameter, please take a look at
 [timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py) for details.
 1. Open chrome and visit <chrome://tracing/>, use `load` button to load the generated `timeline` file.
 	![chrome tracing](./tracing.jpeg)
 1. The resulting timeline should be like:
 	![chrome timeline](./timeline.jpeg)
--- a/doc/fluid/howto/optimization/tracing.jpeg
+++ b/doc/fluid/howto/optimization/tracing.jpeg
--- a/doc/v2/build_and_install/pip_install_cn.rst
+++ b/doc/v2/build_and_install/pip_install_cn.rst
@ -39,7 +39,7 @@ PaddlePaddle可以使用常用的Python包管理工具
    "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
    "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
-    "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
+    "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddle.tgz>`_"
    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
--- a/doc/v2/build_and_install/pip_install_en.rst
+++ b/doc/v2/build_and_install/pip_install_en.rst
@ -42,7 +42,7 @@ If the links below shows up the login form, just click "Log in as guest" to star
    "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
    "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
-    "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
+    "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddle.tgz>`_"
    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
--- a/doc/v2/howto/index_cn.rst
+++ b/doc/v2/howto/index_cn.rst
@ -1,11 +1,37 @@
 进阶使用
 ========
 PaddlePaddle支持用户灵活地设置各种命令行参数，以实现对模型训练或预测流程的控制。使用方式请参考：
 ..  toctree::
  :maxdepth: 1
  cmd_parameter/index_cn.rst
 PaddlePaddle支持在fabric集群、MPI集群、kubernetes集群上分布式训练任务，具体环境配置和使用说明请参考：
 ..  toctree::
  :maxdepth: 1
  cluster/index_cn.rst
 PaddlePaddle提供了用于预测的C-API，关于C-API的使用，我们提供了如下指南:
 ..  toctree::
  :maxdepth: 1
  capi/index_cn.rst
 PaddlePaddle支持多种灵活和高效的循环神经网络，具体配置使用方式请参考：
 ..  toctree::
  :maxdepth: 1
  rnn/index_cn.rst
 关于如何使用内置的定时工具、nvprof 或 nvvp 来运行性能分析和调优，请参考：
 ..  toctree::
  :maxdepth: 1
  optimization/gpu_profiling_cn.rst
--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
@ -53,6 +53,7 @@ struct CastDataType {
      auto* context = static_cast<const platform::CUDADeviceContext*>(ctx_);
      trans(*context, in_begin, in_end, out_begin,
            CastDataTypeFunctor<InType, OutType>());
      context->Wait();
 #endif
    } else {
      PADDLE_THROW("Unsupported place!");
--- a/paddle/fluid/framework/data_type_transform_test.cc
+++ b/paddle/fluid/framework/data_type_transform_test.cc
@ -50,13 +50,13 @@ TEST(DataTypeTransform, CPUTransform) {
    TransDataType(kernel_fp32, kernel_fp64, in, &out);
    double* out_data_double = out.data<double>();
    for (int i = 0; i < data_number; ++i) {
-      ASSERT_EQ(out_data_double[i], static_cast<double>(i / 3));
+      EXPECT_EQ(out_data_double[i], static_cast<double>(i / 3));
    }
    TransDataType(kernel_fp32, kernel_int32, in, &out);
    int* out_data_int = out.data<int>();
    for (int i = 0; i < data_number; ++i) {
-      ASSERT_EQ(out_data_int[i], static_cast<int>(i / 3));
+      EXPECT_EQ(out_data_int[i], static_cast<int>(i / 3));
    }
  }
@ -76,31 +76,31 @@ TEST(DataTypeTransform, CPUTransform) {
    TransDataType(kernel_fp16, kernel_fp32, in, &out);
    float* out_data_float = out.data<float>();
    for (int i = 0; i < data_number; ++i) {
-      ASSERT_EQ(out_data_float[i], static_cast<float>(ptr[i]));
+      EXPECT_EQ(out_data_float[i], static_cast<float>(ptr[i]));
    }
    TransDataType(kernel_fp16, kernel_fp64, in, &out);
    double* out_data_double = out.data<double>();
    for (int i = 0; i < data_number; ++i) {
-      ASSERT_EQ(out_data_double[i], static_cast<double>(ptr[i]));
+      EXPECT_EQ(out_data_double[i], static_cast<double>(ptr[i]));
    }
    TransDataType(kernel_fp16, kernel_int32, in, &out);
    int* out_data_int = out.data<int>();
    for (int i = 0; i < data_number; ++i) {
-      ASSERT_EQ(out_data_int[i], static_cast<int>(ptr[i]));
+      EXPECT_EQ(out_data_int[i], static_cast<int>(ptr[i]));
    }
    TransDataType(kernel_fp16, kernel_int64, in, &out);
    int64_t* out_data_int64 = out.data<int64_t>();
    for (int i = 0; i < data_number; ++i) {
-      ASSERT_EQ(out_data_int64[i], static_cast<int64_t>(ptr[i]));
+      EXPECT_EQ(out_data_int64[i], static_cast<int64_t>(ptr[i]));
    }
    TransDataType(kernel_fp16, kernel_bool, in, &out);
    bool* out_data_bool = out.data<bool>();
    for (int i = 0; i < data_number; ++i) {
-      ASSERT_EQ(out_data_bool[i], static_cast<bool>(ptr[i]));
+      EXPECT_EQ(out_data_bool[i], static_cast<bool>(ptr[i]));
    }
    // transform float to float16
@ -112,7 +112,7 @@ TEST(DataTypeTransform, CPUTransform) {
    TransDataType(kernel_fp32, kernel_fp16, in, &out);
    ptr = out.data<float16>();
    for (int i = 0; i < data_number; ++i) {
-      ASSERT_EQ(ptr[i].x, static_cast<float16>(in_data_float[i]).x);
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_float[i]).x);
    }
    // transform double to float16
@ -124,7 +124,7 @@ TEST(DataTypeTransform, CPUTransform) {
    TransDataType(kernel_fp64, kernel_fp16, in, &out);
    ptr = out.data<float16>();
    for (int i = 0; i < data_number; ++i) {
-      ASSERT_EQ(ptr[i].x, static_cast<float16>(in_data_double[i]).x);
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_double[i]).x);
    }
    // transform int to float16
@ -136,7 +136,7 @@ TEST(DataTypeTransform, CPUTransform) {
    TransDataType(kernel_int32, kernel_fp16, in, &out);
    ptr = out.data<float16>();
    for (int i = 0; i < data_number; ++i) {
-      ASSERT_EQ(ptr[i].x, static_cast<float16>(in_data_int[i]).x);
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int[i]).x);
    }
    // transform int64 to float16
@ -148,7 +148,7 @@ TEST(DataTypeTransform, CPUTransform) {
    TransDataType(kernel_int64, kernel_fp16, in, &out);
    ptr = out.data<float16>();
    for (int i = 0; i < data_number; ++i) {
-      ASSERT_EQ(ptr[i].x, static_cast<float16>(in_data_int64[i]).x);
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int64[i]).x);
    }
    // transform bool to float16
@ -160,7 +160,7 @@ TEST(DataTypeTransform, CPUTransform) {
    TransDataType(kernel_bool, kernel_fp16, in, &out);
    ptr = out.data<float16>();
    for (int i = 0; i < data_number; ++i) {
-      ASSERT_EQ(ptr[i].x, static_cast<float16>(in_data_bool[i]).x);
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_bool[i]).x);
    }
  }
 }
--- a/paddle/fluid/framework/data_type_transform_test.cu
+++ b/paddle/fluid/framework/data_type_transform_test.cu
@ -49,15 +49,16 @@ TEST(DataTypeTransform, GPUTransform) {
    float arr[6] = {0, 1, 2, 3, 4, 5};
    int data_number = sizeof(arr) / sizeof(arr[0]);
    memcpy(in_ptr, arr, sizeof(arr));
    TensorCopy(in, gpu_place, context, &in_gpu);
    TensorCopy(in, gpu_place, context, &in_gpu);
    context.Wait();
    TransDataType(kernel_fp32, kernel_fp64, in_gpu, &out_gpu);
    TensorCopy(out_gpu, cpu_place, context, &out);
    context.Wait();
    double* out_data_double = out.data<double>();
    for (int i = 0; i < data_number; ++i) {
-      ASSERT_EQ(out_data_double[i], static_cast<double>(arr[i]));
+      EXPECT_EQ(out_data_double[i], static_cast<double>(arr[i]));
    }
    TransDataType(kernel_fp32, kernel_int32, in_gpu, &out_gpu);
@ -66,7 +67,7 @@ TEST(DataTypeTransform, GPUTransform) {
    int* out_data_int = out.data<int>();
    for (int i = 0; i < data_number; ++i) {
-      ASSERT_EQ(out_data_int[i], static_cast<int>(arr[i]));
+      EXPECT_EQ(out_data_int[i], static_cast<int>(arr[i]));
    }
  }
@ -83,6 +84,7 @@ TEST(DataTypeTransform, GPUTransform) {
    int data_number = sizeof(arr) / sizeof(arr[0]);
    memcpy(ptr, arr, sizeof(arr));
    TensorCopy(in, gpu_place, context, &in_gpu);
    context.Wait();
    // transform from float16 to other data types
    TransDataType(kernel_fp16, kernel_fp32, in_gpu, &out_gpu);
@ -91,7 +93,7 @@ TEST(DataTypeTransform, GPUTransform) {
    float* out_data_float = out.data<float>();
    for (int i = 0; i < data_number; ++i) {
-      ASSERT_EQ(out_data_float[i], static_cast<float>(ptr[i]));
+      EXPECT_EQ(out_data_float[i], static_cast<float>(ptr[i]));
    }
    TransDataType(kernel_fp16, kernel_fp64, in_gpu, &out_gpu);
@ -100,7 +102,7 @@ TEST(DataTypeTransform, GPUTransform) {
    double* out_data_double = out.data<double>();
    for (int i = 0; i < data_number; ++i) {
-      ASSERT_EQ(out_data_double[i], static_cast<double>(ptr[i]));
+      EXPECT_EQ(out_data_double[i], static_cast<double>(ptr[i]));
    }
    TransDataType(kernel_fp16, kernel_int32, in_gpu, &out_gpu);
@ -109,7 +111,7 @@ TEST(DataTypeTransform, GPUTransform) {
    int* out_data_int = out.data<int>();
    for (int i = 0; i < data_number; ++i) {
-      ASSERT_EQ(out_data_int[i], static_cast<int>(ptr[i]));
+      EXPECT_EQ(out_data_int[i], static_cast<int>(ptr[i]));
    }
    TransDataType(kernel_fp16, kernel_int64, in_gpu, &out_gpu);
@ -118,7 +120,7 @@ TEST(DataTypeTransform, GPUTransform) {
    int64_t* out_data_int64 = out.data<int64_t>();
    for (int i = 0; i < data_number; ++i) {
-      ASSERT_EQ(out_data_int64[i], static_cast<int64_t>(ptr[i]));
+      EXPECT_EQ(out_data_int64[i], static_cast<int64_t>(ptr[i]));
    }
    TransDataType(kernel_fp16, kernel_bool, in_gpu, &out_gpu);
@ -127,7 +129,7 @@ TEST(DataTypeTransform, GPUTransform) {
    bool* out_data_bool = out.data<bool>();
    for (int i = 0; i < data_number; ++i) {
-      ASSERT_EQ(out_data_bool[i], static_cast<bool>(ptr[i]));
+      EXPECT_EQ(out_data_bool[i], static_cast<bool>(ptr[i]));
    }
    // transform float to float16
@ -137,13 +139,14 @@ TEST(DataTypeTransform, GPUTransform) {
    }
    TensorCopy(in, gpu_place, context, &in_gpu);
    context.Wait();
    TransDataType(kernel_fp32, kernel_fp16, in_gpu, &out_gpu);
    TensorCopy(out_gpu, cpu_place, context, &out);
    context.Wait();
    ptr = out.data<float16>();
    for (int i = 0; i < data_number; ++i) {
-      ASSERT_EQ(ptr[i].x, static_cast<float16>(in_data_float[i]).x);
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_float[i]).x);
    }
    // transform double to float16
@ -154,13 +157,14 @@ TEST(DataTypeTransform, GPUTransform) {
    }
    TensorCopy(in, gpu_place, context, &in_gpu);
    context.Wait();
    TransDataType(kernel_fp64, kernel_fp16, in_gpu, &out_gpu);
    TensorCopy(out_gpu, cpu_place, context, &out);
    context.Wait();
    ptr = out.data<float16>();
    for (int i = 0; i < data_number; ++i) {
-      ASSERT_EQ(ptr[i].x, static_cast<float16>(in_data_double[i]).x);
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_double[i]).x);
    }
    // transform int to float16
@ -170,13 +174,14 @@ TEST(DataTypeTransform, GPUTransform) {
    }
    TensorCopy(in, gpu_place, context, &in_gpu);
    context.Wait();
    TransDataType(kernel_int32, kernel_fp16, in_gpu, &out_gpu);
    TensorCopy(out_gpu, cpu_place, context, &out);
    context.Wait();
    ptr = out.data<float16>();
    for (int i = 0; i < data_number; ++i) {
-      ASSERT_EQ(ptr[i].x, static_cast<float16>(in_data_int[i]).x);
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int[i]).x);
    }
    // transform int64 to float16
@ -187,13 +192,14 @@ TEST(DataTypeTransform, GPUTransform) {
    }
    TensorCopy(in, gpu_place, context, &in_gpu);
    context.Wait();
    TransDataType(kernel_int64, kernel_fp16, in_gpu, &out_gpu);
    TensorCopy(out_gpu, cpu_place, context, &out);
    context.Wait();
    ptr = out.data<float16>();
    for (int i = 0; i < data_number; ++i) {
-      ASSERT_EQ(ptr[i].x, static_cast<float16>(in_data_int64[i]).x);
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int64[i]).x);
    }
    // transform bool to float16
@ -203,13 +209,14 @@ TEST(DataTypeTransform, GPUTransform) {
    }
    TensorCopy(in, gpu_place, context, &in_gpu);
    context.Wait();
    TransDataType(kernel_bool, kernel_fp16, in_gpu, &out_gpu);
    TensorCopy(out_gpu, cpu_place, context, &out);
    context.Wait();
    ptr = out.data<float16>();
    for (int i = 0; i < data_number; ++i) {
-      ASSERT_EQ(ptr[i].x, static_cast<float16>(in_data_bool[i]).x);
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_bool[i]).x);
    }
  }
 }
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@ -5,7 +5,8 @@ cc_library(paddle_fluid_api
    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
 # Create static library
-cc_library(paddle_fluid DEPS paddle_fluid_api ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
+get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
 cc_library(paddle_fluid DEPS ${fluid_modules})
 # Create shared library
 cc_library(paddle_fluid_shared SHARED
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@ -22,14 +22,14 @@ namespace paddle {
 namespace inference {
 void ReadBinaryFile(const std::string& filename, std::string& contents) {
-  VLOG(3) << "loading model from " << filename;
+  std::ifstream fin(filename, std::ios::in | std::ios::binary);
-  std::ifstream inputfs(filename, std::ios::in | std::ios::binary);
+  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
-  inputfs.seekg(0, std::ios::end);
+  fin.seekg(0, std::ios::end);
  contents.clear();
-  contents.resize(inputfs.tellg());
+  contents.resize(fin.tellg());
-  inputfs.seekg(0, std::ios::beg);
+  fin.seekg(0, std::ios::beg);
-  inputfs.read(&contents[0], contents.size());
+  fin.read(&contents[0], contents.size());
-  inputfs.close();
+  fin.close();
 }
 bool IsPersistable(const framework::VarDesc* var) {
@ -97,6 +97,7 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
                                             const std::string& dirname) {
  std::string model_filename = dirname + "/__model__";
  std::string program_desc_str;
  VLOG(3) << "loading model from " << model_filename;
  ReadBinaryFile(model_filename, program_desc_str);
  std::unique_ptr<framework::ProgramDesc> main_program(
--- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
@ -17,10 +17,13 @@ limitations under the License. */
 #include "paddle/fluid/inference/tests/test_helper.h"
 DEFINE_string(dirname, "", "Directory of the inference model.");
 DEFINE_int32(batch_size, 1, "Batch size of input data");
 DEFINE_int32(repeat, 1, "Running the inference program repeat times");
 TEST(inference, image_classification) {
-  if (FLAGS_dirname.empty()) {
+  if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) {
-    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model "
                  "--batch_size=1 --repeat=1";
  }
  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
@ -29,13 +32,11 @@ TEST(inference, image_classification) {
  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
  int64_t batch_size = 1;
  paddle::framework::LoDTensor input;
  // Use normilized image pixels as input data,
  // which should be in the range [0.0, 1.0].
  SetupTensor<float>(input,
-                     {batch_size, 3, 32, 32},
+                     {FLAGS_batch_size, 3, 32, 32},
                     static_cast<float>(0),
                     static_cast<float>(1));
  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
@ -46,7 +47,9 @@ TEST(inference, image_classification) {
  cpu_fetchs1.push_back(&output1);
  // Run inference on CPU
-  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << "--- CPU Runs: ---";
  TestInference<paddle::platform::CPUPlace>(
      dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat);
  LOG(INFO) << output1.dims();
 #ifdef PADDLE_WITH_CUDA
@ -55,7 +58,9 @@ TEST(inference, image_classification) {
  cpu_fetchs2.push_back(&output2);
  // Run inference on CUDA GPU
-  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << "--- GPU Runs: ---";
  TestInference<paddle::platform::CUDAPlace>(
      dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat);
  LOG(INFO) << output2.dims();
  CheckError<float>(output1, output2);
--- a/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
@ -17,10 +17,13 @@ limitations under the License. */
 #include "paddle/fluid/inference/tests/test_helper.h"
 DEFINE_string(dirname, "", "Directory of the inference model.");
 DEFINE_int32(batch_size, 1, "Batch size of input data");
 DEFINE_int32(repeat, 1, "Running the inference program repeat times");
 TEST(inference, recognize_digits) {
-  if (FLAGS_dirname.empty()) {
+  if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) {
-    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model "
                  "--batch_size=1 --repeat=1";
  }
  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
@ -29,24 +32,25 @@ TEST(inference, recognize_digits) {
  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
  int64_t batch_size = 1;
  paddle::framework::LoDTensor input;
  // Use normilized image pixels as input data,
  // which should be in the range [-1.0, 1.0].
  SetupTensor<float>(input,
-                     {batch_size, 1, 28, 28},
+                     {FLAGS_batch_size, 1, 28, 28},
                     static_cast<float>(-1),
                     static_cast<float>(1));
  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
  cpu_feeds.push_back(&input);
  for (auto is_combined : {false, true}) {
    paddle::framework::LoDTensor output1;
    std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
    cpu_fetchs1.push_back(&output1);
    // Run inference on CPU
-  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
+    LOG(INFO) << "--- CPU Runs: is_combined=" << is_combined << " ---";
    TestInference<paddle::platform::CPUPlace>(
        dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined);
    LOG(INFO) << output1.dims();
 #ifdef PADDLE_WITH_CUDA
@ -55,51 +59,12 @@ TEST(inference, recognize_digits) {
    cpu_fetchs2.push_back(&output2);
    // Run inference on CUDA GPU
-  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+    LOG(INFO) << "--- GPU Runs: is_combined=" << is_combined << " ---";
    TestInference<paddle::platform::CUDAPlace>(
        dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat, is_combined);
    LOG(INFO) << output2.dims();
    CheckError<float>(output1, output2);
 #endif
 }
 TEST(inference, recognize_digits_combine) {
  if (FLAGS_dirname.empty()) {
    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
  }
  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
  std::string dirname = FLAGS_dirname;
  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
  paddle::framework::LoDTensor input;
  // Use normilized image pixels as input data,
  // which should be in the range [-1.0, 1.0].
  SetupTensor<float>(
      input, {1, 1, 28, 28}, static_cast<float>(-1), static_cast<float>(1));
  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
  cpu_feeds.push_back(&input);
  paddle::framework::LoDTensor output1;
  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
  cpu_fetchs1.push_back(&output1);
  // Run inference on CPU
  TestInference<paddle::platform::CPUPlace, true>(
      dirname, cpu_feeds, cpu_fetchs1);
  LOG(INFO) << output1.dims();
 #ifdef PADDLE_WITH_CUDA
  paddle::framework::LoDTensor output2;
  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
  cpu_fetchs2.push_back(&output2);
  // Run inference on CUDA GPU
  TestInference<paddle::platform::CUDAPlace, true>(
      dirname, cpu_feeds, cpu_fetchs2);
  LOG(INFO) << output2.dims();
  CheckError<float>(output1, output2);
 #endif
 }
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@ -15,6 +15,7 @@ limitations under the License. */
 #include <time.h>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/profiler.h"
 template <typename T>
 void SetupTensor(paddle::framework::LoDTensor& input,
@ -87,32 +88,61 @@ void CheckError(paddle::framework::LoDTensor& output1,
  EXPECT_EQ(count, 0U) << "There are " << count << " different elements.";
 }
-template <typename Place, bool IsCombined = false>
+template <typename Place>
 void TestInference(const std::string& dirname,
                   const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
-                   std::vector<paddle::framework::LoDTensor*>& cpu_fetchs) {
+                   std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
                   const int repeat = 1,
                   const bool is_combined = false) {
  // 1. Define place, executor, scope
  auto place = Place();
  auto executor = paddle::framework::Executor(place);
  auto* scope = new paddle::framework::Scope();
  // Profile the performance
  paddle::platform::ProfilerState state;
  if (paddle::platform::is_cpu_place(place)) {
    state = paddle::platform::ProfilerState::kCPU;
  } else {
 #ifdef PADDLE_WITH_CUDA
    state = paddle::platform::ProfilerState::kCUDA;
    // The default device_id of paddle::platform::CUDAPlace is 0.
    // Users can get the device_id using:
    //   int device_id = place.GetDeviceId();
    paddle::platform::SetDeviceId(0);
 #else
    PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
 #endif
  }
  // Enable the profiler
  paddle::platform::EnableProfiler(state);
  // 2. Initialize the inference_program and load parameters
  std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
-  if (IsCombined) {
+  {
    paddle::platform::RecordEvent record_event(
        "init_program",
        paddle::platform::DeviceContextPool::Instance().Get(place));
    if (is_combined) {
      // All parameters are saved in a single file.
      // Hard-coding the file names of program and parameters in unittest.
      // The file names should be consistent with that used in Python API
      //  `fluid.io.save_inference_model`.
      std::string prog_filename = "__model_combined__";
      std::string param_filename = "__params_combined__";
-    inference_program = paddle::inference::Load(executor,
+      inference_program =
          paddle::inference::Load(executor,
                                  *scope,
                                  dirname + "/" + prog_filename,
                                  dirname + "/" + param_filename);
    } else {
-    // Parameters are saved in separate files sited in the specified `dirname`.
+      // Parameters are saved in separate files sited in the specified
      // `dirname`.
      inference_program = paddle::inference::Load(executor, *scope, dirname);
    }
  }
  // 3. Get the feed_target_names and fetch_target_names
  const std::vector<std::string>& feed_target_names =
@ -134,7 +164,21 @@ void TestInference(const std::string& dirname,
  }
  // 6. Run the inference program
  {
    // Run repeat times to profile the performance
    for (int i = 0; i < repeat; ++i) {
      paddle::platform::RecordEvent record_event(
          "run_inference",
          paddle::platform::DeviceContextPool::Instance().Get(place));
      executor.Run(*inference_program, scope, feed_targets, fetch_targets);
    }
  }
  // Disable the profiler and print the timing information
  paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault,
                                    "profiler.txt");
  paddle::platform::ResetProfiler();
  delete scope;
 }
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@ -1,5 +1,7 @@
 file(GLOB GENERAL_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
 string(REPLACE "_mkldnn" "" GENERAL_OPS "${GENERAL_OPS}")
 string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}")
 list(REMOVE_DUPLICATES GENERAL_OPS)
 set(DEPS_OPS "")
 set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/pybind.h)
 file(WRITE ${pybind_file} "// Generated by the paddle/operator/CMakeLists.txt.  DO NOT EDIT!\n\n")
@ -13,6 +15,8 @@ function(op_library TARGET)
    set(cu_cc_srcs)
    set(cudnn_cu_cc_srcs)
    set(CUDNN_FILE)
    set(mkldnn_cc_srcs)
    set(MKLDNN_FILE)
    set(op_common_deps operator op_registry math_function)
    set(options "")
    set(oneValueArgs "")
@ -36,12 +40,20 @@ function(op_library TARGET)
        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc)
            list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc)
        endif()
        if(WITH_MKLDNN)
            string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}")
            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_FILE}.cc)
                list(APPEND mkldnn_cc_srcs ${MKLDNN_FILE}.cc)
            endif()
        endif()
    else()
        foreach(src ${op_library_SRCS})
            if (${src} MATCHES ".*\\.cu$")
                list(APPEND cu_srcs ${src})
            elseif(${src} MATCHES ".*_cudnn_op.cu.cc$")
                list(APPEND cudnn_cu_cc_srcs ${src})
            elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$")
                list(APPEND mkldnn_cc_srcs ${src})
            elseif(${src} MATCHES ".*\\.cu.cc$")
                list(APPEND cu_cc_srcs ${src})
            elseif(${src} MATCHES ".*\\.cc$")
@ -62,10 +74,10 @@ function(op_library TARGET)
        set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
    endif()
    if (WITH_GPU)
-        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
+        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
                ${op_common_deps})
    else()
-        cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${op_library_DEPS}
+        cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
            ${op_common_deps})
    endif()
@ -101,7 +113,8 @@ function(op_library TARGET)
    # pybind USE_CPU_ONLY_OP
    list(LENGTH cu_srcs cu_srcs_len)
    list(LENGTH cu_cc_srcs cu_cc_srcs_len)
-    if (${pybind_flag} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0)
+    list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
    if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0)
        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
        set(pybind_flag 1)
    endif()
@ -112,6 +125,11 @@ function(op_library TARGET)
        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
    endif()
    # pybind USE_OP_DEVICE_KERNEL for MKLDNN
    if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
    endif()
    # pybind USE_OP
    if (${pybind_flag} EQUAL 0)
        file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
@ -172,17 +190,18 @@ op_library(cos_sim_op DEPS cos_sim_functor)
 op_library(parallel_do_op DEPS executor)
 if (WITH_GPU)
-    op_library(conv_op DEPS vol2col depthwise_conv)
+    op_library(conv_op DEPS vol2col depthwise_conv im2col)
 else()
-    op_library(conv_op DEPS vol2col)
+    op_library(conv_op DEPS vol2col im2col)
 endif()
-op_library(conv_transpose_op DEPS vol2col)
+op_library(conv_transpose_op DEPS vol2col im2col)
 # FIXME(typhoonzero): save/load depends lodtensor serialization functions
 op_library(save_op DEPS lod_tensor)
 op_library(load_op DEPS lod_tensor)
 op_library(save_combine_op DEPS lod_tensor)
 op_library(load_combine_op DEPS lod_tensor)
 op_library(concat_op DEPS concat)
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@ -100,7 +100,8 @@ class ConcatOpGrad : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP_EX(concat, ops::ConcatOp, ops::ConcatOpMaker, concat_grad,
               ops::ConcatOpGrad, false)
-REGISTER_OP_CPU_KERNEL(concat,
+REGISTER_OP_CPU_KERNEL(
-                       ops::ConcatKernel<paddle::platform::CPUPlace, float>)
+    concat, ops::ConcatKernel<paddle::platform::CPUDeviceContext, float>)
-REGISTER_OP_CPU_KERNEL(concat_grad,
+REGISTER_OP_CPU_KERNEL(
-                       ops::ConcatGradKernel<paddle::platform::CPUPlace, float>)
+    concat_grad,
    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, float>)
--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
@ -17,6 +17,7 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/concat.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
 namespace paddle {
@ -27,54 +28,30 @@ class ConcatKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto ins = ctx.MultiInput<framework::Tensor>("X");
-    auto* out = ctx.Output<framework::Tensor>("Out");
+    framework::Tensor* out = ctx.Output<framework::Tensor>("Out");
    int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
    auto place = ctx.GetPlace();
    out->mutable_data<T>(place);
-    auto out_stride = framework::stride_numel(out->dims());
+    // Sometimes direct copies will be faster, this maybe need deeply analysis.
-
+    if (axis == 0 && ins.size() < 10) {
      size_t output_offset = 0;
    // If axis >=1, copy to out immediately need to call many times
    // of cuda memcpy. Copy the input to cpu and do the stride copy,
    // then copy to gpu output.
    if (platform::is_gpu_place(place) && axis >= 1) {
      platform::CPUPlace copy_place;
      auto& cpu_ctx = *platform::DeviceContextPool::Instance().Get(copy_place);
      framework::Tensor cpu_out;
      cpu_out.Resize(out->dims());
      cpu_out.mutable_data<T>(copy_place);
      auto& dev_ctx = ctx.device_context();
      std::vector<std::unique_ptr<framework::Tensor>> cpu_ins;
      for (auto* in : ins) {
        std::unique_ptr<framework::Tensor> cpu_in(new framework::Tensor);
        framework::TensorCopy(*in, copy_place, dev_ctx, cpu_in.get());
        cpu_ins.emplace_back(std::move(cpu_in));
      }
      // TODO(dzhwinter): overlap copy and compute stream
      // https://devblogs.nvidia.com/how-overlap-data-transfers-cuda-cc/
      dev_ctx.Wait();
      for (auto& in : cpu_ins) {
        auto& cpu_in = *in.get();
        auto in_stride = framework::stride_numel(cpu_in.dims());
        StridedNumelCopyWithAxis<T>(
            cpu_ctx, axis, cpu_out.data<T>() + output_offset, out_stride,
            cpu_in.data<T>(), in_stride, in_stride[axis]);
        output_offset += in_stride[axis];
      }
      framework::TensorCopy(cpu_out, place, dev_ctx, out);
    } else {
      for (auto* in : ins) {
        auto in_stride = framework::stride_numel(in->dims());
        auto out_stride = framework::stride_numel(out->dims());
        StridedNumelCopyWithAxis<T>(ctx.device_context(), axis,
                                    out->data<T>() + output_offset, out_stride,
                                    in->data<T>(), in_stride, in_stride[axis]);
        output_offset += in_stride[axis];
      }
    } else {
      std::vector<framework::Tensor> inputs(ins.size());
      for (size_t j = 0; j < ins.size(); ++j) {
        inputs[j] = *ins[j];
      }
      auto& dev_ctx = ctx.template device_context<DeviceContext>();
      paddle::operators::math::ConcatFunctor<DeviceContext, T> concat_functor;
      concat_functor(dev_ctx, inputs, static_cast<int>(axis), out);
    }
  }
 };
@ -86,6 +63,9 @@ class ConcatGradKernel : public framework::OpKernel<T> {
    auto* in = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
    auto outs = ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
    int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
    // Sometimes direct copies will be faster, this maybe need deeply analysis.
    if (axis == 0 && outs.size() < 10) {
      size_t input_offset = 0;
      auto in_stride = framework::stride_numel(in->dims());
@ -97,6 +77,18 @@ class ConcatGradKernel : public framework::OpKernel<T> {
                                    in_stride, out_stride[axis]);
        input_offset += out_stride[axis];
      }
    } else {
      std::vector<framework::Tensor> outputs(outs.size());
      for (size_t j = 0; j < outs.size(); ++j) {
        outs[j]->mutable_data<T>(ctx.GetPlace());
        outputs[j] = *outs[j];
      }
      auto& dev_ctx = ctx.template device_context<DeviceContext>();
      paddle::operators::math::ConcatGradFunctor<DeviceContext, T>
          concat_grad_functor;
      concat_grad_functor(dev_ctx, *in, static_cast<int>(axis), outputs);
    }
  }
 };
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@ -13,6 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/conv_op.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cudnn_helper.h"
 #endif
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
 namespace paddle {
 namespace operators {
@ -64,22 +70,21 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType ConvOp::GetExpectedKernelType(
    const framework::ExecutionContext& ctx) const {
-  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  framework::LibraryType library_{framework::LibraryType::kPlain};
  use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
 #ifdef PADDLE_WITH_CUDA
-  if (platform::is_gpu_place(ctx.GetPlace())) {
+  if (platform::CanCUDNNBeUsed(ctx)) {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    library_ = framework::LibraryType::kCUDNN;
    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
  }
 #endif
-  framework::LibraryType library_;
+#ifdef PADDLE_WITH_MKLDNN
-  if (use_cudnn) {
+  if (library_ == framework::LibraryType::kPlain &&
-    library_ = framework::LibraryType::kCUDNN;
+      platform::CanMKLDNNBeUsed(ctx)) {
-  } else {
+    library_ = framework::LibraryType::kMKLDNN;
    library_ = framework::LibraryType::kPlain;
  }
 #endif
  std::string data_format = ctx.Attr<std::string>("data_format");
  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
  return framework::OpKernelType(
      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
@ -131,6 +136,9 @@ Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      "use_cudnn",
      "(bool, default false) Only used in cudnn kernel, need install cudnn")
      .SetDefault(false);
  AddAttr<bool>("use_mkldnn",
                "(bool, default false) Only used in mkldnn kernel")
      .SetDefault(false);
  AddAttr<std::string>(
      "data_format",
      "(string, default NCHW) Only used in "
@ -224,6 +232,9 @@ Conv3DOpMaker::Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      "use_cudnn",
      "(bool, default false) Only used in cudnn kernel, need install cudnn")
      .SetDefault(false);
  AddAttr<bool>("use_mkldnn",
                "(bool, default false) Only used in mkldnn kernel")
      .SetDefault(false);
  AddAttr<std::string>(
      "data_format",
      "(string, default NCHW) Only used in "
@ -284,23 +295,21 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
    const framework::ExecutionContext& ctx) const {
-  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  framework::LibraryType library_{framework::LibraryType::kPlain};
  use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
 #ifdef PADDLE_WITH_CUDA
-  if (platform::is_gpu_place(ctx.GetPlace())) {
+  if (platform::CanCUDNNBeUsed(ctx)) {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    library_ = framework::LibraryType::kCUDNN;
    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
  }
 #endif
-
+#ifdef PADDLE_WITH_MKLDNN
-  framework::LibraryType library_;
+  if (library_ == framework::LibraryType::kPlain &&
-  if (use_cudnn) {
+      platform::CanMKLDNNBeUsed(ctx)) {
-    library_ = framework::LibraryType::kCUDNN;
+    library_ = framework::LibraryType::kMKLDNN;
  } else {
    library_ = framework::LibraryType::kPlain;
  }
 #endif
  std::string data_format = ctx.Attr<std::string>("data_format");
  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
  return framework::OpKernelType(
      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
--- a/paddle/fluid/operators/detection_map_op.cc
+++ b/paddle/fluid/operators/detection_map_op.cc
@ -71,7 +71,7 @@ class DetectionMAPOp : public framework::OperatorWithKernel {
    return framework::OpKernelType(
        framework::ToDataType(
            ctx.Input<framework::Tensor>("DetectRes")->type()),
-        ctx.device_context());
+        platform::CPUPlace());
  }
 };
--- a/paddle/fluid/operators/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise_div_op.h
@ -41,77 +41,14 @@ class ElementwiseDivKernel : public framework::OpKernel<T> {
 };
 template <typename T>
-struct ElementwiseDivGradFunctor {
+struct DivGradDX {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout / y; }
            typename dY, typename dZ>
  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
    auto y_e = framework::EigenVector<T>::Flatten(*y);
    auto z_e = framework::EigenVector<T>::Flatten(*z);
    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
    if (dx) {
      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
      dx_e.device(d) = dz_e / y_e;
    }
    if (dy) {
      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
      dy_e.device(d) = -1.0 * dz_e * z_e / y_e;
    }
  }
 };
 template <typename T>
 struct ElementwiseDivBroadCastGradFunctor {
  template <typename Device, typename X, typename Y, typename Z, typename dX,
            typename dY, typename dZ, typename Pre, typename N>
  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
    auto x_e = framework::EigenVector<T>::Flatten(*x);
    auto y_e = framework::EigenVector<T>::Flatten(*y);
    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))
                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
    if (dx) {
      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
      dx_e.device(d) = dz_e / y_e_bcast;
    }
    if (dy) {
      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
      dy_e.device(d) = (-1.0 * (x_e * dz_e) / (y_e_bcast * y_e_bcast))
                           .reshape(Eigen::DSizes<int, 2>(pre, n))
                           .sum(Eigen::array<int, 1>{{0}});
    }
  }
 };
 template <typename T>
-struct ElementwiseDivBroadCast2GradFunctor {
+struct DivGradDY {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-            typename dY, typename dZ, typename Pre, typename N, typename Post>
+    return -dout * x / (y * y);
  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
                  Post post) {
    auto x_e = framework::EigenVector<T>::Flatten(*x);
    auto y_e = framework::EigenVector<T>::Flatten(*y);
    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
    if (dx) {
      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
      dx_e.device(d) = dz_e / y_e_bcast;
    }
    if (dy) {
      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
      dy_e.device(d) = (-1.0 * (x_e * dz_e) / (y_e_bcast * y_e_bcast))
                           .reshape(Eigen::DSizes<int, 3>(pre, n, post))
                           .sum(Eigen::array<int, 2>{{0, 2}});
    }
  }
 };
@ -128,10 +65,8 @@ class ElementwiseDivGradKernel : public framework::OpKernel<T> {
    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
    int axis = ctx.Attr<int>("axis");
-    ElementwiseGradCompute<DeviceContext, T, ElementwiseDivGradFunctor<T>,
+    ElemwiseGradCompute<DeviceContext, T, DivGradDX<T>, DivGradDY<T>>(
-                           ElementwiseDivBroadCastGradFunctor<T>,
+        ctx, *x, *y, *out, *dout, axis, dx, dy, DivGradDX<T>(), DivGradDY<T>());
                           ElementwiseDivBroadCast2GradFunctor<T>>(
        ctx, x, y, out, dout, axis, dx, dy);
  }
 };
--- a/Show More
+++ b/Show More