Merge branch 'develop' into multi-thread2

7 years ago · 459d4cc811
parent f507e5c1f2 907696709f
commit 459d4cc811
68 changed files with 1900 additions and 1104 deletions
--- a/2
+++ b/2
@ -53,7 +53,7 @@ RUN curl -s -q https://glide.sh/get | sh
 #    and its size is only one-third of the official one.
 # 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
 #    See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
-RUN wget -qO- http://paddlepaddledeps.bj.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
+RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
    tar -xz -C /usr/local && \
    cp -rf /usr/local/TensorRT/include /usr && \
    cp -rf /usr/local/TensorRT/lib /usr
--- a/README.md
+++ b/README.md
@ -76,33 +76,26 @@ pip install paddlepaddle-gpu==0.14.0.post85

 ## Installation

-It is recommended to check out the
-[Docker installation guide](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/build_and_install/docker_install_en.html)
-before looking into the
-[build from source guide](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/build_and_install/build_from_source_en.html).
+It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/beginners_guide/install/install_doc.html) on our website.

 ## Documentation

-We provide [English](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html) and
-[Chinese](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html) documentation.
+We provide [English](http://paddlepaddle.org/documentation/docs/en/0.14.0/getstarted/index_en.html) and
+[Chinese](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/beginners_guide/index.html) documentation.

- [Deep Learning 101](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html)
+- [Deep Learning 101](https://github.com/PaddlePaddle/book)

  You might want to start from this online interactive book that can run in a Jupyter Notebook.

- [Distributed Training](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/cluster/index_en.html)
+- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/user_guides/howto/training/cluster_howto.html)

  You can run distributed training jobs on MPI clusters.

- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/cluster/multi_cluster/k8s_en.html)
-
-   You can also run distributed training jobs on Kubernetes clusters.
-
- [Python API](http://www.paddlepaddle.org/docs/develop/api/en/overview.html)
+- [Python API](http://paddlepaddle.org/documentation/api/zh/0.14.0/fluid.html)

   Our new API enables much shorter programs.

- [How to Contribute](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/dev/contribute_to_paddle_en.html)
+- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/advanced_usage/development/contribute_to_paddle.html)

   We appreciate your contributions!

--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@ -169,14 +169,19 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF)

 # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
 # So, don't set these flags here.
+if (NOT WIN32) # windows msvc2015 support c++11 natively. 
+# -std=c++11 -fPIC not recoginize by msvc, -Xcompiler will be added by cmake.
 list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
-list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
 list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
+endif(NOT WIN32)
+
+list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
 # in cuda9, suppress cuda warning on eigen 
 list(APPEND CUDA_NVCC_FLAGS "-w")
 # Set :expt-relaxed-constexpr to suppress Eigen warnings
 list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")

+if (NOT WIN32)
 if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
 elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
@ -187,6 +192,13 @@ elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
    # nvcc 9 does not support -Os. Use Release flags instead
    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
 endif()
+else(NOT WIN32)
+if(CMAKE_BUILD_TYPE STREQUAL "Release")
+  list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG")
+else()
+  message(FATAL "Windows only support Release build now. Please set visual studio build type to Release, x64 build.")
+endif()
+endif(NOT WIN32)

 mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
 mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@ -44,7 +44,7 @@ ExternalProject_Add(
    # 3. keep only zlib, cares, protobuf, boringssl under "third_party",
    #    checkout and clean other dirs under third_party
    # 4. remove .git, and package the directory.
-    URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.10.x.tar.gz"
+    URL "http://paddlepaddledeps.cdn.bcebos.com/grpc-v1.10.x.tar.gz"
    URL_MD5  "1f268a2aff6759839dccd256adcc91cf"
    PREFIX          ${GRPC_SOURCES_DIR}
    UPDATE_COMMAND  ""
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@ -128,16 +128,13 @@ set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid")
 set(module "framework")
 if (NOT WIN32)
-copy(framework_lib DEPS framework_py_proto 
-  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module}
-)
-else()
-copy(framework_lib
+set(framework_lib_deps framework_py_proto)
+endif(NOT WIN32)
+copy(framework_lib DEPS ${framework_lib_deps}
  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module}
+       ${src_dir}/${module}/ir/*.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module}/ir
 )
-endif(NOT WIN32)

 set(module "memory")
 copy(memory_lib
@ -161,7 +158,8 @@ set(module "inference")
 copy(inference_lib DEPS ${inference_deps}
  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
       ${src_dir}/${module}/api/paddle_inference_api.h ${src_dir}/${module}/api/demo_ci
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
+       ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
 )

 set(module "platform")
--- a/doc/fluid/new_docs/user_guides/howto/prepare_data/index.rst
+++ b/doc/fluid/new_docs/user_guides/howto/prepare_data/index.rst
@ -38,7 +38,6 @@ PaddlePaddle Fluid支持两种传入数据的方式:
   :maxdepth: 2

   feeding_data
-   use_recordio_reader

 Python Reader
 #############
--- a/doc/fluid/new_docs/user_guides/howto/prepare_data/use_recordio_reader.rst
+++ b/doc/fluid/new_docs/user_guides/howto/prepare_data/use_recordio_reader.rst
@ -1,167 +0,0 @@
-.. _user_guide_use_recordio_as_train_data:
-
-############################
-使用RecordIO文件作为训练数据
-############################
-
-相比于 :ref:`user_guide_use_numpy_array_as_train_data`，
-:ref:`user_guide_use_recordio_as_train_data` 的性能更好；
-但是用户需要先将训练数据集转换成RecordIO文件格式，再使用
-:code:`fluid.layers.open_files()` 层在神经网络配置中导入 RecordIO 文件。
-用户还可以使用 :code:`fluid.layers.double_buffer()` 加速数据从内存到显存的拷贝，
-使用 :code:`fluid.layers.Preprocessor` 工具进行数据增强。
-
-将训练数据转换成RecordIO文件格式
-################################
-
-:code:`fluid.recordio_writer` 中，每个记录都是一个
-:code:`vector<LoDTensor>`, 即一个支持序列信息的Tensor数组。这个数组包括训练所需
-的所有特征。例如对于图像分类来说，这个数组可以包含图片和分类标签。
-
-用户可以使用 :code:`fluid.recordio_writer.convert_reader_to_recordio_file()` 可以将
-:ref:`user_guide_reader` 转换成一个RecordIO文件。或者可以使用
-:code:`fluid.recordio_writer.convert_reader_to_recordio_files()` 将一个
-:ref:`user_guide_reader` 转换成多个RecordIO文件。
-
-具体使用方法为:
-
-.. code-block:: python
-
-   import paddle.fluid as fluid
-   import numpy
-
-   def reader_creator():
-       def __impl__():
-           for i in range(1000):
-               yield [
-                        numpy.random.random(size=[3,224,224], dtype="float32"),
-                        numpy.random.random(size=[1], dtype="int64")
-                     ]
-       return __impl__
-
-   img = fluid.layers.data(name="image", shape=[3, 224, 224])
-   label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-   feeder = fluid.DataFeeder(feed_list=[img, label], place=fluid.CPUPlace())
-
-   BATCH_SIZE = 32
-   reader = paddle.batch(reader_creator(), batch_size=BATCH_SIZE)
-   fluid.recordio_writer.convert_reader_to_recordio_file(
-      "train.recordio", feeder=feeder, reader_creator=reader)
-
-其中 :code:`reader_creator` 创建了一个 :code:`Reader`。
-:ref:`_api_fluid_data_feeder_DataFeeder`
-是将 :code:`Reader` 转换成 :code:`LoDTensor` 的工具。详细请参考
-:ref:`user_guide_reader` 。
-
-上述程序将 :code:`reader_creator` 的数据转换成了 :code:`train.recordio` 文件，
-其中每一个record 含有 32 条样本。如果batch size会在训练过程中调整，
-用户可以将每一个Record的样本数设置成1。并参考
-:ref:`user_guide_use_recordio_as_train_data_use_op_create_batch`。
-
-
-配置神经网络, 打开RecordIO文件
-##############################
-
-RecordIO文件转换好之后，用户可以使用 :code:`fluid.layers.open_files()`
-打开文件，并使用 :code:`fluid.layers.read_file` 读取文件内容。
-简单使用方法如下:
-
-.. code-block:: python
-
-   import paddle.fluid as fluid
-
-   file_obj = fluid.layers.open_files(
-     filenames=["train.recordio"],
-     shape=[[3, 224, 224], [1]],
-     lod_levels=[0, 0],
-     dtypes=["float32", "int64"],
-     pass_num=100
-   )
-
-   image, label = fluid.layers.read_file(file_obj)
-
-其中如果设置了 :code:`pass_num` ，那么当所有数据读完后，会重新读取数据，
-直到读取了 :code:`pass_num` 遍。
-
-
-
-进阶使用
-########
-
-
-使用 :code:`fluid.layers.double_buffer()`
------------------------------------------
-
-:code:`Double buffer` 使用双缓冲技术，将训练数据从内存中复制到显存中。配置双缓冲
-需要使用 :code:`fluid.layers.double_buffer()` 修饰文件对象。 例如:
-
-.. code-block:: python
-
-   import paddle.fliud as fluid
-   file_obj = fluid.layers.open_files(...)
-   file_obj = fluid.layers.double_buffer(file_obj)
-
-   image, label = fluid.layers.read_file(file_obj)
-
-双缓冲技术可以参考
-`Multiple buffering <https://en.wikipedia.org/wiki/Multiple_buffering>`_ 。
-
-配置数据增强
------------
-
-使用 :code:`fluid.layers.Preprocessor` 可以配置文件的数据增强方法。例如
-
-.. code-block:: python
-
-   import paddle.fluid as fluid
-   file_obj = fluid.layers.open_files(...)
-   preprocessor = fluid.layers.Preprocessor(reader=data_file)
-   with preprocessor.block():
-       image, label = preprocessor.inputs()
-       image = image / 2
-       label = label + 1
-       preprocessor.outputs(image, label)
-
-如上代码所示，使用 :code:`Preprocessor` 定义了一个数据增强模块，并在
-:code:`with preprocessor.block()` 中定义了数据增强的具体操作。 用户通过配置
-:code:`preprocessor.inputs()` 获得数据文件中的各个字段。 并用
-:code:`preprocessor.outputs()` 标记预处理后的输出。
-
-.. _user_guide_use_recordio_as_train_data_use_op_create_batch:
-
-使用Op组batch
-------------
-
-使用 :code:`fluid.layers.batch()` 可以在训练的过程中动态的组batch。例如
-
-.. code-block:: python
-
-   import paddle.fluid as fluid
-   file_obj = fluid.layers.open_files(...)
-   file_obj = fluid.layers.batch(file_obj, batch_size=32)
-
-   img, label = fluid.layers.read_file(file_obj)
-
-需要注意的是，如果数据集中的最后几个样本不能组成 :code:`batch_size` 大小的批量数据，
-那么这几个样本直接组成一个批量数据进行训练。
-
-读入数据的shuffle
-----------------
-
-使用 :code:`fluid.layers.shuffle()` 可以在训练过程中动态重排训练数据。例如
-
-.. code-block:: python
-
-   import paddle.fluid as fluid
-   file_obj = fluid.layers.open_files(...)
-   file_obj = fliud.layers.shuffle(file_obj, buffer_size=8192)
-
-   img, label = fliud.layers.read_file(file_obj)
-
-需要注意的是:
-
-1. :code:`shuffle` 实现方法是:
-先读入 :code:`buffer_size` 条样本，再随机的选出样本进行训练。
-
-2. :code:`shuffle` 中 :code:`buffer_size` 会占用训练内存，需要确定训练过程中内存
-足够支持缓存 :code:`buffer_size` 条数据。
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@ -172,6 +172,7 @@ paddle.fluid.layers.sequence_mask ArgSpec(args=['x', 'maxlen', 'dtype', 'name'],
 paddle.fluid.layers.stack ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,))
 paddle.fluid.layers.pad2d ArgSpec(args=['input', 'paddings', 'mode', 'pad_value', 'data_format', 'name'], varargs=None, keywords=None, defaults=([0, 0, 0, 0], 'constant', 0.0, 'NCHW', None))
 paddle.fluid.layers.unstack ArgSpec(args=['x', 'axis', 'num'], varargs=None, keywords=None, defaults=(0, None))
+paddle.fluid.layers.sequence_enumerate ArgSpec(args=['input', 'win_size', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0, None))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
--- a/paddle/fluid/framework/.gitignore
+++ b/paddle/fluid/framework/.gitignore
@ -0,0 +1,2 @@
+.tensor_util.cu
+.data_type_transform.cu
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -1,3 +1,22 @@
+# windows treat symbolic file as a real file, which is different with unix
+# We create a hidden file and compile it instead of origin source file.
+function(windows_symbolic TARGET)
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS DEPS)
+  cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  foreach(src ${windows_symbolic_SRCS})
+  get_filename_component(src ${src} NAME_WE)
+  if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc OR NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cu)
+      message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.")
+  endif()
+  add_custom_command(OUTPUT .${src}.cu 
+          COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu
+          COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc" "${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu"
+          COMMENT "create hidden file of ${src}.cu")
+  add_custom_target(${TARGET} ALL DEPENDS .${src}.cu)  
+  endforeach()
+endfunction()
+
 add_subdirectory(ir)
 if (NOT WIN32)
 add_subdirectory(details)
@ -11,7 +30,13 @@ nv_test(dim_test SRCS dim_test.cu DEPS ddim)
 cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context)
 cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor)
 if(WITH_GPU)
-  nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context)
+  if (WIN32)
+    windows_symbolic(tensor_util SRCS tensor_util.cu)
+    nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
+    add_dependencies(tensor tensor_util)
+  else()
+    nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context)
+  endif(WIN32)
 else()
  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context)
 endif()
@ -55,7 +80,13 @@ nv_test(data_device_transform_test SRCS data_device_transform_test.cu
        DEPS operator op_registry device_context math_function)

 if(WITH_GPU)
-  nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
+  if (WIN32)
+      windows_symbolic(hidden_file SRCS data_type_transform.cu)
+      nv_library(data_type_transform SRCS .data_type_transform.cu DEPS tensor)
+      add_dependencies(data_type_transform hidden_file)
+  else()
+      nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
+  endif(WIN32)
  nv_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform)
 else()
  cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor)
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@ -1,20 +1,35 @@
+set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
+file(WRITE ${pass_file} "// Generated by the paddle/fluid/framework/ir/CMakeLists.txt.  DO NOT EDIT!\n\n")
+file(APPEND ${pass_file} "\#include \"paddle/fluid/framework/ir/pass.h\"\n")
+function(pass_library TARGET)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass)
+    file(APPEND ${pass_file} "USE_PASS(${TARGET});\n")
+    set(PASS_LIBRARY ${TARGET} ${PASS_LIBRARY} PARENT_SCOPE)
+endfunction()
+
 cc_library(node SRCS node.cc DEPS proto_desc)
 cc_library(graph SRCS graph.cc DEPS node)
 cc_library(graph_helper SRCS graph_helper.cc DEPS graph)
 cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
-cc_library(graph_viz_pass SRCS graph_viz_pass.cc DEPS graph pass graph_helper)
-cc_library(graph_to_program_pass SRCS graph_to_program_pass.cc DEPS graph pass graph_helper)
 cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
 cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits)
-cc_library(fc_fuse_pass SRCS fc_fuse_pass.cc DEPS graph graph_pattern_detector)
-cc_library(attention_lstm_fuse_pass SRCS attention_lstm_fuse_pass.cc DEPS graph graph_pattern_detector)
-cc_library(infer_clean_graph_pass SRCS infer_clean_graph_pass.cc DEPS graph pass)
-cc_library(fc_lstm_fuse_pass SRCS fc_lstm_fuse_pass.cc DEPS graph graph_pattern_detector)
-cc_library(seq_concat_fc_fuse_pass SRCS seq_concat_fc_fuse_pass.cc DEPS graph graph_pattern_detector)
+
+pass_library(graph_to_program_pass)
+pass_library(graph_viz_pass)
+pass_library(fc_fuse_pass)
+pass_library(attention_lstm_fuse_pass)
+pass_library(infer_clean_graph_pass)
+pass_library(fc_lstm_fuse_pass)
+pass_library(seq_concat_fc_fuse_pass)
+set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library")

 cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
 cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
 cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)
 cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
 cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
-cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass graph_pattern_detector graph pass graph_traits framework_proto)
+cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
@ -96,17 +96,13 @@ void FindWhileOp(Graph* graph) {
  auto* cell_init = graph->RetriveNode(6);
  auto* hidden_init = graph->RetriveNode(8);

-#define LINK_TO(node0, node1)      \
-  node0->outputs.push_back(node1); \
-  node1->inputs.push_back(node0);
-
  auto* lstm_op = graph->CreateOpNode(&op_desc);
  PrepareParameters(graph, param);

-  LINK_TO(X, lstm_op);
-  LINK_TO(cell_init, lstm_op);
-  LINK_TO(hidden_init, lstm_op);
-  LINK_TO(lstm_op, LSTMOUT);
+  IR_NODE_LINK_TO(X, lstm_op);
+  IR_NODE_LINK_TO(cell_init, lstm_op);
+  IR_NODE_LINK_TO(hidden_init, lstm_op);
+  IR_NODE_LINK_TO(lstm_op, LSTMOUT);

  GraphSafeRemoveNodes(graph, marked_nodes);
 }
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@ -21,74 +21,26 @@ namespace paddle {
 namespace framework {
 namespace ir {

-bool VarOutLinksToOp(Node* node, const std::string& op_type) {
-  for (auto* out : node->outputs) {
-    if (out->IsOp() && out->Op()->Type() == op_type) {
-      return true;
-    }
-  }
-  return false;
-}
-
-void BuildFCPattern(PDPattern* pattern) {
-  // Create Operators
-  auto* mul_op = pattern->NewNode("mul")->assert_is_op("mul");
-  auto* elementwise_add_op =
-      pattern->NewNode("elementwise_add")->assert_is_op("elementwise_add");
-  // Create variables
-  // w
-  auto* mul_weight_var = pattern->NewNode("mul_weight")
-                             ->AsInput()
-                             ->assert_is_op_nth_input("mul", "Y", 0);
-  // x
-  auto* mul_tmp_var = pattern->NewNode("mul_tmp_var")
-                          ->AsInput()
-                          ->assert_is_op_nth_input("mul", "X", 0);
-  // intermediate variable, will be removed in the IR after fuse.
-  auto* mul_out_var = pattern->NewNode("mul_out")
-                          ->AsIntermediate()
-                          ->assert_is_only_output_of_op("mul")
-                          ->assert_is_op_input("elementwise_add");
-  // bias
-  auto* elementwise_add_tmp_var = pattern->NewNode("elementwise_add_tmpvar")
-                                      ->assert_is_op_input("elementwise_add")
-                                      ->AsInput();
-  // output
-  auto* elementwise_add_out_var = pattern->NewNode("elementwise_add_out")
-                                      ->AsOutput()
-                                      ->assert_is_op_output("elementwise_add");
-
-  mul_op->LinksFrom({mul_weight_var, mul_tmp_var}).LinksTo({mul_out_var});
-  elementwise_add_op->LinksFrom({mul_out_var, elementwise_add_tmp_var})
-      .LinksTo({elementwise_add_out_var});
-}
-
-// Replace the node `from` in the links to `to`
-bool LinksReplace(std::vector<Node*>* links, Node* from, Node* to) {
-  for (auto*& n : *links) {
-    if (n == from) {
-      n = to;
-      return true;
-    }
-  }
-  return false;
-}
-
 std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
  PADDLE_ENFORCE(graph.get());
-  FusePassBase::Init("fc", graph.get());
+  FusePassBase::Init("fc_fuse", graph.get());

  std::unordered_set<Node*> nodes2delete;

  GraphPatternDetector gpd;
-  BuildFCPattern(gpd.mutable_pattern());
-
-#define GET_NODE(id)                                              \
-  PADDLE_ENFORCE(subgraph.count(gpd.pattern().RetrieveNode(#id)), \
-                 "pattern has no Node called %s", #id);           \
-  auto* id = subgraph.at(gpd.pattern().RetrieveNode(#id));        \
-  PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id);
+  // BuildFCPattern(gpd.mutable_pattern());
+  auto* x = gpd.mutable_pattern()
+                ->NewNode("fc_fuse/x")
+                ->AsInput()
+                ->assert_is_op_input("mul", "X");
+  patterns::FC(gpd.mutable_pattern(), "fc_fuse", x, true /*with bias*/);
+
+#define GET_NODE(id)                                                         \
+  PADDLE_ENFORCE(subgraph.count(gpd.pattern().RetrieveNode("fc_fuse/" #id)), \
+                 "pattern has no Node called %s", #id);                      \
+  auto* id = subgraph.at(gpd.pattern().RetrieveNode("fc_fuse/" #id));        \
+  PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", "fc_fuse/" #id);

  int found_fc_count = 0;
  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
@ -98,43 +50,33 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
    // scenerio.
    // FC's fusion is simple, just op fuse, no need to process the
    // parameters.
-    GET_NODE(mul_tmp_var);             // x
-    GET_NODE(mul_weight);              // Y
-    GET_NODE(elementwise_add_tmpvar);  // bias
-    GET_NODE(elementwise_add_out);     // Out
-    GET_NODE(mul);                     // MUL op
-    GET_NODE(elementwise_add);         // ELEMENT_ADD op
-    GET_NODE(mul_out);                 // tmp
+    GET_NODE(x);                // x
+    GET_NODE(w);                // Y
+    GET_NODE(fc_bias);          // bias
+    GET_NODE(fc_out);           // Out
+    GET_NODE(mul);              // MUL op
+    GET_NODE(elementwise_add);  // ELEMENT_ADD op
+    GET_NODE(mul_out);          // tmp
 #undef GET_NODE

    // Create an FC Node.
    OpDesc desc;
-    std::string fc_x_in = mul_tmp_var->Name();
-    std::string fc_Y_in = mul_weight->Name();
-    std::string fc_bias_in = elementwise_add_tmpvar->Name();
-    std::string fc_out = elementwise_add_out->Name();
+    std::string fc_x_in = x->Name();
+    std::string fc_Y_in = w->Name();
+    std::string fc_bias_in = fc_bias->Name();
+    std::string fc_out_out = fc_out->Name();
    desc.SetInput("Input", std::vector<std::string>({fc_x_in}));
    desc.SetInput("W", std::vector<std::string>({fc_Y_in}));
    desc.SetInput("Bias", std::vector<std::string>({fc_bias_in}));
-    desc.SetOutput("Out", std::vector<std::string>({fc_out}));
+    desc.SetOutput("Out", std::vector<std::string>({fc_out_out}));
    desc.SetType("fc");
    auto fc_node = g->CreateOpNode(&desc);  // OpDesc will be copied.
-    fc_node->inputs =
-        std::vector<Node*>({mul_tmp_var, mul_weight, elementwise_add_tmpvar});
-    fc_node->outputs.push_back(elementwise_add_out);
-
-    // Update link relatons
-    PADDLE_ENFORCE(LinksReplace(&mul_tmp_var->outputs, mul, fc_node));
-    PADDLE_ENFORCE(LinksReplace(&mul_weight->outputs, mul, fc_node));
-    PADDLE_ENFORCE(LinksReplace(&elementwise_add_tmpvar->outputs,
-                                elementwise_add, fc_node));
-    PADDLE_ENFORCE(
-        LinksReplace(&elementwise_add_out->inputs, elementwise_add, fc_node));
+    GraphSafeRemoveNodes(graph.get(), {mul, elementwise_add, mul_out});

-    // Drop old nodes
-    graph->RemoveNode(mul);
-    graph->RemoveNode(elementwise_add);
-    graph->RemoveNode(mul_out);  // tmp variable
+    IR_NODE_LINK_TO(x, fc_node);
+    IR_NODE_LINK_TO(w, fc_node);
+    IR_NODE_LINK_TO(fc_bias, fc_node);
+    IR_NODE_LINK_TO(fc_node, fc_out);

    found_fc_count++;
  };
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
@ -13,6 +13,7 @@
 // limitations under the License.

 #include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h"
+#include <string>
 #include "paddle/fluid/framework/lod_tensor.h"

 namespace paddle {
@ -94,21 +95,37 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
    op_desc.SetOutput("Hidden", {hidden_n->Name()});
    op_desc.SetOutput("Cell", {cell_n->Name()});
    op_desc.SetOutput("XX", {xx_n->Name()});
-    op_desc.SetOutput("BatchedGate", {"blstm_0.tmp_2"});
-    op_desc.SetOutput("BatchCellPreAct", {"blstm_1.tmp_2"});
+    op_desc.SetOutput("BatchedInput", {"blstm_0.tmp_2"});
    op_desc.SetAttr("is_reverse", lstm_n->Op()->GetAttr("is_reverse"));
    op_desc.SetAttr("use_peepholes", lstm_n->Op()->GetAttr("use_peepholes"));
-    auto* op = graph->CreateOpNode(&op_desc);
+    // TODO(TJ): get from attr
+    op_desc.SetAttr("use_seq", true);
+
+#define TMP_NAME(x) "at.new.tmp." #x
+#define OP_SET_OUT(x) op_desc.SetOutput(#x, {TMP_NAME(x)})
+    OP_SET_OUT(BatchedCell);
+    OP_SET_OUT(BatchedHidden);
+    OP_SET_OUT(ReorderedH0);
+    OP_SET_OUT(ReorderedC0);
+#undef OP_SET_OUT

-#define LINK_TO(a, b)      \
-  a->outputs.push_back(b); \
-  b->inputs.push_back(a);
-    LINK_TO(input_n, op);
-    LINK_TO(weight_x_n, op);
-    LINK_TO(weight_h_n, op);
-    LINK_TO(bias_n, op);
-    LINK_TO(op, hidden_n);
-#undef LINK_TO
+    auto* op = graph->CreateOpNode(&op_desc);
+    PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
+    auto* scope = graph->Get<Scope*>(kParamScopeAttr);
+
+#define TMP_NEW(x) scope->Var(TMP_NAME(x))->GetMutable<LoDTensor>()
+    TMP_NEW(BatchedCell);
+    TMP_NEW(BatchedHidden);
+    TMP_NEW(ReorderedH0);
+    TMP_NEW(ReorderedC0);
+#undef TMP_NEW
+#undef TMP_NAME
+
+    IR_NODE_LINK_TO(input_n, op);
+    IR_NODE_LINK_TO(weight_x_n, op);
+    IR_NODE_LINK_TO(weight_h_n, op);
+    IR_NODE_LINK_TO(bias_n, op);
+    IR_NODE_LINK_TO(op, hidden_n);
    return op;
  };

@ -116,7 +133,6 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,

  auto fc_no_bias_handler = [&](
      const GraphPatternDetector::subgraph_t& subgraph, Graph* g) {
-
 #define GET_NODE(name__)                                \
  std::string name__##key = name_scope + "/" + #name__; \
  auto* name__##n = pattern->RetrieveNode(name__##key); \
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@ -111,6 +111,11 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph& graph) {
      return false;
    }
  }
+  for (auto& item : pdnodes2nodes_) {
+    for (auto& n : item.second) {
+      GetMarkedNodes(const_cast<Graph*>(&graph)).insert(n);
+    }
+  }
  VLOG(3) << pdnodes2nodes_.size() << " nodes marked";

  return !pdnodes2nodes_.empty();
@ -278,7 +283,7 @@ void GraphPatternDetector::RemoveOverlappedMatch(
  for (const auto& subgraph : *subgraphs) {
    bool valid = true;
    for (auto& item : subgraph) {
-      if (node_set.count(item.second)) {
+      if (item.first->IsIntermediate() && node_set.count(item.second)) {
        valid = false;
        break;
      }
@ -334,22 +339,22 @@ PDNode& PDNode::LinksFrom(const std::vector<PDNode*>& others) {
 }

 PDNode* PDNode::assert_is_op() {
-  asserts_.emplace_back([this](Node* x) { return x && x->IsOp(); });
+  asserts_.emplace_back([](Node* x) { return x && x->IsOp(); });
  return this;
 }
 PDNode* PDNode::assert_is_op(const std::string& op_type) {
-  asserts_.emplace_back([this, op_type](Node* x) {
+  asserts_.emplace_back([op_type](Node* x) {
    return x && x->IsOp() && x->Op()->Type() == op_type;
  });
  return this;
 }
 PDNode* PDNode::assert_is_var() {
-  asserts_.emplace_back([this](Node* x) { return x && x->IsVar(); });
+  asserts_.emplace_back([](Node* x) { return x && x->IsVar(); });
  return this;
 }
 PDNode* PDNode::assert_var_not_persistable() {
  assert_is_var();
-  asserts_.emplace_back([this](Node* x) { return !x->Var()->Persistable(); });
+  asserts_.emplace_back([](Node* x) { return !x->Var()->Persistable(); });
  return this;
 }
 PDNode* PDNode::assert_is_persistable_var() {
@ -491,14 +496,16 @@ void GraphSafeRemoveNodes(Graph* graph,
    for (auto it = node->inputs.begin(); it != node->inputs.end();) {
      if (nodes.count(*it)) {
        it = const_cast<Node*>(node)->inputs.erase(it);
-      } else
+      } else {
        it++;
+      }
    }
    for (auto it = node->outputs.begin(); it != node->outputs.end();) {
      if (nodes.count(*it)) {
        it = const_cast<Node*>(node)->outputs.erase(it);
-      } else
+      } else {
        it++;
+      }
    }
  }
 }
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@ -245,6 +245,8 @@ class GraphPatternDetector {
  void UniquePatterns(std::vector<subgraph_t>* subgraphs);

  // Remove overlapped match subgraphs, when overlapped, keep the previous one.
+  // The intermediate PDNodes will be removed, so can't shared by multiple
+  // patterns.
  void RemoveOverlappedMatch(std::vector<subgraph_t>* subgraphs);

  // Validate whether the intermediate nodes are linked by external nodes.
@ -295,6 +297,10 @@ PDNode* LSTM(PDPattern* pattern, const std::string& name_scope, PDNode* x);

 }  // namespace patterns

+#define IR_NODE_LINK_TO(a, b) \
+  a->outputs.push_back(b);    \
+  b->inputs.push_back(a);
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc
@ -140,8 +140,9 @@ TEST(GraphPatternDetecter, MultiSubgraph) {
        return node->IsOp() && (node->Name() == "op2" || node->Name() == "op3");
      },
      "OP0");
-  auto* any_var = x.mutable_pattern()->NewNode(
-      [](Node* node) { return node->IsVar(); }, "VAR");
+  auto* any_var = x.mutable_pattern()
+                      ->NewNode([](Node* node) { return node->IsVar(); }, "VAR")
+                      ->AsIntermediate();
  auto* any_op1 = x.mutable_pattern()->NewNode(
      [](Node* node) { return node->IsOp(); }, "OP1");

--- a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
+++ b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
@ -13,42 +13,41 @@
 // limitations under the License.

 #include <algorithm>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"

 namespace paddle {
 namespace framework {
 namespace ir {

-class InferCleanGraphPass : public Pass {
+class InferCleanGraphPass : public FusePassBase {
 public:
  virtual ~InferCleanGraphPass() {}

 protected:
  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const {
+    FusePassBase::Init("original_graph", graph.get());
    PADDLE_ENFORCE(graph.get());

    auto is_valid_node = [](Node* x) {
      return x && IsControlDepVar(*x) && x->IsVar() && !x->Var();
    };

-    std::unordered_set<Node*> invalid_nodes;
+    std::unordered_set<const Node*> invalid_nodes;
+    int valid_op = 0;
    for (auto* node : graph->Nodes()) {
      if (is_valid_node(node)) {
        invalid_nodes.insert(node);
+      } else if (node->IsOp()) {
+        // Collect all the operators to help tracking number of operators.
+        ++valid_op;
      }
    }

-    // remove nodes from the graph.
-    for (auto* node : invalid_nodes) {
-      graph->RemoveNode(node);
-    }
+    GraphSafeRemoveNodes(graph.get(), invalid_nodes);

-    // clean edges.
-    for (auto* node : graph->Nodes()) {
-      CleanEdges(&node->inputs, invalid_nodes);
-      CleanEdges(&node->outputs, invalid_nodes);
-    }
+    AddStatis(valid_op);

    return graph;
  }
--- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
@ -219,16 +219,13 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl(
    op_desc.SetAttr("fc_activation", act->Op()->Type());

    auto* op_node = graph->CreateOpNode(&op_desc);
-// Add links
-#define NODE_LINKS(a, b)   \
-  a->outputs.push_back(b); \
-  b->inputs.push_back(a);
-    NODE_LINKS(fc_w, op_node);
-    NODE_LINKS(fc_bias, op_node);
-    NODE_LINKS(concat_in0, op_node);
-    NODE_LINKS(sequence_expand0_in, op_node);
-    NODE_LINKS(sequence_expand1_in, op_node);
-    NODE_LINKS(op_node, fc_out);
+    // Add links
+    IR_NODE_LINK_TO(fc_w, op_node);
+    IR_NODE_LINK_TO(fc_bias, op_node);
+    IR_NODE_LINK_TO(concat_in0, op_node);
+    IR_NODE_LINK_TO(sequence_expand0_in, op_node);
+    IR_NODE_LINK_TO(sequence_expand1_in, op_node);
+    IR_NODE_LINK_TO(op_node, fc_out);

    // Clean nodes.
    std::unordered_set<const Node*> marked_nodes;
@ -241,7 +238,6 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl(
    marked_nodes.erase(sequence_expand0_in);
    marked_nodes.erase(sequence_expand1_in);
    marked_nodes.erase(fc_out);
-
    GraphSafeRemoveNodes(graph, marked_nodes);
  });

--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@ -10,7 +10,7 @@ set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor)
 # TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal?
 cc_library(paddle_fluid_api
    SRCS io.cc
-    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB} graph_to_program_pass)
+    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) 

 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)

@ -22,7 +22,7 @@ cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
 #endif()

 # Create static library
-cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api)
+cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api analysis_predictor)
 if(NOT APPLE)
  # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
  set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym")
@ -32,6 +32,7 @@ endif()
 # Create shared library
 cc_library(paddle_fluid_shared SHARED
    SRCS io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc
    DEPS ${fluid_modules} paddle_fluid_api)

 set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@ -33,7 +33,7 @@ function (inference_analysis_test TARGET)
        endif()
        cc_test(${TARGET}
                SRCS "${analysis_test_SRCS}"
-                DEPS analysis graph fc_fuse_pass graph_viz_pass infer_clean_graph_pass graph_pattern_detector pass ${analysis_test_EXTRA_DEPS}
+                DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS}
                ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt} ${analysis_test_ARGS})
        set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
    endif(WITH_TESTING)
@ -56,25 +56,13 @@ if (NOT EXISTS ${DITU_INSTALL_DIR} AND WITH_TESTING)
 endif()

 inference_analysis_test(test_analyzer SRCS analyzer_tester.cc
-    EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis
-    analysis_predictor
-		# ir
-		fc_fuse_pass
-		fc_lstm_fuse_pass
-    seq_concat_fc_fuse_pass
-		graph_viz_pass
-		infer_clean_graph_pass
-		graph_pattern_detector
-    infer_clean_graph_pass
-    attention_lstm_fuse_pass
-    paddle_inference_api
-		pass
+    EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
    ARGS --infer_ditu_rnn_model=${DITU_INSTALL_DIR}/model
        --infer_ditu_rnn_data=${DITU_INSTALL_DIR}/data.txt)

 inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc)
-inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc EXTRA_DEPS paddle_inference_api)
-inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc EXTRA_DEPS paddle_fluid)
+inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc)
+inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc)
 inference_analysis_test(test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc)
 inference_analysis_test(test_subgraph_splitter SRCS subgraph_splitter_tester.cc)
 inference_analysis_test(test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc)
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@ -23,6 +23,7 @@
 #include "paddle/fluid/inference/api/analysis_predictor.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/inference/utils/singleton.h"

 DEFINE_string(infer_ditu_rnn_model, "", "model path for ditu RNN");
@ -329,9 +330,20 @@ void TestDituRNNPrediction(bool use_analysis_and_activate_ir = false,
      LOG(INFO) << "fused " << item.first << " " << item.second;
    }

-    ASSERT_TRUE(fuse_statis.count("fc"));
-    EXPECT_EQ(fuse_statis.at("fc"), 1);
-    EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 1);
+    int num_ops = 0;
+    for (auto &node :
+         analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
+      if (node->IsFunction()) {
+        ++num_ops;
+      }
+    }
+    LOG(INFO) << "has num ops: " << num_ops;
+
+    ASSERT_TRUE(fuse_statis.count("fc_fuse"));
+    EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
+    EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2);  // bi-directional LSTM
+    EXPECT_EQ(num_ops,
+              13);  // After graph optimization, only 13 operators exists.
  }
 }

@ -348,10 +360,3 @@ TEST(Analyzer, DituRNN_multi_thread) {
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
-
-USE_PASS(fc_fuse_pass);
-USE_PASS(seq_concat_fc_fuse_pass);
-USE_PASS(fc_lstm_fuse_pass);
-USE_PASS(graph_viz_pass);
-USE_PASS(infer_clean_graph_pass);
-USE_PASS(attention_lstm_fuse_pass);
--- a/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc
@ -16,6 +16,7 @@

 #include <gtest/gtest.h>
 #include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"

 namespace paddle {
 namespace inference {
@ -33,10 +34,3 @@ TEST(FluidToIrPass, Test) {
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
-
-USE_PASS(graph_viz_pass);
-USE_PASS(infer_clean_graph_pass);
-USE_PASS(attention_lstm_fuse_pass);
-USE_PASS(fc_lstm_fuse_pass);
-USE_PASS(seq_concat_fc_fuse_pass);
-USE_PASS(fc_fuse_pass);
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@ -18,10 +18,7 @@ if(APPLE)
 endif(APPLE)


-set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager
-  graph_viz_pass fc_fuse_pass
-  infer_clean_graph_pass
-  )
+set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager ${GLOB_PASS_LIB})

 if(WITH_GPU AND TENSORRT_FOUND)
    set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine)
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@ -20,6 +20,7 @@
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/inference/utils/singleton.h"

 namespace paddle {
@ -132,7 +133,3 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
 }

 }  // namespace paddle
-
-USE_PASS(fc_fuse_pass);
-USE_PASS(graph_viz_pass);
-USE_PASS(infer_clean_graph_pass);
--- a/Show More
+++ b/Show More