Merge branch 'develop' of github.com:PaddlePaddle/Paddle into dist_pass_barrier

8 years ago · c1ab215e26
parent 1366832a41 66c91911cf
commit c1ab215e26
47 changed files with 561 additions and 257 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -61,8 +61,10 @@ option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen"            OFF)
 option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
 option(WITH_FAST_BUNDLE_TEST    "Bundle tests that can be run in a single process together to reduce launch overhead"   OFF)
 option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
+option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
 option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
+option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)

 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@ -131,6 +133,10 @@ if (NOT DEFINED WITH_MKLDNN)
        set(WITH_MKLDNN OFF)
    endif()
 endif()
+
+if (REPLACE_ENFORCE_GLOG)
+  add_definitions("-DREPLACE_ENFORCE_GLOG")
+endif()
 ########################################################################################

 include(external/mklml)     # download mklml package
@ -153,12 +159,24 @@ include(external/cares)
 if(WITH_DISTRIBUTE)
    if(WITH_GRPC)
        include(external/grpc)
+        message(STATUS "Use grpc framework.")
    else()
+        message(STATUS "Use brpc framework.")
        include(external/leveldb)
        include(external/brpc)
    endif()
 endif()

+if(WITH_BRPC_RDMA)
+    message(STATUS "Use brpc with rdma.")
+    if(WITH_GRPC)
+        message(FATAL_ERROR "Can't use grpc with brpc rdma.")
+    endif()
+    if(NOT WITH_DISTRIBUTE)
+        message(FATAL_ERROR "Can't use brpc rdma in no distribute env.")
+    endif()
+endif()
+
 include(external/snappy)    # download snappy
 include(external/snappystream)
 include(external/threadpool)
--- a/README.md
+++ b/README.md
@ -4,7 +4,6 @@
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
 [![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html)
 [![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html)
-[![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@ -174,3 +174,7 @@ endif(WITH_GOLANG)
 if(WITH_GRPC)
    add_definitions(-DPADDLE_WITH_GRPC)
 endif(WITH_GRPC)
+
+if(WITH_BRPC_RDMA)
+    add_definitions(-DPADDLE_WITH_BRPC_RDMA)
+endif(WITH_BRPC_RDMA)
--- a/cmake/external/brpc.cmake
+++ b/cmake/external/brpc.cmake
@ -14,6 +14,15 @@

 INCLUDE(ExternalProject)

+find_library(SSL_LIBRARY NAMES ssl)
+ADD_LIBRARY(ssl SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${SSL_LIBRARY})
+
+find_library(CRYPTO_LIBRARY NAMES crypto)
+ADD_LIBRARY(crypto SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${CRYPTO_LIBRARY})
+
+
 SET(BRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/brpc)
 SET(BRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/brpc)
 SET(BRPC_INCLUDE_DIR "${BRPC_INSTALL_DIR}/include" CACHE PATH "brpc include directory." FORCE)
@ -22,14 +31,14 @@ SET(BRPC_LIBRARIES "${BRPC_INSTALL_DIR}/lib/libbrpc.a" CACHE FILEPATH "brpc libr
 INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR})

 # Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args
-set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf")
+set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib")

 # If minimal .a is need, you can set  WITH_DEBUG_SYMBOLS=OFF
 ExternalProject_Add(
    extern_brpc
    ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/brpc/brpc"
-    GIT_TAG         "6d153dd7ff00f960ae6895c9c5fff0ce9f07aff2"
+    GIT_REPOSITORY  "https://github.com/gongweibao/brpc"
+    GIT_TAG         "7dc04defad1fd4173aae170c3fcbde131b65155a"
    PREFIX          ${BRPC_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
@ -42,6 +51,8 @@ ExternalProject_Add(
                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                    -DCMAKE_PREFIX_PATH=${prefix_path}
                    -DBRPC_WITH_GLOG=ON
+                    -DIOBUF_WITH_HUGE_BLOCK=ON
+                    -DBRPC_WITH_RDMA=${WITH_BRPC_RDMA}
                    ${EXTERNAL_OPTIONAL_ARGS}
    LIST_SEPARATOR |
    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${BRPC_INSTALL_DIR}
@ -49,7 +60,7 @@ ExternalProject_Add(
                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
-ADD_DEPENDENCIES(extern_brpc protobuf leveldb gflags glog gtest snappy)
+ADD_DEPENDENCIES(extern_brpc protobuf ssl crypto leveldb gflags glog gtest snappy)
 ADD_LIBRARY(brpc STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES})
 ADD_DEPENDENCIES(brpc extern_brpc)
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@ -96,6 +96,20 @@ if(NOT APPLE AND NOT ANDROID)
    set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
 endif(NOT APPLE AND NOT ANDROID)

+set_property(GLOBAL PROPERTY FLUID_MODULES "")
+# find all fluid modules is used for paddle fluid static library
+# for building inference libs
+function(find_fluid_modules TARGET_NAME)
+  get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
+  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
+  string(FIND "${__target_path}" "fluid" pos)
+  if(pos GREATER 1)
+    get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
+    set(fluid_modules ${fluid_modules} ${TARGET_NAME})
+    set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
+  endif()
+endfunction(find_fluid_modules)
+
 function(merge_static_libs TARGET_NAME)
  set(libs ${ARGN})
  list(REMOVE_DUPLICATES libs)
@ -250,6 +264,7 @@ function(cc_test TARGET_NAME)
             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
    if (${cc_test_SERIAL})
        set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
    endif()
  endif()
 endfunction(cc_test)
@ -314,6 +329,7 @@ function(nv_test TARGET_NAME)
    add_test(${TARGET_NAME} ${TARGET_NAME})
    if (nv_test_SERIAL)
        set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
    endif()
  endif()
 endfunction(nv_test)
@ -561,7 +577,7 @@ function(py_test TARGET_NAME)
    set(multiValueArgs SRCS DEPS ARGS ENVS)
    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_test(NAME ${TARGET_NAME}
-             COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
+             COMMAND env FLAGS_init_allocated_mem=true PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
             ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
  endif()
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@ -12,19 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-set_property(GLOBAL PROPERTY FLUID_MODULES "")
-# find all fluid modules is used for paddle fluid static library
-function(find_fluid_modules TARGET_NAME)
-  get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
-  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
-  string(FIND "${__target_path}" "fluid" pos)
-  if(pos GREATER 1)
-    get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
-    set(fluid_modules ${fluid_modules} ${TARGET_NAME})
-    set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
-  endif()
-endfunction(find_fluid_modules)
-
 # make package for paddle fluid shared and static library
 function(copy TARGET)
    set(options "")
@ -154,7 +141,7 @@ set(inference_deps paddle_fluid_shared paddle_fluid)
 if(WITH_CONTRIB)
    message(STATUS "installing contrib")
    set(contrib_dst_dir "${FLUID_INSTALL_DIR}/contrib/inference")
-    if (WITH_ANAKIN)
+    if (WITH_ANAKIN AND WITH_GPU)
        copy(contrib_anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
            SRCS
            ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libinference_anakin_api* # compiled anakin api
@ -163,9 +150,9 @@ if(WITH_CONTRIB)
        list(APPEND inference_deps contrib_anakin_inference_lib)
   endif()

-  copy(contrib_inference_lib DEPS paddle_inference_api
+  copy(contrib_inference_lib DEPS paddle_inference_api paddle_inference_api_shared
        SRCS ${PADDLE_SOURCE_DIR}/paddle/contrib/inference/paddle_inference_api.h
-        ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api.*
+        ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api*
        DSTS ${contrib_dst_dir} ${contrib_dst_dir})
  list(APPEND inference_deps contrib_inference_lib)
 endif()
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@ -46,6 +46,10 @@ cc_library(paddle_inference_api
    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})

+cc_library(paddle_inference_api_shared SHARED
+    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
+    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
+
 cc_test(test_paddle_inference_api
        SRCS test_paddle_inference_api.cc
        DEPS paddle_inference_api)
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@ -147,9 +147,9 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
                 "Input tensor type is not supported: ", in.type().name());
  memory::data_type out_type = in_type;

-  auto in_format = MKLDNNFormatForSize(in_tz.size(), in.format());
+  auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format());
  auto out_format =
-      MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
+      platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));

  void* in_data = GetDataFromTensor(in, in_type);

--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
@ -62,12 +62,6 @@ inline MKLDNNDataType ToMKLDNNDataType(const std::type_index type) {
  return MKLDNNDataType::data_undef;
 }

-inline MKLDNNFormat MKLDNNFormatForSize(size_t dims_size,
-                                        MKLDNNFormat default_format) {
-  return (dims_size == 1
-              ? mkldnn::memory::format::x
-              : dims_size == 2 ? mkldnn::memory::format::nc : default_format);
-}
 #endif

 void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@ -18,6 +18,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/framework/data_type_transform.h"

+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
 namespace paddle {
 namespace framework {

@ -48,8 +52,8 @@ void TransformData(const OpKernelType &expected_kernel_type,
        // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
        // Just set layout/format. No real transform occur

-        auto out_format =
-            MKLDNNFormatForSize(in.dims().size(), ToMKLDNNFormat(lin));
+        auto out_format = platform::MKLDNNFormatForSize(in.dims().size(),
+                                                        ToMKLDNNFormat(lin));

        out.ShareDataWith(input_tensor);
        out.set_layout(DataLayout::kMKLDNN);
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@ -20,9 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
-#ifdef PADDLE_WITH_DISTRIBUTE
-#include "paddle/fluid/operators/distributed/grpc_client.h"
-#endif
+#include "paddle/fluid/operators/detail/macros.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"

--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@ -68,7 +68,7 @@ std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
  // only print first ten elements
  int64_t size = t.numel() < 10 ? t.numel() : 10;
  for (int64_t i = 0; i < size; ++i) {
-    if (t.type().hash_code() == typeid(float).hash_code()) {
+    if (t.type().hash_code() == typeid(float).hash_code()) {  // NOLINT
      os << t.data<float>()[i] << " ";
    } else if (t.type().hash_code() == typeid(int64_t).hash_code()) {
      os << t.data<int64_t>()[i] << " ";
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@ -748,10 +748,6 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
          t = &var->Get<LoDTensor>();
        } else if (var->IsType<SelectedRows>()) {
          t = &(var->Get<SelectedRows>().value());
-        } else if (var->IsType<LoDTensorArray>()) {
-          const LoDTensorArray& arr = var->Get<LoDTensorArray>();
-          PADDLE_ENFORCE(arr.size() > 0);
-          t = &(arr[0]);
        }
        if (t != nullptr) {
          int tmp = static_cast<int>(ToDataType(t->type()));
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@ -253,9 +253,6 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
      t->set_lod(lod_tensors[j].lod());
    }
  }
-  for (auto &p : member_->places_) {
-    platform::DeviceContextPool::Instance().Get(p)->Wait();
-  }
 }

 ParallelExecutor::~ParallelExecutor() {
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@ -23,9 +23,9 @@ namespace framework {
 template <typename T>
 inline const T* Tensor::data() const {
  check_memory_size();
-  PADDLE_ENFORCE(std::is_same<T, void>::value ||
-                     holder_->type() == std::type_index(typeid(T)),
-                 "Tensor holds the wrong type, it holds %s",
+  bool valid = std::is_same<T, void>::value ||
+               holder_->type() == std::type_index(typeid(T));
+  PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s",
                 this->holder_->type().name());

  return reinterpret_cast<const T*>(
@ -37,9 +37,9 @@ inline bool Tensor::IsInitialized() const { return holder_ != nullptr; }
 template <typename T>
 inline T* Tensor::data() {
  check_memory_size();
-  PADDLE_ENFORCE(std::is_same<T, void>::value ||
-                     holder_->type() == std::type_index(typeid(T)),
-                 "Tensor holds the wrong type, it holds %s",
+  bool valid = std::is_same<T, void>::value ||
+               holder_->type() == std::type_index(typeid(T));
+  PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s",
                 this->holder_->type().name());
  return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                              offset_);
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@ -69,7 +69,22 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
    auto stream =
        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
-    memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
+    if (platform::is_same_place(src_place, dst_place)) {
+      memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+                   stream);
+    } else {
+      if (platform::is_same_place(ctx_place, src_place)) {
+        memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+                     stream);
+        platform::DeviceContextPool::Instance().Get(src.place())->Wait();
+      } else if (platform::is_same_place(ctx_place, dst_place)) {
+        platform::DeviceContextPool::Instance().Get(src.place())->Wait();
+        memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+                     stream);
+      } else {
+        PADDLE_THROW("ctx is not belong to dst_gpu_place or src_gpu_place.");
+      }
+    }
  }
 #endif
 }
@ -78,10 +93,10 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                Tensor* dst) {
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
  const platform::DeviceContext* dev_ctx;
-  if (platform::is_gpu_place(src.place())) {
-    dev_ctx = pool.Get(src.place());
-  } else {
+  if (platform::is_gpu_place(dst_place)) {
    dev_ctx = pool.Get(dst_place);
+  } else {
+    dev_ctx = pool.Get(src.place());
  }
  TensorCopy(src, dst_place, *dev_ctx, dst);
 }
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@ -23,10 +23,25 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

+// NOTE(zcd): Because TensorCopy is an async operation, when the src_place
+// and dst_place are two different GPU, to ensure that the operation can
+// be carried out correctly, there is a src_ctx wait operation in TensorCopy.
+// If ctx_place and src_place are the same, src_ctx.Wait() is added
+// after memory::Copy; if ctx_place and dst_place are the same,
+// src_ctx.Wait() is added before memory::Copy.
 void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                const platform::DeviceContext& ctx, Tensor* dst);
+
+// NOTE(zcd): If the src.place() and dst_place are two different GPU,
+// the copy operation is carried out on the dst_place's stream. This is
+// very important, because TensorCopy is an async operator, and in most
+// case, once this copy operator returns, dst is to be used in dst_place's
+// stream, if this copy operation is carried out on the src_place's stream,
+// when dst is used in dst_place's stream the copy operation may be
+// not completed.
 void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                Tensor* dst);
+
 void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
                    Tensor* dst);

--- a/paddle/fluid/inference/analysis/README.md
+++ b/paddle/fluid/inference/analysis/README.md
@ -0,0 +1,57 @@
+# Inference Analysis
+
+The `inference/analysis` module is used to analyze and optimize the inference program,
+it references some philosophy from `LLVM/analysis`, 
+and make the various optimization features be pluggable and co-exist in a pipeline.
+
+We borrowed some concepts from LLVM, such as
+
+- [Pass](./pass.h)es to implement optimization that traverse the inference program,
+- [DataFlowGraph](./data_flow_graph.h) to represent the data flow graph built from a program,
+- [PassManager](./pass_manager.h) to manage a sequence of `Pass`es over a graph.
+
+There are some other basic concepts here
+
+- [Node](./node.h), the node in a `DataFlowGraph`,
+  - `Function`, the Operator in Fluid,
+  - `Value`, the Variable in Fluid;
+- [Argument](./argument.h), the argument that treat as the input and output of all `Pass`es in the pipeline,
+
+## How it works
+
+The `inference/analysis` module make all the passes in a pipeline, and works in such way:
+
+1. Build a `DataFlowGraph` from a Fluid inference ProgramDesc,
+2. Call the middle passes one by one, the same `DataFlowGraph` is passed across all the passes,
+3. Transform a new ProgramDesc from the modified `DataFlowGraph`.
+
+The new optimization features can be added as an independent `Pass` and controlled by gflags,
+each pass will generate unified debug information or visualization for better debugging.
+
+## Supported Passes
+
+### `FluidToDataFlowGraphPass`
+Transform the fluid `ProgramDesc` to a `DataFlowGraph` to give an abstract representation for all the middle passes, 
+this should be the first pass of the pipeline.
+
+### `DataFlowGraphToFluidPass`
+Generate a final `ProgramDesc` from a data flow graph, this should be the last pass of the pipeline.
+
+### `TensorRTSubgraphNodeMarkPass`
+Mark the `Node` that are supported by TensorRT, 
+this pass will generate a visualization file which can be used for debugging.
+
+### `TensorRTSubGraphPass`
+Split the sub-graph that are can be accelerated by TensorRT.
+
+### `DFG_GraphvizDrawPass`
+This pass is just for debug, it will visualize the `DataFlowGraph` using the [graphviz](http://www.graphviz.org) tool.
+
+It can be used as a helper class that draws the modified graph after each pass.
+
+## Utilities
+
+There is some helper function/class for analysis.
+
+- [dot.h](./dot.h) give a easy to use interface for generating `DOT` codes,
+- [graph_traits.h](./graph_traits.h) contains the graph traversal algorithms, it uses `iterator` to make the algorithms easy to share across different passes.
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@ -13,6 +13,7 @@
 // limitations under the License.

 #include "paddle/fluid/inference/analysis/analyzer.h"
+#include <string>
 #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
 #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
@ -79,4 +80,4 @@ void Analyzer::Run(Argument* argument) {

 }  // namespace analysis
 }  // namespace inference
-}  // namespace paddle
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#pragma once
+
 /*
 * This file contains Analyzer, an class that exposed as a library that analyze
 * and optimize
--- a/paddle/fluid/inference/analysis/data_flow_graph.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph.h
@ -138,7 +138,7 @@ struct GraphTraits<DataFlowGraph> {
 // sub-graph is the inputs nodes and output nodes that doesn't inside the
 // sub-graph.
 static std::pair<std::vector<Node *>, std::vector<Node *>>
-ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {
+ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT
  std::unordered_set<Node *> nodes(graph.begin(), graph.end());
  std::unordered_set<Node *> inputs;
  std::unordered_set<Node *> outputs;
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@ -13,6 +13,7 @@
 // limitations under the License.

 #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
+#include <vector>
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/proto_desc.h"
@ -150,13 +151,14 @@ namespace {
 class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
 public:
  using Config = DFG_GraphvizDrawPass::Config;
-  DFG_DebuggerPass(const Config& config) : DFG_GraphvizDrawPass(config) {}
+  explicit DFG_DebuggerPass(const Config& config)
+      : DFG_GraphvizDrawPass(config) {}

  std::string repr() const override { return "dfg-to-fluid-debuger-pass"; }

  bool Finalize() override { return true; }
 };
-}
+}  // namespace

 Pass* DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
  return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
@ -19,6 +19,7 @@

 #pragma once

+#include <string>
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
 #include "paddle/fluid/inference/analysis/pass.h"
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
@ -46,7 +46,7 @@ class DFG_GraphvizDrawPass : public DataFlowGraphPass {
    const bool display_deleted_node;
  };

-  DFG_GraphvizDrawPass(const Config &config) : config_(config) {}
+  explicit DFG_GraphvizDrawPass(const Config &config) : config_(config) {}

  bool Initialize(Argument *argument) override { return true; }
  void Run(DataFlowGraph *graph) override;
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
@ -15,7 +15,7 @@ limitations under the License. */
 #include <string>
 #include <vector>

-#include "analyzer.h"
+#include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"

@ -88,7 +88,8 @@ namespace {
 class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
 public:
  using Config = DFG_GraphvizDrawPass::Config;
-  DFG_DebuggerPass(const Config &config) : DFG_GraphvizDrawPass(config) {}
+  explicit DFG_DebuggerPass(const Config &config)
+      : DFG_GraphvizDrawPass(config) {}
  std::string repr() const override { return "fluid-to-dfg-debuger-pass"; }
  bool Finalize() override { return true; }
 };
--- a/Show More
+++ b/Show More