merge develop

7 years ago · 5c537941c2
parent 018e2f3a6c 51cc80cca0
commit 5c537941c2
412 changed files with 10606 additions and 3186 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -27,15 +27,6 @@ script:
    # 43min timeout
    paddle/scripts/paddle_docker_build.sh ${JOB}
    if [ $? -eq 0 ] || [ $? -eq 142 ]; then true; else exit 1; fi;
-  - |
-    if [[ "$JOB" != "doc" ]]; then exit 0; fi;
-    # For document only
-    if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
-    if [[ "$TRAVIS_BRANCH" != "develop"  && ! "$TRAVIS_BRANCH" =~ ^v|release/[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then exit 0; fi;
-    export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
-    export DOCS_DIR=`pwd`
-    cd ..
-    curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc/
 notifications:
  email:
    on_success: change
--- a/AUTHORS.md
+++ b/AUTHORS.md
@ -46,6 +46,7 @@
 | tianbingsz | Tian-Bing Xu |
 | tpatejko | Tomasz Patejko |
 | typhoonzero | Yi Wu |
+| velconia | Qi-Yang Min |
 | wanghaoshuang | Hao-Shuang Wang |
 | wangyang59 | Yang Wang |
 | wangzhen-nlp | Zhen Wang |
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -65,6 +65,7 @@ option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better d
 option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
+option(WITH_INFERENCE    "Compile fluid inference library"              ON)
 option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
 option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})

@ -72,6 +73,7 @@ option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VER
 if(NOT PY_VERSION)
  set(PY_VERSION 2.7)
 endif()
+set(PYBIND11_PYTHON_VERSION ${PY_VERSION})

 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@ -158,6 +160,7 @@ endif()
 ########################################################################################

 include(external/mklml)     # download mklml package
+include(external/xbyak)     # download xbyak package
 include(external/libxsmm)   # download, build, install libxsmm
 include(external/zlib)      # download, build, install zlib
 include(external/gflags)    # download, build, install gflags
@ -174,6 +177,7 @@ include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
 include(external/cares)
+include(external/cub)

 if(WITH_DISTRIBUTE)
    if(WITH_GRPC)
@ -200,6 +204,14 @@ include(external/snappy)    # download snappy
 include(external/snappystream)
 include(external/threadpool)

+if(WITH_GPU)
+    include(cuda)
+    include(tensorrt)
+    include(external/anakin)
+else()
+  set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when GPU is set." FORCE)
+endif()
+
 include(cudnn)              # set cudnn libraries, must before configure
 include(cupti)
 include(configure)          # add paddle env configuration
@ -228,14 +240,6 @@ set(EXTERNAL_LIBS
    ${PYTHON_LIBRARIES}
 )

-if(WITH_GPU)
-    include(cuda)
-    include(tensorrt)
-    include(external/anakin)
-else()
-  set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when GPU is set." FORCE)
-endif()
-
 if(WITH_AMD_GPU)
    find_package(HIP)
    include(hip)
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@ -85,8 +85,7 @@ def dist_transpile(trainer_id, args):
        trainer_id,
        pservers=pserver_endpoints,
        trainers=trainers,
-        sync_mode=not args.async_mode,
-        slice_var_up=not args.no_split_var)
+        sync_mode=not args.async_mode)
    if training_role == "PSERVER":
        pserver_program = t.get_pserver_program(current_endpoint)
        pserver_startup_program = t.get_startup_program(current_endpoint,
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@ -21,6 +21,7 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
    ${CUDNN_ROOT}/lib64
    ${CUDNN_ROOT}/lib
    ${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu
+    ${CUDNN_ROOT}/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/
    $ENV{CUDNN_ROOT}
    $ENV{CUDNN_ROOT}/lib64
    $ENV{CUDNN_ROOT}/lib
--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@ -8,6 +8,7 @@ set(ANAKIN_INCLUDE "${ANAKIN_INSTALL_DIR}" CACHE STRING "root of Anakin header f
 set(ANAKIN_LIBRARY "${ANAKIN_INSTALL_DIR}" CACHE STRING "path of Anakin library")

 set(ANAKIN_COMPILE_EXTRA_FLAGS 
+    -Wno-error=unused-but-set-variable -Wno-unused-but-set-variable
    -Wno-error=unused-variable -Wno-unused-variable 
    -Wno-error=format-extra-args -Wno-format-extra-args
    -Wno-error=comment -Wno-comment 
@ -19,7 +20,7 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS
    -Wno-reorder 
    -Wno-error=cpp)

-set(ANAKIN_LIBRARY_URL "https://github.com/pangge/Anakin/releases/download/3.0/anakin_release_simple.tar.gz")
+set(ANAKIN_LIBRARY_URL "https://github.com/pangge/Anakin/releases/download/Version0.1.0/anakin.tar.gz")

 # A helper function used in Anakin, currently, to use it, one need to recursively include
 # nearly all the header files.
@ -41,9 +42,9 @@ if (NOT EXISTS "${ANAKIN_INSTALL_DIR}")
    message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}")
    execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
    execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*")
-    execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget -q ${ANAKIN_LIBRARY_URL}")
+    execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget --no-check-certificate -q ${ANAKIN_LIBRARY_URL}")
    execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
-    execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin_release_simple.tar.gz")
+    execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin.tar.gz")
 endif()

 if (WITH_ANAKIN)
--- a/cmake/external/cub.cmake
+++ b/cmake/external/cub.cmake
@ -0,0 +1,35 @@
+if(NOT WITH_GPU)
+  return()
+endif()
+
+include(ExternalProject)
+
+set(CUB_SOURCE_DIR ${THIRD_PARTY_PATH}/cub)
+set(CUB_INCLUDE_DIR ${CUB_SOURCE_DIR}/src/extern_cub)
+
+include_directories(${CUB_INCLUDE_DIR})
+
+ExternalProject_Add(
+  extern_cub
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  GIT_REPOSITORY "https://github.com/NVlabs/cub.git"
+  GIT_TAG        "v1.8.0"
+  PREFIX         ${CUB_SOURCE_DIR}
+  UPDATE_COMMAND ""
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND     ""
+  INSTALL_COMMAND   ""
+  TEST_COMMAND      ""
+)
+
+if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
+  set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cub_dummy.c)
+  file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
+  add_library(cub STATIC ${dummyfile})
+else()
+  add_library(cub INTERFACE)
+endif()
+
+add_dependencies(cub extern_cub)
+
+LIST(APPEND externl_project_dependencies cub)
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@ -50,7 +50,7 @@ ExternalProject_Add(
    UPDATE_COMMAND  ""
    CONFIGURE_COMMAND ""
    BUILD_IN_SOURCE 1
-    PATCH_COMMAND git apply ${PADDLE_SOURCE_DIR}/patches/grpc/fix_too_early_destory.patch
+    PATCH_COMMAND cp ${PADDLE_SOURCE_DIR}/patches/grpc/grpc_library.h ${GRPC_SOURCES_DIR}/src/extern_grpc/include/grpcpp/impl/codegen/grpc_library.h && cp ${PADDLE_SOURCE_DIR}/patches/grpc/completion_queue.h ${GRPC_SOURCES_DIR}/src/extern_grpc/include/grpcpp/impl/codegen/completion_queue.h
    # NOTE(yuyang18):
    # Disable -Werror, otherwise the compile will fail in MacOS.
    # It seems that we cannot configure that by make command.
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@ -24,7 +24,7 @@ SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
 SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)

 IF(WIN32 OR APPLE)
-    MESSAGE(WARNING 
+    MESSAGE(WARNING
        "Windows or Mac is not supported with MKLDNN in Paddle yet."
        "Force WITH_MKLDNN=OFF")
    SET(WITH_MKLDNN OFF CACHE STRING "Disable MKLDNN in Windows and MacOS" FORCE)
@ -57,8 +57,10 @@ ExternalProject_Add(
    GIT_TAG             "a29d8487a63afca3d5b8c5bbdbb473cf8ccc6e51"
    PREFIX              ${MKLDNN_SOURCES_DIR}
    UPDATE_COMMAND      ""
+    CMAKE_ARGS          -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+    CMAKE_ARGS          -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
-    CMAKE_ARGS          -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} 
+    CMAKE_ARGS          -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
    CMAKE_ARGS          -DMKLROOT=${MKLML_ROOT}
    CMAKE_ARGS          -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
    CMAKE_ARGS          -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
--- a/cmake/external/xbyak.cmake
+++ b/cmake/external/xbyak.cmake
@ -0,0 +1,58 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set(WITH_XBYAK ON)
+if(WIN32 OR APPLE)
+    SET(WITH_XBYAK OFF CACHE STRING "Disable XBYAK in Windows and MacOS" FORCE)
+    return()
+endif()
+
+include(ExternalProject)
+
+set(XBYAK_PROJECT       extern_xbyak)
+set(XBYAK_PREFIX_DIR    ${THIRD_PARTY_PATH}/xbyak)
+set(XBYAK_INSTALL_ROOT  ${THIRD_PARTY_PATH}/install/xbyak)
+set(XBYAK_INC_DIR       ${XBYAK_INSTALL_ROOT}/include)
+
+include_directories(${XBYAK_INC_DIR})
+include_directories(${XBYAK_INC_DIR}/xbyak)
+
+add_definitions(-DPADDLE_WITH_XBYAK)
+
+# xbyak options
+add_definitions(-DXBYAK64)
+add_definitions(-DXBYAK_NO_OP_NAMES)
+
+ExternalProject_Add(
+    ${XBYAK_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    DEPENDS             ""
+    GIT_REPOSITORY      "https://github.com/herumi/xbyak.git"
+    GIT_TAG             "v5.661"  # Jul 26th
+    PREFIX              ${XBYAK_PREFIX_DIR}
+    UPDATE_COMMAND      ""
+    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${XBYAK_INSTALL_ROOT}
+    CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${XBYAK_INSTALL_ROOT}
+)
+
+if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/xbyak_dummy.c)
+    file(WRITE ${dummyfile} "const char *dummy_xbyak = \"${dummyfile}\";")
+    add_library(xbyak STATIC ${dummyfile})
+else()
+    add_library(xbyak INTERFACE)
+endif()
+
+add_dependencies(xbyak ${XBYAK_PROJECT})
+list(APPEND external_project_dependencies xbyak)
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@ -263,8 +263,11 @@ function(cc_test TARGET_NAME)
             COMMAND ${TARGET_NAME} ${cc_test_ARGS}
             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
    if (${cc_test_SERIAL})
-        set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
+
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
    endif()
  endif()
 endfunction(cc_test)
@ -328,8 +331,11 @@ function(nv_test TARGET_NAME)
    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
    add_test(${TARGET_NAME} ${TARGET_NAME})
    if (nv_test_SERIAL)
-        set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
+
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
    endif()
  endif()
 endfunction(nv_test)
@ -577,7 +583,9 @@ function(py_test TARGET_NAME)
    set(multiValueArgs SRCS DEPS ARGS ENVS)
    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_test(NAME ${TARGET_NAME}
-             COMMAND env FLAGS_init_allocated_mem=true PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
+             COMMAND env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
+             FLAGS_cpu_deterministic=true
+             PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
             ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
  endif()
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@ -148,18 +148,11 @@ if (WITH_ANAKIN AND WITH_GPU)
     list(APPEND inference_deps anakin_inference_lib)
 endif()

-copy(inference_api_lib DEPS paddle_inference_api paddle_inference_api_shared
-  SRCS ${src_dir}/${module}/paddle_inference_api.h 
-       ${src_dir}/${module}/demo_ci
-       ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libpaddle_inference_api*
-  DSTS ${dst_dir}/inference ${dst_dir}/inference ${dst_dir}/inference
-)
-list(APPEND inference_deps inference_api_lib)
-
 set(module "inference")
 copy(inference_lib DEPS ${inference_deps}
  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module}
+       ${src_dir}/${module}/api/paddle_inference_api.h ${src_dir}/${module}/api/demo_ci
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
 )

 set(module "platform")
--- a/doc/fluid/design/ir/draft.md
+++ b/doc/fluid/design/ir/draft.md
@ -1,89 +0,0 @@
-## Motivation
-
-There is a ```gap``` between the ```Program``` defined by
-user and the ```Executable``` that can be scheduled
-efficiently on heterogeneous hardware, either locally
-or distributedly.
-
-Usually, the ```gap``` is bridged by
-
-* A serious transformations with defined order.
-
-* These transformations usually involve
-```insert, delete, clustering, split, dependency analysis```.
-
-* Has a simple way to verify and debug each transformation.
-
-* Flexible to add, remove or customize transformations to fit
-the requirements of various algorithms (models) and hardware secenarios.
-
-Some other events also push us to a better unified pattern.
-
-* The deep learning framework is built around the concepts of graphs.
-To leverage tools such as compilation (e.g. TVM and nGraph) or
-cross-framework conversion (e.g. ONNX), we also need a intermediate
-representation that can be connected to the rest of the ecosystem.
-
-
-We need a unified pattern to naturally support the requirements
-described above. The pattern should fit both training, inference
-and other offline serielized model transformations.
-Learned from LLVM and other deep learning framework, we draft the
-design below.
-
-
-## Design
-
-### Major Concepts
-
-#### Node
-
-```Node``` represents an operation that performs some computation or
-a variable that is input or output of operation.
-
-```Node```s are connected to other ```Node```s via inputs and outputs.
-
-Other properties (maybe device placement information) can be added
-to ```Node``` in the future if it's a
-common requirement of many other ```Pass```es. Otherwise, it should live
-in a ```Node``` wrapper class that is private to some ```Pass``` or be
-a local member of a ```Pass```.
-
-#### Graph
-
-```Graph``` contains a list of ```Node```s, which are connected to
-each other via inputs and outputs.
-
-TODO: Better definitions for the graph.
-
-```Graph``` can also contain ```Attribute```s. ```Attribute```s
-can be ``any`` thing. For example, it can be a list of "wraper"
-nodes. The ```wrapper``` nodes compose ```Node```s and provide
-helper method for execution or transformation. ```Attribute```
-can also contain other things that describe some properties of
-the ```Graph``` or ```Graph``` nodes. ```Attribute``` can be passed
-across ```Pass```. However, it should be used with care.
-
-#### Pass
-
-```Pass``` represents a transformation of ```Graph```. Its input
-is a ```Graph``` and its output is also a ```Graph```. For example,
-a ```Pass``` can simply print out the ```Graph```. A ```Pass```
-can also fuse some ```Graph```'s ```Node```s.
-
-#### Optimize
-
-```Optimize``` contains a series of ```Pass``` with defined order.
-```Optimize``` transforms a ```Graph``` that only contains raw
-modeling logic to a ```Graph``` that can be run efficiently while
-maintaining the original modeling logic.
-
-
-### Optimize Process
-
-* Program is first converted to Graph.
-* Graph goes through a series of Pass
-* Graph is transformed from raw model logic to a
-form that is efficient to execute.
-
-Program->ProgramToGraph->Graph->Pass1->Graph->Pass2->Graph->Pass3->Graph->Executor
--- a/doc/fluid/design/ir/overview.md
+++ b/doc/fluid/design/ir/overview.md
@ -0,0 +1,185 @@
+## Motivation
+
+There is a `gap` between the `Program` defined by
+user and the `Executable` that can be scheduled
+efficiently on heterogeneous hardware, either locally
+or distributedly.
+
+Usually, the `gap` is bridged by
+
+* A serious transformations with defined order.
+
+* These transformations usually involve
+`insert, delete, clustering, split, dependency analysis`.
+
+* Has a simple way to verify and debug each transformation.
+
+* Flexible to add, remove or customize transformations to fit
+the requirements of various algorithms (models) and hardware secenarios.
+
+Some other events also push us to a better unified pattern.
+
+* The deep learning framework is built around the concepts of graphs.
+To leverage tools such as compilation (e.g. TVM and nGraph) or
+cross-framework conversion (e.g. ONNX), we also need a intermediate
+representation that can be connected to the rest of the ecosystem.
+
+
+We need a unified pattern to naturally support the requirements
+described above. The pattern should fit both training, inference
+and other offline serielized model transformations.
+Learned from LLVM and other deep learning framework, we draft the
+design below.
+
+
+## Design
+
+### Major Concepts
+
+#### Node
+
+`Node` represents an operation that performs some computation or
+a variable that is input or output of operation.
+
+`Node`s are connected to other `Node`s via inputs and outputs.
+
+Other properties (maybe device placement information) can be added
+to `Node` in the future if it's a
+common requirement of many other `Pass`es. Otherwise, it should live
+in a `Node` wrapper class that is private to some `Pass` or be
+a local member of a `Pass`.
+
+#### Graph
+
+`Graph` contains a list of `Node`s, which are connected to
+each other via inputs and outputs.
+
+TODO: Better definitions for the graph.
+
+`Graph` can also contain `Attribute`s. `Attribute`s
+can be `any` thing. For example, it can be a list of "wraper"
+nodes. The `wrapper` nodes compose `Node`s and provide
+helper method for execution or transformation. `Attribute`
+can also contain other things that describe some properties of
+the `Graph` or `Graph` nodes. `Attribute` can be passed
+across `Pass`. However, it should be used with care.
+
+```cpp
+class Graph {
+ public:
+  explicit Graph(const ProgramDesc &program);
+
+  bool Has(const std::string &attr_name) const;
+
+  template <typename AttrType>
+  AttrType &Get(const std::string &attr_name) const;
+
+  template <typename AttrType>
+  void Set(const std::string &attr_name, AttrType *attr);
+  const std::unordered_set<ir::Node *> &Nodes() const;
+
+  // Create a normal variable with non-null VarDesc.
+  ir::Node *CreateVarNode(VarDesc *var_desc);
+
+  // Create a normal runnable operator with OpDesc.
+  ir::Node *CreateOpNode(OpDesc *op_desc);
+
+  // Create a control dependency var that connects 2 operations. The
+  // var doesn't hold any data. Other than that, it's no different from
+  // other var, considering dependency analysis.
+  ir::Node *CreateControlDepVar();
+
+  // A more free style way of creating a graph node. Mostly use for test
+  // or "copy" from another node. Avoid using it if possible.
+  ir::Node *CreateEmptyNode(const std::string &name, ir::Node::Type type);
+
+  // Clear all node information of the graph and return the ownership of the
+  // nodes.
+  std::vector<std::unique_ptr<ir::Node>> ReleaseNodes();
+};
+```
+
+#### Pass
+
+`Pass` represents a transformation of `Graph`. Its input
+is a `Graph` and its output is also a `Graph`. For example,
+a `Pass` can simply print out the `Graph`. A `Pass`
+can also fuse some `Graph`'s `Node`s.
+
+```cpp
+class Pass {
+ public:
+
+  std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const {
+    // Some correctness check.
+    auto new_graph = ApplyImpl(std::move(graph));
+    // Some correctness check.
+    return new_graph;
+  }
+
+  // Get a reference to the attributed previously set.
+  template <typename AttrType>
+  AttrType &Get(const std::string &attr_name) const;
+
+  // Set a pointer to the attribute. Pass takes ownership of the attribute.
+  template <typename AttrType>
+  void Set(const std::string &attr_name, AttrType *attr) ;
+
+  // Set a pointer to the attribute. Pass doesn't take ownership. Caller
+  // should delete the attribute.
+  template <typename AttrType>
+  void SetNotOwned(const std::string &attr_name, AttrType *attr);
+
+ protected:
+  virtual std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const = 0;
+};
+
+// In my_pass.cc
+class MyPass : public Pass {
+ protected:
+  std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const override {
+    // do something.
+    return graph;
+  }
+}
+REGISTER_PASS(my_pass, MyPass)
+.RequirePassAttr("places")
+.RequireGraphAttr("dep_vars");
+
+
+// To use the pass.
+auto my_pass = ir::PassRegistry::Instance().Get("my_pass");
+graph = my_pass->Apply(std::move(graph));
+// Note: to force link my_pass.cc, in the code:
+USE_PASS(my_pass);
+```
+
+#### Optimize
+
+`Optimize` contains a series of `Pass` with defined order.
+`Optimize` transforms a `Graph` that only contains raw
+modeling logic to a `Graph` that can be run efficiently while
+maintaining the original modeling logic.
+
+
+### Optimize Process
+
+* Program is first converted to Graph.
+* Graph goes through a series of Pass
+* Graph is transformed from raw model logic to a
+form that is efficient to execute.
+
+```
+// Program->ProgramToGraph->Graph->Pass1->Graph->Pass2->Graph->Pass3->Graph->Executor
+auto graph = Graph(program);
+graph = PassRegistry::Instance().Get("op_fuse_pass").Apply(std::move(grah));
+// For more complex Pass, Optimize Process can provide Pass attributes.
+auto mem_opt_pass = PassRegistry::Instance().Get("memory_optimization_pass");
+mem_opt_pass.SetNotOwned<int>("optimize_level", 1);
+mem_opt_pass->Apply(std::move(graph));
+graph = PassRegistry::Instance().Get("multi_devices_pass").Apply(std::move(grah));
+graph = PassRegistry::Instance().Get("multi_devices_check_pass").Apply(std::move(grah));
+Executor exe;
+exe.Run(graph);
+
+```
--- a/doc/fluid/howto/optimization/timeline_cn.md
+++ b/doc/fluid/howto/optimization/timeline_cn.md
@ -1,21 +1,27 @@
 # 如何使用timeline工具做性能分析

-1. 在训练的主循环外加上`with profiler.profiler(...)`。运行之后，代码会在`/tmp/profile`目录下生成一个profile的记录文件。
+1. 在训练的主循环外加上`profiler.start_profiler(...)`和`profiler.stop_profiler(...)`。运行之后，代码会在`/tmp/profile`目录下生成一个profile的记录文件。

 	**提示：**
 	请不要在timeline记录信息时运行太多次迭代，因为timeline中的记录数量和迭代次数是成正比的。

 	```python
-	with profiler.profiler('All', 'total', '/tmp/profile') as prof:
-	    for pass_id in range(pass_num):
-	        for batch_id, data in enumerate(train_reader()):
-	            exe.run(fluid.default_main_program(),
-	                    feed=feeder.feed(data),
-	                    fetch_list=[])
+    for pass_id in range(pass_num):
+        for batch_id, data in enumerate(train_reader()):
+            if pass_id == 0 and batch_id == 5:
+                profiler.start_profiler("All")
+            elif pass_id == 0 and batch_id == 10:
+                profiler.stop_profiler("total", "/tmp/profile")
+            exe.run(fluid.default_main_program(),
+                    feed=feeder.feed(data),
+                    fetch_list=[])
 	            ...
 	```

 1. 运行`python paddle/tools/timeline.py`来处理`/tmp/profile`，这个程序默认会生成一个`/tmp/timeline`文件，你也可以用命令行参数来修改这个路径，请参考[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py)。
+```python
+python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline
+```

 1. 打开chrome浏览器，访问<chrome://tracing/>，用`load`按钮来加载生成的`timeline`文件。

--- a/doc/fluid/howto/optimization/timeline_en.md
+++ b/doc/fluid/howto/optimization/timeline_en.md
@ -1,15 +1,17 @@
 # how to use timeline tool to do profile

-1. Add `with profiler.profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
+1. Add `profiler.start_profiler(...)`和`profiler.stop_profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.

 	```python
-	with profiler.profiler('All', 'total', '/tmp/profile') as prof:
-	    for pass_id in range(pass_num):
-	        for batch_id, data in enumerate(train_reader()):
-	            exe.run(fluid.default_main_program(),
-	                    feed=feeder.feed(data),
-	                    fetch_list=[],
-	                    use_program_cache=True)
+    for pass_id in range(pass_num):
+        for batch_id, data in enumerate(train_reader()):
+            if pass_id == 0 and batch_id == 5:
+                profiler.start_profiler("All")
+            elif pass_id == 0 and batch_id == 10:
+                profiler.stop_profiler("total", "/tmp/profile")
+            exe.run(fluid.default_main_program(),
+                    feed=feeder.feed(data),
+                    fetch_list=[])
 	            ...
 	```

@ -17,6 +19,10 @@
 file `/tmp/timeline` by default. You can change the path by cmd parameter, please take a look at
 [timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py) for details.

+```python
+python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline
+```
+
 1. Open chrome and visit <chrome://tracing/>, use `load` button to load the generated `timeline` file.

 	![chrome tracing](./tracing.jpeg)
--- a/doc/survey/op_fusion_design.md
+++ b/doc/survey/op_fusion_design.md
@ -0,0 +1,20 @@
+# Operator fusion  
+Fusing multiple operators together is an important method to optimize the program execution, particularly for GPU or other specialized accelerators. An obvious benefit is to avoid the overhead of saving the intermediate result back into global memory.   
+
+There are generally two ways to fuse operators, fusing directly connected operators and fusing non directly connected operators. The first method is mainly used by [NNVM Compiler](https://github.com/dmlc/tvm/) and [XLA](https://www.tensorflow.org/performance/xla/). The second method is mainly used by Dynet and TensorFlow Fold to do auto-batching. The principle of fusing operator is according to some rules to combine multiple operations into one, for example, `Y = X * W` and `Z = Y + B` can be fused to `Z = X * W + B`, and `Y1 = X1 * W` and `Y2 = X2 * W` can be fused to `[Y1;Y2] = [X1;X2] * W`. In order to get a short-term profit, we decided to try to manually specify these rules.   
+
+## Challenge
+The challenge of fusing operators is:
+  - how to make the rules.
+  - how to implement these rules efficiently.
+
+### How to make the rules?
+
+The problem of determining the best single location for a fusion operator is an NP-hard combinatorial problem. After analysis the operators of the DL model, we found there are two group of operators can be fused explicitly, one is the simple and adjacent operations, for example, `tmp = x + y` and `z = Relu(tmp)`, and the other is the operators that have the same function, for example, a serials of `SGD` or `Momentum`. They usually appear in the model in a large number. So we should think about how to fuse them separately first.
+
+### How to implement these rules efficiently?
+#### How to fuse the adjacent operations efficiently?
+Here we use a template function to represent the fused operations. The pros of using a template function are that it is simple and efficient, and the cons are that it is not easy to expand, and it can only be used to express some simple operations. So taking into account our current needs, the template function is more appropriate.
+
+#### How to fuse the operators that have the same function efficiently?
+We take SGD operator as an example, the training model may have hundreds of parameters and correspondingly have the same number of SGD operators. The expression(`w = w - lr*w_g`) of those operators is the same, so during of training, the executor will execute this expression hundreds time in CPU or other specialized accelerators. If we can fuse them and make the address of all `w` and all `w_g` continuous respectively, we only need execute one time. For some accelerators, the time of launching kernel is not neglected, so the time of hundreds of times of launching and executing kernel may be larger than launching and executing only once. There usually are many operators that similar to `SGD` in the DL model, such as `AllReduce` and `FC`.
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@ -170,6 +170,7 @@ paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], var
 paddle.fluid.layers.relu ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.log ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
@ -201,7 +202,6 @@ paddle.fluid.layers.zeros ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=
 paddle.fluid.layers.reverse ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.While.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.While.complete ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.Switch.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.Switch.case ArgSpec(args=['self', 'condition'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.Switch.default ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
@ -225,17 +225,14 @@ paddle.fluid.layers.DynamicRNN.static_input ArgSpec(args=['self', 'x'], varargs=
 paddle.fluid.layers.DynamicRNN.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.DynamicRNN.update_memory ArgSpec(args=['self', 'ex_mem', 'new_mem'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.StaticRNN.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.StaticRNN.complete_op ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.StaticRNN.memory ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1))
 paddle.fluid.layers.StaticRNN.output ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None)
-paddle.fluid.layers.StaticRNN.parent_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.StaticRNN.step ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.StaticRNN.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.StaticRNN.step_output ArgSpec(args=['self', 'o'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.StaticRNN.update_memory ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.reorder_lod_tensor_by_rank ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.ParallelDo.__init__ ArgSpec(args=['self', 'places', 'use_nccl', 'name'], varargs=None, keywords=None, defaults=(False, None))
-paddle.fluid.layers.ParallelDo.complete_op ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.ParallelDo.do ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.ParallelDo.get_parameters ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.ParallelDo.parent_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
@ -266,9 +263,7 @@ paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=[], varargs='ar
 paddle.fluid.layers.scatter ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.sum ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.shape ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.maxout ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.sigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.logsigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
@ -309,7 +304,9 @@ paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', '
 paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral'))
 paddle.fluid.layers.rpn_target_assign ArgSpec(args=['loc', 'scores', 'anchor_box', 'gt_box', 'rpn_batch_size_per_im', 'fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap'], varargs=None, keywords=None, defaults=(256, 0.25, 0.7, 0.3))
 paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None))
+paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
 paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 200, 1))
 paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
@ -339,6 +336,7 @@ paddle.fluid.contrib.BeamSearchDecoder.decode ArgSpec(args=['self'], varargs=Non
 paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False))
 paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.transpiler.DistributeTranspiler.create_splited_vars ArgSpec(args=['self', 'source_var', 'block', 'tag'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@ -5,5 +5,7 @@ add_subdirectory(operators)
 add_subdirectory(pybind)
 add_subdirectory(string)
 add_subdirectory(recordio)
-# NOTE: please add subdirectory inference at last.
-add_subdirectory(inference)
+if(WITH_INFERENCE)
+  # NOTE: please add subdirectory inference at last.
+  add_subdirectory(inference)
+endif()
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -7,10 +7,11 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
 cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context)
+cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor)
 if(WITH_GPU)
-  nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type)
+  nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context)
 else()
-  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type)
+  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context)
 endif()

 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
@ -22,7 +23,12 @@ endif()

 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)

-nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place memory device_context tensor)
+if(WITH_GPU)
+  nv_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS place memory device_context tensor)
+else()
+  cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
+endif()
+
 cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio)
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
@ -94,7 +100,7 @@ else()
 endif()


-cc_library(parallel_executor SRCS parallel_executor.cc DEPS ssa_graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph)
+cc_library(parallel_executor SRCS parallel_executor.cc DEPS threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph graph_viz_pass multi_devices_graph_pass multi_devices_graph_print_pass multi_devices_graph_check_pass)

 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
@ -105,7 +111,7 @@ cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)

 cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
 cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
-      
+
 # cc_test(channel_test SRCS channel_test.cc)
 cc_test(tuple_test SRCS tuple_test.cc )

--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
@ -88,9 +88,8 @@ class BlockDesc {
  OpDesc *InsertOp(size_t index);

  /*
-   * Remove Op and its input/output variables.
-   * Note that for either input or output variable, if it is also an input or
-   * output variable of other ops, we should remain it.
+   * Only remove op itself,
+   * do nothing to its input and output variables
   */
  void RemoveOp(size_t s, size_t e);

--- a/paddle/fluid/framework/data_type.cc
+++ b/paddle/fluid/framework/data_type.cc
@ -17,6 +17,8 @@
 #include <string>
 #include <unordered_map>

+using float16 = paddle::platform::float16;
+
 namespace paddle {
 namespace framework {

@ -53,7 +55,7 @@ static DataTypeMap* InitDataTypeMap() {
  RegisterType<cc_type>(retv, proto_type, #cc_type)

  // NOTE: Add your customize type here.
-  RegType(platform::float16, proto::VarType::FP16);
+  RegType(float16, proto::VarType::FP16);
  RegType(float, proto::VarType::FP32);
  RegType(double, proto::VarType::FP64);
  RegType(int, proto::VarType::INT32);
--- a/paddle/fluid/framework/data_type_test.cc
+++ b/paddle/fluid/framework/data_type_test.cc
@ -0,0 +1,40 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/data_type.h"
+
+#include <string>
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/tensor.h"
+
+TEST(DataType, float16) {
+  using paddle::framework::Tensor;
+  using paddle::platform::CPUPlace;
+  using paddle::platform::float16;
+  namespace f = paddle::framework;
+  f::proto::VarType::Type dtype = f::proto::VarType::FP16;
+
+  Tensor tensor;
+  CPUPlace cpu;
+  tensor.mutable_data(cpu, f::ToTypeIndex(dtype));
+
+  // test fp16 tensor
+  EXPECT_EQ(tensor.type(), std::type_index(typeid(float16)));
+
+  // test fp16 size
+  EXPECT_EQ(f::SizeOfType(f::ToTypeIndex(dtype)), 2u);
+
+  // test debug info
+  std::string type = "float16";
+  EXPECT_STREQ(f::DataTypeToString(dtype).c_str(), type.c_str());
+}
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@ -1,13 +1,13 @@
-cc_library(var_handle SRCS var_handle.cc DEPS place framework_proto)
+cc_library(var_handle SRCS var_handle.cc DEPS place framework_proto node)
 cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor)
 cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
 cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place operator op_registry)

-cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS graph)
-cc_library(ssa_graph_printer SRCS ssa_graph_printer.cc DEPS ssa_graph_builder)
-cc_library(ssa_graph_checker SRCS ssa_graph_checker.cc DEPS ssa_graph_builder)
+cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper)
+cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper)
+cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc DEPS multi_devices_helper)

 cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)

@ -28,12 +28,9 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_
 cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
 cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)

-cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
+cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle)

-
-cc_library(ssa_graph_builder_factory SRCS ssa_graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer ssa_graph_checker)
-
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
        simple_threadpool device_context)
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@ -17,6 +17,7 @@
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
+#include "paddle/fluid/platform/profiler.h"

 namespace paddle {
 namespace framework {
@ -45,6 +46,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
 #endif

 void AllReduceOpHandle::RunImpl() {
+  platform::RecordEvent r("all_reduce", nullptr);
  if (NoDummyInputSize() == 1) {
    return;  // No need to all reduce when GPU count = 1;
  } else {
--- a/Show More
+++ b/Show More