[Feature] Lite subgraph (#22114)

5 years ago · ad0dfb17c1
parent 7d10edc5ee
commit ad0dfb17c1
39 changed files with 1909 additions and 11 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -86,6 +86,7 @@ option(WITH_HIGH_LEVEL_API_TEST   "Test fluid python high-level api interface"
 option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
 option(WITH_DGC   "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE})
 option(SANITIZER_TYPE "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined" OFF)
+option(WITH_LITE   "Compile Paddle Fluid with Lite Engine" OFF)

 # PY_VERSION
 if(NOT PY_VERSION)
--- a/cmake/external/lite.cmake
+++ b/cmake/external/lite.cmake
@ -0,0 +1,87 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT LINUX OR NOT WITH_MKL)
+  message("Paddle-lite will not build because the required Linux and MKL do not exist.")
+  set(WITH_LITE OFF)
+  return()
+endif()
+
+if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
+  include(ExternalProject)
+  set(LITE_PROJECT extern_lite)
+  set(LITE_SOURCES_DIR ${THIRD_PARTY_PATH}/lite)
+  set(LITE_INSTALL_DIR ${THIRD_PARTY_PATH}/install/lite)
+
+  # No quotes, so cmake can resolve it as a command with arguments.
+  set(LITE_BUILD_COMMAND $(MAKE) -j)
+  set(LITE_OPTIONAL_ARGS -DWITH_MKL=ON
+                         -DLITE_WITH_CUDA=${WITH_GPU}
+                         -DWITH_MKLDNN=OFF
+                         -DLITE_WITH_X86=ON
+                         -DLITE_WITH_PROFILE=OFF
+                         -DWITH_LITE=OFF
+                         -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF
+                         -DWITH_PYTHON=OFF
+                         -DWITH_TESTING=ON
+                         -DLITE_BUILD_EXTRA=ON
+                         -DCUDNN_ROOT=${CUDNN_ROOT}
+                         -DLITE_WITH_ARM=OFF)
+
+  ExternalProject_Add(
+      ${LITE_PROJECT}
+      ${EXTERNAL_PROJECT_LOG_ARGS}
+      GIT_REPOSITORY      "https://github.com/PaddlePaddle/Paddle-Lite.git"
+      GIT_TAG             947cda26637d46dc23f4e39d2b52e7d9a1fa6eef
+      PREFIX              ${LITE_SOURCES_DIR}
+      UPDATE_COMMAND      ""
+      BUILD_COMMAND       ${LITE_BUILD_COMMAND}
+      INSTALL_COMMAND     ""
+      CMAKE_ARGS          -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                          -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                          -DCMAKE_CXX_FLAGS=${LITE_CMAKE_CXX_FLAGS}
+                          -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                          -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+                          -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                          -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                          -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+                          -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                          -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                          ${EXTERNAL_OPTIONAL_ARGS}
+                          ${LITE_OPTIONAL_ARGS}
+  )
+  ExternalProject_Get_property(${LITE_PROJECT} BINARY_DIR)
+  ExternalProject_Get_property(${LITE_PROJECT} SOURCE_DIR)
+  set(LITE_BINARY_DIR ${BINARY_DIR})
+  set(LITE_SOURCE_DIR ${SOURCE_DIR})
+
+endif()
+
+message(STATUS "Paddle-lite BINARY_DIR: ${LITE_BINARY_DIR}")
+message(STATUS "Paddle-lite SOURCE_DIR: ${LITE_SOURCE_DIR}")
+include_directories(${LITE_SOURCE_DIR})
+include_directories(${LITE_BINARY_DIR})
+
+function(external_lite_static_libs alias path)
+  add_library(${alias} STATIC IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET ${alias} PROPERTY IMPORTED_LOCATION
+               ${path})
+  if (LITE_PROJECT)
+    add_dependencies(${alias} ${LITE_PROJECT})
+  endif()
+endfunction()
+
+external_lite_static_libs(lite_full_static ${LITE_BINARY_DIR}/lite/api/libapi_full_static.a)
+
+add_definitions(-DPADDLE_WITH_LITE)
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@ -207,11 +207,6 @@ if(LINUX)
        ${GPU_COMMON_FLAGS})
 endif(LINUX)

-if(UNIX AND NOT APPLE)
-  # except apple from nix*Os family
-  set(LINUX TRUE)
-endif(UNIX AND NOT APPLE)
-
 foreach(flag ${COMMON_FLAGS})
    safe_set_cflag(CMAKE_C_FLAGS ${flag})
    safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag})
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@ -20,6 +20,11 @@
 # for instance, protobuf libs path is <install_dir>/lib64
 # on CentOS, but <install_dir>/lib on other systems.

+if(UNIX AND NOT APPLE)
+  # except apple from nix*Os family
+  set(LINUX TRUE)
+endif(UNIX AND NOT APPLE)
+
 IF(WIN32)
    SET(HOST_SYSTEM "win32")
 ELSE(WIN32)
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@ -284,4 +284,8 @@ if(WITH_DGC)
    list(APPEND third_party_deps extern_dgc)
 endif()

+if (WITH_LITE)
+    include(external/lite)
+endif (WITH_LITE)
+
 add_custom_target(third_party DEPENDS ${third_party_deps})
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@ -21,6 +21,10 @@ if (ANAKIN_SUBGRAPH)
  add_subdirectory(anakin)
 endif()

+if (WITH_LITE)
+  add_subdirectory(lite)
+endif()
+
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
 get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)
 get_property(fluid_third_partys GLOBAL PROPERTY FLUID_THRID_PARTYS)
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@ -197,6 +197,12 @@ struct Argument {
  DECL_ARGUMENT_FIELD(anakin_ops_filter, AnakinOpsFilter,
                      std::vector<std::string>);

+  DECL_ARGUMENT_FIELD(lite_passes_filter, LitePassesFilter,
+                      std::vector<std::string>);
+  DECL_ARGUMENT_FIELD(lite_ops_filter, LiteOpsFilter, std::vector<std::string>);
+  DECL_ARGUMENT_FIELD(lite_precision_mode, LitePrecisionMode,
+                      AnalysisConfig::Precision);
+
  // Memory optimized related.
  DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);

--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@ -128,6 +128,17 @@ void IRPassManager::CreatePasses(Argument *argument,
      pass->Set("program",
                new framework::ProgramDesc *(&argument->main_program()));
    }
+    if (pass_name == "lite_subgraph_pass") {
+      bool enable_int8 =
+          argument->lite_precision_mode() == AnalysisConfig::Precision::kInt8;
+      pass->Set("program",
+                new framework::ProgramDesc *(&argument->main_program()));
+      pass->Set("lite_ops_filter",
+                new std::vector<std::string>(argument->lite_ops_filter()));
+      pass->Set("predictor_id", new int(argument->predictor_id()));
+      pass->Set("enable_int8", new bool(enable_int8));
+      pass->Set("use_gpu", new bool(argument->use_gpu()));
+    }
    if (pass_name == "anakin_subgraph_pass") {
      pass->Set("program",
                new framework::ProgramDesc *(&argument->main_program()));
--- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
@ -23,3 +23,12 @@ if (ANAKIN_SUBGRAPH)
  file(APPEND ${pass_file} "USE_PASS(anakin_subgraph_pass);\n")
  set(INFER_IR_PASSES ${INFER_IR_PASSES} anakin_subgraph_pass CACHE INTERNAL "")
 endif()
+
+if (WITH_LITE) 
+  cc_library(lite_subgraph_pass SRCS lite_subgraph_pass.cc DEPS ${analysis_deps} subgraph_util lite_op_teller)
+  set(analysis_deps ${analysis_deps} subgraph_util lite_subgraph_pass CACHE INTERNAL "")
+  set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
+  file(APPEND ${pass_file} "USE_PASS(lite_subgraph_pass);\n")
+  set(INFER_IR_PASSES ${INFER_IR_PASSES} lite_subgraph_pass CACHE INTERNAL "")
+  cc_test(lite_subgraph_pass_tester SRCS lite_subgraph_pass_tester.cc DEPS lite_subgraph_pass gtest glog)
+endif()
--- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
--- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.h
@ -0,0 +1,45 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <paddle/fluid/framework/ir/fuse_pass_base.h>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+class LiteSubgraphPass : public framework::ir::FusePassBase {
+ public:
+  void ApplyImpl(framework::ir::Graph* graph) const override;
+
+ private:
+  void BuildOperator(framework::ir::Node* merged_node,
+                     framework::ProgramDesc* global_program,
+                     std::vector<std::string>* repetitive_params) const;
+
+  void SetUpEngine(framework::ProgramDesc* program,
+                   const std::vector<std::string>& repetitive_params,
+                   const std::string& unique_key,
+                   bool dump_model = false) const;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass_tester.cc
@ -0,0 +1,59 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.h"
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/inference/lite/op_teller.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+namespace lite {
+void StrToBinaryFile(const std::string& path, const std::string& str);
+void ModifyHostSubgraphOps(framework::ProgramDesc* host_program,
+                           framework::BlockDesc* host_sub_block,
+                           const std::vector<framework::OpDesc*>& subgraph_ops);
+void AppendLiteSubBlocks(const std::vector<framework::OpDesc*>& subgraph_ops,
+                         framework::ProgramDesc* engine_program,
+                         framework::ProgramDesc* host_program,
+                         const int32_t host_sub_id);
+}
+
+TEST(LiteSubgraphPass, basic) {
+  framework::ProgramDesc host_program;
+  framework::ProgramDesc engine_program;
+  framework::BlockDesc* host_main_block = host_program.MutableBlock(0);
+  framework::BlockDesc* host_sub_block =
+      host_program.AppendBlock(*host_main_block);
+  framework::OpDesc* host_while_op = host_main_block->AppendOp();
+  host_main_block->Var("var_main");
+  host_sub_block->Var("var_sub");
+  host_while_op->SetType("while");
+  host_while_op->SetAttr("sub_block", host_sub_block);
+  framework::OpDesc* host_sub_block_op = host_sub_block->AppendOp();
+  host_sub_block_op->SetType("leaky_relu");
+
+  CHECK(inference::lite::OpTeller::Global().Tell("while", *host_while_op))
+      << "Lite operator teller test failed.";
+
+  lite::AppendLiteSubBlocks({host_while_op}, &engine_program, &host_program,
+                            host_sub_block->ID());
+  lite::ModifyHostSubgraphOps(&host_program, host_sub_block, {host_while_op});
+  lite::StrToBinaryFile("./", "test");
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
@ -26,7 +26,7 @@ namespace analysis {
 using framework::ir::Node;

 std::vector<std::string> ExtractParameters(
-    const std::unordered_set<Node *> &nodes) {
+    const std::unordered_set<Node *> &nodes, bool sorted) {
  // We can judge whether a variable is a parameter by
  // its presistable property, but sometimes the presistable
  // of the feed op output is true, so we have to identify it.
@ -50,9 +50,59 @@ std::vector<std::string> ExtractParameters(
      parameters.push_back(node->Name());
    }
  }
+  if (sorted) {
+    std::sort(parameters.begin(), parameters.end());
+    parameters.erase(std::unique(parameters.begin(), parameters.end()),
+                     parameters.end());
+  }
  return parameters;
 }

+std::unordered_set<Node *> GetRelatedIOVarNodes(
+    const std::vector<Node *> &nodes) {
+  std::unordered_set<Node *> io_nodes;
+  for (const auto &node : nodes) {
+    if (!node->IsOp()) continue;
+    for (const auto &in : node->inputs) {
+      io_nodes.insert(in);
+    }
+    for (const auto &out : node->outputs) {
+      io_nodes.insert(out);
+    }
+  }
+  return io_nodes;
+}
+
+void PrependFeedOps(framework::BlockDesc *global_block,
+                    const std::vector<std::string> &feed_target_names,
+                    std::string feed_holder_name) {
+  framework::VarDesc *feed_var = global_block->Var(feed_holder_name);
+  feed_var->SetType(paddle::framework::proto::VarType::FEED_MINIBATCH);
+  feed_var->SetPersistable(true);
+  for (size_t i = 0; i < feed_target_names.size(); i++) {
+    framework::OpDesc *feed_op = global_block->AppendOp();
+    feed_op->SetType("feed");
+    feed_op->SetInput("X", {feed_holder_name});
+    feed_op->SetOutput("Out", {feed_target_names[i]});
+    feed_op->SetAttr("col", static_cast<int>(i));
+  }
+}
+
+void PrependFetchOps(framework::BlockDesc *global_block,
+                     const std::vector<std::string> &fetch_target_names,
+                     std::string fetch_holder_name) {
+  framework::VarDesc *fetch_var = global_block->Var(fetch_holder_name);
+  fetch_var->SetType(paddle::framework::proto::VarType::FETCH_LIST);
+  fetch_var->SetPersistable(true);
+  for (size_t i = 0; i < fetch_target_names.size(); i++) {
+    framework::OpDesc *fetch_op = global_block->AppendOp();
+    fetch_op->SetType("fetch");
+    fetch_op->SetInput("X", {fetch_target_names[i]});
+    fetch_op->SetOutput("Out", {fetch_holder_name});
+    fetch_op->SetAttr("col", static_cast<int>(i));
+  }
+}
+
 void RenameAndGetOutputs(
    const std::vector<framework::ir::Node *> &subgraph_nodes,
    framework::BlockDesc *block_desc,
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
@ -30,10 +30,21 @@ limitations under the License. */
 namespace paddle {
 namespace inference {
 namespace analysis {
-using framework::ir::Node;

 std::vector<std::string> ExtractParameters(
-    const std::unordered_set<Node *> &nodes);
+    const std::unordered_set<framework::ir::Node *> &nodes,
+    bool sorted = false);
+
+std::unordered_set<framework::ir::Node *> GetRelatedIOVarNodes(
+    const std::vector<framework::ir::Node *> &nodes);
+
+void PrependFeedOps(framework::BlockDesc *global_block,
+                    const std::vector<std::string> &feed_target_names,
+                    std::string feed_holder_name = "feed");
+
+void PrependFetchOps(framework::BlockDesc *global_block,
+                     const std::vector<std::string> &fetch_target_names,
+                     std::string fetch_holder_name = "fetch");

 void RenameAndGetOutputs(
    const std::vector<framework::ir::Node *> &subgraph_nodes,
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@ -23,6 +23,7 @@
 namespace paddle {
 extern const std::vector<std::string> kTRTSubgraphPasses;
 extern const std::vector<std::string> kAnakinSubgraphPasses;
+extern const std::vector<std::string> kLiteSubgraphPasses;

 PassStrategy *AnalysisConfig::pass_builder() const {
  if (!pass_builder_.get()) {
@ -128,6 +129,11 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
  CP_MEMBER(anakin_passes_filter_);
  CP_MEMBER(anakin_ops_filter_);

+  CP_MEMBER(use_lite_);
+  CP_MEMBER(lite_precision_mode_);
+  CP_MEMBER(lite_passes_filter_);
+  CP_MEMBER(lite_ops_filter_);
+
  // profile related.
  CP_MEMBER(with_profile_);

@ -351,6 +357,20 @@ void AnalysisConfig::Update() {
    }
  }

+  if (use_lite_) {
+#ifndef PADDLE_WITH_LITE
+    LOG(WARNING) << "You tried to enable the lite subgraph "
+                    "but did not have the option -DWITH_LITE compiled.";
+#endif
+    pass_builder()->ClearPasses();
+    for (const auto &pass : kLiteSubgraphPasses) {
+      if (std::find(lite_passes_filter_.begin(), lite_passes_filter_.end(),
+                    pass) == lite_passes_filter_.end()) {
+        pass_builder()->AppendPass(pass);
+      }
+    }
+  }
+
  if (ir_debug_) {
    pass_builder()->TurnOnDebug();
  }
@ -395,6 +415,8 @@ std::string AnalysisConfig::SerializeInfoCache() {
  ss << cpu_math_library_num_threads_;
  ss << use_anakin_;
  ss << anakin_min_subgraph_size_;
+
+  ss << use_lite_;
  return ss.str();
 }

@ -484,6 +506,17 @@ void AnalysisConfig::EnableAnakinEngine(
  Update();
 }

+void AnalysisConfig::EnableLiteEngine(
+    AnalysisConfig::Precision precision_mode,
+    const std::vector<std::string> &passes_filter,
+    const std::vector<std::string> &ops_filter) {
+  use_lite_ = true;
+  lite_precision_mode_ = precision_mode;
+  lite_passes_filter_ = passes_filter;
+  lite_ops_filter_ = ops_filter;
+  Update();
+}
+
 void AnalysisConfig::PartiallyRelease() {
  prog_file_.clear();
  prog_file_.shrink_to_fit();
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@ -428,6 +428,13 @@ void AnalysisPredictor::PrepareArgument() {
    LOG(INFO) << "Anakin subgraph engine is enabled";
  }

+  if (config_.lite_engine_enabled()) {
+    argument_.SetLitePrecisionMode(config_.lite_precision_mode_);
+    argument_.SetLitePassesFilter(config_.lite_passes_filter_);
+    argument_.SetLiteOpsFilter(config_.lite_ops_filter_);
+    LOG(INFO) << "Lite subgraph engine is enabled";
+  }
+
  if (config_.use_mkldnn_) {
    LOG(INFO) << "MKLDNN is enabled";
    argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_);
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@ -171,10 +171,19 @@ struct AnalysisConfig {
      std::vector<std::string> passes_filter = {},
      std::vector<std::string> ops_filter = {});

+  void EnableLiteEngine(
+      AnalysisConfig::Precision precision_mode = Precision::kFloat32,
+      const std::vector<std::string>& passes_filter = {},
+      const std::vector<std::string>& ops_filter = {});
+
  /** A boolean state indicating whether the Anakin sub-graph engine is used.
  */
  bool anakin_engine_enabled() const { return use_anakin_; }

+  /** A boolean state indicating whether the Lite sub-graph engine is used.
+  */
+  bool lite_engine_enabled() const { return use_lite_; }
+
  /** \brief Control whether to debug IR graph analysis phase.
   *
   * This will generate DOT files for visualizing the computation graph after
@ -350,6 +359,11 @@ struct AnalysisConfig {
  std::vector<std::string> anakin_passes_filter_;
  std::vector<std::string> anakin_ops_filter_;

+  bool use_lite_{false};
+  std::vector<std::string> lite_passes_filter_;
+  std::vector<std::string> lite_ops_filter_;
+  Precision lite_precision_mode_;
+
  // mkldnn related.
  int mkldnn_cache_capacity_{0};
  bool use_mkldnn_quantizer_{false};
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@ -105,6 +105,12 @@ const std::vector<std::string> kAnakinSubgraphPasses({
    "fc_gru_fuse_pass",                             //
 });

+const std::vector<std::string> kLiteSubgraphPasses({
+#ifdef PADDLE_WITH_LITE
+    "lite_subgraph_pass",
+#endif
+});
+
 GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
  passes_.assign({
    //   "identity_scale_op_clean_pass",             //
@ -123,7 +129,7 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
        "conv_elementwise_add2_act_fuse_pass",  //
        "conv_elementwise_add_fuse_pass",       //
 #endif                                          //
-        "transpose_flatten_concat_fuse_pass",
+        "transpose_flatten_concat_fuse_pass",   //
        // following pass should be located in the last, since it will
        // work on all fused ops.
        "runtime_context_cache_pass"
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@ -163,5 +163,6 @@ class GpuPassStrategy : public PassStrategy {

 extern const std::vector<std::string> kTRTSubgraphPasses;
 extern const std::vector<std::string> kAnakinSubgraphPasses;
+extern const std::vector<std::string> kLiteSubgraphPasses;

 }  // namespace paddle
--- a/paddle/fluid/inference/lite/CMakeLists.txt
+++ b/paddle/fluid/inference/lite/CMakeLists.txt
@ -0,0 +1,5 @@
+cc_library(lite_op_teller SRCS op_teller.cc DEPS lite_full_static framework_proto device_context boost xxhash)
+cc_library(lite_engine SRCS engine.cc DEPS lite_full_static framework_proto)
+cc_library(lite_tensor_utils SRCS tensor_utils.cc DEPS memcpy lite_full_static framework_proto boost)
+cc_test(test_lite_engine SRCS test_engine.cc DEPS lite_engine protobuf framework_proto glog gtest analysis)
+cc_test(test_lite_tensor_utils SRCS test_tensor_utils.cc DEPS lite_engine lite_tensor_utils)
--- a/paddle/fluid/inference/lite/engine.cc
+++ b/paddle/fluid/inference/lite/engine.cc
@ -0,0 +1,64 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_CUDA
+#define LITE_WITH_CUDA 1
+#endif
+
+#include "paddle/fluid/inference/lite/engine.h"
+#include "lite/core/context.h"
+#include "lite/core/device_info.h"
+
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+
+namespace paddle {
+namespace inference {
+namespace lite {
+
+bool EngineManager::Empty() const { return engines_.size() == 0; }
+
+bool EngineManager::Has(const std::string& name) const {
+  if (engines_.count(name) == 0) {
+    return false;
+  }
+  return engines_.at(name).get() != nullptr;
+}
+
+paddle::lite::Predictor* EngineManager::Get(const std::string& name) const {
+  return engines_.at(name).get();
+}
+
+paddle::lite::Predictor* EngineManager::Create(const std::string& name,
+                                               const EngineConfig& cfg) {
+  auto* p = new paddle::lite::Predictor();
+#ifdef PADDLE_WITH_CUDA
+  paddle::lite::Env<TARGET(kCUDA)>::Init();
+#endif
+  p->Build("", cfg.model, cfg.param, cfg.valid_places, cfg.neglected_passes,
+           cfg.model_type, cfg.model_from_memory);
+  engines_[name].reset(p);
+  return p;
+}
+
+void EngineManager::DeleteAll() {
+  for (auto& item : engines_) {
+    item.second.reset(nullptr);
+  }
+}
+
+}  // namespace lite
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/lite/engine.h
+++ b/paddle/fluid/inference/lite/engine.h
@ -0,0 +1,55 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "lite/api/cxx_api.h"
+
+namespace paddle {
+namespace inference {
+namespace lite {
+
+struct EngineConfig {
+  std::string model;
+  std::string param;
+  paddle::lite::Place prefer_place;
+  std::vector<paddle::lite::Place> valid_places;
+  std::vector<std::string> neglected_passes;
+  lite_api::LiteModelType model_type{lite_api::LiteModelType::kProtobuf};
+  bool model_from_memory{true};
+};
+
+class EngineManager {
+ public:
+  bool Empty() const;
+  bool Has(const std::string& name) const;
+  paddle::lite::Predictor* Get(const std::string& name) const;
+  paddle::lite::Predictor* Create(const std::string& name,
+                                  const EngineConfig& cfg);
+  void DeleteAll();
+
+ private:
+  std::unordered_map<std::string, std::unique_ptr<paddle::lite::Predictor>>
+      engines_;
+};
+
+}  // namespace lite
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/lite/op_teller.cc
+++ b/paddle/fluid/inference/lite/op_teller.cc
@ -0,0 +1,92 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <map>
+
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/inference/lite/op_teller.h"
+
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace inference {
+namespace lite {
+
+// Just tell by the op_types.
+struct SimpleOpTeller : public Teller {
+  SimpleOpTeller() {
+    const std::map<std::string, std::string>& op2path =
+        OpKernelInfoCollector::Global().GetOp2PathDict();
+    auto is_non_inst = [](const std::string& op) -> bool {
+      const std::vector<std::string> ops = {"feed", "fetch", "while"};
+      return std::find(ops.begin(), ops.end(), op) != ops.end();
+    };
+    for (const auto& op : op2path) {
+      if (!is_non_inst(op.first)) {
+        ops_.insert(op.first);
+      }
+    }
+  }
+
+  bool operator()(const std::string& op_type,
+                  const framework::OpDesc& op_desc) override {
+    return ops_.count(op_type);
+  }
+
+ private:
+  std::unordered_set<std::string> ops_{};
+};
+
+struct SingleBlockOpTeller : public Teller {
+  SingleBlockOpTeller() { ops_.insert("while"); }
+
+  bool operator()(const std::string& op_type,
+                  const framework::OpDesc& op_desc) override {
+    if (ops_.count(op_type)) {
+      SimpleOpTeller supported;
+      const int id = op_desc.GetBlockAttrId("sub_block");
+      const framework::BlockDesc& block_desc =
+          op_desc.Block()->Program()->Block(id);
+      const std::vector<framework::OpDesc*>& ops_sub_block =
+          block_desc.AllOps();
+      for (auto* op : ops_sub_block) {
+        if (!supported(op->Type(), *op) && !this->operator()(op->Type(), *op)) {
+          return false;
+        }
+      }
+      return true;
+    }
+    return false;
+  }
+
+ private:
+  std::unordered_set<std::string> ops_;
+};
+
+bool OpTeller::Tell(const std::string& op_type, const framework::OpDesc& desc) {
+  for (auto& teller : tellers_) {
+    if ((*teller)(op_type, desc)) return true;
+  }
+  return false;
+}
+
+OpTeller::OpTeller() {
+  tellers_.emplace_back(new SimpleOpTeller);
+  tellers_.emplace_back(new SingleBlockOpTeller);
+}
+
+}  // namespace lite
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/lite/op_teller.h
+++ b/paddle/fluid/inference/lite/op_teller.h
@ -0,0 +1,70 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+#include "paddle/fluid/framework/op_desc.h"
+
+namespace paddle {
+namespace inference {
+namespace lite {
+
+/*
+ * Single Op teller definition.
+ * One can override this and define a more complex tell logic, considerring more
+ * issues such as op_desc.
+ */
+struct Teller {
+  virtual bool operator()(const std::string& op_type,
+                          const framework::OpDesc& desc) = 0;
+
+  virtual ~Teller() = default;
+};
+/*
+ * A real example:
+ *
+ * struct SomeTeller : public Teller {
+ * bool operator()(const std::string& op_type,
+ *                const framework::OpDesc& desc) override {
+ *  return op_type == "fc" && desc.Inputs().size() == 2;
+ * }
+ *};
+ */
+
+/*
+ * class OpTeller helps to tell whether a fluid
+ * operator can be transformed to a TensorRT layer.
+ */
+class OpTeller {
+ public:
+  static OpTeller& Global() {
+    static std::unique_ptr<OpTeller> x(new OpTeller);
+    return *x;
+  }
+
+  bool Tell(const std::string& op_type, const framework::OpDesc& desc);
+
+ private:
+  OpTeller();
+
+ private:
+  std::vector<std::unique_ptr<Teller>> tellers_;
+};
+
+}  // namespace lite
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/lite/tensor_utils.cc
+++ b/paddle/fluid/inference/lite/tensor_utils.cc
@ -0,0 +1,181 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/lite/tensor_utils.h"
+#include <map>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/inference/lite/engine.h"
+
+namespace paddle {
+namespace inference {
+namespace lite {
+namespace utils {
+
+using paddle::lite_api::TargetType;
+using paddle::lite_api::PrecisionType;
+using paddle::lite_api::DataLayoutType;
+
+template <typename DstLoD, typename SrcLoD>
+void SetLoD(DstLoD* dst, const SrcLoD& src) {
+  dst->reserve(src.size());
+  dst->clear();
+  for (auto&& v : src) {
+    dst->emplace_back(v);
+  }
+}
+template void SetLoD<paddle::lite::LoD, framework::LoD>(
+    paddle::lite::LoD* dst, const framework::LoD& src);
+template void SetLoD<framework::LoD, paddle::lite::LoD>(
+    framework::LoD* dst, const paddle::lite::LoD& src);
+
+platform::Place GetNativePlace(const TargetType& type, int id = 0) {
+  switch (type) {
+    case TargetType::kHost:
+    case TargetType::kX86:
+      return platform::CPUPlace();
+    case TargetType::kCUDA:
+      return platform::CUDAPlace(id);
+    default:
+      LOG(FATAL) << "Error target type.";
+      return platform::Place();
+  }
+}
+
+TargetType GetLiteTargetType(const platform::Place& place) {
+  if (platform::is_cpu_place(place)) {
+    return TargetType::kHost;
+  }
+  return TargetType::kCUDA;
+}
+
+PrecisionType GetLitePrecisionType(framework::proto::VarType::Type type) {
+  switch (type) {
+    case framework::proto::VarType_Type_FP32:
+      return PrecisionType::kFloat;
+    case framework::proto::VarType_Type_INT8:
+      return PrecisionType::kInt8;
+    case framework::proto::VarType_Type_INT32:
+      return PrecisionType::kInt32;
+    case framework::proto::VarType_Type_INT64:
+      return PrecisionType::kInt64;
+    default:
+      LOG(FATAL) << "Error precision type.";
+      return PrecisionType::kUnk;
+  }
+}
+
+framework::proto::VarType::Type GetNativePrecisionType(
+    const PrecisionType& type) {
+  switch (type) {
+    case PrecisionType::kFloat:
+      return framework::proto::VarType_Type_FP32;
+    case PrecisionType::kInt8:
+      return framework::proto::VarType_Type_INT8;
+    case PrecisionType::kInt32:
+      return framework::proto::VarType_Type_INT32;
+    case PrecisionType::kInt64:
+      return framework::proto::VarType_Type_INT64;
+    default:
+      LOG(FATAL) << "Error precision type.";
+      return static_cast<framework::proto::VarType::Type>(-1);
+  }
+}
+
+framework::DataLayout GetNativeLayoutType(const DataLayoutType& type) {
+  switch (type) {
+    case DataLayoutType::kNCHW:
+      return framework::DataLayout::kNCHW;
+    default:
+      LOG(FATAL) << "Error layout type.";
+      return static_cast<framework::DataLayout>(-1);
+  }
+}
+
+void MemoryCopyAsync(const platform::Place& dst_place, void* dst_data,
+                     const platform::Place& src_place, const void* src_data,
+                     const size_t size, const platform::DeviceContext& ctx) {
+  const platform::CPUPlace cpu_place;
+  if (platform::is_cpu_place(dst_place) && platform::is_cpu_place(src_place)) {
+    memory::Copy(cpu_place, dst_data, cpu_place, src_data, size);
+  } else {
+#ifdef PADDLE_WITH_CUDA
+    if (platform::is_cpu_place(dst_place) &&
+        platform::is_gpu_place(src_place)) {
+      LOG(FATAL) << "lite::MemoryCopy GPU->CPU is not yet implemented.";
+    } else if (platform::is_gpu_place(dst_place) &&
+               platform::is_cpu_place(src_place)) {
+      LOG(FATAL) << "lite::MemoryCopy CPU->GPU is not yet implemented.";
+    } else if (platform::is_gpu_place(dst_place) &&
+               platform::is_gpu_place(src_place)) {
+      auto gpu_place = boost::get<platform::CUDAPlace>(src_place);
+      memory::Copy(
+          gpu_place, dst_data, gpu_place, src_data, size,
+          static_cast<const platform::CUDADeviceContext&>(ctx).stream());
+    }
+#else
+    LOG(FATAL) << "You must define PADDLE_WITH_CUDA for using CUDAPlace.";
+#endif
+  }
+}
+
+void InitDstTensor(paddle::lite::Tensor* dst, const framework::LoDTensor& src) {
+  // Currently, Lite needs to explicitly specify the target type of
+  // the input tensor.
+  constexpr int empty_size = 0;
+  dst->mutable_data(GetLiteTargetType(src.place()), empty_size);
+  dst->set_precision(GetLitePrecisionType(src.type()));
+  SetLoD(dst->mutable_lod(), src.lod());
+}
+
+void InitDstTensor(framework::LoDTensor* dst, const paddle::lite::Tensor& src) {
+  constexpr framework::proto::VarType::Type dtype =
+      framework::proto::VarType_Type_FP32;
+  dst->mutable_data(inference::lite::utils::GetNativePlace(src.target()),
+                    dtype);
+  SetLoD(dst->mutable_lod(), src.lod());
+}
+
+template <>
+void TensorCopyAsync(paddle::lite::Tensor* dst, const framework::LoDTensor& src,
+                     const platform::DeviceContext& ctx) {
+  InitDstTensor(dst, src);
+  const platform::Place& src_place = src.place();
+  const platform::Place& dst_place = GetNativePlace(dst->target());
+  const size_t bytes =
+      static_cast<size_t>(src.numel()) * framework::SizeOfType(src.type());
+  dst->Resize(framework::vectorize(src.dims()));
+  const void* src_data = src.data<void>();
+  void* dst_data = dst->mutable_data(bytes);
+  MemoryCopyAsync(dst_place, dst_data, src_place, src_data, bytes, ctx);
+}
+
+template <>
+void TensorCopyAsync(framework::LoDTensor* dst, const paddle::lite::Tensor& src,
+                     const platform::DeviceContext& ctx) {
+  InitDstTensor(dst, src);
+  const platform::Place& src_place = GetNativePlace(src.target());
+  const platform::Place& dst_place = dst->place();
+  dst->Resize(paddle::framework::make_ddim(src.dims().Vectorize()));
+  const size_t bytes =
+      static_cast<size_t>(src.numel()) * framework::SizeOfType(dst->type());
+  const void* src_data = src.raw_data();
+  // When Lite is ready, the source type needs to be modified here.
+  void* dst_data = dst->mutable_data(dst_place, dst->type());
+  MemoryCopyAsync(dst_place, dst_data, src_place, src_data, bytes, ctx);
+}
+
+}  // namespace utils
+}  // namespace lite
+}  // namespace inference
+}  // namespace paddle
--- a/Show More
+++ b/Show More