add demo

7 years ago · c3e1fb5a3e
parent 379b471ee2
commit c3e1fb5a3e
30 changed files with 362 additions and 77 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -26,6 +26,7 @@ message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
        "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
 if(WIN32)
    set(CMAKE_STATIC_LIBRARY_PREFIX lib)
+    set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} "/MT") #create multithread dynamic library
 endif(WIN32)

 if(NOT CMAKE_CROSSCOMPILING)
@ -33,7 +34,7 @@ if(NOT CMAKE_CROSSCOMPILING)
 endif(NOT CMAKE_CROSSCOMPILING)
 find_package(Git REQUIRED)
 find_package(Threads REQUIRED)
-
+include(flags)              # set paddle compile flags
 include(simd)

 ################################ Configurations #######################################
@ -206,8 +207,6 @@ endif()


 include(external/threadpool)
-
-include(flags)              # set paddle compile flags
 include(cudnn)              # set cudnn libraries, must before configure
 include(configure)          # add paddle env configuration

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@ -62,8 +62,27 @@ if(NOT CMAKE_CROSSCOMPILING)
 endif()

 if(WIN32)
-  # windows stupid compile option for all targets.
+  # windows header option for all targets.
  add_definitions(-D_XKEYCHECK_H)
+  # Use symbols instead of absolute path, reduce the cmake link command length. 
+  SET(CMAKE_C_USE_RESPONSE_FILE_FOR_LIBRARIES 1)
+  SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_LIBRARIES 1)
+  SET(CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
+  SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS 1)
+  SET(CMAKE_C_USE_RESPONSE_FILE_FOR_INCLUDES 1)
+  SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_INCLUDES 1)
+  SET(CMAKE_C_RESPONSE_FILE_LINK_FLAG "@")
+  SET(CMAKE_CXX_RESPONSE_FILE_LINK_FLAG "@")
+
+  # Specify the program to use when building static libraries
+  SET(CMAKE_C_CREATE_STATIC_LIBRARY "<CMAKE_AR> lib <TARGET> <LINK_FLAGS> <OBJECTS>")
+  SET(CMAKE_CXX_CREATE_STATIC_LIBRARY "<CMAKE_AR> lib <TARGET> <LINK_FLAGS> <OBJECTS>")
+
+  # set defination for the dll export
+  if (NOT MSVC)
+    message(FATAL "Windows build only support msvc. Which was binded by the nvcc compiler of NVIDIA.")
+  endif(NOT MSVC)
+  add_definitions(/DPADDLE_COMPILE)
 endif(WIN32)

 if(NOT WITH_GOLANG)
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@ -46,14 +46,9 @@ ExternalProject_Add(
    ${BOOST_PROJECT}
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DOWNLOAD_DIR          ${BOOST_DOWNLOAD_DIR}
-<<<<<<< HEAD
-    DOWNLOAD_COMMAND      "wget --no-check-certificate ${BOOST_URL} -c -q -O ${BOOST_TAR}.tar.gz
-                          && tar zxf ${BOOST_TAR}.tar.gz"
-=======
    DOWNLOAD_COMMAND      wget --no-check-certificate ${BOOST_URL} -c -q -O ${BOOST_TAR}.tar.gz
    && tar zxf ${BOOST_TAR}.tar.gz
->>>>>>> origin/develop
-    DOWNLOAD_NO_PROGRESS  1
+DOWNLOAD_NO_PROGRESS  1
    PREFIX                ${BOOST_SOURCES_DIR}
    CONFIGURE_COMMAND     ""
    BUILD_COMMAND         ""
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@ -35,7 +35,9 @@ ExternalProject_Add(
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DBUILD_STATIC_LIBS=ON
                    -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
                    -DBUILD_TESTING=OFF
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@ -34,7 +34,6 @@ ELSE()
  SET(GLOG_REPOSITORY "https://github.com/google/glog.git")
  SET(GLOG_TAG "v0.3.5")
 ENDIF()
-
 ExternalProject_Add(
    extern_glog
    ${EXTERNAL_PROJECT_LOG_ARGS}
@ -46,6 +45,7 @@ ExternalProject_Add(
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
                    -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
                    -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@ -51,6 +51,7 @@ IF(WITH_TESTING)
                        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                        -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
                        -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                        -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
                        -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
                        -DBUILD_GMOCK=ON
@ -70,6 +71,5 @@ IF(WITH_TESTING)
    ADD_LIBRARY(gtest_main STATIC IMPORTED GLOBAL)
    SET_PROPERTY(TARGET gtest_main PROPERTY IMPORTED_LOCATION ${GTEST_MAIN_LIBRARIES})
    ADD_DEPENDENCIES(gtest_main extern_gtest)
-
    LIST(APPEND external_project_dependencies gtest gtest_main)
 ENDIF(WITH_TESTING)
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@ -124,6 +124,7 @@ INCLUDE_DIRECTORIES(${CBLAS_INCLUDE_DIR})
 # linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
 SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c)
 FILE(WRITE ${dummyfile} "const char *dummy_cblas = \"${dummyfile}\";")
+
 ADD_LIBRARY(cblas STATIC ${dummyfile})

 IF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@ -70,6 +70,20 @@ macro(safe_set_nvflag flag_name)
    endif()
 endmacro()

+macro(safe_set_static_flag) # set c_flags and cxx_flags to static or shared
+    if (BUILD_SHARED_LIBS) 
+        return() # if build shared libs, the flags keep same with '/MD'
+    endif(BUILD_SHARED_LIBS)
+    foreach(flag_var
+        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
+        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
+        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
+      if(${flag_var} MATCHES "/MD")
+        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+      endif(${flag_var} MATCHES "/MD")
+    endforeach(flag_var)
+endmacro()

 CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS)
 if(NOT UINT64_MAX_EXISTS)
@ -133,7 +147,8 @@ set(GPU_COMMON_FLAGS

 else(NOT WIN32)
 set(COMMON_FLAGS
-    "/w") #disable all warnings
+    "/w") #disable all warnings.
+
 set(GPU_COMMON_FLAGS
    "") #disable all warnings

@ -167,3 +182,7 @@ endforeach()
 foreach(flag ${GPU_COMMON_FLAGS})
    safe_set_nvflag(${flag})
 endforeach()
+
+if(MSVC)
+safe_set_static_flag()
+endif(MSVC)
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -10,10 +10,22 @@ function(windows_symbolic TARGET)
  if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc OR NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cu)
      message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.")
  endif()
+
+  # only copy the xx.cu to .xx.cu when the content are modified
+  set(copy_flag 1)
+  if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu)
+  file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc SOURCE_STR)
+  file(READ ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu TARGET_STR)
+  if (SOURCE_STR STREQUAL TARGET_STR)
+    set(copy_flag 0)
+  endif()
+  endif()
+  if (copy_flag)
  add_custom_command(OUTPUT .${src}.cu 
          COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu
          COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc" "${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu"
          COMMENT "create hidden file of ${src}.cu")
+  endif(copy_flag)
  add_custom_target(${TARGET} ALL DEPENDS .${src}.cu)  
  endforeach()
 endfunction()
--- a/paddle/fluid/framework/eigen.h
+++ b/paddle/fluid/framework/eigen.h
@ -15,6 +15,9 @@ limitations under the License. */
 #pragma once
 // logging.h and windows.h conflict
 #define GLOG_NO_ABBREVIATED_SEVERITIES
+// solve static linking error in windows
+// https://github.com/google/glog/issues/301
+#define GOOGLE_GLOG_DLL_DECL

 #include "paddle/fluid/framework/tensor.h"
 #include "unsupported/Eigen/CXX11/Tensor"
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@ -25,6 +25,7 @@ limitations under the License. */

 #if defined(_WIN32)
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#define GOOGLE_GLOG_DLL_DECL
 #endif

 #include "glog/logging.h"  // For VLOG()
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@ -11,6 +11,9 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#define GLOG_NO_ABBREVIATED_SEVERITIES
+#define GOOGLE_GLOG_DLL_DECL
+
 #include <gflags/gflags.h>
 #include <glog/logging.h>

--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@ -22,6 +22,7 @@ limitations under the License. */

 #if defined(_WIN32)
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#define GOOGLE_GLOG_DLL_DECL
 #endif

 #include "paddle/fluid/framework/data_layout.h"
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@ -26,8 +26,9 @@ cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
 #endif()

 # Create static library
-
+if (WIN32)
 cc_library(paddle_fluid DEPS ${fluid_modules} ${fluid_third_partys} paddle_fluid_api paddle_inference_api)
+endif(WIN32)
 if(NOT APPLE)
  # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
  set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym")
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@ -26,18 +26,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"

 DEFINE_bool(profile, false, "Turn on profiler for fluid");
-using Timer = paddle::inference::Timer;

 namespace paddle {
-namespace {
-
-template <class T>
-std::string num2str(T a) {
-  std::stringstream istr;
-  istr << a;
-  return istr.str();
-}
-}  // namespace

 void NativePaddlePredictor::PrepareFeedFetch() {
  for (auto *op : inference_program_->Block(0).AllOps()) {
@ -130,6 +120,7 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
                                std::vector<PaddleTensor> *output_data,
                                int batch_size) {
  VLOG(3) << "Predictor::predict";
+  using Timer = paddle::inference::Timer;
  Timer timer;
  timer.tic();
  // set feed variable
@ -307,7 +298,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
        config.fraction_of_gpu_memory <= 0.95f) {
      flags.push_back("dummpy");
      std::string flag = "--fraction_of_gpu_memory_to_use=" +
-                         num2str<float>(config.fraction_of_gpu_memory);
+                         std::to_string(config.fraction_of_gpu_memory);
      flags.push_back(flag);
      VLOG(3) << "set flag: " << flag;
      framework::InitGflags(flags);
--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
@ -14,6 +14,12 @@

 #pragma once

+// logging.h and windows.h conflict
+#define GLOG_NO_ABBREVIATED_SEVERITIES
+// solve static linking error in windows
+// https://github.com/google/glog/issues/301
+#define GOOGLE_GLOG_DLL_DECL
+
 #include <glog/logging.h>
 #include <map>
 #include <memory>
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@ -1,13 +1,31 @@
 cmake_minimum_required(VERSION 3.0)
-
 project(cpp_inference_demo CXX C)
+option(WITH_MKL        "Compile demo with MKL/OpenBlas support, default use MKL."       ON)
+option(WITH_GPU        "Compile demo with GPU/CPU, default use CPU."                    OFF)
+option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static."   ON)
+
+macro(safe_set_static_flag)
+    foreach(flag_var
+        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+      if(${flag_var} MATCHES "/MD")
+        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+      endif(${flag_var} MATCHES "/MD")
+    endforeach(flag_var)
+endmacro()

-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 if (WIN32)
-set(CMAKE_STATIC_LIBRARY_PREFIX "lib")
+  if (WITH_STATIC_LIB)
+    safe_set_static_flag()
+    set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} "/w")
+    set(CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE} "/w")
+  endif()
+  set(CMAKE_STATIC_LIBRARY_PREFIX "lib")
 else()
-set(CMAKE_STATIC_LIBRARY_PREFIX "")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+  set(CMAKE_STATIC_LIBRARY_PREFIX "")
 endif()
+message("flags" ${CMAKE_CXX_FLAGS})

 if(NOT DEFINED PADDLE_LIB)
  message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib")
@ -16,14 +34,18 @@ if(NOT DEFINED DEMO_NAME)
  message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name")
 endif()

-option(WITH_MKL        "Compile demo with MKL/OpenBlas support, default use MKL."       ON)
-option(WITH_GPU        "Compile demo with GPU/CPU, default use CPU."                    OFF)
-option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static."   ON)

 if(WITH_GPU)
-  set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library")
+  if(NOT WIN32)
+    set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library")
+  else()
+    if(CUDA_LIB STREQUAL "")
+    set(CUDA_LIB "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\lib\\x64")
+    endif()
+  endif(NOT WIN32)
 endif()

+include_directories("D:/Paddle/")
 include_directories("${PADDLE_LIB}")
 include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
 include_directories("${PADDLE_LIB}/third_party/install/glog/include")
@ -83,10 +105,16 @@ set(DEPS ${DEPS}
    ${MATH_LIB} ${MKLDNN_LIB}
    ${CMAKE_STATIC_LIBRARY_PREFIX}glog  ${CMAKE_STATIC_LIBRARY_PREFIX}gflags  ${CMAKE_STATIC_LIBRARY_PREFIX}protobuf
    ${EXTERNAL_LIB})
+# NOTE(dzhwinter) shlwapi is deprecated.
+set(DEPS ${DEPS} libcmt shlwapi)
 endif(NOT WIN32)

 if(WITH_GPU)
-  set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX})
+  if(NOT WIN32)
+    set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX})
+  else()
+    set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX})
+  endif()
 endif()

 target_link_libraries(${DEMO_NAME} ${DEPS})
--- a/paddle/fluid/inference/api/demo_ci/inference_icnet.cc
+++ b/paddle/fluid/inference/api/demo_ci/inference_icnet.cc
@ -0,0 +1,184 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains a simple demo for how to take a model for inference.
+ */
+#include <cassert>
+#include <cctype>
+
+#include <algorithm>
+#include <iostream>
+#include <iterator>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <thread>  //NOLINT
+#include "paddle/fluid/inference/paddle_inference_api.h"
+
+std::string DIRNAME = ""; /* "Directory of the inference model." */ // NOLINT
+bool USE_GPU = false;     /*"Whether use gpu."*/
+
+auto message_err = []() {
+  std::cout << "Copyright (c) 2018 PaddlePaddle Authors." << std::endl;
+  std::cout << "Demo Case for windows inference. "
+            << "\n"
+            << "Usage: Input your model path and use_gpu as the guide requires,"
+            << "then run the demo inference, and will get a result."
+            << std::endl;
+  std::cout << std::endl;
+};
+
+void ParseArgs() {
+  message_err();
+  std::cout << "DIRNAME:[D:/Paddle/xxx/path_to_model_dir]" << std::endl;
+  std::cin >> DIRNAME;
+  std::cout << "USE_GPU:[yes|no]";
+  std::string value;
+  std::cin >> value;
+  std::transform(value.begin(), value.end(), value.begin(), ::toupper);
+  USE_GPU = (value == "YES") ? true : false;
+}
+
+namespace paddle {
+namespace demo {
+std::string ToString(const NativeConfig& config) {
+  std::stringstream ss;
+  ss << "Use GPU : " << (config.use_gpu ? "True" : "False") << "\n"
+     << "Device : " << config.device << "\n"
+     << "fraction_of_gpu_memory : " << config.fraction_of_gpu_memory << "\n"
+     << "specify_input_name : "
+     << (config.specify_input_name ? "True" : "False") << "\n"
+     << "Program File : " << config.prog_file << "\n"
+     << "Param File : " << config.param_file;
+  return ss.str();
+}
+
+void Main(bool use_gpu) {
+  //# 1. Create PaddlePredictor with a config.
+  NativeConfig config;
+  config.model_dir = DIRNAME;
+  config.use_gpu = USE_GPU;
+  config.fraction_of_gpu_memory = 0.15;
+  config.device = 0;
+  std::cout << ToString(config) << std::endl;
+  auto predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+
+  for (int batch_id = 0; batch_id < 3; batch_id++) {
+    //# 2. Prepare input.
+    int64_t data[4] = {1, 2, 3, 4};
+
+    PaddleTensor tensor;
+    tensor.shape = std::vector<int>({4, 1});
+    tensor.data = PaddleBuf(data, sizeof(data));
+    tensor.dtype = PaddleDType::INT64;
+
+    // For simplicity, we set all the slots with the same data.
+    std::vector<PaddleTensor> slots(4, tensor);
+
+    //# 3. Run
+    std::vector<PaddleTensor> outputs;
+    assert(predictor->Run(slots, &outputs) == true &&
+           "Predict run expect true");
+
+    //# 4. Get output.
+    assert(outputs.size() == 1UL);
+    // Check the output buffer size and result of each tid.
+    assert(outputs.front().data.length() == 33168UL);
+    float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815,
+                       0.000932706};
+    const size_t num_elements = outputs.front().data.length() / sizeof(float);
+    // The outputs' buffers are in CPU memory.
+    for (size_t i = 0; i < std::min(static_cast<size_t>(5), num_elements);
+         i++) {
+      assert(static_cast<float*>(outputs.front().data.data())[i] == result[i]);
+      std::cout << "expect the output "
+                << static_cast<float*>(outputs.front().data.data())[i]
+                << std::endl;
+    }
+  }
+}
+
+void MainThreads(int num_threads, bool USE_GPU) {
+  // Multi-threads only support on CPU
+  // 0. Create PaddlePredictor with a config.
+  NativeConfig config;
+  config.model_dir = DIRNAME;
+  config.use_gpu = USE_GPU;
+  config.fraction_of_gpu_memory = 0.15;
+  config.device = 0;
+  std::cout << ToString(config) << std::endl;
+  auto main_predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+
+  std::vector<std::thread> threads;
+  for (int tid = 0; tid < num_threads; ++tid) {
+    threads.emplace_back([&, tid]() {
+      // 1. clone a predictor which shares the same parameters
+      auto predictor = main_predictor->Clone();
+      constexpr int num_batches = 3;
+      for (int batch_id = 0; batch_id < num_batches; ++batch_id) {
+        // 2. Dummy Input Data
+        int64_t data[4] = {1, 2, 3, 4};
+        PaddleTensor tensor;
+        tensor.shape = std::vector<int>({4, 1});
+        tensor.data = PaddleBuf(data, sizeof(data));
+        tensor.dtype = PaddleDType::INT64;
+
+        std::vector<PaddleTensor> inputs(4, tensor);
+        std::vector<PaddleTensor> outputs;
+        // 3. Run
+        assert(predictor->Run(inputs, &outputs) == true);
+
+        // 4. Get output.
+        assert(outputs.size() == 1UL);
+        // Check the output buffer size and result of each tid.
+        assert(outputs.front().data.length() == 33168UL);
+        float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815,
+                           0.000932706};
+        const size_t num_elements =
+            outputs.front().data.length() / sizeof(float);
+        // The outputs' buffers are in CPU memory.
+        for (size_t i = 0; i < std::min(static_cast<size_t>(5), num_elements);
+             i++) {
+          assert(static_cast<float*>(outputs.front().data.data())[i] ==
+                 result[i]);
+        }
+      }
+    });
+  }
+  for (int i = 0; i < num_threads; ++i) {
+    threads[i].join();
+  }
+}
+
+}  // namespace demo
+}  // namespace paddle
+
+int main(int argc, char** argv) {
+  // ParseArgs();
+  DIRNAME = "./icnet";
+  USE_GPU = true;
+  paddle::demo::Main(false /* USE_GPU*/);
+  paddle::demo::MainThreads(1, false /* USE_GPU*/);
+  paddle::demo::MainThreads(4, false /* USE_GPU*/);
+  if (USE_GPU) {
+    paddle::demo::Main(true /*USE_GPU*/);
+    paddle::demo::MainThreads(1, true /*USE_GPU*/);
+    paddle::demo::MainThreads(4, true /*USE_GPU*/);
+  }
+  system("pause");
+  return 0;
+}
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@ -29,13 +29,13 @@ function download() {
  fi
  cd ..
 }
-mkdir -p data
-cd data
-vis_demo_list='se_resnext50 ocr mobilenet'
-for vis_demo_name in $vis_demo_list; do
-  download $vis_demo_name
-done
-cd ..
+# mkdir -p data
+# cd data
+# vis_demo_list='se_resnext50 ocr mobilenet'
+# for vis_demo_name in $vis_demo_list; do
+#   download $vis_demo_name
+# done
+# cd ..

 # compile and test the demo
 mkdir -p build
@ -63,25 +63,25 @@ for WITH_STATIC_LIB in ON OFF; do
    done
  fi
  # ---------vis_demo---------
-  rm -rf *
-  cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \
-    -DWITH_MKL=$TURN_ON_MKL \
-    -DDEMO_NAME=vis_demo \
-    -DWITH_GPU=$TEST_GPU_CPU \
-    -DWITH_STATIC_LIB=$WITH_STATIC_LIB
-  make -j
-  for use_gpu in $use_gpu_list; do
-    for vis_demo_name in $vis_demo_list; do 
-      ./vis_demo \
-        --modeldir=../data/$vis_demo_name/model \
-        --data=../data/$vis_demo_name/data.txt \
-        --refer=../data/$vis_demo_name/result.txt \
-        --use_gpu=$use_gpu
-      if [ $? -ne 0 ]; then
-        echo "vis demo $vis_demo_name runs fail."
-        exit 1
-      fi
-    done
-  done
+  # rm -rf *
+  # cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \
+  #   -DWITH_MKL=$TURN_ON_MKL \
+  #   -DDEMO_NAME=vis_demo \
+  #   -DWITH_GPU=$TEST_GPU_CPU \
+  #   -DWITH_STATIC_LIB=$WITH_STATIC_LIB
+  # make -j
+  # for use_gpu in $use_gpu_list; do
+  #   for vis_demo_name in $vis_demo_list; do 
+  #     ./vis_demo \
+  #       --modeldir=../data/$vis_demo_name/model \
+  #       --data=../data/$vis_demo_name/data.txt \
+  #       --refer=../data/$vis_demo_name/result.txt \
+  #       --use_gpu=$use_gpu
+  #     if [ $? -ne 0 ]; then
+  #       echo "vis demo $vis_demo_name runs fail."
+  #       exit 1
+  #     fi
+  #   done
+  # done
 done
 set +x
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@ -25,6 +25,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+#include "paddle/fluid/platform/macros.h"

 namespace paddle {

@ -33,7 +34,7 @@ enum PaddleDType {
  INT64,
 };

-class PaddleBuf {
+class PADDLE_DLL PaddleBuf {
 public:
  PaddleBuf() = default;
  PaddleBuf(PaddleBuf&& other);
@ -45,7 +46,7 @@ class PaddleBuf {
  PaddleBuf(void* data, size_t length)
      : data_(data), length_(length), memory_owned_{false} {}
  // Own memory.
-  PaddleBuf(size_t length)
+  explicit PaddleBuf(size_t length)
      : data_(new char[length]), length_(length), memory_owned_(true) {}
  // Resize to `length` bytes.
  void Resize(size_t length);
@ -64,7 +65,7 @@ class PaddleBuf {
  bool memory_owned_{true};
 };

-struct PaddleTensor {
+struct PADDLE_DLL PaddleTensor {
  PaddleTensor() = default;
  std::string name;  // variable name.
  std::vector<int> shape;
@ -87,7 +88,7 @@ enum class PaddleEngineKind {
 * A simple Inference API for Paddle. Currently this API can be used by
 * non-sequence scenerios.
 */
-class PaddlePredictor {
+class PADDLE_DLL PaddlePredictor {
 public:
  struct Config;
  PaddlePredictor() = default;
@ -96,7 +97,6 @@ class PaddlePredictor {

  // Predict an record.
  // The caller should be responsible for allocating and releasing the memory of
-  // `inputs`. `inputs` should be available until Run returns. Caller should be
  // responsible for the output tensor's buffer, either allocated or passed from
  // outside.
  virtual bool Run(const std::vector<PaddleTensor>& inputs,
@ -111,12 +111,12 @@ class PaddlePredictor {
  virtual ~PaddlePredictor() = default;

  // The common configs for all the predictors.
-  struct Config {
+  struct PADDLE_DLL Config {
    std::string model_dir;  // path to the model directory.
  };
 };

-struct NativeConfig : public PaddlePredictor::Config {
+struct PADDLE_DLL NativeConfig : public PaddlePredictor::Config {
  // GPU related fields.
  bool use_gpu{false};
  int device{0};
@ -129,7 +129,7 @@ struct NativeConfig : public PaddlePredictor::Config {
 };

 // Configurations for Anakin engine.
-struct AnakinConfig : public PaddlePredictor::Config {
+struct PADDLE_DLL AnakinConfig : public PaddlePredictor::Config {
  enum TargetType { NVGPU = 0, X86 };
  int device;
  std::string model_file;
@ -137,7 +137,7 @@ struct AnakinConfig : public PaddlePredictor::Config {
  TargetType target_type;
 };

-struct TensorRTConfig : public NativeConfig {
+struct PADDLE_DLL TensorRTConfig : public NativeConfig {
  // Determine whether a subgraph will be executed by TRT.
  int min_subgraph_size{1};
  // While TensorRT allows an engine optimized for a given max batch size
@ -159,8 +159,9 @@ struct TensorRTConfig : public NativeConfig {
 //
 // Similarly, each engine kind should map to a unique predictor implementation.
 template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative>
-std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
+PADDLE_DLL std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(
+    const ConfigT& config);

-int PaddleDtypeSize(PaddleDType dtype);
+PADDLE_DLL int PaddleDtypeSize(PaddleDType dtype);

 }  // namespace paddle
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #define GLOG_NO_ABBREVIATED_SEVERITIES
+#define GOOGLE_GLOG_DLL_DECL

 #include "paddle/fluid/memory/detail/system_allocator.h"

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@ -87,7 +87,8 @@ function(op_library TARGET)
    if (WIN32)
    # no nccl, no avx instructions ops.
    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op"
-     "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op")
+     "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op"
+     "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op")
        if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
          return()
        endif()
--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
@ -14,7 +14,6 @@ limitations under the License. */

 #pragma once

-#include <glog/logging.h>
 #include <algorithm>
 #include <iterator>
 #include <vector>
@ -99,7 +98,7 @@ class MidWiseTransformIterator;
 template <typename T>
 class RowwiseTransformIterator<T, platform::CPUDeviceContext>
    : public std::iterator<std::random_access_iterator_tag, typename T,
-                           std::ptrdiff_t, typename T*, typename T&> {
+                           std::ptrdiff_t, typename T *, typename T &> {
 public:
  RowwiseTransformIterator(const T *ptr, int n) : ptr_(ptr), i_(0), n_(n) {}

@ -132,7 +131,7 @@ class RowwiseTransformIterator<T, platform::CPUDeviceContext>
 template <typename T>
 class MidWiseTransformIterator<T, platform::CPUDeviceContext>
    : public std::iterator<std::random_access_iterator_tag, T, std::ptrdiff_t,
-                           T*, T&> {
+                           T *, T &> {
 public:
  MidWiseTransformIterator(const T *ptr, int n, int post)
      : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {}
--- a/paddle/fluid/operators/lstm_unit_op.h
+++ b/paddle/fluid/operators/lstm_unit_op.h
@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

-    http://www.apache.org/licenses/LICENSE-2.0
+   http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
--- a/paddle/fluid/operators/print_op.cc
+++ b/paddle/fluid/operators/print_op.cc
@ -13,6 +13,7 @@
   limitations under the License. */

 #include <algorithm>
+#include <iostream>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/var_type.h"

--- a/Show More
+++ b/Show More