Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_python36and37_to_paddle_build

test=develop
7 years ago · d68b9ede44
parent 255cc1eb65 ae7d22862b
commit d68b9ede44
65 changed files with 1870 additions and 729 deletions
--- a/AUTHORS.md
+++ b/AUTHORS.md
@ -25,6 +25,7 @@
 | kexinzhao | Ke-Xin Zhao |
 | kuke | Yi-Bing Liu |
 | lcy-seso | Ying Cao |
+| cjld | Dun Liang |
 | lipeng-unisound | Peng Li |
 | liuyuan | Yuan Liu |
 | livc | Zhao Li |
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -130,6 +130,21 @@ if (APPLE OR WIN32)
        "Disable MKL for building on mac and windows" FORCE)
 endif()

+if (WIN32)
+    set(WITH_AVX OFF CACHE STRING
+            "Disable AVX when compiling for Windows" FORCE)
+    set(WITH_DSO OFF CACHE STRING
+            "Disable DSO when compiling for Windows" FORCE)
+    set(WITH_MKL OFF CACHE STRING
+            "Disable MKL when compiling for Windows" FORCE)
+    set(WITH_DISTRIBUTE OFF CACHE STRING
+            "Disable DISTRIBUTE when compiling for Windows" FORCE)
+    set(WITH_C_API OFF CACHE STRING
+            "Disable C_API when compiling for Windows" FORCE)
+    set(WITH_FLUID_ONLY ON CACHE STRING
+            "Enable FLUID_ONLY when compiling for Windows" FORCE)
+endif()
+
 set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
  "A path setting third party libraries download & build directories.")

@ -190,11 +205,11 @@ include(external/pybind11)  # download pybind11
 include(external/cares)
 include(external/cub)
 include(external/xxhash)    # download xxhash
-
-if (NOT WIN32)
-# there is no official support of snappystream, warpctc, nccl, cupti in windows
 include(external/snappy)    # download snappy
 include(external/snappystream) # download snappystream
+
+if (NOT WIN32)
+# there is no official support of warpctc, nccl, cupti in windows
 include(external/warpctc)   # download, build, install warpctc
 include(cupti)
 endif (NOT WIN32)
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@ -50,7 +50,11 @@ IF(WITH_TESTING)
        CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                        -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                        -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                        -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
                        -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                        -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                        -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
                        -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
                        -DBUILD_GMOCK=ON
--- a/cmake/external/snappy.cmake
+++ b/cmake/external/snappy.cmake
@ -24,7 +24,11 @@ set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
 set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
 set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE)

-set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
+if (WIN32)
+    set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/snappy.lib")
+else(WIN32)
+    set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
+endif (WIN32)

 ExternalProject_Add(
    extern_snappy
@ -34,8 +38,12 @@ ExternalProject_Add(
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
                    -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
                    -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
--- a/cmake/external/snappystream.cmake
+++ b/cmake/external/snappystream.cmake
@ -18,36 +18,45 @@ ENDIF()

 include (ExternalProject)

-# NOTE: snappy is needed when linking with recordio
-
 set(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
 set(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream)
 set(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include" CACHE PATH "snappy stream include directory." FORCE)

-set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
-
-ExternalProject_Add(
-        extern_snappystream
-        GIT_REPOSITORY "https://github.com/hoxnox/snappystream.git"
-        GIT_TAG "0.2.8"
-        PREFIX          ${SNAPPYSTREAM_SOURCES_DIR}
-        UPDATE_COMMAND  ""
-        CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                        -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-                        -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                        -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
-                        -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
-                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                        -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                        -DSNAPPY_ROOT=${SNAPPY_INSTALL_DIR}
-                        ${EXTERNAL_OPTIONAL_ARGS}
-                        CMAKE_CACHE_ARGS
-                        -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPYSTREAM_INSTALL_DIR}
-                        -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPYSTREAM_INSTALL_DIR}/lib
-                        -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-        DEPENDS snappy
-)
+if(WIN32)
+    # Fix me, VS2015 come without VLA support
+    set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/snappystream.lib")
+    MESSAGE(WARNING, "In windows, snappystream has no compile support for windows,
+    please build it manually and put it at " ${SNAPPYSTREAM_INSTALL_DIR})
+else(WIN32)
+    set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
+
+    ExternalProject_Add(
+            extern_snappystream
+            GIT_REPOSITORY "https://github.com/hoxnox/snappystream.git"
+            GIT_TAG "0.2.8"
+            PREFIX          ${SNAPPYSTREAM_SOURCES_DIR}
+            UPDATE_COMMAND  ""
+            CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                            -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                            -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                            -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+                            -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                            -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                            -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+                            -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
+                            -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
+                            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                            -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                            -DSNAPPY_ROOT=${SNAPPY_INSTALL_DIR}
+                            ${EXTERNAL_OPTIONAL_ARGS}
+                            CMAKE_CACHE_ARGS
+                            -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPYSTREAM_INSTALL_DIR}
+                            -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPYSTREAM_INSTALL_DIR}/lib
+                            -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+            DEPENDS snappy
+    )
+endif(WIN32)

 add_library(snappystream STATIC IMPORTED GLOBAL)
 set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION ${SNAPPYSTREAM_LIBRARIES})
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@ -351,6 +351,9 @@ function(cc_test TARGET_NAME)
    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_executable(${TARGET_NAME} ${cc_test_SRCS})
    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
+    if(WIN32)
+      target_link_libraries(${TARGET_NAME} shlwapi)
+    endif(WIN32)
    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
    add_test(NAME ${TARGET_NAME}
             COMMAND ${TARGET_NAME} ${cc_test_ARGS}
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@ -84,9 +84,7 @@ function(op_library TARGET)
    endif()
    if (WIN32)
    # remove windows unsupported op, because windows has no nccl, no warpctc such ops.
-    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op"
-     "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op"
-      "fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op")
+    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op")
        if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
          return()
        endif()
--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@ -57,43 +57,46 @@ int main()
    return 0;
 }" SSE3_FOUND)

-# Check AVX
-set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
-set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
-#include <immintrin.h>
-int main()
-{
-    __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
-    __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
-    __m256 result = _mm256_add_ps (a, b);
-    return 0;
-}" AVX_FOUND)
+# disable AVX by default on windows
+if(NOT WIN32)
+    # Check AVX
+    set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
+    set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+    CHECK_CXX_SOURCE_RUNS("
+    #include <immintrin.h>
+    int main()
+    {
+        __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
+        __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
+        __m256 result = _mm256_add_ps (a, b);
+        return 0;
+    }" AVX_FOUND)

-# Check AVX 2
-set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
-set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
-#include <immintrin.h>
-int main()
-{
-    __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
-    __m256i result = _mm256_abs_epi32 (a);
-    return 0;
-}" AVX2_FOUND)
+    # Check AVX 2
+    set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
+    set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+    CHECK_CXX_SOURCE_RUNS("
+    #include <immintrin.h>
+    int main()
+    {
+        __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
+        __m256i result = _mm256_abs_epi32 (a);
+        return 0;
+    }" AVX2_FOUND)

-# Check AVX512F
-set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
-set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
-#include <immintrin.h>
-int main()
-{
-    __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
-                                  13, -5, 6, -7, 9, 2, -6, 3);
-    __m512i result = _mm512_abs_epi32 (a);
-    return 0;
-}" AVX512F_FOUND)
+    # Check AVX512F
+    set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
+    set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+    CHECK_CXX_SOURCE_RUNS("
+    #include <immintrin.h>
+    int main()
+    {
+        __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
+                                      13, -5, 6, -7, 9, 2, -6, 3);
+        __m512i result = _mm512_abs_epi32 (a);
+        return 0;
+    }" AVX512F_FOUND)
+endif(NOT WIN32)

 set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
 mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND)
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@ -103,6 +103,7 @@ paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 's
 paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
+paddle.fluid.layers.group_norm ArgSpec(args=['input', 'groups', 'epsilon', 'param_attr', 'bias_attr', 'act', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(1e-05, None, None, None, 'NCHW', None))
 paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode', 'return_softmax'], varargs=None, keywords=None, defaults=(False, -100, False, False))
 paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None)
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@ -3,13 +3,9 @@ add_subdirectory(platform)
 add_subdirectory(framework)
 add_subdirectory(operators)
 add_subdirectory(string)
-
-add_subdirectory(pybind)
-if (NOT WIN32)
 add_subdirectory(recordio)
-endif(NOT WIN32)
+add_subdirectory(pybind)

 # NOTE: please add subdirectory inference at last.
 add_subdirectory(inference)
-
 add_subdirectory(train)
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -31,9 +31,7 @@ function(windows_symbolic TARGET)
 endfunction()

 add_subdirectory(ir)
-if (NOT WIN32)
 add_subdirectory(details)
-endif (NOT WIN32)
 # ddim lib
 proto_library(framework_proto SRCS framework.proto)

@ -68,11 +66,7 @@ if(WITH_GPU)
 else()
  cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
 endif()
-if (NOT WIN32)
-  cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version)
-else()
-  cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version)
-endif (NOT WIN32)
+cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version)

 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
@ -122,13 +116,8 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)

-if (NOT WIN32)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
    shape_inference data_transform lod_tensor profiler)
-else()
-cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
-    shape_inference data_transform lod_tensor)
-endif(NOT WIN32)

 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)

@ -183,12 +172,10 @@ else()
  cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()

-if (NOT WIN32)
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS
        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
        graph build_strategy
        fast_threaded_ssa_graph_executor)
-endif() # NOT WIN32

 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
@ -13,9 +13,9 @@
 // limitations under the License.

 #pragma once
+#include <ThreadPool.h>
 #include <string>
 #include <vector>
-#include "ThreadPool.h"
 #include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/framework/details/exception_holder.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"
--- a/paddle/fluid/framework/eigen.h
+++ b/paddle/fluid/framework/eigen.h
@ -13,11 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
-// logging.h and windows.h conflict
-#define GLOG_NO_ABBREVIATED_SEVERITIES
-// solve static linking error in windows
-// https://github.com/google/glog/issues/301
-#define GOOGLE_GLOG_DLL_DECL

 #include "paddle/fluid/framework/tensor.h"
 #include "unsupported/Eigen/CXX11/Tensor"
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@ -23,11 +23,6 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>

-#if defined(_WIN32)
-#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
-#define GOOGLE_GLOG_DLL_DECL
-#endif
-
 #include "glog/logging.h"  // For VLOG()
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/details/op_registry.h"
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define GLOG_NO_ABBREVIATED_SEVERITIES
-#define GOOGLE_GLOG_DLL_DECL

 #include <gflags/gflags.h>
 #include <glog/logging.h>
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@ -20,8 +20,6 @@ limitations under the License. */
 #include <tuple>
 #include <unordered_map>
 #include <vector>
-#define GLOG_NO_ABBREVIATED_SEVERITIES
-#define GOOGLE_GLOG_DLL_DECL

 #include "glog/logging.h"  // For VLOG
 #include "paddle/fluid/framework/attribute.h"
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@ -35,4 +35,4 @@ function(inference_analysis_test TARGET)
  endif()
 endfunction(inference_analysis_test)

-inference_analysis_test(test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS paddle_inference_api)
+inference_analysis_test(test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS reset_tensor_array paddle_inference_api)
--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
@ -14,12 +14,6 @@ limitations under the License. */

 #pragma once

-// logging.h and windows.h conflict
-#define GLOG_NO_ABBREVIATED_SEVERITIES
-// solve static linking error in windows
-// https://github.com/google/glog/issues/301
-#define GOOGLE_GLOG_DLL_DECL
-
 #include <glog/logging.h>
 #include <map>
 #include <memory>
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@ -18,7 +18,7 @@ nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc
 nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc
        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine conv_op conv_transpose_op SERIAL)
 nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
-        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pool_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pool_op tensorrt_plugin SERIAL)
 nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin
             elementwise_add_op elementwise_mul_op SERIAL)
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@ -13,25 +13,57 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h"

 namespace paddle {
 namespace inference {
 namespace tensorrt {

+void DealCeilMode(const nvinfer1::Dims &input_shape, std::vector<int> ksize,
+                  std::vector<int> strides, std::vector<int> paddings,
+                  nvinfer1::DimsHW *pre_pad, nvinfer1::DimsHW *post_pad,
+                  int input_dims) {
+  int input_height = input_shape.d[input_dims - 2];
+  int input_width = input_shape.d[input_dims - 1];
+  int floor_h_output_size =
+      (input_height - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
+  int ceil_h_output_size =
+      (input_height - ksize[0] + 2 * paddings[0] + strides[0] - 1) /
+          strides[0] +
+      1;
+
+  int floor_w_output_size =
+      (input_width - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
+  int ceil_w_output_size =
+      (input_width - ksize[1] + 2 * paddings[1] + strides[1] - 1) / strides[1] +
+      1;
+  if (floor_h_output_size != ceil_h_output_size) {
+    post_pad->h() = strides[0] - 1;
+  }
+
+  if (floor_w_output_size != ceil_w_output_size) {
+    post_pad->w() = strides[1] - 1;
+  }
+}
+
 /*
 * Pool2dOp, IPoolingLayer in TRT. This Layer doesn't has weights.
 */
 class Pool2dOpConverter : public OpConverter {
 public:
-  void operator()(const framework::proto::OpDesc& op,
-                  const framework::Scope& scope, bool test_mode) override {
-    VLOG(3)
+  void operator()(const framework::proto::OpDesc &op,
+                  const framework::Scope &scope, bool test_mode) override {
+    VLOG(40)
        << "convert a fluid pool2d op to tensorrt pool2d layer without bias";
    framework::OpDesc op_desc(op, nullptr);
    // Declare inputs
    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
-    auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
+    auto *input1 = engine_->GetITensor(op_desc.Input("X")[0]);
+    nvinfer1::Dims input_shape = input1->getDimensions();
+    int input_dims = input_shape.nbDims;
+
+    PADDLE_ENFORCE_EQ(input_dims, 3UL);

    bool global_pooling = boost::get<bool>(op_desc.GetAttr("global_pooling"));
    std::string pool_type =
@ -44,23 +76,6 @@ class Pool2dOpConverter : public OpConverter {
        boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
    bool ceil_mode = boost::get<bool>(op_desc.GetAttr("ceil_mode"));

-    nvinfer1::Dims input_shape = input1->getDimensions();
-    int nbDims = input_shape.nbDims;
-    nvinfer1::DimsHW nv_ksize(ksize[0], ksize[1]);
-    nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
-    nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
-
-    if (global_pooling == true) {
-      nv_ksize.d[0] = input_shape.d[nbDims - 2];
-      nv_ksize.d[1] = input_shape.d[nbDims - 1];
-      nv_strides.h() = 1;
-      nv_strides.w() = 1;
-      nv_paddings.h() = 0;
-      nv_paddings.w() = 0;
-    }
-
-    PADDLE_ENFORCE_EQ(input1->getDimensions().nbDims, 3UL);
-
    nvinfer1::PoolingType nv_pool_type = nvinfer1::PoolingType::kMAX;
    if (pool_type == "max") {
      nv_pool_type = nvinfer1::PoolingType::kMAX;
@ -70,42 +85,63 @@ class Pool2dOpConverter : public OpConverter {
      PADDLE_THROW("TensorRT unsupported pooling type!");
    }

-    if (ceil_mode) {
-      nvinfer1::DimsHW pre_pad(0, 0);
-      nvinfer1::DimsHW post_pad(0, 0);
-      int input_height = input_shape.d[nbDims - 2];
-      int input_width = input_shape.d[nbDims - 1];
-      int floor_h_output_size =
-          (input_height - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
-      int ceil_h_output_size =
-          (input_height - ksize[0] + 2 * paddings[0] + strides[0] - 1) /
-              strides[0] +
-          1;
-
-      int floor_w_output_size =
-          (input_width - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
-      int ceil_w_output_size =
-          (input_width - ksize[1] + 2 * paddings[1] + strides[1] - 1) /
-              strides[1] +
-          1;
-      if (floor_h_output_size != ceil_h_output_size) {
-        post_pad.h() = strides[0] - 1;
+    nvinfer1::DimsHW nv_ksize(ksize[0], ksize[1]);
+    nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
+    nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
+
+    nvinfer1::ILayer *layer = nullptr;
+
+    if (global_pooling == true) {
+      nv_ksize.d[0] = input_shape.d[input_dims - 2];
+      nv_ksize.d[1] = input_shape.d[input_dims - 1];
+      auto *layer = TRT_ENGINE_ADD_LAYER(
+          engine_, Pooling, *const_cast<nvinfer1::ITensor *>(input1),
+          nv_pool_type, nv_ksize);
+      PADDLE_ENFORCE_NOT_NULL(layer, "pool layer could not be created.");
+      auto output_name = op_desc.Output("Out")[0];
+      layer->setName(("pool2d (Output: " + output_name + ")").c_str());
+      layer->getOutput(0)->setName(output_name.c_str());
+      engine_->SetITensor(output_name, layer->getOutput(0));
+      if (test_mode) {
+        engine_->DeclareOutput(output_name);
      }
+      return;
+    }

-      if (floor_w_output_size != ceil_w_output_size) {
-        post_pad.w() = strides[1] - 1;
+    if (pool_type == "max") {
+      nvinfer1::DimsHW pre_pad(paddings[0], paddings[1]);
+      nvinfer1::DimsHW post_pad(paddings[0], paddings[1]);
+      if (ceil_mode) {
+        // If ceil mode is true, we will pad the appropriate size to the input.
+        DealCeilMode(input_shape, ksize, strides, paddings, &pre_pad, &post_pad,
+                     input_dims);
+        auto *pad_layer = TRT_ENGINE_ADD_LAYER(
+            engine_, Padding, *const_cast<nvinfer1::ITensor *>(input1), pre_pad,
+            post_pad);
+        PADDLE_ENFORCE_NOT_NULL(
+            pad_layer, "pad layer in poolOp converter could not be created.");
+        input1 = pad_layer->getOutput(0);
+      }
+      auto *pool_layer = TRT_ENGINE_ADD_LAYER(
+          engine_, Pooling, *const_cast<nvinfer1::ITensor *>(input1),
+          nv_pool_type, nv_ksize);
+      PADDLE_ENFORCE_NOT_NULL(pool_layer, "pool layer could not be created.");
+      pool_layer->setStride(nv_strides);
+      pool_layer->setPadding(nv_paddings);
+      layer = pool_layer;
+    } else {
+      // Average pooling needs to exclude the padding pixels from the average
+      // mean.
+      // It is not supported well by TRT, we use a plugin here.
+      std::vector<int> input_shape_v;
+      for (int i = 0; i < input_dims; i++) {
+        input_shape_v.push_back(input_shape.d[i]);
      }
-      auto* layer = TRT_ENGINE_ADD_LAYER(
-          engine_, Padding, *const_cast<nvinfer1::ITensor*>(input1), pre_pad,
-          post_pad);
-      input1 = layer->getOutput(0);
+      plugin::AvgPoolPlugin *plugin = new plugin::AvgPoolPlugin(
+          ceil_mode, ksize, strides, paddings, input_shape_v);
+      auto *avg_pool_layer = engine_->AddPlugin(&input1, 1, plugin);
+      layer = avg_pool_layer;
    }
-    auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling,
-                                       *const_cast<nvinfer1::ITensor*>(input1),
-                                       nv_pool_type, nv_ksize);
-    PADDLE_ENFORCE_NOT_NULL(layer, "pool layer could not be created.");
-    layer->setStride(nv_strides);
-    layer->setPadding(nv_paddings);

    auto output_name = op_desc.Output("Out")[0];
    layer->setName(("pool2d (Output: " + output_name + ")").c_str());
--- a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
@ -20,20 +20,21 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {

-void test_pool2d(bool global_pooling, bool ceil_mode) {
+void test_pool2d(bool global_pooling, bool ceil_mode,
+                 std::string pool_type = "max") {
  framework::Scope scope;
  std::unordered_set<std::string> parameters;
  TRTConvertValidation validator(5, parameters, scope, 1 << 15);

  // The ITensor's Dims should not contain the batch size.
  // So, the ITensor's Dims of input and output should be C * H * W.
-  validator.DeclInputVar("pool2d-X", nvinfer1::Dims3(3, 13, 14));
+  validator.DeclInputVar("pool2d-X", nvinfer1::Dims3(3, 6, 7));
  if (global_pooling)
    validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 1, 1));
  else if (ceil_mode)
-    validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 6, 7));
+    validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 3, 4));
  else
-    validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 6, 6));
+    validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 3, 3));

  // Prepare Op description
  framework::OpDesc desc;
@ -41,10 +42,10 @@ void test_pool2d(bool global_pooling, bool ceil_mode) {
  desc.SetInput("X", {"pool2d-X"});
  desc.SetOutput("Out", {"pool2d-Out"});

-  std::vector<int> ksize({3, 3});
+  std::vector<int> ksize({2, 2});
  std::vector<int> strides({2, 2});
  std::vector<int> paddings({0, 0});
-  std::string pooling_t = "max";
+  std::string pooling_t = pool_type;

  desc.SetAttr("pooling_type", pooling_t);
  desc.SetAttr("ksize", ksize);
@ -63,7 +64,8 @@ void test_pool2d(bool global_pooling, bool ceil_mode) {
 TEST(Pool2dOpConverter, normal) { test_pool2d(false, false); }
 TEST(Pool2dOpConverter, test_global_pooling) { test_pool2d(true, false); }

-TEST(Pool2dOpConverter, test_ceil_mode) { test_pool2d(false, true); }
+TEST(Pool2dOpConverter, max_ceil_test) { test_pool2d(false, true); }
+TEST(Pool2dOpConverter, avg_ceil_test) { test_pool2d(false, true, "avg"); }

 }  // namespace tensorrt
 }  // namespace inference
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@ -1,3 +1,4 @@
 nv_library(tensorrt_plugin
           SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu prelu_op_plugin.cu
+           avg_pool_op_plugin.cu
           DEPS enforce tensorrt_engine)
--- a/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu
@ -0,0 +1,64 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h"
+#include "paddle/fluid/operators/math/pooling.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+nvinfer1::Dims AvgPoolPlugin::getOutputDimensions(
+    int index, const nvinfer1::Dims* inputDims, int nbInputs) {
+  assert(nbInputs == 1);
+  assert(index == 0);
+  assert(inputDims[0].nbDims == 3);
+  nvinfer1::Dims const& input_dims = inputDims[0];
+
+  nvinfer1::Dims output_dims = input_dims;
+
+  output_dims.d[1] = output_shape_[1];
+  output_dims.d[2] = output_shape_[2];
+  return output_dims;
+}
+
+int AvgPoolPlugin::enqueue(int batchSize, const void* const* inputs,
+                           void** outputs, void* workspace,
+                           cudaStream_t stream) {
+  auto const& input_dims = this->getInputDims(0);
+  int input_size = 0;
+  float const* idata = reinterpret_cast<float const*>(inputs[0]);
+  float** odatas = reinterpret_cast<float**>(outputs);
+
+  paddle::operators::math::AvgPool<float> pool_process;
+  paddle::operators::math::Pool2dDirectCUDAFunctor<
+      paddle::operators::math::AvgPool<float>, float>
+      pool2d_forward;
+
+  std::vector<int> input_shape = input_shape_;
+  std::vector<int> output_shape = output_shape_;
+  input_shape.insert(input_shape.begin(), batchSize);
+  output_shape.insert(output_shape.begin(), batchSize);
+
+  pool2d_forward(idata, input_shape, output_shape, ksize_, strides_, paddings_,
+                 pool_process, true, odatas[0], stream);
+
+  return cudaGetLastError() != cudaSuccess;
+}
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h
@ -0,0 +1,111 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cassert>
+#include <vector>
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+class AvgPoolPlugin : public PluginTensorRT {
+ private:
+  bool ceil_mode_;
+  std::vector<int> ksize_;
+  std::vector<int> strides_;
+  std::vector<int> paddings_;
+  std::vector<int> input_shape_;
+  std::vector<int> output_shape_;
+
+ protected:
+  size_t getSerializationSize() override {
+    return SerializedSize(ceil_mode_) + SerializedSize(ksize_) +
+           SerializedSize(strides_) + SerializedSize(paddings_) +
+           SerializedSize(input_shape_) + getBaseSerializationSize();
+  }
+
+  // TRT will call this func when we need to serialize the configuration of
+  // tensorrt.
+  // It should not be called by users.
+  void serialize(void *buffer) override {
+    serializeBase(buffer);
+    SerializeValue(&buffer, ceil_mode_);
+    SerializeValue(&buffer, ksize_);
+    SerializeValue(&buffer, strides_);
+    SerializeValue(&buffer, paddings_);
+    SerializeValue(&buffer, input_shape_);
+  }
+
+ public:
+  AvgPoolPlugin(bool ceil_mode, std::vector<int> ksize,
+                std::vector<int> strides, std::vector<int> paddings,
+                std::vector<int> input_shape)
+      : ceil_mode_(ceil_mode),
+        ksize_(ksize),
+        strides_(strides),
+        paddings_(paddings),
+        input_shape_(input_shape) {
+    int output_h, output_w;
+    output_shape_ = input_shape_;
+    if (!ceil_mode_) {
+      output_h =
+          (input_shape[1] - ksize_[0] + 2 * paddings_[0]) / strides_[0] + 1;
+      output_w =
+          (input_shape[2] - ksize_[1] + 2 * paddings_[1]) / strides_[1] + 1;
+    } else {
+      output_h =
+          (input_shape[1] - ksize_[0] + 2 * paddings_[0] + strides_[0] - 1) /
+              strides_[0] +
+          1;
+      output_w =
+          (input_shape[2] - ksize_[1] + 2 * paddings_[1] + strides_[1] - 1) /
+              strides_[1] +
+          1;
+    }
+    output_shape_[1] = output_h;
+    output_shape_[2] = output_w;
+  }
+
+  // It was used for tensorrt deserialization.
+  // It should not be called by users.
+  AvgPoolPlugin(void const *serialData, size_t serialLength) {
+    deserializeBase(serialData, serialLength);
+    DeserializeValue(&serialData, &serialLength, &ceil_mode_);
+    DeserializeValue(&serialData, &serialLength, &ksize_);
+    DeserializeValue(&serialData, &serialLength, &strides_);
+    DeserializeValue(&serialData, &serialLength, &paddings_);
+    DeserializeValue(&serialData, &serialLength, &input_shape_);
+  }
+
+  AvgPoolPlugin *clone() const override {
+    return new AvgPoolPlugin(ceil_mode_, ksize_, strides_, paddings_,
+                             input_shape_);
+  }
+
+  const char *getPluginType() const override { return "avg_pool"; }
+  int getNbOutputs() const override { return 1; }
+  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs,
+                                     int nbInputDims) override;
+  int initialize() override { return 0; }
+  int enqueue(int batchSize, const void *const *inputs, void **outputs,
+              void *workspace, cudaStream_t stream) override;
+};
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/memory/allocation/best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/best_fit_allocator_test.cc
@ -13,6 +13,7 @@
 // limitations under the License.

 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
+#include <random>
 #include <thread>  // NOLINT
 #include <vector>
 #include "gtest/gtest.h"
--- a/Show More
+++ b/Show More