Merge remote-tracking branch 'origin/develop' into enhance/memory

7 years ago · 3932cd6714
parent 94dd50c33f bec68fa0b3
commit 3932cd6714
31 changed files with 253 additions and 155 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -25,12 +25,18 @@ message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
 message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
        "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
 if(WIN32)
+    set(CMAKE_SUPPRESS_REGENERATION ON)
    set(CMAKE_STATIC_LIBRARY_PREFIX lib)
    add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
+    add_compile_options(/wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838)
+    set(PADDLE_LINK_FLAGS "/IGNORE:4006 /IGNORE:4098 /IGNORE:4217 /IGNORE:4221")
+    set(CMAKE_STATIC_LINKER_FLAGS  "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
+    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
+    set(CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
 endif(WIN32)

 find_package(CUDA QUIET)
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@ -152,7 +152,12 @@ endif()

 if (WITH_MKLML AND MKLML_IOMP_LIB)
    message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
-    set(OPENMP_FLAGS "-fopenmp")
+    if(WIN32)
+        # openmp not support well for now on windows
+        set(OPENMP_FLAGS "")
+    else(WIN32)
+        set(OPENMP_FLAGS "-fopenmp")
+    endif(WIN32)
    set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
    set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@ -203,25 +203,26 @@ list(APPEND CUDA_NVCC_FLAGS "-w")
 list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")

 if (NOT WIN32)
-if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
-    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
-elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
-    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
-elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
-    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
-elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
-    # nvcc 9 does not support -Os. Use Release flags instead
-    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
-endif()
+  if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
+      list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
+  elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
+      list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
+  elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
+      list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
+  elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
+      # nvcc 9 does not support -Os. Use Release flags instead
+      list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
+  endif()
 else(NOT WIN32)
-list(APPEND CUDA_NVCC_FLAGS  "--compiler-options;/bigobj")
-if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
-  list(APPEND CUDA_NVCC_FLAGS  "-g -G")
-  # match the cl's _ITERATOR_DEBUG_LEVEL
-  list(APPEND CUDA_NVCC_FLAGS  "-D_DEBUG")
-elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
-  list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG")
-else()
+  list(APPEND CUDA_NVCC_FLAGS  "-Xcompiler \"/wd 4244 /wd 4267 /wd 4819\"")
+  list(APPEND CUDA_NVCC_FLAGS  "--compiler-options;/bigobj")
+  if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
+    list(APPEND CUDA_NVCC_FLAGS  "-g -G")
+    # match the cl's _ITERATOR_DEBUG_LEVEL
+    list(APPEND CUDA_NVCC_FLAGS  "-D_DEBUG")
+  elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
+    list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG")
+  else()
  message(FATAL "Windows only support Release or Debug build now. Please set visual studio build type to Release/Debug, x64 build.")
 endif()
 endif(NOT WIN32)
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@ -20,8 +20,10 @@ SET(GLOG_INCLUDE_DIR "${GLOG_INSTALL_DIR}/include" CACHE PATH "glog include dire

 IF(WIN32)
  SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.lib" CACHE FILEPATH "glog library." FORCE)
+  SET(GLOG_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4530")
 ELSE(WIN32)
  SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE)
+  SET(GLOG_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
 ENDIF(WIN32)

 INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})
@ -39,7 +41,7 @@ ExternalProject_Add(
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_FLAGS=${GLOG_CMAKE_CXX_FLAGS}
                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@ -49,6 +49,8 @@ IF(NOT WIN32)
    SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value")
    SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}")
    SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}")
+ELSE()
+    SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} /EHsc")
 ENDIF(NOT WIN32)

 ExternalProject_Add(
@ -61,7 +63,6 @@ ExternalProject_Add(
    UPDATE_COMMAND      ""
    CMAKE_ARGS          -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
    CMAKE_ARGS          -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-    CMAKE_ARGS          -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
    CMAKE_ARGS          -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
    CMAKE_ARGS          -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
    CMAKE_ARGS          -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
--- a/cmake/external/snappy.cmake
+++ b/cmake/external/snappy.cmake
@ -20,6 +20,12 @@ set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
 set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
 set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE)

+if(WIN32)
+    SET(SNAPPY_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4267")
+else()
+    SET(SNAPPY_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+endif()
+
 ExternalProject_Add(
    extern_snappy
    GIT_REPOSITORY "https://github.com/google/snappy"
@ -31,7 +37,7 @@ ExternalProject_Add(
                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_FLAGS=${SNAPPY_CMAKE_CXX_FLAGS}
                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
                    -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@ -147,12 +147,6 @@ set(GPU_COMMON_FLAGS
    -Wno-error=unused-function  # Warnings in Numpy Header.
    -Wno-error=array-bounds # Warnings in Eigen::array
 )
-
-else(NOT WIN32)
-set(COMMON_FLAGS
-    "/w") #disable all warnings.
-set(GPU_COMMON_FLAGS
-    "/w") #disable all warnings
 endif(NOT WIN32)

 if (APPLE)
@ -193,8 +187,7 @@ safe_set_static_flag()
        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
-      if(${flag_var} MATCHES "/W3")
-        string(REGEX REPLACE "/W3" "/w" ${flag_var} "${${flag_var}}")
-      endif(${flag_var} MATCHES "/W3")
+        string(REGEX REPLACE "(^| )/W[0-9]( |$)" " " ${flag_var} "${${flag_var}}")
+        set(flag_var "${flag_var} /w")
    endforeach(flag_var)
 endif(WIN32)
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@ -141,7 +141,8 @@ class Graph {
  ir::Node *CreateControlDepVar() {
    // TODO(panyx0718): control var name should be really unique.
    const std::string name = string::Sprintf(
-        "%s@%llu", ir::Node::kControlDepVarName, node_set_.size());
+        "%s@%llu", static_cast<const char *>(ir::Node::kControlDepVarName),
+        node_set_.size());
    auto *x = AddNode(new ir::Node(name, ir::Node::Type::kVariable));
    x->SetId(num_node_created_++);
    return x;
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@ -1,5 +1,5 @@
 if(WITH_PYTHON)
-cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas)
-cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context)
+cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind)
+cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind)
 cc_library(engine SRCS engine.cc)
 endif()
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@ -58,12 +58,13 @@ if(WIN32)
  sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
              DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array
                   analysis_config paddle_pass_builder)
-  target_link_libraries(paddle_fluid_shared shlwapi)
 else(WIN32)
  cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
             DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array
                  analysis_config paddle_pass_builder)
 endif()
+get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+target_link_libraries(paddle_fluid_shared ${os_dependency_modules})

 set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
 if(NOT APPLE AND NOT WIN32)
--- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
@ -1,4 +1,7 @@
 cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc)
+if(WITH_TESTING)
+  add_dependencies(subgraph_detector gtest)
+endif()

 if (WITH_GPU AND TENSORRT_FOUND)
  cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller)
--- a/paddle/fluid/operators/detection/box_coder_op.cc
+++ b/paddle/fluid/operators/detection/box_coder_op.cc
@ -38,20 +38,12 @@ class BoxCoderOp : public framework::OperatorWithKernel {
                        "The shape of PriorBox is [N, 4]");
      if (ctx->HasInput("PriorBoxVar")) {
        auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar");
-        PADDLE_ENFORCE(
-            prior_box_var_dims.size() == 1 || prior_box_var_dims.size() == 2,
-            "Input(PriorBoxVar) of BoxCoderOp should be 1 or 2.");
-        if (prior_box_var_dims.size() == 1) {
-          PADDLE_ENFORCE_EQ(
-              prior_box_var_dims[0], 4,
-              "The 1st dimension of Input(PriorBoxVar) should be 4"
-              "when the rank is 1.");
-        } else {
-          PADDLE_ENFORCE_EQ(
-              prior_box_dims, prior_box_var_dims,
-              "The dimension of Input(PriorBoxVar) should be equal to"
-              "the dimension of Input(PriorBox when the rank is 2.)");
-        }
+        PADDLE_ENFORCE(prior_box_var_dims.size() == 2,
+                       "Input(PriorBoxVar) of BoxCoderOp should be 2.");
+        PADDLE_ENFORCE_EQ(
+            prior_box_dims, prior_box_var_dims,
+            "The dimension of Input(PriorBoxVar) should be equal to"
+            "the dimension of Input(PriorBox) when the rank is 2.");
      }
    }

--- a/paddle/fluid/operators/detection/box_coder_op.cu
+++ b/paddle/fluid/operators/detection/box_coder_op.cu
@ -56,10 +56,7 @@ __global__ void EncodeCenterSizeKernel(
    output[idx * len + 2] = log(fabs(target_box_width / prior_box_width));
    output[idx * len + 3] = log(fabs(target_box_height / prior_box_height));
    if (prior_box_var_data) {
-      int prior_var_offset = 0;
-      if (prior_box_var_size == 2) {
-        prior_var_offset = col_idx * len;
-      }
+      int prior_var_offset = col_idx * len;
      output[idx * len] /= prior_box_var_data[prior_var_offset];
      output[idx * len + 1] /= prior_box_var_data[prior_var_offset + 1];
      output[idx * len + 2] /= prior_box_var_data[prior_var_offset + 2];
@ -99,10 +96,7 @@ __global__ void DecodeCenterSizeKernel(
    T box_var_x = T(1), box_var_y = T(1);
    T box_var_w = T(1), box_var_h = T(1);
    if (prior_box_var_data) {
-      int prior_var_offset = 0;
-      if (prior_box_var_size == 2) {
-        prior_var_offset = axis == 0 ? col_idx * len : row_idx * len;
-      }
+      int prior_var_offset = axis == 0 ? col_idx * len : row_idx * len;
      box_var_x = prior_box_var_data[prior_var_offset];
      box_var_y = prior_box_var_data[prior_var_offset + 1];
      box_var_w = prior_box_var_data[prior_var_offset + 2];
--- a/paddle/fluid/operators/detection/box_coder_op.h
+++ b/paddle/fluid/operators/detection/box_coder_op.h
@ -79,10 +79,7 @@ class BoxCoderKernel : public framework::OpKernel<T> {
        output[offset + 3] =
            std::log(std::fabs(target_box_height / prior_box_height));
        if (prior_box_var) {
-          int prior_var_offset = 0;
-          if (prior_box_var->dims().size() == 2) {
-            prior_var_offset = j * len;
-          }
+          int prior_var_offset = j * len;
          output[offset] /= prior_box_var_data[prior_var_offset];
          output[offset + 1] /= prior_box_var_data[prior_var_offset + 1];
          output[offset + 2] /= prior_box_var_data[prior_var_offset + 2];
@ -95,11 +92,12 @@ class BoxCoderKernel : public framework::OpKernel<T> {
      }
    }
  }
+  template <int axis, int var_size>
  void DecodeCenterSize(const framework::Tensor* target_box,
                        const framework::Tensor* prior_box,
                        const framework::Tensor* prior_box_var,
-                        const bool normalized, const int axis,
-                        const std::vector<float> variance, T* output) const {
+                        const bool normalized, std::vector<float> variance,
+                        T* output) const {
    int64_t row = target_box->dims()[0];
    int64_t col = target_box->dims()[1];
    int64_t len = target_box->dims()[2];
@ -107,19 +105,17 @@ class BoxCoderKernel : public framework::OpKernel<T> {
    auto* target_box_data = target_box->data<T>();
    auto* prior_box_data = prior_box->data<T>();
    const T* prior_box_var_data = nullptr;
-    if (prior_box_var) prior_box_var_data = prior_box_var->data<T>();
+    if (var_size == 2) prior_box_var_data = prior_box_var->data<T>();
    int prior_box_offset = 0;
+    T var_data[4] = {1., 1., 1., 1.};
+    T* var_ptr = var_data;
 #ifdef PADDLE_WITH_MKLML
 #pragma omp parallel for collapse(2)
 #endif
    for (int64_t i = 0; i < row; ++i) {
      for (int64_t j = 0; j < col; ++j) {
        size_t offset = i * col * len + j * len;
-        if (axis == 0) {
-          prior_box_offset = j * len;
-        } else if (axis == 1) {
-          prior_box_offset = i * len;
-        }
+        prior_box_offset = axis == 0 ? j * len : i * len;
        T prior_box_width = prior_box_data[prior_box_offset + 2] -
                            prior_box_data[prior_box_offset] +
                            (normalized == false);
@ -133,26 +129,18 @@ class BoxCoderKernel : public framework::OpKernel<T> {

        T target_box_center_x = 0, target_box_center_y = 0;
        T target_box_width = 0, target_box_height = 0;
-        T box_var_x = T(1), box_var_y = T(1);
-        T box_var_w = T(1), box_var_h = T(1);
-        if (prior_box_var) {
-          int prior_var_offset = 0;
-          if (prior_box_var->dims().size() == 2) {
-            if (axis == 0)
-              prior_var_offset = j * len;
-            else if (axis == 1)
-              prior_var_offset = i * len;
-          }
-          box_var_x = prior_box_var_data[prior_var_offset];
-          box_var_y = prior_box_var_data[prior_var_offset + 1];
-          box_var_w = prior_box_var_data[prior_var_offset + 2];
-          box_var_h = prior_box_var_data[prior_var_offset + 3];
-        } else if (!(variance.empty())) {
-          box_var_x = static_cast<T>(variance[0]);
-          box_var_y = static_cast<T>(variance[1]);
-          box_var_w = static_cast<T>(variance[2]);
-          box_var_h = static_cast<T>(variance[3]);
+        int prior_var_offset = axis == 0 ? j * len : i * len;
+        if (var_size == 2) {
+          std::memcpy(var_ptr, prior_box_var_data + prior_var_offset,
+                      4 * sizeof(T));
+        } else if (var_size == 1) {
+          var_ptr = reinterpret_cast<T*>(variance.data());
        }
+        T box_var_x = *var_ptr;
+        T box_var_y = *(var_ptr + 1);
+        T box_var_w = *(var_ptr + 2);
+        T box_var_h = *(var_ptr + 3);
+
        target_box_center_x =
            box_var_x * target_box_data[offset] * prior_box_width +
            prior_box_center_x;
@ -211,8 +199,31 @@ class BoxCoderKernel : public framework::OpKernel<T> {
      EncodeCenterSize(target_box, prior_box, prior_box_var, normalized,
                       variance, output);
    } else if (code_type == BoxCodeType::kDecodeCenterSize) {
-      DecodeCenterSize(target_box, prior_box, prior_box_var, normalized, axis,
-                       variance, output);
+      if (prior_box_var) {
+        if (axis == 0) {
+          DecodeCenterSize<0, 2>(target_box, prior_box, prior_box_var,
+                                 normalized, variance, output);
+        } else {
+          DecodeCenterSize<1, 2>(target_box, prior_box, prior_box_var,
+                                 normalized, variance, output);
+        }
+      } else if (!(variance.empty())) {
+        if (axis == 0) {
+          DecodeCenterSize<0, 1>(target_box, prior_box, prior_box_var,
+                                 normalized, variance, output);
+        } else {
+          DecodeCenterSize<1, 1>(target_box, prior_box, prior_box_var,
+                                 normalized, variance, output);
+        }
+      } else {
+        if (axis == 0) {
+          DecodeCenterSize<0, 0>(target_box, prior_box, prior_box_var,
+                                 normalized, variance, output);
+        } else {
+          DecodeCenterSize<1, 0>(target_box, prior_box, prior_box_var,
+                                 normalized, variance, output);
+        }
+      }
    }
  }
 };
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@ -37,7 +37,7 @@ math_library(concat_and_split)
 math_library(context_project DEPS im2col math_function)
 math_library(cross_entropy)
 math_library(cos_sim_functor)
-math_library(depthwise_conv)
+math_library(depthwise_conv DEPS cub)
 math_library(im2col)
 math_library(sampler)

--- a/paddle/fluid/operators/ngraph/ngraph_bridge.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_bridge.cc
@ -31,6 +31,7 @@ std::map<std::string,
                            std::shared_ptr<std::unordered_map<
                                std::string, std::shared_ptr<ngraph::Node>>>)>>
    NgraphBridge::NG_NODE_MAP = {
+        {"accuracy", NG_OPS::BuildAccuracyNode},
        {"conv2d", NG_OPS::BuildConv2dNode},
        {"conv2d_grad", NG_OPS::BuildConv2dGradNode},
        {"elementwise_add", NG_OPS::BuildElementwiseAddNode},
--- a/paddle/fluid/operators/ngraph/ngraph_ops.h
+++ b/paddle/fluid/operators/ngraph/ngraph_ops.h
@ -21,7 +21,8 @@ limitations under the License. */

 #pragma once

-#include "ops/binary_unnary_op.h"
+#include "ops/accuracy_op.h"
+#include "ops/binary_unary_op.h"
 #include "ops/conv2d_op.h"
 #include "ops/elementwise_add_op.h"
 #include "ops/fill_constant_op.h"
--- a/paddle/fluid/operators/ngraph/ops/accuracy_op.h
+++ b/paddle/fluid/operators/ngraph/ops/accuracy_op.h
@ -0,0 +1,65 @@
+/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "ngraph/ngraph.hpp"
+#include "paddle/fluid/platform/ngraph_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace ngraphs {
+
+void BuildAccuracyNode(
+    const std::shared_ptr<framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto indices = platform::GetInputNode(op, "Indices", ngb_node_map);
+  auto label = platform::GetInputNode(op, "Label", ngb_node_map);
+  auto inference = platform::GetInputNode(op, "Out", ngb_node_map);
+  auto inference_shape = inference->get_shape();
+  size_t num_samples = inference_shape.at(0);
+  size_t k = inference_shape.at(1);
+
+  std::shared_ptr<ngraph::Node> label_k = label;
+  if (k > 1) {
+    auto label_1d = std::make_shared<ngraph::op::Reshape>(
+        label, ngraph::AxisVector{0, 1}, ngraph::Shape{num_samples});
+    label_k = std::make_shared<ngraph::op::Broadcast>(label_1d, inference_shape,
+                                                      ngraph::AxisSet{1});
+  }
+
+  auto node_equal = std::make_shared<ngraph::op::Equal>(indices, label_k);
+  auto node_eq_int =
+      std::make_shared<ngraph::op::Convert>(node_equal, ngraph::element::i64);
+  auto num_correct_0d =
+      std::make_shared<ngraph::op::Sum>(node_eq_int, ngraph::AxisSet{0, 1});
+  std::shared_ptr<ngraph::Node> num_correct =
+      platform::NgReshaper(num_correct_0d, ngraph::Shape{1});
+  std::shared_ptr<ngraph::Node> n_samples = ngraph::op::Constant::create(
+      ngraph::element::i64, ngraph::Shape{1}, {num_samples});
+  std::shared_ptr<ngraph::Node> accuracy = std::make_shared<ngraph::op::Divide>(
+      std::make_shared<ngraph::op::Convert>(num_correct, ngraph::element::f32),
+      std::make_shared<ngraph::op::Convert>(n_samples, ngraph::element::f32));
+
+  platform::SetOutputNode(op, "Accuracy", accuracy, ngb_node_map);
+  platform::SetOutputNode(op, "Correct", num_correct, ngb_node_map);
+  platform::SetOutputNode(op, "Total", n_samples, ngb_node_map);
+}
+}  // namespace ngraphs
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/ngraph/ops/binary_unnary_op.h
+++ b/paddle/fluid/operators/ngraph/ops/binary_unnary_op.h
--- a/paddle/fluid/operators/ngraph/ops/top_k_op.h
+++ b/paddle/fluid/operators/ngraph/ops/top_k_op.h
@ -36,11 +36,6 @@ void BuildTopKNode(
      std::make_shared<ngraph::op::GetOutputElement>(top_k, 0);
  std::shared_ptr<ngraph::Node> out =
      std::make_shared<ngraph::op::GetOutputElement>(top_k, 1);
-  auto dummy_out = paddle::platform::GetOutputNode(op, "Out", ngb_node_map);
-  if (dummy_out && dummy_out->get_element_type() != out->get_element_type()) {
-    out = std::make_shared<ngraph::op::Convert>(out,
-                                                dummy_out->get_element_type());
-  }
  paddle::platform::SetOutputNode(op, "Indices", indices, ngb_node_map);
  paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
 }
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@ -259,7 +259,7 @@ Example:
       W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1
       $$

-  For exclusive = true:
+  For exclusive = false:
       $$
       hstart = i * strides[0] - paddings[0]
       hend = hstart + ksize[0]
@ -267,7 +267,7 @@ Example:
       wend = wstart + ksize[1]
       Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{ksize[0] * ksize[1]}
       $$
-  For exclusive = false:
+  For exclusive = true:
       $$
       hstart = max(0, i * strides[0] - paddings[0])
       hend = min(H, hstart + ksize[0])
@ -403,7 +403,7 @@ Example:
       H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1 \\
       W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1
  $$
-  For exclusive = true:
+  For exclusive = false:
  $$
  dstart = i * strides[0] - paddings[0]
  dend = dstart + ksize[0]
@ -413,7 +413,7 @@ Example:
  wend = wstart + ksize[2]
  Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]}
  $$
-  For exclusive = false:
+  For exclusive = true:
  $$
  dstart = max(0, i * strides[0] - paddings[0])
  dend = min(D, dstart + ksize[0])
--- a/paddle/fluid/operators/reader/ctr_reader.cc
+++ b/paddle/fluid/operators/reader/ctr_reader.cc
@ -213,7 +213,7 @@ void ReadSvmData(const DataDesc& data_desc, std::shared_ptr<Reader> reader,
      framework::LoD lod{lod_data};
      lod_tensor.set_lod(lod);
      int64_t* tensor_data = lod_tensor.mutable_data<int64_t>(
-          framework::make_ddim({1, static_cast<int64_t>(batch_feasign.size())}),
+          framework::make_ddim({static_cast<int64_t>(batch_feasign.size()), 1}),
          platform::CPUPlace());
      memcpy(tensor_data, batch_feasign.data(),
             batch_feasign.size() * sizeof(int64_t));
@ -223,7 +223,7 @@ void ReadSvmData(const DataDesc& data_desc, std::shared_ptr<Reader> reader,
    // insert label tensor
    framework::LoDTensor label_tensor;
    auto* label_tensor_data = label_tensor.mutable_data<int64_t>(
-        framework::make_ddim({1, static_cast<int64_t>(batch_label.size())}),
+        framework::make_ddim({static_cast<int64_t>(batch_label.size()), 1}),
        platform::CPUPlace());
    memcpy(label_tensor_data, batch_label.data(),
           batch_label.size() * sizeof(int64_t));
--- a/paddle/fluid/operators/reader/ctr_reader_test.cc
+++ b/paddle/fluid/operators/reader/ctr_reader_test.cc
@ -123,7 +123,7 @@ TEST(CTR_READER, read_data) {
  std::vector<std::tuple<LoD, std::vector<int64_t>>> data_slot_6003{b1, b2, b3,
                                                                    b4};

-  std::vector<DDim> label_dims = {{1, 3}, {1, 3}, {1, 3}, {1, 1}};
+  std::vector<DDim> label_dims = {{3, 1}, {3, 1}, {3, 1}, {1, 1}};

  LoDTensorBlockingQueueHolder queue_holder;
  int capacity = 64;
--- a/paddle/fluid/operators/reduce_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/reduce_ops/CMakeLists.txt
@ -1,5 +1,9 @@
 include(operators)
-register_operators()
+if(WITH_GPU)
+    register_operators(DEPS cub)
+else()
+    register_operators()
+endif()

 if(WITH_GPU)
    file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.part.cu")
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@ -1,4 +1,4 @@
-proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto)
+proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto simple_threadpool)
 py_proto_compile(profiler_py_proto SRCS profiler.proto)

 add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
@ -36,7 +36,7 @@ cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)

 nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce)

-cc_library(place SRCS place.cc DEPS enforce boost)
+cc_library(place SRCS place.cc DEPS enforce boost lib_any)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)

 add_subdirectory(dynload)
--- a/Show More
+++ b/Show More