Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix_bug_for_lstmp

6 years ago · 2ba256df40
parent dff7461ea8 bec68fa0b3
commit 2ba256df40
40 changed files with 329 additions and 203 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -25,12 +25,18 @@ message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
 message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
        "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
 if(WIN32)
    set(CMAKE_SUPPRESS_REGENERATION ON)
    set(CMAKE_STATIC_LIBRARY_PREFIX lib)
    add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
    add_compile_options(/wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838)
    set(PADDLE_LINK_FLAGS "/IGNORE:4006 /IGNORE:4098 /IGNORE:4217 /IGNORE:4221")
    set(CMAKE_STATIC_LINKER_FLAGS  "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
    set(CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
 endif(WIN32)
 find_package(CUDA QUIET)
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@ -152,7 +152,12 @@ endif()
 if (WITH_MKLML AND MKLML_IOMP_LIB)
    message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
    if(WIN32)
        # openmp not support well for now on windows
        set(OPENMP_FLAGS "")
    else(WIN32)
        set(OPENMP_FLAGS "-fopenmp")
    endif(WIN32)
    set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
    set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@ -203,25 +203,26 @@ list(APPEND CUDA_NVCC_FLAGS "-w")
 list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
 if (NOT WIN32)
-if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
+  if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
      list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
-elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
+  elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
      list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
-elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
+  elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
      list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
-elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
+  elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
      # nvcc 9 does not support -Os. Use Release flags instead
      list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
-endif()
+  endif()
 else(NOT WIN32)
-list(APPEND CUDA_NVCC_FLAGS  "--compiler-options;/bigobj")
+  list(APPEND CUDA_NVCC_FLAGS  "-Xcompiler \"/wd 4244 /wd 4267 /wd 4819\"")
-if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
+  list(APPEND CUDA_NVCC_FLAGS  "--compiler-options;/bigobj")
  if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
    list(APPEND CUDA_NVCC_FLAGS  "-g -G")
    # match the cl's _ITERATOR_DEBUG_LEVEL
    list(APPEND CUDA_NVCC_FLAGS  "-D_DEBUG")
-elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
+  elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
    list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG")
-else()
+  else()
  message(FATAL "Windows only support Release or Debug build now. Please set visual studio build type to Release/Debug, x64 build.")
 endif()
 endif(NOT WIN32)
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@ -20,8 +20,10 @@ SET(GLOG_INCLUDE_DIR "${GLOG_INSTALL_DIR}/include" CACHE PATH "glog include dire
 IF(WIN32)
  SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.lib" CACHE FILEPATH "glog library." FORCE)
  SET(GLOG_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4530")
 ELSE(WIN32)
  SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE)
  SET(GLOG_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
 ENDIF(WIN32)
 INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})
@ -39,7 +41,7 @@ ExternalProject_Add(
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_FLAGS=${GLOG_CMAKE_CXX_FLAGS}
                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@ -49,6 +49,8 @@ IF(NOT WIN32)
    SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value")
    SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}")
    SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}")
 ELSE()
    SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} /EHsc")
 ENDIF(NOT WIN32)
 ExternalProject_Add(
@ -61,7 +63,6 @@ ExternalProject_Add(
    UPDATE_COMMAND      ""
    CMAKE_ARGS          -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
    CMAKE_ARGS          -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
    CMAKE_ARGS          -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
    CMAKE_ARGS          -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
    CMAKE_ARGS          -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
    CMAKE_ARGS          -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
--- a/cmake/external/snappy.cmake
+++ b/cmake/external/snappy.cmake
@ -20,6 +20,12 @@ set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
 set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
 set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE)
 if(WIN32)
    SET(SNAPPY_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4267")
 else()
    SET(SNAPPY_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
 endif()
 ExternalProject_Add(
    extern_snappy
    GIT_REPOSITORY "https://github.com/google/snappy"
@ -31,7 +37,7 @@ ExternalProject_Add(
                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_FLAGS=${SNAPPY_CMAKE_CXX_FLAGS}
                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
                    -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@ -147,12 +147,6 @@ set(GPU_COMMON_FLAGS
    -Wno-error=unused-function  # Warnings in Numpy Header.
    -Wno-error=array-bounds # Warnings in Eigen::array
 )
 else(NOT WIN32)
 set(COMMON_FLAGS
    "/w") #disable all warnings.
 set(GPU_COMMON_FLAGS
    "/w") #disable all warnings
 endif(NOT WIN32)
 if (APPLE)
@ -193,8 +187,7 @@ safe_set_static_flag()
        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
-      if(${flag_var} MATCHES "/W3")
+        string(REGEX REPLACE "(^| )/W[0-9]( |$)" " " ${flag_var} "${${flag_var}}")
-        string(REGEX REPLACE "/W3" "/w" ${flag_var} "${${flag_var}}")
+        set(flag_var "${flag_var} /w")
      endif(${flag_var} MATCHES "/W3")
    endforeach(flag_var)
 endif(WIN32)
--- a/cmake/version.cmake
+++ b/cmake/version.cmake
@ -30,10 +30,25 @@ while ("${PADDLE_VERSION}" STREQUAL "")
        else()  # otherwise, get the previous git tag name.
          set(tmp_version "${GIT_TAG_NAME}~1")
        endif()
      else()
        execute_process(
          COMMAND ${GIT_EXECUTABLE} describe --exact-match --tags ${tmp_version}
          WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
          OUTPUT_VARIABLE GIT_EXACT_TAG_NAME
          RESULT_VARIABLE GIT_EXACT_TAG_RESULT
          ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
        if (NOT ${GIT_EXACT_TAG_NAME})
          # Check if current branch is tag branch
          if (${GIT_EXACT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}")
            string(REPLACE "v" "" PADDLE_VERSION ${GIT_EXACT_TAG_NAME})
          else()
            set(PADDLE_VERSION "0.0.0")
          endif()
        else()
          # otherwise, we always set PADDLE_VERSION to 0.0.0 to represent latest
          set(PADDLE_VERSION "0.0.0")
        endif()
      endif()
    else()
      set(PADDLE_VERSION "0.0.0")
      message(WARNING "Cannot add paddle version from git tag")
--- a/paddle/fluid/framework/details/inplace_op_pass.cc
+++ b/paddle/fluid/framework/details/inplace_op_pass.cc
@ -403,18 +403,20 @@ void GraphView::Build(ir::Graph* g) {
  // 2. track the nodes which used by parameter server.
  // these node can not be inplaced, otherwise trainer
  // pserver can not find each other name.
-  for (auto& node : g->Nodes()) {
+  auto update_skip_set = [&](ir::Node* node) {
    if (!node->IsOp()) continue;
    if (node->Name() == "send") {
    for (auto& in : node->inputs) {
-        dup_nodes_.emplace(in->Name());
+      if (in->IsVar() && in->Var() != nullptr) dup_nodes_.emplace(in->Name());
      }
    }
    if (node->Name() == "recv") {
    for (auto& out : node->outputs) {
      if (out->IsVar() && out->Var() != nullptr)
        dup_nodes_.emplace(out->Name());
    }
-    }
+  };
  for (auto& node : g->Nodes()) {
    if (!node->IsOp()) continue;
    if (node->Name() == "send") update_skip_set(node);
    if (node->Name() == "recv") update_skip_set(node);
    if (node->Name() == "prefetch") update_skip_set(node);
  }
 }
--- a/paddle/fluid/framework/details/memory_optimize_pass.cc
+++ b/paddle/fluid/framework/details/memory_optimize_pass.cc
@ -51,8 +51,7 @@ static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
 std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
  auto nodes = graph->Nodes();
-  auto subblock_vars = GetSubBlockVars(nodes);
+  CollectSkipVarsSet(nodes);
  skip_set_.insert(subblock_vars.begin(), subblock_vars.end());
  cfg_.reset(new details::ControlFlowGraph(*graph));
  cfg_->LiveVariableAnalysis();
@ -224,20 +223,27 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
  }
 }
-std::unordered_set<std::string> MemoryOptimizePass::GetSubBlockVars(
+void MemoryOptimizePass::CollectSkipVarsSet(
    const std::unordered_set<ir::Node*>& nodes) const {
-  std::unordered_set<std::string> vars;
+  auto update_skip_set = [&](OpDesc* op_desc) {
    auto inputs = op_desc->InputArgumentNames();
    auto outputs = op_desc->OutputArgumentNames();
    skip_set_.insert(inputs.begin(), inputs.end());
    skip_set_.insert(outputs.begin(), outputs.end());
  };
  for (auto& op : nodes) {
    if (!op->IsOp() || op->Op() == nullptr) continue;
    auto* op_desc = op->Op();
-    if (OpHasSubBlock(op_desc)) {
+    // NOTE(dzhwinter):
-      auto inputs = op_desc->InputArgumentNames();
+    // current block can not reuse next level block vars.
-      auto outputs = op_desc->OutputArgumentNames();
+    if (OpHasSubBlock(op_desc)) update_skip_set(op_desc);
-      vars.insert(inputs.begin(), inputs.end());
+    // NOTE(dzhwinter):
-      vars.insert(outputs.begin(), outputs.end());
+    // distributed ops input/output name need to
-    }
+    // keep same bettwen trainer/pserver
    if (op_desc->Type() == "send") update_skip_set(op_desc);
    if (op_desc->Type() == "recv") update_skip_set(op_desc);
    if (op_desc->Type() == "prefetch") update_skip_set(op_desc);
  }
  return vars;
 }
 void MemoryOptimizePass::RenameVarInGraphDesc(const std::string& var,
--- a/paddle/fluid/framework/details/memory_optimize_pass.h
+++ b/paddle/fluid/framework/details/memory_optimize_pass.h
@ -55,9 +55,10 @@ class MemoryOptimizePass : public ir::Pass {
                            ir::Graph* graph) const;
  void SubGraphOptimize(OpDesc* op_desc) const;
-  // scan subblock and collect the output/input variables.
+  // 1. scan op with subblock and collect the output/input vars.
-  std::unordered_set<std::string> GetSubBlockVars(
+  // while, while_grad, conditional_block
-      const std::unordered_set<ir::Node*>&) const;
+  // 2. scan distributed ops and collect the output/input vars
  void CollectSkipVarsSet(const std::unordered_set<ir::Node*>&) const;
 private:
  // Reuse Node Pool, Owned.
--- a/paddle/fluid/framework/inplace_op_inference_test.cc
+++ b/paddle/fluid/framework/inplace_op_inference_test.cc
@ -276,6 +276,7 @@ TEST(InferInplace, MultiGradInplaceInToOut) {
  auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
  auto in_to_outs = infer_inplace(*op, op->Block());
  EXPECT_EQ(in_to_outs.size(), 3ul);
  std::unordered_map<std::string, std::string> expects = {
      {"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"},
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@ -141,7 +141,8 @@ class Graph {
  ir::Node *CreateControlDepVar() {
    // TODO(panyx0718): control var name should be really unique.
    const std::string name = string::Sprintf(
-        "%s@%llu", ir::Node::kControlDepVarName, node_set_.size());
+        "%s@%llu", static_cast<const char *>(ir::Node::kControlDepVarName),
        node_set_.size());
    auto *x = AddNode(new ir::Node(name, ir::Node::Type::kVariable));
    x->SetId(num_node_created_++);
    return x;
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@ -22,7 +22,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/string/printf.h"
-DECLARE_bool(benchmark);
+DEFINE_bool(benchmark, false,
            "Doing memory benchmark. It will make deleting scope synchronized, "
            "and add some memory usage logs."
            "Default cuda is asynchronous device, set to True will"
            "force op run in synchronous mode.");
 DEFINE_bool(
    eager_delete_scope, true,
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@ -1,5 +1,5 @@
 if(WITH_PYTHON)
-cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas)
+cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind)
-cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context)
+cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind)
 cc_library(engine SRCS engine.cc)
 endif()
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@ -58,12 +58,13 @@ if(WIN32)
  sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
              DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array
                   analysis_config paddle_pass_builder)
  target_link_libraries(paddle_fluid_shared shlwapi)
 else(WIN32)
  cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
             DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array
                  analysis_config paddle_pass_builder)
 endif()
 get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
 target_link_libraries(paddle_fluid_shared ${os_dependency_modules})
 set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
 if(NOT APPLE AND NOT WIN32)
--- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
@ -1,4 +1,7 @@
 cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc)
 if(WITH_TESTING)
  add_dependencies(subgraph_detector gtest)
 endif()
 if (WITH_GPU AND TENSORRT_FOUND)
  cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller)
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
@ -18,6 +18,7 @@
 #include <limits>
 #include <map>
 #include <string>
 #include <type_traits>
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/ir/graph_helper.h"
@ -168,7 +169,11 @@ bool FindSuitableTensorToReuse(
    if (!cluster->count(candidate)) continue;
    size_t space = space_table.at(candidate);
-    size_t space_diff = std::abs<size_t>(space - space_required);
+    PADDLE_ENFORCE(
        space <= std::numeric_limits<std::make_signed<size_t>::type>::max(),
        "space overload");
    size_t space_diff =
        std::abs((std::make_signed<size_t>::type)space - space_required);
    if (space_diff < best_fit.second) {
      best_fit.first = candidate;
      best_fit.second = space_diff;
--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
@ -35,7 +35,6 @@ DEFINE_bool(init_allocated_mem, false,
            "To find this error in time, we use init_allocated_mem to indicate "
            "that initializing the allocated memory with a small value "
            "during unit testing.");
 DECLARE_bool(benchmark);
 DECLARE_double(fraction_of_gpu_memory_to_use);
 namespace paddle {
@ -188,21 +187,20 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
    platform::SetDeviceId(place.device);
    size_t avail, total;
    platform::GpuMemoryUsage(&avail, &total);
-    LOG(WARNING) << "Cannot allocate " << string::HumanReadableSize(size)
+    LOG(FATAL) << "Cannot allocate " << string::HumanReadableSize(size)
               << " in GPU " << place.device << ", available "
-                 << string::HumanReadableSize(avail);
+               << string::HumanReadableSize(avail) << "total " << total
-    LOG(WARNING) << "total " << total;
+               << "GpuMinChunkSize "
-    LOG(WARNING) << "GpuMinChunkSize "
+               << string::HumanReadableSize(buddy_allocator->GetMinChunkSize())
-                 << string::HumanReadableSize(
+               << "GpuMaxChunkSize "
-                        buddy_allocator->GetMinChunkSize());
+               << string::HumanReadableSize(buddy_allocator->GetMaxChunkSize())
-    LOG(WARNING) << "GpuMaxChunkSize "
+               << "GPU memory used: "
                 << string::HumanReadableSize(
                        buddy_allocator->GetMaxChunkSize());
    LOG(WARNING) << "GPU memory used: "
               << string::HumanReadableSize(Used<platform::CUDAPlace>(place));
    platform::SetDeviceId(cur_dev);
  } else {
-    if (FLAGS_benchmark) allocation::GPUMemMonitor.Add(place.device, size);
+    if (VLOG_IS_ON(3)) {
      allocation::GPUMemMonitor.Add(place.device, size);
    }
    if (FLAGS_init_allocated_mem) {
      cudaMemset(ptr, 0xEF, size);
    }
@ -218,7 +216,9 @@ void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
                               size_t size) {
 #ifdef PADDLE_WITH_CUDA
  GetGPUBuddyAllocator(place.device)->Free(p);
-  if (FLAGS_benchmark) allocation::GPUMemMonitor.Minus(place.device, size);
+  if (VLOG_IS_ON(3)) {
    allocation::GPUMemMonitor.Minus(place.device, size);
  }
 #else
  PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
 #endif
--- a/paddle/fluid/operators/detection/box_coder_op.cc
+++ b/paddle/fluid/operators/detection/box_coder_op.cc
@ -38,20 +38,12 @@ class BoxCoderOp : public framework::OperatorWithKernel {
                        "The shape of PriorBox is [N, 4]");
      if (ctx->HasInput("PriorBoxVar")) {
        auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar");
-        PADDLE_ENFORCE(
+        PADDLE_ENFORCE(prior_box_var_dims.size() == 2,
-            prior_box_var_dims.size() == 1 || prior_box_var_dims.size() == 2,
+                       "Input(PriorBoxVar) of BoxCoderOp should be 2.");
            "Input(PriorBoxVar) of BoxCoderOp should be 1 or 2.");
        if (prior_box_var_dims.size() == 1) {
          PADDLE_ENFORCE_EQ(
              prior_box_var_dims[0], 4,
              "The 1st dimension of Input(PriorBoxVar) should be 4"
              "when the rank is 1.");
        } else {
        PADDLE_ENFORCE_EQ(
            prior_box_dims, prior_box_var_dims,
            "The dimension of Input(PriorBoxVar) should be equal to"
-              "the dimension of Input(PriorBox when the rank is 2.)");
+            "the dimension of Input(PriorBox) when the rank is 2.");
        }
      }
    }
--- a/paddle/fluid/operators/detection/box_coder_op.cu
+++ b/paddle/fluid/operators/detection/box_coder_op.cu
@ -56,10 +56,7 @@ __global__ void EncodeCenterSizeKernel(
    output[idx * len + 2] = log(fabs(target_box_width / prior_box_width));
    output[idx * len + 3] = log(fabs(target_box_height / prior_box_height));
    if (prior_box_var_data) {
-      int prior_var_offset = 0;
+      int prior_var_offset = col_idx * len;
      if (prior_box_var_size == 2) {
        prior_var_offset = col_idx * len;
      }
      output[idx * len] /= prior_box_var_data[prior_var_offset];
      output[idx * len + 1] /= prior_box_var_data[prior_var_offset + 1];
      output[idx * len + 2] /= prior_box_var_data[prior_var_offset + 2];
@ -99,10 +96,7 @@ __global__ void DecodeCenterSizeKernel(
    T box_var_x = T(1), box_var_y = T(1);
    T box_var_w = T(1), box_var_h = T(1);
    if (prior_box_var_data) {
-      int prior_var_offset = 0;
+      int prior_var_offset = axis == 0 ? col_idx * len : row_idx * len;
      if (prior_box_var_size == 2) {
        prior_var_offset = axis == 0 ? col_idx * len : row_idx * len;
      }
      box_var_x = prior_box_var_data[prior_var_offset];
      box_var_y = prior_box_var_data[prior_var_offset + 1];
      box_var_w = prior_box_var_data[prior_var_offset + 2];
--- a/paddle/fluid/operators/detection/box_coder_op.h
+++ b/paddle/fluid/operators/detection/box_coder_op.h
@ -79,10 +79,7 @@ class BoxCoderKernel : public framework::OpKernel<T> {
        output[offset + 3] =
            std::log(std::fabs(target_box_height / prior_box_height));
        if (prior_box_var) {
-          int prior_var_offset = 0;
+          int prior_var_offset = j * len;
          if (prior_box_var->dims().size() == 2) {
            prior_var_offset = j * len;
          }
          output[offset] /= prior_box_var_data[prior_var_offset];
          output[offset + 1] /= prior_box_var_data[prior_var_offset + 1];
          output[offset + 2] /= prior_box_var_data[prior_var_offset + 2];
@ -95,11 +92,12 @@ class BoxCoderKernel : public framework::OpKernel<T> {
      }
    }
  }
  template <int axis, int var_size>
  void DecodeCenterSize(const framework::Tensor* target_box,
                        const framework::Tensor* prior_box,
                        const framework::Tensor* prior_box_var,
-                        const bool normalized, const int axis,
+                        const bool normalized, std::vector<float> variance,
-                        const std::vector<float> variance, T* output) const {
+                        T* output) const {
    int64_t row = target_box->dims()[0];
    int64_t col = target_box->dims()[1];
    int64_t len = target_box->dims()[2];
@ -107,19 +105,17 @@ class BoxCoderKernel : public framework::OpKernel<T> {
    auto* target_box_data = target_box->data<T>();
    auto* prior_box_data = prior_box->data<T>();
    const T* prior_box_var_data = nullptr;
-    if (prior_box_var) prior_box_var_data = prior_box_var->data<T>();
+    if (var_size == 2) prior_box_var_data = prior_box_var->data<T>();
    int prior_box_offset = 0;
    T var_data[4] = {1., 1., 1., 1.};
    T* var_ptr = var_data;
 #ifdef PADDLE_WITH_MKLML
 #pragma omp parallel for collapse(2)
 #endif
    for (int64_t i = 0; i < row; ++i) {
      for (int64_t j = 0; j < col; ++j) {
        size_t offset = i * col * len + j * len;
-        if (axis == 0) {
+        prior_box_offset = axis == 0 ? j * len : i * len;
          prior_box_offset = j * len;
        } else if (axis == 1) {
          prior_box_offset = i * len;
        }
        T prior_box_width = prior_box_data[prior_box_offset + 2] -
                            prior_box_data[prior_box_offset] +
                            (normalized == false);
@ -133,26 +129,18 @@ class BoxCoderKernel : public framework::OpKernel<T> {
        T target_box_center_x = 0, target_box_center_y = 0;
        T target_box_width = 0, target_box_height = 0;
-        T box_var_x = T(1), box_var_y = T(1);
+        int prior_var_offset = axis == 0 ? j * len : i * len;
-        T box_var_w = T(1), box_var_h = T(1);
+        if (var_size == 2) {
-        if (prior_box_var) {
+          std::memcpy(var_ptr, prior_box_var_data + prior_var_offset,
-          int prior_var_offset = 0;
+                      4 * sizeof(T));
-          if (prior_box_var->dims().size() == 2) {
+        } else if (var_size == 1) {
-            if (axis == 0)
+          var_ptr = reinterpret_cast<T*>(variance.data());
-              prior_var_offset = j * len;
+        }
-            else if (axis == 1)
+        T box_var_x = *var_ptr;
-              prior_var_offset = i * len;
+        T box_var_y = *(var_ptr + 1);
-          }
+        T box_var_w = *(var_ptr + 2);
-          box_var_x = prior_box_var_data[prior_var_offset];
+        T box_var_h = *(var_ptr + 3);
-          box_var_y = prior_box_var_data[prior_var_offset + 1];
+
          box_var_w = prior_box_var_data[prior_var_offset + 2];
          box_var_h = prior_box_var_data[prior_var_offset + 3];
        } else if (!(variance.empty())) {
          box_var_x = static_cast<T>(variance[0]);
          box_var_y = static_cast<T>(variance[1]);
          box_var_w = static_cast<T>(variance[2]);
          box_var_h = static_cast<T>(variance[3]);
        }
        target_box_center_x =
            box_var_x * target_box_data[offset] * prior_box_width +
            prior_box_center_x;
@ -211,8 +199,31 @@ class BoxCoderKernel : public framework::OpKernel<T> {
      EncodeCenterSize(target_box, prior_box, prior_box_var, normalized,
                       variance, output);
    } else if (code_type == BoxCodeType::kDecodeCenterSize) {
-      DecodeCenterSize(target_box, prior_box, prior_box_var, normalized, axis,
+      if (prior_box_var) {
-                       variance, output);
+        if (axis == 0) {
          DecodeCenterSize<0, 2>(target_box, prior_box, prior_box_var,
                                 normalized, variance, output);
        } else {
          DecodeCenterSize<1, 2>(target_box, prior_box, prior_box_var,
                                 normalized, variance, output);
        }
      } else if (!(variance.empty())) {
        if (axis == 0) {
          DecodeCenterSize<0, 1>(target_box, prior_box, prior_box_var,
                                 normalized, variance, output);
        } else {
          DecodeCenterSize<1, 1>(target_box, prior_box, prior_box_var,
                                 normalized, variance, output);
        }
      } else {
        if (axis == 0) {
          DecodeCenterSize<0, 0>(target_box, prior_box, prior_box_var,
                                 normalized, variance, output);
        } else {
          DecodeCenterSize<1, 0>(target_box, prior_box, prior_box_var,
                                 normalized, variance, output);
        }
      }
    }
  }
 };
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@ -37,7 +37,7 @@ math_library(concat_and_split)
 math_library(context_project DEPS im2col math_function)
 math_library(cross_entropy)
 math_library(cos_sim_functor)
-math_library(depthwise_conv)
+math_library(depthwise_conv DEPS cub)
 math_library(im2col)
 math_library(sampler)
--- a/paddle/fluid/operators/ngraph/ngraph_bridge.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_bridge.cc
@ -31,6 +31,7 @@ std::map<std::string,
                            std::shared_ptr<std::unordered_map<
                                std::string, std::shared_ptr<ngraph::Node>>>)>>
    NgraphBridge::NG_NODE_MAP = {
        {"accuracy", NG_OPS::BuildAccuracyNode},
        {"conv2d", NG_OPS::BuildConv2dNode},
        {"conv2d_grad", NG_OPS::BuildConv2dGradNode},
        {"elementwise_add", NG_OPS::BuildElementwiseAddNode},
--- a/paddle/fluid/operators/ngraph/ngraph_ops.h
+++ b/paddle/fluid/operators/ngraph/ngraph_ops.h
@ -21,7 +21,8 @@ limitations under the License. */
 #pragma once
-#include "ops/binary_unnary_op.h"
+#include "ops/accuracy_op.h"
 #include "ops/binary_unary_op.h"
 #include "ops/conv2d_op.h"
 #include "ops/elementwise_add_op.h"
 #include "ops/fill_constant_op.h"
--- a/Show More
+++ b/Show More