Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix_bug_for_lstmp

revert-15774-anakin_subgraph_engine
xuezhong 6 years ago
commit 2ba256df40

@ -25,12 +25,18 @@ message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
"${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
if(WIN32)
set(CMAKE_SUPPRESS_REGENERATION ON)
set(CMAKE_STATIC_LIBRARY_PREFIX lib)
add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
add_compile_options(/wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838)
set(PADDLE_LINK_FLAGS "/IGNORE:4006 /IGNORE:4098 /IGNORE:4217 /IGNORE:4221")
set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
endif(WIN32)
find_package(CUDA QUIET)

@ -152,7 +152,12 @@ endif()
if (WITH_MKLML AND MKLML_IOMP_LIB)
message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
set(OPENMP_FLAGS "-fopenmp")
if(WIN32)
# openmp not support well for now on windows
set(OPENMP_FLAGS "")
else(WIN32)
set(OPENMP_FLAGS "-fopenmp")
endif(WIN32)
set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")

@ -203,25 +203,26 @@ list(APPEND CUDA_NVCC_FLAGS "-w")
list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
if (NOT WIN32)
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG})
elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE})
elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
# nvcc 9 does not support -Os. Use Release flags instead
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE})
endif()
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG})
elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE})
elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
# nvcc 9 does not support -Os. Use Release flags instead
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE})
endif()
else(NOT WIN32)
list(APPEND CUDA_NVCC_FLAGS "--compiler-options;/bigobj")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND CUDA_NVCC_FLAGS "-g -G")
# match the cl's _ITERATOR_DEBUG_LEVEL
list(APPEND CUDA_NVCC_FLAGS "-D_DEBUG")
elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG")
else()
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler \"/wd 4244 /wd 4267 /wd 4819\"")
list(APPEND CUDA_NVCC_FLAGS "--compiler-options;/bigobj")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND CUDA_NVCC_FLAGS "-g -G")
# match the cl's _ITERATOR_DEBUG_LEVEL
list(APPEND CUDA_NVCC_FLAGS "-D_DEBUG")
elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG")
else()
message(FATAL "Windows only support Release or Debug build now. Please set visual studio build type to Release/Debug, x64 build.")
endif()
endif(NOT WIN32)

@ -20,8 +20,10 @@ SET(GLOG_INCLUDE_DIR "${GLOG_INSTALL_DIR}/include" CACHE PATH "glog include dire
IF(WIN32)
SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.lib" CACHE FILEPATH "glog library." FORCE)
SET(GLOG_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4530")
ELSE(WIN32)
SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE)
SET(GLOG_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
ENDIF(WIN32)
INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})
@ -39,7 +41,7 @@ ExternalProject_Add(
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS=${GLOG_CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}

@ -49,6 +49,8 @@ IF(NOT WIN32)
SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value")
SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}")
SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}")
ELSE()
SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} /EHsc")
ENDIF(NOT WIN32)
ExternalProject_Add(
@ -61,7 +63,6 @@ ExternalProject_Add(
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
CMAKE_ARGS -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
CMAKE_ARGS -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}

@ -20,6 +20,12 @@ set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE)
if(WIN32)
SET(SNAPPY_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4267")
else()
SET(SNAPPY_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
endif()
ExternalProject_Add(
extern_snappy
GIT_REPOSITORY "https://github.com/google/snappy"
@ -31,7 +37,7 @@ ExternalProject_Add(
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS=${SNAPPY_CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}

@ -147,12 +147,6 @@ set(GPU_COMMON_FLAGS
-Wno-error=unused-function # Warnings in Numpy Header.
-Wno-error=array-bounds # Warnings in Eigen::array
)
else(NOT WIN32)
set(COMMON_FLAGS
"/w") #disable all warnings.
set(GPU_COMMON_FLAGS
"/w") #disable all warnings
endif(NOT WIN32)
if (APPLE)
@ -193,8 +187,7 @@ safe_set_static_flag()
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
if(${flag_var} MATCHES "/W3")
string(REGEX REPLACE "/W3" "/w" ${flag_var} "${${flag_var}}")
endif(${flag_var} MATCHES "/W3")
string(REGEX REPLACE "(^| )/W[0-9]( |$)" " " ${flag_var} "${${flag_var}}")
set(flag_var "${flag_var} /w")
endforeach(flag_var)
endif(WIN32)

@ -31,8 +31,23 @@ while ("${PADDLE_VERSION}" STREQUAL "")
set(tmp_version "${GIT_TAG_NAME}~1")
endif()
else()
# otherwise, we always set PADDLE_VERSION to 0.0.0 to represent latest
set(PADDLE_VERSION "0.0.0")
execute_process(
COMMAND ${GIT_EXECUTABLE} describe --exact-match --tags ${tmp_version}
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
OUTPUT_VARIABLE GIT_EXACT_TAG_NAME
RESULT_VARIABLE GIT_EXACT_TAG_RESULT
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
if (NOT ${GIT_EXACT_TAG_NAME})
# Check if current branch is tag branch
if (${GIT_EXACT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}")
string(REPLACE "v" "" PADDLE_VERSION ${GIT_EXACT_TAG_NAME})
else()
set(PADDLE_VERSION "0.0.0")
endif()
else()
# otherwise, we always set PADDLE_VERSION to 0.0.0 to represent latest
set(PADDLE_VERSION "0.0.0")
endif()
endif()
else()
set(PADDLE_VERSION "0.0.0")

@ -403,18 +403,20 @@ void GraphView::Build(ir::Graph* g) {
// 2. track the nodes which used by parameter server.
// these node can not be inplaced, otherwise trainer
// pserver can not find each other name.
for (auto& node : g->Nodes()) {
if (!node->IsOp()) continue;
if (node->Name() == "send") {
for (auto& in : node->inputs) {
dup_nodes_.emplace(in->Name());
}
auto update_skip_set = [&](ir::Node* node) {
for (auto& in : node->inputs) {
if (in->IsVar() && in->Var() != nullptr) dup_nodes_.emplace(in->Name());
}
if (node->Name() == "recv") {
for (auto& out : node->outputs) {
for (auto& out : node->outputs) {
if (out->IsVar() && out->Var() != nullptr)
dup_nodes_.emplace(out->Name());
}
}
};
for (auto& node : g->Nodes()) {
if (!node->IsOp()) continue;
if (node->Name() == "send") update_skip_set(node);
if (node->Name() == "recv") update_skip_set(node);
if (node->Name() == "prefetch") update_skip_set(node);
}
}

@ -51,8 +51,7 @@ static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
auto nodes = graph->Nodes();
auto subblock_vars = GetSubBlockVars(nodes);
skip_set_.insert(subblock_vars.begin(), subblock_vars.end());
CollectSkipVarsSet(nodes);
cfg_.reset(new details::ControlFlowGraph(*graph));
cfg_->LiveVariableAnalysis();
@ -224,20 +223,27 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
}
}
std::unordered_set<std::string> MemoryOptimizePass::GetSubBlockVars(
void MemoryOptimizePass::CollectSkipVarsSet(
const std::unordered_set<ir::Node*>& nodes) const {
std::unordered_set<std::string> vars;
auto update_skip_set = [&](OpDesc* op_desc) {
auto inputs = op_desc->InputArgumentNames();
auto outputs = op_desc->OutputArgumentNames();
skip_set_.insert(inputs.begin(), inputs.end());
skip_set_.insert(outputs.begin(), outputs.end());
};
for (auto& op : nodes) {
if (!op->IsOp() || op->Op() == nullptr) continue;
auto* op_desc = op->Op();
if (OpHasSubBlock(op_desc)) {
auto inputs = op_desc->InputArgumentNames();
auto outputs = op_desc->OutputArgumentNames();
vars.insert(inputs.begin(), inputs.end());
vars.insert(outputs.begin(), outputs.end());
}
// NOTE(dzhwinter):
// current block can not reuse next level block vars.
if (OpHasSubBlock(op_desc)) update_skip_set(op_desc);
// NOTE(dzhwinter):
// distributed ops input/output name need to
// keep same bettwen trainer/pserver
if (op_desc->Type() == "send") update_skip_set(op_desc);
if (op_desc->Type() == "recv") update_skip_set(op_desc);
if (op_desc->Type() == "prefetch") update_skip_set(op_desc);
}
return vars;
}
void MemoryOptimizePass::RenameVarInGraphDesc(const std::string& var,

@ -55,9 +55,10 @@ class MemoryOptimizePass : public ir::Pass {
ir::Graph* graph) const;
void SubGraphOptimize(OpDesc* op_desc) const;
// scan subblock and collect the output/input variables.
std::unordered_set<std::string> GetSubBlockVars(
const std::unordered_set<ir::Node*>&) const;
// 1. scan op with subblock and collect the output/input vars.
// while, while_grad, conditional_block
// 2. scan distributed ops and collect the output/input vars
void CollectSkipVarsSet(const std::unordered_set<ir::Node*>&) const;
private:
// Reuse Node Pool, Owned.

@ -276,6 +276,7 @@ TEST(InferInplace, MultiGradInplaceInToOut) {
auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
auto in_to_outs = infer_inplace(*op, op->Block());
EXPECT_EQ(in_to_outs.size(), 3ul);
std::unordered_map<std::string, std::string> expects = {
{"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"},

@ -141,7 +141,8 @@ class Graph {
ir::Node *CreateControlDepVar() {
// TODO(panyx0718): control var name should be really unique.
const std::string name = string::Sprintf(
"%s@%llu", ir::Node::kControlDepVarName, node_set_.size());
"%s@%llu", static_cast<const char *>(ir::Node::kControlDepVarName),
node_set_.size());
auto *x = AddNode(new ir::Node(name, ir::Node::Type::kVariable));
x->SetId(num_node_created_++);
return x;

@ -22,7 +22,11 @@ limitations under the License. */
#include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/string/printf.h"
DECLARE_bool(benchmark);
DEFINE_bool(benchmark, false,
"Doing memory benchmark. It will make deleting scope synchronized, "
"and add some memory usage logs."
"Default cuda is asynchronous device, set to True will"
"force op run in synchronous mode.");
DEFINE_bool(
eager_delete_scope, true,

@ -1,5 +1,5 @@
if(WITH_PYTHON)
cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas)
cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context)
cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind)
cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind)
cc_library(engine SRCS engine.cc)
endif()

@ -58,12 +58,13 @@ if(WIN32)
sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array
analysis_config paddle_pass_builder)
target_link_libraries(paddle_fluid_shared shlwapi)
else(WIN32)
cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array
analysis_config paddle_pass_builder)
endif()
get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(paddle_fluid_shared ${os_dependency_modules})
set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
if(NOT APPLE AND NOT WIN32)

@ -1,4 +1,7 @@
cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc)
if(WITH_TESTING)
add_dependencies(subgraph_detector gtest)
endif()
if (WITH_GPU AND TENSORRT_FOUND)
cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller)

@ -18,6 +18,7 @@
#include <limits>
#include <map>
#include <string>
#include <type_traits>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/ir/graph_helper.h"
@ -168,7 +169,11 @@ bool FindSuitableTensorToReuse(
if (!cluster->count(candidate)) continue;
size_t space = space_table.at(candidate);
size_t space_diff = std::abs<size_t>(space - space_required);
PADDLE_ENFORCE(
space <= std::numeric_limits<std::make_signed<size_t>::type>::max(),
"space overload");
size_t space_diff =
std::abs((std::make_signed<size_t>::type)space - space_required);
if (space_diff < best_fit.second) {
best_fit.first = candidate;
best_fit.second = space_diff;

@ -35,7 +35,6 @@ DEFINE_bool(init_allocated_mem, false,
"To find this error in time, we use init_allocated_mem to indicate "
"that initializing the allocated memory with a small value "
"during unit testing.");
DECLARE_bool(benchmark);
DECLARE_double(fraction_of_gpu_memory_to_use);
namespace paddle {
@ -188,21 +187,20 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
platform::SetDeviceId(place.device);
size_t avail, total;
platform::GpuMemoryUsage(&avail, &total);
LOG(WARNING) << "Cannot allocate " << string::HumanReadableSize(size)
<< " in GPU " << place.device << ", available "
<< string::HumanReadableSize(avail);
LOG(WARNING) << "total " << total;
LOG(WARNING) << "GpuMinChunkSize "
<< string::HumanReadableSize(
buddy_allocator->GetMinChunkSize());
LOG(WARNING) << "GpuMaxChunkSize "
<< string::HumanReadableSize(
buddy_allocator->GetMaxChunkSize());
LOG(WARNING) << "GPU memory used: "
<< string::HumanReadableSize(Used<platform::CUDAPlace>(place));
LOG(FATAL) << "Cannot allocate " << string::HumanReadableSize(size)
<< " in GPU " << place.device << ", available "
<< string::HumanReadableSize(avail) << "total " << total
<< "GpuMinChunkSize "
<< string::HumanReadableSize(buddy_allocator->GetMinChunkSize())
<< "GpuMaxChunkSize "
<< string::HumanReadableSize(buddy_allocator->GetMaxChunkSize())
<< "GPU memory used: "
<< string::HumanReadableSize(Used<platform::CUDAPlace>(place));
platform::SetDeviceId(cur_dev);
} else {
if (FLAGS_benchmark) allocation::GPUMemMonitor.Add(place.device, size);
if (VLOG_IS_ON(3)) {
allocation::GPUMemMonitor.Add(place.device, size);
}
if (FLAGS_init_allocated_mem) {
cudaMemset(ptr, 0xEF, size);
}
@ -218,7 +216,9 @@ void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
size_t size) {
#ifdef PADDLE_WITH_CUDA
GetGPUBuddyAllocator(place.device)->Free(p);
if (FLAGS_benchmark) allocation::GPUMemMonitor.Minus(place.device, size);
if (VLOG_IS_ON(3)) {
allocation::GPUMemMonitor.Minus(place.device, size);
}
#else
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
#endif

@ -38,20 +38,12 @@ class BoxCoderOp : public framework::OperatorWithKernel {
"The shape of PriorBox is [N, 4]");
if (ctx->HasInput("PriorBoxVar")) {
auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar");
PADDLE_ENFORCE(
prior_box_var_dims.size() == 1 || prior_box_var_dims.size() == 2,
"Input(PriorBoxVar) of BoxCoderOp should be 1 or 2.");
if (prior_box_var_dims.size() == 1) {
PADDLE_ENFORCE_EQ(
prior_box_var_dims[0], 4,
"The 1st dimension of Input(PriorBoxVar) should be 4"
"when the rank is 1.");
} else {
PADDLE_ENFORCE_EQ(
prior_box_dims, prior_box_var_dims,
"The dimension of Input(PriorBoxVar) should be equal to"
"the dimension of Input(PriorBox when the rank is 2.)");
}
PADDLE_ENFORCE(prior_box_var_dims.size() == 2,
"Input(PriorBoxVar) of BoxCoderOp should be 2.");
PADDLE_ENFORCE_EQ(
prior_box_dims, prior_box_var_dims,
"The dimension of Input(PriorBoxVar) should be equal to"
"the dimension of Input(PriorBox) when the rank is 2.");
}
}

@ -56,10 +56,7 @@ __global__ void EncodeCenterSizeKernel(
output[idx * len + 2] = log(fabs(target_box_width / prior_box_width));
output[idx * len + 3] = log(fabs(target_box_height / prior_box_height));
if (prior_box_var_data) {
int prior_var_offset = 0;
if (prior_box_var_size == 2) {
prior_var_offset = col_idx * len;
}
int prior_var_offset = col_idx * len;
output[idx * len] /= prior_box_var_data[prior_var_offset];
output[idx * len + 1] /= prior_box_var_data[prior_var_offset + 1];
output[idx * len + 2] /= prior_box_var_data[prior_var_offset + 2];
@ -99,10 +96,7 @@ __global__ void DecodeCenterSizeKernel(
T box_var_x = T(1), box_var_y = T(1);
T box_var_w = T(1), box_var_h = T(1);
if (prior_box_var_data) {
int prior_var_offset = 0;
if (prior_box_var_size == 2) {
prior_var_offset = axis == 0 ? col_idx * len : row_idx * len;
}
int prior_var_offset = axis == 0 ? col_idx * len : row_idx * len;
box_var_x = prior_box_var_data[prior_var_offset];
box_var_y = prior_box_var_data[prior_var_offset + 1];
box_var_w = prior_box_var_data[prior_var_offset + 2];

@ -79,10 +79,7 @@ class BoxCoderKernel : public framework::OpKernel<T> {
output[offset + 3] =
std::log(std::fabs(target_box_height / prior_box_height));
if (prior_box_var) {
int prior_var_offset = 0;
if (prior_box_var->dims().size() == 2) {
prior_var_offset = j * len;
}
int prior_var_offset = j * len;
output[offset] /= prior_box_var_data[prior_var_offset];
output[offset + 1] /= prior_box_var_data[prior_var_offset + 1];
output[offset + 2] /= prior_box_var_data[prior_var_offset + 2];
@ -95,11 +92,12 @@ class BoxCoderKernel : public framework::OpKernel<T> {
}
}
}
template <int axis, int var_size>
void DecodeCenterSize(const framework::Tensor* target_box,
const framework::Tensor* prior_box,
const framework::Tensor* prior_box_var,
const bool normalized, const int axis,
const std::vector<float> variance, T* output) const {
const bool normalized, std::vector<float> variance,
T* output) const {
int64_t row = target_box->dims()[0];
int64_t col = target_box->dims()[1];
int64_t len = target_box->dims()[2];
@ -107,19 +105,17 @@ class BoxCoderKernel : public framework::OpKernel<T> {
auto* target_box_data = target_box->data<T>();
auto* prior_box_data = prior_box->data<T>();
const T* prior_box_var_data = nullptr;
if (prior_box_var) prior_box_var_data = prior_box_var->data<T>();
if (var_size == 2) prior_box_var_data = prior_box_var->data<T>();
int prior_box_offset = 0;
T var_data[4] = {1., 1., 1., 1.};
T* var_ptr = var_data;
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(2)
#endif
for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) {
size_t offset = i * col * len + j * len;
if (axis == 0) {
prior_box_offset = j * len;
} else if (axis == 1) {
prior_box_offset = i * len;
}
prior_box_offset = axis == 0 ? j * len : i * len;
T prior_box_width = prior_box_data[prior_box_offset + 2] -
prior_box_data[prior_box_offset] +
(normalized == false);
@ -133,26 +129,18 @@ class BoxCoderKernel : public framework::OpKernel<T> {
T target_box_center_x = 0, target_box_center_y = 0;
T target_box_width = 0, target_box_height = 0;
T box_var_x = T(1), box_var_y = T(1);
T box_var_w = T(1), box_var_h = T(1);
if (prior_box_var) {
int prior_var_offset = 0;
if (prior_box_var->dims().size() == 2) {
if (axis == 0)
prior_var_offset = j * len;
else if (axis == 1)
prior_var_offset = i * len;
}
box_var_x = prior_box_var_data[prior_var_offset];
box_var_y = prior_box_var_data[prior_var_offset + 1];
box_var_w = prior_box_var_data[prior_var_offset + 2];
box_var_h = prior_box_var_data[prior_var_offset + 3];
} else if (!(variance.empty())) {
box_var_x = static_cast<T>(variance[0]);
box_var_y = static_cast<T>(variance[1]);
box_var_w = static_cast<T>(variance[2]);
box_var_h = static_cast<T>(variance[3]);
int prior_var_offset = axis == 0 ? j * len : i * len;
if (var_size == 2) {
std::memcpy(var_ptr, prior_box_var_data + prior_var_offset,
4 * sizeof(T));
} else if (var_size == 1) {
var_ptr = reinterpret_cast<T*>(variance.data());
}
T box_var_x = *var_ptr;
T box_var_y = *(var_ptr + 1);
T box_var_w = *(var_ptr + 2);
T box_var_h = *(var_ptr + 3);
target_box_center_x =
box_var_x * target_box_data[offset] * prior_box_width +
prior_box_center_x;
@ -211,8 +199,31 @@ class BoxCoderKernel : public framework::OpKernel<T> {
EncodeCenterSize(target_box, prior_box, prior_box_var, normalized,
variance, output);
} else if (code_type == BoxCodeType::kDecodeCenterSize) {
DecodeCenterSize(target_box, prior_box, prior_box_var, normalized, axis,
variance, output);
if (prior_box_var) {
if (axis == 0) {
DecodeCenterSize<0, 2>(target_box, prior_box, prior_box_var,
normalized, variance, output);
} else {
DecodeCenterSize<1, 2>(target_box, prior_box, prior_box_var,
normalized, variance, output);
}
} else if (!(variance.empty())) {
if (axis == 0) {
DecodeCenterSize<0, 1>(target_box, prior_box, prior_box_var,
normalized, variance, output);
} else {
DecodeCenterSize<1, 1>(target_box, prior_box, prior_box_var,
normalized, variance, output);
}
} else {
if (axis == 0) {
DecodeCenterSize<0, 0>(target_box, prior_box, prior_box_var,
normalized, variance, output);
} else {
DecodeCenterSize<1, 0>(target_box, prior_box, prior_box_var,
normalized, variance, output);
}
}
}
}
};

@ -37,7 +37,7 @@ math_library(concat_and_split)
math_library(context_project DEPS im2col math_function)
math_library(cross_entropy)
math_library(cos_sim_functor)
math_library(depthwise_conv)
math_library(depthwise_conv DEPS cub)
math_library(im2col)
math_library(sampler)

@ -31,6 +31,7 @@ std::map<std::string,
std::shared_ptr<std::unordered_map<
std::string, std::shared_ptr<ngraph::Node>>>)>>
NgraphBridge::NG_NODE_MAP = {
{"accuracy", NG_OPS::BuildAccuracyNode},
{"conv2d", NG_OPS::BuildConv2dNode},
{"conv2d_grad", NG_OPS::BuildConv2dGradNode},
{"elementwise_add", NG_OPS::BuildElementwiseAddNode},

@ -21,7 +21,8 @@ limitations under the License. */
#pragma once
#include "ops/binary_unnary_op.h"
#include "ops/accuracy_op.h"
#include "ops/binary_unary_op.h"
#include "ops/conv2d_op.h"
#include "ops/elementwise_add_op.h"
#include "ops/fill_constant_op.h"

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save