Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix_bug_for_lstmp

revert-15774-anakin_subgraph_engine
xuezhong 6 years ago
commit 2ba256df40

@ -25,12 +25,18 @@ message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: " message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
"${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
if(WIN32) if(WIN32)
set(CMAKE_SUPPRESS_REGENERATION ON)
set(CMAKE_STATIC_LIBRARY_PREFIX lib) set(CMAKE_STATIC_LIBRARY_PREFIX lib)
add_definitions("/DGOOGLE_GLOG_DLL_DECL=") add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
add_compile_options(/wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838)
set(PADDLE_LINK_FLAGS "/IGNORE:4006 /IGNORE:4098 /IGNORE:4217 /IGNORE:4221")
set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
endif(WIN32) endif(WIN32)
find_package(CUDA QUIET) find_package(CUDA QUIET)

@ -152,7 +152,12 @@ endif()
if (WITH_MKLML AND MKLML_IOMP_LIB) if (WITH_MKLML AND MKLML_IOMP_LIB)
message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}") message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
if(WIN32)
# openmp not support well for now on windows
set(OPENMP_FLAGS "")
else(WIN32)
set(OPENMP_FLAGS "-fopenmp") set(OPENMP_FLAGS "-fopenmp")
endif(WIN32)
set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")

@ -203,25 +203,26 @@ list(APPEND CUDA_NVCC_FLAGS "-w")
list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr") list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
if (NOT WIN32) if (NOT WIN32)
if(CMAKE_BUILD_TYPE STREQUAL "Debug") if(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG})
elseif(CMAKE_BUILD_TYPE STREQUAL "Release") elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE})
elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
# nvcc 9 does not support -Os. Use Release flags instead # nvcc 9 does not support -Os. Use Release flags instead
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE})
endif() endif()
else(NOT WIN32) else(NOT WIN32)
list(APPEND CUDA_NVCC_FLAGS "--compiler-options;/bigobj") list(APPEND CUDA_NVCC_FLAGS "-Xcompiler \"/wd 4244 /wd 4267 /wd 4819\"")
if(CMAKE_BUILD_TYPE STREQUAL "Debug") list(APPEND CUDA_NVCC_FLAGS "--compiler-options;/bigobj")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND CUDA_NVCC_FLAGS "-g -G") list(APPEND CUDA_NVCC_FLAGS "-g -G")
# match the cl's _ITERATOR_DEBUG_LEVEL # match the cl's _ITERATOR_DEBUG_LEVEL
list(APPEND CUDA_NVCC_FLAGS "-D_DEBUG") list(APPEND CUDA_NVCC_FLAGS "-D_DEBUG")
elseif(CMAKE_BUILD_TYPE STREQUAL "Release") elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG") list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG")
else() else()
message(FATAL "Windows only support Release or Debug build now. Please set visual studio build type to Release/Debug, x64 build.") message(FATAL "Windows only support Release or Debug build now. Please set visual studio build type to Release/Debug, x64 build.")
endif() endif()
endif(NOT WIN32) endif(NOT WIN32)

@ -20,8 +20,10 @@ SET(GLOG_INCLUDE_DIR "${GLOG_INSTALL_DIR}/include" CACHE PATH "glog include dire
IF(WIN32) IF(WIN32)
SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.lib" CACHE FILEPATH "glog library." FORCE) SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.lib" CACHE FILEPATH "glog library." FORCE)
SET(GLOG_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4530")
ELSE(WIN32) ELSE(WIN32)
SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE) SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE)
SET(GLOG_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
ENDIF(WIN32) ENDIF(WIN32)
INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR}) INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})
@ -39,7 +41,7 @@ ExternalProject_Add(
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_CXX_FLAGS=${GLOG_CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}

@ -49,6 +49,8 @@ IF(NOT WIN32)
SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value") SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value")
SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}") SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}")
SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}") SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}")
ELSE()
SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} /EHsc")
ENDIF(NOT WIN32) ENDIF(NOT WIN32)
ExternalProject_Add( ExternalProject_Add(
@ -61,7 +63,6 @@ ExternalProject_Add(
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
CMAKE_ARGS -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} CMAKE_ARGS -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
CMAKE_ARGS -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} CMAKE_ARGS -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}

@ -20,6 +20,12 @@ set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy) set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE) set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE)
if(WIN32)
SET(SNAPPY_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4267")
else()
SET(SNAPPY_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
endif()
ExternalProject_Add( ExternalProject_Add(
extern_snappy extern_snappy
GIT_REPOSITORY "https://github.com/google/snappy" GIT_REPOSITORY "https://github.com/google/snappy"
@ -31,7 +37,7 @@ ExternalProject_Add(
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_CXX_FLAGS=${SNAPPY_CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR} -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}

@ -147,12 +147,6 @@ set(GPU_COMMON_FLAGS
-Wno-error=unused-function # Warnings in Numpy Header. -Wno-error=unused-function # Warnings in Numpy Header.
-Wno-error=array-bounds # Warnings in Eigen::array -Wno-error=array-bounds # Warnings in Eigen::array
) )
else(NOT WIN32)
set(COMMON_FLAGS
"/w") #disable all warnings.
set(GPU_COMMON_FLAGS
"/w") #disable all warnings
endif(NOT WIN32) endif(NOT WIN32)
if (APPLE) if (APPLE)
@ -193,8 +187,7 @@ safe_set_static_flag()
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO) CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
if(${flag_var} MATCHES "/W3") string(REGEX REPLACE "(^| )/W[0-9]( |$)" " " ${flag_var} "${${flag_var}}")
string(REGEX REPLACE "/W3" "/w" ${flag_var} "${${flag_var}}") set(flag_var "${flag_var} /w")
endif(${flag_var} MATCHES "/W3")
endforeach(flag_var) endforeach(flag_var)
endif(WIN32) endif(WIN32)

@ -30,10 +30,25 @@ while ("${PADDLE_VERSION}" STREQUAL "")
else() # otherwise, get the previous git tag name. else() # otherwise, get the previous git tag name.
set(tmp_version "${GIT_TAG_NAME}~1") set(tmp_version "${GIT_TAG_NAME}~1")
endif() endif()
else()
execute_process(
COMMAND ${GIT_EXECUTABLE} describe --exact-match --tags ${tmp_version}
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
OUTPUT_VARIABLE GIT_EXACT_TAG_NAME
RESULT_VARIABLE GIT_EXACT_TAG_RESULT
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
if (NOT ${GIT_EXACT_TAG_NAME})
# Check if current branch is tag branch
if (${GIT_EXACT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}")
string(REPLACE "v" "" PADDLE_VERSION ${GIT_EXACT_TAG_NAME})
else()
set(PADDLE_VERSION "0.0.0")
endif()
else() else()
# otherwise, we always set PADDLE_VERSION to 0.0.0 to represent latest # otherwise, we always set PADDLE_VERSION to 0.0.0 to represent latest
set(PADDLE_VERSION "0.0.0") set(PADDLE_VERSION "0.0.0")
endif() endif()
endif()
else() else()
set(PADDLE_VERSION "0.0.0") set(PADDLE_VERSION "0.0.0")
message(WARNING "Cannot add paddle version from git tag") message(WARNING "Cannot add paddle version from git tag")

@ -403,18 +403,20 @@ void GraphView::Build(ir::Graph* g) {
// 2. track the nodes which used by parameter server. // 2. track the nodes which used by parameter server.
// these node can not be inplaced, otherwise trainer // these node can not be inplaced, otherwise trainer
// pserver can not find each other name. // pserver can not find each other name.
for (auto& node : g->Nodes()) { auto update_skip_set = [&](ir::Node* node) {
if (!node->IsOp()) continue;
if (node->Name() == "send") {
for (auto& in : node->inputs) { for (auto& in : node->inputs) {
dup_nodes_.emplace(in->Name()); if (in->IsVar() && in->Var() != nullptr) dup_nodes_.emplace(in->Name());
}
} }
if (node->Name() == "recv") {
for (auto& out : node->outputs) { for (auto& out : node->outputs) {
if (out->IsVar() && out->Var() != nullptr)
dup_nodes_.emplace(out->Name()); dup_nodes_.emplace(out->Name());
} }
} };
for (auto& node : g->Nodes()) {
if (!node->IsOp()) continue;
if (node->Name() == "send") update_skip_set(node);
if (node->Name() == "recv") update_skip_set(node);
if (node->Name() == "prefetch") update_skip_set(node);
} }
} }

@ -51,8 +51,7 @@ static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl( std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const { std::unique_ptr<ir::Graph> graph) const {
auto nodes = graph->Nodes(); auto nodes = graph->Nodes();
auto subblock_vars = GetSubBlockVars(nodes); CollectSkipVarsSet(nodes);
skip_set_.insert(subblock_vars.begin(), subblock_vars.end());
cfg_.reset(new details::ControlFlowGraph(*graph)); cfg_.reset(new details::ControlFlowGraph(*graph));
cfg_->LiveVariableAnalysis(); cfg_->LiveVariableAnalysis();
@ -224,20 +223,27 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
} }
} }
std::unordered_set<std::string> MemoryOptimizePass::GetSubBlockVars( void MemoryOptimizePass::CollectSkipVarsSet(
const std::unordered_set<ir::Node*>& nodes) const { const std::unordered_set<ir::Node*>& nodes) const {
std::unordered_set<std::string> vars; auto update_skip_set = [&](OpDesc* op_desc) {
auto inputs = op_desc->InputArgumentNames();
auto outputs = op_desc->OutputArgumentNames();
skip_set_.insert(inputs.begin(), inputs.end());
skip_set_.insert(outputs.begin(), outputs.end());
};
for (auto& op : nodes) { for (auto& op : nodes) {
if (!op->IsOp() || op->Op() == nullptr) continue; if (!op->IsOp() || op->Op() == nullptr) continue;
auto* op_desc = op->Op(); auto* op_desc = op->Op();
if (OpHasSubBlock(op_desc)) { // NOTE(dzhwinter):
auto inputs = op_desc->InputArgumentNames(); // current block can not reuse next level block vars.
auto outputs = op_desc->OutputArgumentNames(); if (OpHasSubBlock(op_desc)) update_skip_set(op_desc);
vars.insert(inputs.begin(), inputs.end()); // NOTE(dzhwinter):
vars.insert(outputs.begin(), outputs.end()); // distributed ops input/output name need to
} // keep same bettwen trainer/pserver
if (op_desc->Type() == "send") update_skip_set(op_desc);
if (op_desc->Type() == "recv") update_skip_set(op_desc);
if (op_desc->Type() == "prefetch") update_skip_set(op_desc);
} }
return vars;
} }
void MemoryOptimizePass::RenameVarInGraphDesc(const std::string& var, void MemoryOptimizePass::RenameVarInGraphDesc(const std::string& var,

@ -55,9 +55,10 @@ class MemoryOptimizePass : public ir::Pass {
ir::Graph* graph) const; ir::Graph* graph) const;
void SubGraphOptimize(OpDesc* op_desc) const; void SubGraphOptimize(OpDesc* op_desc) const;
// scan subblock and collect the output/input variables. // 1. scan op with subblock and collect the output/input vars.
std::unordered_set<std::string> GetSubBlockVars( // while, while_grad, conditional_block
const std::unordered_set<ir::Node*>&) const; // 2. scan distributed ops and collect the output/input vars
void CollectSkipVarsSet(const std::unordered_set<ir::Node*>&) const;
private: private:
// Reuse Node Pool, Owned. // Reuse Node Pool, Owned.

@ -276,6 +276,7 @@ TEST(InferInplace, MultiGradInplaceInToOut) {
auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
auto in_to_outs = infer_inplace(*op, op->Block()); auto in_to_outs = infer_inplace(*op, op->Block());
EXPECT_EQ(in_to_outs.size(), 3ul); EXPECT_EQ(in_to_outs.size(), 3ul);
std::unordered_map<std::string, std::string> expects = { std::unordered_map<std::string, std::string> expects = {
{"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"}, {"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"},

@ -141,7 +141,8 @@ class Graph {
ir::Node *CreateControlDepVar() { ir::Node *CreateControlDepVar() {
// TODO(panyx0718): control var name should be really unique. // TODO(panyx0718): control var name should be really unique.
const std::string name = string::Sprintf( const std::string name = string::Sprintf(
"%s@%llu", ir::Node::kControlDepVarName, node_set_.size()); "%s@%llu", static_cast<const char *>(ir::Node::kControlDepVarName),
node_set_.size());
auto *x = AddNode(new ir::Node(name, ir::Node::Type::kVariable)); auto *x = AddNode(new ir::Node(name, ir::Node::Type::kVariable));
x->SetId(num_node_created_++); x->SetId(num_node_created_++);
return x; return x;

@ -22,7 +22,11 @@ limitations under the License. */
#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
DECLARE_bool(benchmark); DEFINE_bool(benchmark, false,
"Doing memory benchmark. It will make deleting scope synchronized, "
"and add some memory usage logs."
"Default cuda is asynchronous device, set to True will"
"force op run in synchronous mode.");
DEFINE_bool( DEFINE_bool(
eager_delete_scope, true, eager_delete_scope, true,

@ -1,5 +1,5 @@
if(WITH_PYTHON) if(WITH_PYTHON)
cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas) cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind)
cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context) cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind)
cc_library(engine SRCS engine.cc) cc_library(engine SRCS engine.cc)
endif() endif()

@ -58,12 +58,13 @@ if(WIN32)
sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array
analysis_config paddle_pass_builder) analysis_config paddle_pass_builder)
target_link_libraries(paddle_fluid_shared shlwapi)
else(WIN32) else(WIN32)
cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array
analysis_config paddle_pass_builder) analysis_config paddle_pass_builder)
endif() endif()
get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(paddle_fluid_shared ${os_dependency_modules})
set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid) set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
if(NOT APPLE AND NOT WIN32) if(NOT APPLE AND NOT WIN32)

@ -1,4 +1,7 @@
cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc) cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc)
if(WITH_TESTING)
add_dependencies(subgraph_detector gtest)
endif()
if (WITH_GPU AND TENSORRT_FOUND) if (WITH_GPU AND TENSORRT_FOUND)
cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller) cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller)

@ -18,6 +18,7 @@
#include <limits> #include <limits>
#include <map> #include <map>
#include <string> #include <string>
#include <type_traits>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h"
@ -168,7 +169,11 @@ bool FindSuitableTensorToReuse(
if (!cluster->count(candidate)) continue; if (!cluster->count(candidate)) continue;
size_t space = space_table.at(candidate); size_t space = space_table.at(candidate);
size_t space_diff = std::abs<size_t>(space - space_required); PADDLE_ENFORCE(
space <= std::numeric_limits<std::make_signed<size_t>::type>::max(),
"space overload");
size_t space_diff =
std::abs((std::make_signed<size_t>::type)space - space_required);
if (space_diff < best_fit.second) { if (space_diff < best_fit.second) {
best_fit.first = candidate; best_fit.first = candidate;
best_fit.second = space_diff; best_fit.second = space_diff;

@ -35,7 +35,6 @@ DEFINE_bool(init_allocated_mem, false,
"To find this error in time, we use init_allocated_mem to indicate " "To find this error in time, we use init_allocated_mem to indicate "
"that initializing the allocated memory with a small value " "that initializing the allocated memory with a small value "
"during unit testing."); "during unit testing.");
DECLARE_bool(benchmark);
DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_double(fraction_of_gpu_memory_to_use);
namespace paddle { namespace paddle {
@ -188,21 +187,20 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
platform::SetDeviceId(place.device); platform::SetDeviceId(place.device);
size_t avail, total; size_t avail, total;
platform::GpuMemoryUsage(&avail, &total); platform::GpuMemoryUsage(&avail, &total);
LOG(WARNING) << "Cannot allocate " << string::HumanReadableSize(size) LOG(FATAL) << "Cannot allocate " << string::HumanReadableSize(size)
<< " in GPU " << place.device << ", available " << " in GPU " << place.device << ", available "
<< string::HumanReadableSize(avail); << string::HumanReadableSize(avail) << "total " << total
LOG(WARNING) << "total " << total; << "GpuMinChunkSize "
LOG(WARNING) << "GpuMinChunkSize " << string::HumanReadableSize(buddy_allocator->GetMinChunkSize())
<< string::HumanReadableSize( << "GpuMaxChunkSize "
buddy_allocator->GetMinChunkSize()); << string::HumanReadableSize(buddy_allocator->GetMaxChunkSize())
LOG(WARNING) << "GpuMaxChunkSize " << "GPU memory used: "
<< string::HumanReadableSize(
buddy_allocator->GetMaxChunkSize());
LOG(WARNING) << "GPU memory used: "
<< string::HumanReadableSize(Used<platform::CUDAPlace>(place)); << string::HumanReadableSize(Used<platform::CUDAPlace>(place));
platform::SetDeviceId(cur_dev); platform::SetDeviceId(cur_dev);
} else { } else {
if (FLAGS_benchmark) allocation::GPUMemMonitor.Add(place.device, size); if (VLOG_IS_ON(3)) {
allocation::GPUMemMonitor.Add(place.device, size);
}
if (FLAGS_init_allocated_mem) { if (FLAGS_init_allocated_mem) {
cudaMemset(ptr, 0xEF, size); cudaMemset(ptr, 0xEF, size);
} }
@ -218,7 +216,9 @@ void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
size_t size) { size_t size) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
GetGPUBuddyAllocator(place.device)->Free(p); GetGPUBuddyAllocator(place.device)->Free(p);
if (FLAGS_benchmark) allocation::GPUMemMonitor.Minus(place.device, size); if (VLOG_IS_ON(3)) {
allocation::GPUMemMonitor.Minus(place.device, size);
}
#else #else
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
#endif #endif

@ -38,20 +38,12 @@ class BoxCoderOp : public framework::OperatorWithKernel {
"The shape of PriorBox is [N, 4]"); "The shape of PriorBox is [N, 4]");
if (ctx->HasInput("PriorBoxVar")) { if (ctx->HasInput("PriorBoxVar")) {
auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar"); auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar");
PADDLE_ENFORCE( PADDLE_ENFORCE(prior_box_var_dims.size() == 2,
prior_box_var_dims.size() == 1 || prior_box_var_dims.size() == 2, "Input(PriorBoxVar) of BoxCoderOp should be 2.");
"Input(PriorBoxVar) of BoxCoderOp should be 1 or 2.");
if (prior_box_var_dims.size() == 1) {
PADDLE_ENFORCE_EQ(
prior_box_var_dims[0], 4,
"The 1st dimension of Input(PriorBoxVar) should be 4"
"when the rank is 1.");
} else {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
prior_box_dims, prior_box_var_dims, prior_box_dims, prior_box_var_dims,
"The dimension of Input(PriorBoxVar) should be equal to" "The dimension of Input(PriorBoxVar) should be equal to"
"the dimension of Input(PriorBox when the rank is 2.)"); "the dimension of Input(PriorBox) when the rank is 2.");
}
} }
} }

@ -56,10 +56,7 @@ __global__ void EncodeCenterSizeKernel(
output[idx * len + 2] = log(fabs(target_box_width / prior_box_width)); output[idx * len + 2] = log(fabs(target_box_width / prior_box_width));
output[idx * len + 3] = log(fabs(target_box_height / prior_box_height)); output[idx * len + 3] = log(fabs(target_box_height / prior_box_height));
if (prior_box_var_data) { if (prior_box_var_data) {
int prior_var_offset = 0; int prior_var_offset = col_idx * len;
if (prior_box_var_size == 2) {
prior_var_offset = col_idx * len;
}
output[idx * len] /= prior_box_var_data[prior_var_offset]; output[idx * len] /= prior_box_var_data[prior_var_offset];
output[idx * len + 1] /= prior_box_var_data[prior_var_offset + 1]; output[idx * len + 1] /= prior_box_var_data[prior_var_offset + 1];
output[idx * len + 2] /= prior_box_var_data[prior_var_offset + 2]; output[idx * len + 2] /= prior_box_var_data[prior_var_offset + 2];
@ -99,10 +96,7 @@ __global__ void DecodeCenterSizeKernel(
T box_var_x = T(1), box_var_y = T(1); T box_var_x = T(1), box_var_y = T(1);
T box_var_w = T(1), box_var_h = T(1); T box_var_w = T(1), box_var_h = T(1);
if (prior_box_var_data) { if (prior_box_var_data) {
int prior_var_offset = 0; int prior_var_offset = axis == 0 ? col_idx * len : row_idx * len;
if (prior_box_var_size == 2) {
prior_var_offset = axis == 0 ? col_idx * len : row_idx * len;
}
box_var_x = prior_box_var_data[prior_var_offset]; box_var_x = prior_box_var_data[prior_var_offset];
box_var_y = prior_box_var_data[prior_var_offset + 1]; box_var_y = prior_box_var_data[prior_var_offset + 1];
box_var_w = prior_box_var_data[prior_var_offset + 2]; box_var_w = prior_box_var_data[prior_var_offset + 2];

@ -79,10 +79,7 @@ class BoxCoderKernel : public framework::OpKernel<T> {
output[offset + 3] = output[offset + 3] =
std::log(std::fabs(target_box_height / prior_box_height)); std::log(std::fabs(target_box_height / prior_box_height));
if (prior_box_var) { if (prior_box_var) {
int prior_var_offset = 0; int prior_var_offset = j * len;
if (prior_box_var->dims().size() == 2) {
prior_var_offset = j * len;
}
output[offset] /= prior_box_var_data[prior_var_offset]; output[offset] /= prior_box_var_data[prior_var_offset];
output[offset + 1] /= prior_box_var_data[prior_var_offset + 1]; output[offset + 1] /= prior_box_var_data[prior_var_offset + 1];
output[offset + 2] /= prior_box_var_data[prior_var_offset + 2]; output[offset + 2] /= prior_box_var_data[prior_var_offset + 2];
@ -95,11 +92,12 @@ class BoxCoderKernel : public framework::OpKernel<T> {
} }
} }
} }
template <int axis, int var_size>
void DecodeCenterSize(const framework::Tensor* target_box, void DecodeCenterSize(const framework::Tensor* target_box,
const framework::Tensor* prior_box, const framework::Tensor* prior_box,
const framework::Tensor* prior_box_var, const framework::Tensor* prior_box_var,
const bool normalized, const int axis, const bool normalized, std::vector<float> variance,
const std::vector<float> variance, T* output) const { T* output) const {
int64_t row = target_box->dims()[0]; int64_t row = target_box->dims()[0];
int64_t col = target_box->dims()[1]; int64_t col = target_box->dims()[1];
int64_t len = target_box->dims()[2]; int64_t len = target_box->dims()[2];
@ -107,19 +105,17 @@ class BoxCoderKernel : public framework::OpKernel<T> {
auto* target_box_data = target_box->data<T>(); auto* target_box_data = target_box->data<T>();
auto* prior_box_data = prior_box->data<T>(); auto* prior_box_data = prior_box->data<T>();
const T* prior_box_var_data = nullptr; const T* prior_box_var_data = nullptr;
if (prior_box_var) prior_box_var_data = prior_box_var->data<T>(); if (var_size == 2) prior_box_var_data = prior_box_var->data<T>();
int prior_box_offset = 0; int prior_box_offset = 0;
T var_data[4] = {1., 1., 1., 1.};
T* var_ptr = var_data;
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
#endif #endif
for (int64_t i = 0; i < row; ++i) { for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) { for (int64_t j = 0; j < col; ++j) {
size_t offset = i * col * len + j * len; size_t offset = i * col * len + j * len;
if (axis == 0) { prior_box_offset = axis == 0 ? j * len : i * len;
prior_box_offset = j * len;
} else if (axis == 1) {
prior_box_offset = i * len;
}
T prior_box_width = prior_box_data[prior_box_offset + 2] - T prior_box_width = prior_box_data[prior_box_offset + 2] -
prior_box_data[prior_box_offset] + prior_box_data[prior_box_offset] +
(normalized == false); (normalized == false);
@ -133,26 +129,18 @@ class BoxCoderKernel : public framework::OpKernel<T> {
T target_box_center_x = 0, target_box_center_y = 0; T target_box_center_x = 0, target_box_center_y = 0;
T target_box_width = 0, target_box_height = 0; T target_box_width = 0, target_box_height = 0;
T box_var_x = T(1), box_var_y = T(1); int prior_var_offset = axis == 0 ? j * len : i * len;
T box_var_w = T(1), box_var_h = T(1); if (var_size == 2) {
if (prior_box_var) { std::memcpy(var_ptr, prior_box_var_data + prior_var_offset,
int prior_var_offset = 0; 4 * sizeof(T));
if (prior_box_var->dims().size() == 2) { } else if (var_size == 1) {
if (axis == 0) var_ptr = reinterpret_cast<T*>(variance.data());
prior_var_offset = j * len; }
else if (axis == 1) T box_var_x = *var_ptr;
prior_var_offset = i * len; T box_var_y = *(var_ptr + 1);
} T box_var_w = *(var_ptr + 2);
box_var_x = prior_box_var_data[prior_var_offset]; T box_var_h = *(var_ptr + 3);
box_var_y = prior_box_var_data[prior_var_offset + 1];
box_var_w = prior_box_var_data[prior_var_offset + 2];
box_var_h = prior_box_var_data[prior_var_offset + 3];
} else if (!(variance.empty())) {
box_var_x = static_cast<T>(variance[0]);
box_var_y = static_cast<T>(variance[1]);
box_var_w = static_cast<T>(variance[2]);
box_var_h = static_cast<T>(variance[3]);
}
target_box_center_x = target_box_center_x =
box_var_x * target_box_data[offset] * prior_box_width + box_var_x * target_box_data[offset] * prior_box_width +
prior_box_center_x; prior_box_center_x;
@ -211,8 +199,31 @@ class BoxCoderKernel : public framework::OpKernel<T> {
EncodeCenterSize(target_box, prior_box, prior_box_var, normalized, EncodeCenterSize(target_box, prior_box, prior_box_var, normalized,
variance, output); variance, output);
} else if (code_type == BoxCodeType::kDecodeCenterSize) { } else if (code_type == BoxCodeType::kDecodeCenterSize) {
DecodeCenterSize(target_box, prior_box, prior_box_var, normalized, axis, if (prior_box_var) {
variance, output); if (axis == 0) {
DecodeCenterSize<0, 2>(target_box, prior_box, prior_box_var,
normalized, variance, output);
} else {
DecodeCenterSize<1, 2>(target_box, prior_box, prior_box_var,
normalized, variance, output);
}
} else if (!(variance.empty())) {
if (axis == 0) {
DecodeCenterSize<0, 1>(target_box, prior_box, prior_box_var,
normalized, variance, output);
} else {
DecodeCenterSize<1, 1>(target_box, prior_box, prior_box_var,
normalized, variance, output);
}
} else {
if (axis == 0) {
DecodeCenterSize<0, 0>(target_box, prior_box, prior_box_var,
normalized, variance, output);
} else {
DecodeCenterSize<1, 0>(target_box, prior_box, prior_box_var,
normalized, variance, output);
}
}
} }
} }
}; };

@ -37,7 +37,7 @@ math_library(concat_and_split)
math_library(context_project DEPS im2col math_function) math_library(context_project DEPS im2col math_function)
math_library(cross_entropy) math_library(cross_entropy)
math_library(cos_sim_functor) math_library(cos_sim_functor)
math_library(depthwise_conv) math_library(depthwise_conv DEPS cub)
math_library(im2col) math_library(im2col)
math_library(sampler) math_library(sampler)

@ -31,6 +31,7 @@ std::map<std::string,
std::shared_ptr<std::unordered_map< std::shared_ptr<std::unordered_map<
std::string, std::shared_ptr<ngraph::Node>>>)>> std::string, std::shared_ptr<ngraph::Node>>>)>>
NgraphBridge::NG_NODE_MAP = { NgraphBridge::NG_NODE_MAP = {
{"accuracy", NG_OPS::BuildAccuracyNode},
{"conv2d", NG_OPS::BuildConv2dNode}, {"conv2d", NG_OPS::BuildConv2dNode},
{"conv2d_grad", NG_OPS::BuildConv2dGradNode}, {"conv2d_grad", NG_OPS::BuildConv2dGradNode},
{"elementwise_add", NG_OPS::BuildElementwiseAddNode}, {"elementwise_add", NG_OPS::BuildElementwiseAddNode},

@ -21,7 +21,8 @@ limitations under the License. */
#pragma once #pragma once
#include "ops/binary_unnary_op.h" #include "ops/accuracy_op.h"
#include "ops/binary_unary_op.h"
#include "ops/conv2d_op.h" #include "ops/conv2d_op.h"
#include "ops/elementwise_add_op.h" #include "ops/elementwise_add_op.h"
#include "ops/fill_constant_op.h" #include "ops/fill_constant_op.h"

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save