Compare commits

...

62 Commits

Author SHA1 Message Date
swtkiwi 13e906c162 delete matplotlib
6 years ago
danleifeng 3f8746842b
nccl init support hostname and ip; test=develop (#28163)
6 years ago
Wilber c008ae696a
update. (#28166)
6 years ago
lidanqing 56aba4b747
add FLAGS_use_mkldnn check (#28147)
6 years ago
wangguanzhong 86887b9b1f
Cherry pick fix generate proposals labels (#28165)
6 years ago
wangguanzhong 5178d9f832
support multiclass nms for multi-batch, test=develop (#28164)
6 years ago
liu zhengxi 5d94a5ca54
fix dynamic decode imperative (#28162)
6 years ago
李灿 4193ae0d02
fix bugs test=develop (#28158)
6 years ago
Zhou Wei dc53ba879f
fix dynamic_loader more safe and error message on windows (#28144)
6 years ago
WangXi 739043c6a6
【paddle.fleet】fleet add _get_applied_meta_list and _get_applied_graph_list, test=develop (#27952) (#28053)
6 years ago
Double_V 39745d448c
Roi align kl (#28128)
6 years ago
LielinJiang 172e76a852
transform add pil backend (#28132)
6 years ago
tianshuo78520a f63f8d730c
Add build paddle inference (#28137)
6 years ago
李灿 2477ccc378
fix bugs test=develop (#28113)
6 years ago
danleifeng 61162497b4
raise error if use multi-cards in fleet non_distributed mode;test=develop (#28093)
6 years ago
LielinJiang d4160941f3
fix dataloader (#28105)
6 years ago
yinhaofeng 0b294906f9
lookup_table_v2_op_xpu report errors;test=kunlun (#28064) (#28100)
6 years ago
yinhaofeng ea45fb90a4
【cherry-pick】xpu adam op (#28031) (#28097)
6 years ago
Double_V 4f43d51f43
add rois_num params for roi_align_xpu op, test=kunlun (#28094)
6 years ago
lilong12 c4e18dc093
build gloo from source code instead of using the pre-compiled library (#27930) (#28009)
6 years ago
Kaipeng Deng 6375ad39cc
[cherry-pick] lr scheduler epoch2step (#28056)
6 years ago
TeslaZhao 11adb0f373
[cherry-pick] Add xpu transpose2 op.test=kunlun (#28096)
6 years ago
lilong12 957e6fbe5f
add doc for ReduceOp (#28051) (#28082)
6 years ago
Chengmo 91727ac899
Fix xpu error message (#28061) (#28092)
6 years ago
liuyuhui 5c2852a330
[API 2.0: doc] transfer from paddle.fluid.layers.assign() into creation.py (#27999) (#28074)
6 years ago
huangxu96 6bb6cb27b8
Allclose op (#27891) (#28069)
6 years ago
xiaoting 905b076553
rm max_input in conv2d for kunlun, test=kunlun (#28063)
6 years ago
Double_V 8600f47439
error message opt for XPU, test=kunlun (#27972) (#28078)
6 years ago
Pei Yang d89deae9e0
reduce trt warning message (#28011) (#28075)
6 years ago
lilong12 316c97c7eb
put gloo initialization log to file (#27969) (#28076)
6 years ago
pangyoki 46a1f69b3c
cherry pick 27861 Add truncated_gaussian_random XPU kernel, test=kunlun (#28060)
6 years ago
pangyoki b21409e0c0
cherry pick 27853 Add gaussian_random XPU kernels, test=kunlun (#28059)
6 years ago
pangyoki 69ec13cdf0
cherry pick 27846 Add uniform_random XPU kernel, test=kunlun (#28057)
6 years ago
pangyoki 386429beb0
cherry pick 27946 Fix error message of multinomial op (#28080)
6 years ago
MRXLT e3513a6395
[cherry pick] Fix fleet (#28067)
6 years ago
LutaoChu e3a88eb450
Fix diag OP bug on Windows Python3.8, cherry-pick from #28034
6 years ago
littletomatodonkey edbaa027fe
fix pad api (#28045)
6 years ago
liuyuhui 77eddf9168
add cast/concat/assign xpu op (#27911) (#28050)
6 years ago
Kaipeng Deng f05b184f72
update yolo_box support h != w. test=develop (#28054)
6 years ago
Zhou Wei e9a1669e92
fix optimizer init (#27994)
6 years ago
xiaoting 5c1babde56
[cherry-pick] polish kunlun error message for 2.0 rc (#28048)
6 years ago
Guo Sheng 3f565903e8
[cherry-pick] Incorporate cudnn_lstm into LSTM api (#27217) (#28023)
6 years ago
Jack Zhou a6c18075d6
add xpu error message description in some ops
6 years ago
chentianyu03 2ca53fa65f
change paddle.fluid.layers.reduce_sum to paddle.sum in sample codes (#27998) (#28017)
6 years ago
LiuChiachi 4316bd4d5a
Clean text.py and decode.py for API 2.0 (#26853) (#27958)
6 years ago
lidanqing ea76fe310c
[oneDNN] Conv dilation support (#27914) (#28028)
6 years ago
tangwei12 c0550b54a5
Feature/large scale kv save base/delta (#27470) (#27990)
6 years ago
Aurelius84 772c266822
[Dy2Stat] Fix Error when generating train_program in eval mode (#27975) (#28019)
6 years ago
chentianyu03 77866effeb
change paddle.fluid.data to paddle.static.data in sample code (#27992) (#28014)
6 years ago
chentianyu03 cafa35e1ea
Change reduce mean (#27997) (#28004)
6 years ago
chentianyu03 3251f9c1eb
change paddle.fluid.layers.fill_constant to paddle.full in sample codes (#27993) (#28000)
6 years ago
Steffy-zxf 164b9aabc1
update code examples for paddle.sums
6 years ago
mapingshuo 50d24899cf
fix kunlun kernel of reshape op (#27989)
6 years ago
Zhou Wei b57254ed61
[cherry-pick2.0]Add tensor clone 2.0 (#27982)
6 years ago
123malin c0061ff56f
【paddle.fleet】geo send sparse optimize (#27719) (#27979)
6 years ago
Guanghua Yu 51dd268cfe
error message optimization in mean_xpu,softmax_with_cross_entropy_op_xpu,test=kunlun (#27968)
6 years ago
Feiyu Chan 429c0b62b1
support channel last in BatchNorm*d (#27961)
6 years ago
mapingshuo 39c31a20e5
reshape support bool, test=develop (#27944) (#27971)
6 years ago
Qinghe JING 1f45c06e92
add reduce xpu op test=develop;test=kunlun (#27960)
6 years ago
Yiqun Liu ef2b12f11c
Reimplement paddle.utils.install_check. (#27771) (#27963)
6 years ago
tianshuo78520a b7fd4f9224
fix norm code format error (#27973)
6 years ago
Thunderbrook ba6a29070a
fix slice doc (#27941) (#27976)
6 years ago

@ -193,10 +193,19 @@ if(WITH_BRPC_RDMA)
endif()
endif()
# lite subgraph compilation depends on CUDNN_ROOT,
# so include(cudnn) needs to be in front of include(third_party/lite)
include(cudnn) # set cudnn libraries, must before configure
include(third_party) # download, build, install third_party
if(WITH_GPU)
include(cuda)
# lite subgraph compilation depends on CUDNN_ROOT,
# so include(cudnn) needs to be in front of include(third_party/lite)
include(cudnn) # set cudnn libraries, must before configure
include(tensorrt)
# there is no official support of nccl, cupti in windows
if(NOT WIN32)
include(cupti)
endif()
endif()
include(third_party) # download, build, install third_party, Contains about 20+ dependencies
if(WITH_DISTRIBUTE)
if(WITH_GRPC)
@ -209,18 +218,8 @@ if(WITH_DISTRIBUTE)
endif()
endif()
# there is no official support of nccl, cupti in windows
if(NOT WIN32)
include(cupti)
endif()
include(flags) # set paddle compile flags
if(WITH_GPU)
include(cuda)
include(tensorrt)
endif()
if(WITH_PROFILER)
find_package(Gperftools REQUIRED)
include_directories(${GPERFTOOLS_INCLUDE_DIR})

@ -198,7 +198,9 @@ elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.x
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
endif()
add_definitions("-DPADDLE_CUDA_BINVER=\"${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}\"")
add_definitions("-DCUDA_VERSION_MAJOR=\"${CUDA_VERSION_MAJOR}\"")
add_definitions("-DCUDA_VERSION_MINOR=\"${CUDA_VERSION_MINOR}\"")
add_definitions("-DCUDA_TOOLKIT_ROOT_DIR=\"${CUDA_TOOLKIT_ROOT_DIR}\"")
# setting nvcc arch flags
select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
@ -249,3 +251,4 @@ endif()
mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)

@ -35,17 +35,18 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
)
set(CUDNN_LIB_NAME "")
if (LINUX)
set(CUDNN_LIB_NAME "libcudnn.so")
set(CUDNN_LIB_NAME "libcudnn.so")
endif(LINUX)
if(WIN32)
# only support cudnn7
set(CUDNN_LIB_NAME "cudnn.lib" "cudnn64_7.dll")
# only support cudnn7
set(CUDNN_LIB_NAME "cudnn.lib" "cudnn64_7.dll")
endif(WIN32)
if(APPLE)
set(CUDNN_LIB_NAME "libcudnn.dylib" "libcudnn.so")
set(CUDNN_LIB_NAME "libcudnn.dylib" "libcudnn.so")
endif(APPLE)
find_library(CUDNN_LIBRARY NAMES ${CUDNN_LIB_NAME} # libcudnn_static.a
@ -88,7 +89,7 @@ macro(find_cudnn_version cudnn_header_file)
if(NOT CUDNN_MAJOR_VERSION)
set(CUDNN_VERSION "???")
else()
add_definitions("-DPADDLE_CUDNN_BINVER=\"${CUDNN_MAJOR_VERSION}\"")
add_definitions("-DCUDNN_MAJOR_VERSION=\"${CUDNN_MAJOR_VERSION}\"")
math(EXPR CUDNN_VERSION
"${CUDNN_MAJOR_VERSION} * 1000 +
${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}")

@ -14,55 +14,40 @@
INCLUDE(ExternalProject)
execute_process(COMMAND bash -c "gcc -dumpversion" OUTPUT_VARIABLE GCC_VERSION)
SET(GLOO_PROJECT "extern_gloo")
IF((NOT DEFINED GLOO_VER) OR (NOT DEFINED GLOO_URL))
MESSAGE(STATUS "use pre defined download url")
SET(GLOO_VER "master" CACHE STRING "" FORCE)
SET(GLOO_NAME "gloo" CACHE STRING "" FORCE)
if(${GCC_VERSION} VERSION_EQUAL "8.2.0")
SET(GLOO_URL "https://fleet.bj.bcebos.com/gloo/gloo.tar.gz.gcc8" CACHE STRING "" FORCE)
else()
SET(GLOO_URL "https://fleet.bj.bcebos.com/gloo/gloo.tar.gz.gcc482" CACHE STRING "" FORCE)
endif()
ENDIF()
MESSAGE(STATUS "GLOO_NAME: ${GLOO_NAME}, GLOO_URL: ${GLOO_URL}")
SET(GLOO_SOURCE_DIR "${THIRD_PARTY_PATH}/gloo")
SET(GLOO_DOWNLOAD_DIR "${GLOO_SOURCE_DIR}/src/${GLOO_PROJECT}")
SET(GLOO_DST_DIR "gloo")
SET(GLOO_INSTALL_ROOT "${THIRD_PARTY_PATH}/install")
SET(GLOO_INSTALL_DIR ${GLOO_INSTALL_ROOT}/${GLOO_DST_DIR})
SET(GLOO_ROOT ${GLOO_INSTALL_DIR})
SET(GLOO_INC_DIR ${GLOO_ROOT}/include)
SET(GLOO_LIB_DIR ${GLOO_ROOT}/lib)
SET(GLOO_LIB ${GLOO_LIB_DIR}/libgloo.a)
#SET(GLOO_IOMP_LIB ${GLOO_LIB_DIR}/libiomp5.so) #todo what is this
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${GLOO_ROOT}/lib")
INCLUDE_DIRECTORIES(${GLOO_INC_DIR})
FILE(WRITE ${GLOO_DOWNLOAD_DIR}/CMakeLists.txt
"PROJECT(GLOO)\n"
"cmake_minimum_required(VERSION 3.0)\n"
"install(DIRECTORY ${GLOO_NAME}/include ${GLOO_NAME}/lib \n"
" DESTINATION ${GLOO_DST_DIR})\n")
SET(GLOO_PREFIX_DIR ${THIRD_PARTY_PATH}/gloo)
SET(GLOO_SOURCE_DIR ${THIRD_PARTY_PATH}/gloo/src/extern_gloo/gloo)
SET(GLOO_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gloo)
SET(GLOO_INCLUDE_DIR "${GLOO_INSTALL_DIR}/include" CACHE PATH "gloo include directory." FORCE)
SET(GLOO_LIBRARY_DIR "${GLOO_INSTALL_DIR}/lib" CACHE PATH "gloo library directory." FORCE)
SET(GLOO_REPOSITORY https://github.com/sandyhouse/gloo.git)
SET(GLOO_TAG v0.0.2)
SET(GLOO_LIBRARIES "${GLOO_INSTALL_DIR}/lib/libgloo.a" CACHE FILEPATH "gloo library." FORCE)
INCLUDE_DIRECTORIES(${GLOO_INCLUDE_DIR})
cache_third_party(extern_gloo
REPOSITORY ${GLOO_REPOSITORY}
TAG ${GLOO_TAG}
DIR GLOO_SOURCE_DIR)
ExternalProject_Add(
${GLOO_PROJECT}
extern_gloo
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${GLOO_SOURCE_DIR}
DOWNLOAD_DIR ${GLOO_DOWNLOAD_DIR}
DOWNLOAD_COMMAND wget --no-check-certificate ${GLOO_URL} -c -q -O ${GLOO_NAME}.tar.gz
&& tar zxvf ${GLOO_NAME}.tar.gz
DOWNLOAD_NO_PROGRESS 1
${SHALLOW_CLONE}
"${GLOO_DOWNLOAD_CMD}"
PREFIX "${GLOO_PREFIX_DIR}"
SOURCE_DIR "${GLOO_SOURCE_DIR}"
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GLOO_INSTALL_ROOT}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOO_INSTALL_ROOT}
CONFIGURE_COMMAND ""
BUILD_COMMAND mkdir -p ${GLOO_SOURCE_DIR}/build
&& cd ${GLOO_SOURCE_DIR}/build && cmake .. && make
&& mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo
INSTALL_COMMAND ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR}
COMMAND ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo"
)
ADD_LIBRARY(gloo SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET gloo PROPERTY IMPORTED_LOCATION ${GLOO_LIB})
ADD_LIBRARY(gloo STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET gloo PROPERTY IMPORTED_LOCATION ${GLOO_LIBRARIES})
ADD_DEPENDENCIES(gloo ${GLOO_PROJECT})

@ -22,7 +22,7 @@ if(XPU_SDK_ROOT)
set(LITE_WITH_XPU ON)
include_directories("${XPU_SDK_ROOT}/XTDK/include")
include_directories("${XPU_SDK_ROOT}/XTCL/include")
add_definitions(-DPADDLE_WITH_XPU)
add_definitions(-DLITE_SUBGRAPH_WITH_XPU)
LINK_DIRECTORIES("${XPU_SDK_ROOT}/XTDK/shlib/")
LINK_DIRECTORIES("${XPU_SDK_ROOT}/XTDK/runtime/shlib/")
endif()

@ -84,19 +84,6 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const {
VLOG(3) << "do not perform " + type() + "+bias fuse";
return;
}
if (conv->Op()->HasAttr("dilations")) {
auto dilations =
BOOST_GET_CONST(std::vector<int>, conv->Op()->GetAttr("dilations"));
for (const auto& d : dilations) {
if (d != 1) {
LOG(WARNING)
<< "dilation conv not supported in MKLDNN, fuse not apply "
<< "and set conv attribute use_mkldnn = false";
conv->Op()->SetAttr("use_mkldnn", false);
return;
}
}
}
auto* eltwise_bias_tensor =
scope->FindVar(eltwise_bias->Name())->GetMutable<LoDTensor>();

@ -84,6 +84,12 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
}
#endif
#ifdef PADDLE_WITH_CUDA
else if (platform::is_cuda_pinned_place(src_place) && // NOLINT
platform::is_cuda_pinned_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place), src_ptr,
size);
}
else if (platform::is_cuda_pinned_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
@ -285,6 +291,12 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
}
#endif
#ifdef PADDLE_WITH_CUDA
else if (platform::is_cuda_pinned_place(src_place) && // NOLINT
platform::is_cuda_pinned_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place), src_ptr,
size);
}
else if (platform::is_cuda_pinned_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,

@ -100,7 +100,19 @@ void NCCLParallelContext::SendNCCLID(const std::string &ep,
serv_addr.sin_family = AF_INET;
serv_addr.sin_port = htons(port);
if (inet_pton(AF_INET, host.c_str(), &serv_addr.sin_addr) <= 0) {
char *ip = NULL;
struct hostent *hp;
if ((hp = gethostbyname(host.c_str())) == NULL) {
PADDLE_THROW(platform::errors::InvalidArgument(
"Fail to get host by name %s.", host));
}
int i = 0;
while (hp->h_addr_list[i] != NULL) {
ip = inet_ntoa(*(struct in_addr *)hp->h_addr_list[i]);
VLOG(3) << "gethostbyname host:" << host << " ->ip: " << ip;
break;
}
if (inet_pton(AF_INET, ip, &serv_addr.sin_addr) <= 0) {
PADDLE_THROW(platform::errors::Unavailable("Open address %s failed.", ep));
}

@ -16,6 +16,7 @@
// network header files
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
#include <arpa/inet.h>
#include <netdb.h>
#include <netinet/in.h>
#include <stdlib.h>
#include <sys/socket.h>

@ -20,6 +20,8 @@
#include "paddle/fluid/imperative/infer_shape_context.h"
#include "paddle/fluid/imperative/infer_var_type_context.h"
DECLARE_bool(use_mkldnn);
namespace paddle {
namespace imperative {
@ -91,8 +93,10 @@ PreparedOp PrepareOpImpl(const NameVarMap<VarType>& ins,
// MKLDNN variant of code reads attributes in some of GetKernelTypeForVar and
// GetKernelType functions, so we need to copy the attributes there.
// Const qualifier of Attrs had to be discarded to overwrite it.
auto& mutable_op_attrs = const_cast<framework::AttributeMap&>(op.Attrs());
mutable_op_attrs = attrs;
if (FLAGS_use_mkldnn) {
auto& mutable_op_attrs = const_cast<framework::AttributeMap&>(op.Attrs());
mutable_op_attrs = attrs;
}
#endif
auto expected_kernel_key =
op.GetExpectedKernelType(DygraphExecutionContext<VarType>(

@ -20,7 +20,7 @@ namespace imperative = paddle::imperative;
namespace platform = paddle::platform;
imperative::ParallelStrategy GetStrategy(int local_rank) {
std::vector<std::string> eps = {"127.0.0.1:9866", "127.0.0.1:9867"};
std::vector<std::string> eps = {"127.0.0.1:9866", "localhost:9867"};
imperative::ParallelStrategy strategy;
strategy.trainer_endpoints_ = eps;
strategy.current_endpoint_ = eps[local_rank];

@ -375,7 +375,7 @@ void AnalysisConfig::Update() {
}
if (use_xpu_) {
#ifndef PADDLE_WITH_XPU
#ifndef LITE_SUBGRAPH_WITH_XPU
PADDLE_THROW(platform::errors::Unavailable(
"You tried to use an XPU device, but Paddle was not compiled "
"with XPU-runtime."));

@ -4,6 +4,6 @@ endif()
cc_library(lite_op_teller SRCS op_teller.cc DEPS lite_full_static framework_proto device_context boost xxhash)
cc_library(lite_engine SRCS engine.cc DEPS lite_full_static framework_proto ${XPU_DEPS})
cc_library(lite_tensor_utils SRCS tensor_utils.cc DEPS memcpy lite_full_static framework_proto boost device_context)
cc_library(lite_tensor_utils SRCS tensor_utils.cc DEPS memcpy lite_full_static framework_proto boost device_context ${XPU_DEPS})
cc_test(test_lite_engine SRCS test_engine.cc DEPS lite_engine protobuf framework_proto glog gtest analysis)
cc_test(test_lite_tensor_utils SRCS test_tensor_utils.cc DEPS lite_engine lite_tensor_utils)

@ -16,7 +16,7 @@
#define LITE_WITH_CUDA 1
#endif
#ifdef PADDLE_WITH_XPU
#ifdef LITE_SUBGRAPH_WITH_XPU
#define LITE_WITH_XPU 1
#endif
@ -59,7 +59,7 @@ paddle::lite_api::PaddlePredictor* EngineManager::Create(
cfg.cpu_math_library_num_threads);
#endif
#ifdef PADDLE_WITH_XPU
#ifdef LITE_SUBGRAPH_WITH_XPU
lite_cxx_config.set_xpu_workspace_l3_size_per_thread(
cfg.xpu_l3_workspace_size);
#endif

@ -13,12 +13,49 @@
// limitations under the License.
#include "paddle/fluid/operators/allclose_op.h"
#include <cmath>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace operators {
template <typename T>
struct GetTensorValue<platform::CPUDeviceContext, T> {
T operator()(const platform::CPUDeviceContext& dev_ctx,
const framework::Tensor& tensor) const {
return *(tensor.data<T>());
}
};
template <typename T>
struct AllcloseFunctor<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& ctx,
const framework::Tensor& in, const framework::Tensor& other,
const double rtol, const double atol, bool equal_nan,
framework::Tensor* output) {
auto* in_a = in.data<T>();
auto* in_b = other.data<T>();
auto* out_data = output->mutable_data<bool>(ctx.GetPlace());
auto num = in.numel();
*out_data = true;
for (int i = 0; i < num; i++) {
const T a = in_a[i], b = in_b[i];
bool val;
if (std::isnan(a) || std::isnan(b)) {
val = equal_nan && std::isnan(a) == std::isnan(b);
} else {
T left = (a > b ? a - b : b - a);
T right = atol + (b > 0 ? rtol * b : (-rtol) * b);
T diff = (left > right ? left - right : right - left);
val = a == b || left <= right || diff <= 1e-15;
}
*out_data &= val;
}
}
};
class AllcloseOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
@ -26,12 +63,9 @@ class AllcloseOpMaker : public framework::OpProtoAndCheckerMaker {
"The input tensor, it's data type should be float32, float64.");
AddInput("Other",
"The input tensor, it's data type should be float32, float64.");
AddInput("Rtol", "The relative tolerance.");
AddInput("Atol", "The absolute tolerance.");
AddOutput("Out", "The output tensor, it's data type is bool.");
AddAttr<float>("rtol", "The relative tolerance. Default: :math:`1e-5` .")
.SetDefault(1e-5);
AddAttr<float>("atol", "The absolute tolerance. Default: :math:`1e-8` .")
.SetDefault(1e-8);
AddAttr<bool>("equal_nan",
"If :math:`True` , then two :math:`NaNs` will be "
"compared as equal. Default: :math:`False` .")
@ -54,16 +88,12 @@ class AllcloseOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true,
platform::errors::NotFound(
"Input(Input) of allclose op should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasInput("Other"), true,
platform::errors::NotFound(
"Input(Other) of allclose op should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
platform::errors::NotFound(
"The output(Out) of allclose op must not be null."));
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "Allclose");
OP_INOUT_CHECK(ctx->HasInput("Other"), "Input", "Other", "Allclose");
OP_INOUT_CHECK(ctx->HasInput("Rtol"), "Input", "Rtol", "Allclose");
OP_INOUT_CHECK(ctx->HasInput("Atol"), "Input", "Atol", "Allclose");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Allclose");
auto input_dim = ctx->GetInputDim("Input");
auto other_dim = ctx->GetInputDim("Other");
@ -96,7 +126,7 @@ class AllcloseOp : public framework::OperatorWithKernel {
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "Input"),
ctx.device_context());
@ -105,7 +135,7 @@ class AllcloseOp : public framework::OperatorWithKernel {
class AllcloseOpVarTypeInference : public framework::VarTypeInference {
public:
void operator()(framework::InferVarTypeContext *ctx) const override {
void operator()(framework::InferVarTypeContext* ctx) const override {
ctx->SetOutputDataType("Out", framework::proto::VarType::BOOL);
}
};

@ -12,12 +12,70 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#define EIGEN_USE_GPU
#include <cuda_runtime.h>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/allclose_op.h"
namespace paddle {
namespace operators {
template <typename T>
struct GetTensorValue<platform::CUDADeviceContext, T> {
T operator()(const platform::CUDADeviceContext& dev_ctx,
const framework::Tensor& tensor) const {
const T* data = tensor.data<T>();
T value;
const auto gpu_place =
BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace());
memory::Copy(platform::CPUPlace(), &value, gpu_place, data, sizeof(T),
dev_ctx.stream());
return value;
}
};
template <typename T>
__global__ void AllcloseCUDAKernel(const T* in_data, const T* other_data,
const double rtol, const double atol,
bool equal_nan, int num, bool* out_data) {
unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
bool val;
for (int i = idx; i < num; i += blockDim.x * gridDim.x) {
const T a = in_data[i], b = other_data[i];
if (isnan(a) || isnan(b)) {
val = equal_nan && isnan(a) == isnan(b);
} else {
T left = (a > b ? a - b : b - a);
T right = atol + (b > 0 ? rtol * b : (-rtol) * b);
T diff = (left > right ? left - right : right - left);
val = a == b || left <= right || diff <= 1e-15;
}
if (!val) *out_data = false;
}
}
template <typename T>
struct AllcloseFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& dev_ctx,
const framework::Tensor& in, const framework::Tensor& other,
const double rtol, const double atol, bool equal_nan,
framework::Tensor* output) {
int num = in.numel();
const T* in_data = in.data<T>();
const T* other_data = other.data<T>();
bool* out_data = output->mutable_data<bool>(dev_ctx.GetPlace());
int block = 1024;
int grid = (block - 1 + num) / block;
grid = (grid > block) ? block : grid;
cudaMemset(out_data, true, sizeof(bool));
AllcloseCUDAKernel<T><<<grid, block, 0, dev_ctx.stream()>>>(
in_data, other_data, rtol, atol, equal_nan, num, out_data);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
using CUDA = paddle::platform::CUDADeviceContext;
REGISTER_OP_CUDA_KERNEL(allclose, ops::AllcloseKernel<CUDA, float>,

@ -22,38 +22,38 @@ namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename DeviceContext, typename T>
struct GetTensorValue {
T operator()(const platform::DeviceContext& ctx,
const framework::Tensor& tensor) const;
};
template <typename DeviceContext, typename T>
struct AllcloseFunctor {
void operator()(const DeviceContext& ctx, const framework::Tensor& in,
const framework::Tensor& other, const float rtol,
const float atol, bool equal_nan, framework::Tensor* output);
};
template <typename DeviceContext, typename T>
class AllcloseKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
// get attrs
float rtol = ctx.Attr<float>("rtol");
float atol = ctx.Attr<float>("atol");
bool equal_nan = ctx.Attr<bool>("equal_nan");
// get input/output
auto* input = ctx.Input<Tensor>("Input");
auto* other = ctx.Input<Tensor>("Other");
const auto* input = ctx.Input<Tensor>("Input");
const auto* other = ctx.Input<Tensor>("Other");
const auto* rtol = ctx.Input<Tensor>("Rtol");
const auto* atol = ctx.Input<Tensor>("Atol");
auto* out = ctx.Output<Tensor>("Out");
out->mutable_data<bool>(ctx.GetPlace());
// get place
auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
auto input_v = framework::EigenVector<T>::Flatten(*input);
auto other_v = framework::EigenVector<T>::Flatten(*other);
auto out_v = framework::EigenScalar<bool>::From(*out);
auto left = (input_v - other_v).abs();
auto right = static_cast<T>(atol) + static_cast<T>(rtol) * other_v.abs();
auto compare_res = left <= right;
if (equal_nan) {
auto input_nan = input_v.isnan();
auto other_nan = other_v.isnan();
out_v.device(place) =
(input_nan == other_nan).all() && (compare_res != input_nan).all();
} else {
out_v.device(place) = compare_res.all();
}
auto& dev_ctx = ctx.template device_context<DeviceContext>();
GetTensorValue<DeviceContext, double> get_tensor_value;
double rtol_v = get_tensor_value(dev_ctx, *rtol);
double atol_v = get_tensor_value(dev_ctx, *atol);
AllcloseFunctor<DeviceContext, T>()(dev_ctx, *input, *other, rtol_v, atol_v,
equal_nan, out);
}
};

@ -54,7 +54,7 @@ class AssignFunctor {
out_rows.set_height(rows.height());
auto &t = rows.value();
auto *m = out_rows.mutable_value();
framework::TensorCopy(t, dev_ctx_.GetPlace(), dev_ctx_, m);
framework::TensorCopy(t, t.place(), m);
}
template <typename T>
@ -70,7 +70,7 @@ class AssignFunctor {
framework::LoDTensor *out) const {
if (lod_tensor.numel() == 0) return;
auto &out_tensor = *out;
TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_, &out_tensor);
TensorCopy(lod_tensor, lod_tensor.place(), &out_tensor);
out_tensor.set_lod(lod_tensor.lod());
}

@ -0,0 +1,161 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/operators/assign_op.h"
#include <string>
namespace paddle {
namespace framework {
class OpDesc;
class Variable;
} // namespace framework
namespace imperative {
class OpBase;
} // namespace imperative
namespace platform {
struct CPUPlace;
struct CUDAPlace;
struct float16;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace operators {
class AssignOp : public framework::OperatorWithKernel {
public:
AssignOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorWithKernel(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const override {
if (ctx->HasInput("X")) {
auto type = ctx->GetInputsVarType("X")[0];
if (type == framework::proto::VarType::SELECTED_ROWS ||
type == framework::proto::VarType::LOD_TENSOR) {
ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
if (type == framework::proto::VarType::LOD_TENSOR) {
ctx->ShareLoD("X", /*->*/ "Out");
}
} else if (type == framework::proto::VarType::LOD_TENSOR_ARRAY) {
if (ctx->IsRuntime()) {
// The runtime output shape is determined in kernel.
return;
} else {
ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
}
}
}
}
protected:
framework::OpKernelType GetKernelTypeForVar(
const std::string &var_name, const framework::Tensor &tensor,
const framework::OpKernelType &expected_kernel_type) const override {
return framework::OpKernelType(expected_kernel_type.data_type_,
expected_kernel_type.place_,
tensor.layout());
}
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
const framework::Variable *var = ctx.InputVar("X");
if (var->IsType<framework::LoDTensorArray>()) {
auto t_arr = var->Get<framework::LoDTensorArray>();
// NOTE(liym27): Support an empty tensor array as Input.
// And set the kernel type is float.
if (t_arr.size() == 0) {
return framework::OpKernelType(framework::proto::VarType::FP32,
ctx.device_context());
}
}
return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "X"),
ctx.device_context());
}
};
class AssignInferVarType : public framework::VarTypeInference {
public:
void operator()(framework::InferVarTypeContext *ctx) const override {
ctx->SyncTypeAndDataType("X", "Out");
}
};
class AssignKernel {
public:
void operator()(const framework::ExecutionContext &ctx) const {
auto *x = ctx.InputVar("X");
if (x == nullptr) {
return;
}
PADDLE_ENFORCE_EQ(
ctx.HasOutput("Out"), true,
platform::errors::NotFound("Output(Out) of assign_op is not found."));
auto *out = ctx.OutputVar("Out");
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(ctx.GetPlace());
framework::VisitVarType(*x, AssignFunctor(out, dev_ctx));
}
};
class AssignOpProtoMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"(LoDTensor, SelectedRows or LoDTensorArray) The input variable "
"could be LoDTensor, SelectedRows or LoDTensorArray.")
.AsDispensable();
AddOutput("Out",
"(LoDTensor, SelectedRows or LoDTensorArray) The type of output "
"is the same as input X.");
AddComment(R"DOC(Assign Operator
Out = X, when type in [LoDTensor/SelectedRows/LoDTensorArray]
raise error if the type is not listed above.
)DOC");
}
};
template <typename T>
class AssignGradMaker : public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> op) const override {
op->SetType("assign");
op->SetInput("X", this->OutputGrad("Out"));
op->SetOutput("Out", this->InputGrad("X"));
}
};
DECLARE_INPLACE_OP_INFERER(AssignOpInplaceInferer, {"X", "Out"});
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_XPU_KERNEL_FUNCTOR(assign, float, ops::AssignKernel, double,
ops::AssignKernel, int, ops::AssignKernel,
int64_t, ops::AssignKernel, bool,
ops::AssignKernel);
#endif

@ -19,6 +19,7 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/layout_utils.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/norm_utils.h"
@ -41,127 +42,6 @@ template <typename T>
using ConstEigenVectorArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
template <typename DeviceContext, typename T>
inline void ResizeToChannelFirst(const framework::ExecutionContext& context,
const Tensor* input,
Tensor* transformed_input) {
int dim = input->dims().size() - 2;
if (dim == 3) {
// input
transformed_input->Resize(input->dims());
auto in_dims_vec = framework::vectorize(input->dims());
in_dims_vec[1] = input->dims()[4];
in_dims_vec[2] = input->dims()[1];
in_dims_vec[3] = input->dims()[2];
in_dims_vec[4] = input->dims()[3];
transformed_input->Resize(framework::make_ddim(in_dims_vec));
transformed_input->mutable_data<T>(context.GetPlace());
} else if (dim == 2) {
// input
transformed_input->Resize(input->dims());
auto in_dims_vec = framework::vectorize(input->dims());
in_dims_vec[1] = input->dims()[3];
in_dims_vec[2] = input->dims()[1];
in_dims_vec[3] = input->dims()[2];
transformed_input->Resize(framework::make_ddim(in_dims_vec));
transformed_input->mutable_data<T>(context.GetPlace());
} else if (dim == 1) {
transformed_input->Resize(input->dims());
auto in_dims_vec = framework::vectorize(input->dims());
in_dims_vec[1] = input->dims()[2];
in_dims_vec[2] = input->dims()[1];
transformed_input->Resize(framework::make_ddim(in_dims_vec));
transformed_input->mutable_data<T>(context.GetPlace());
}
}
template <typename DeviceContext, typename T>
inline void TransToChannelFirst(const framework::ExecutionContext& context,
const Tensor* input,
Tensor* transformed_input) {
int dim = input->dims().size() - 2;
if (dim == 3) {
auto& dev_ctx = context.template device_context<DeviceContext>();
std::vector<int> axis{0, 4, 1, 2, 3};
math::Transpose<DeviceContext, T, 5> trans5;
trans5(dev_ctx, *input, transformed_input, axis);
} else if (dim == 2) {
auto& dev_ctx = context.template device_context<DeviceContext>();
std::vector<int> axis{0, 3, 1, 2};
math::Transpose<DeviceContext, T, 4> trans4;
trans4(dev_ctx, *input, transformed_input, axis);
} else if (dim == 1) {
auto& dev_ctx = context.template device_context<DeviceContext>();
std::vector<int> axis{0, 2, 1};
math::Transpose<DeviceContext, T, 3> trans3;
trans3(dev_ctx, *input, transformed_input, axis);
}
}
template <typename DeviceContext, typename T>
inline void ResizeToChannelLast(const framework::ExecutionContext& context,
const Tensor* input,
Tensor* transformed_input) {
int dim = input->dims().size() - 2;
if (dim == 3) {
transformed_input->Resize(input->dims());
auto in_dims_vec = framework::vectorize(input->dims());
in_dims_vec[1] = input->dims()[2];
in_dims_vec[2] = input->dims()[3];
in_dims_vec[3] = input->dims()[4];
in_dims_vec[4] = input->dims()[1];
transformed_input->Resize(framework::make_ddim(in_dims_vec));
transformed_input->mutable_data<T>(context.GetPlace());
} else if (dim == 2) {
transformed_input->Resize(input->dims());
auto in_dims_vec = framework::vectorize(input->dims());
in_dims_vec[1] = input->dims()[2];
in_dims_vec[2] = input->dims()[3];
in_dims_vec[3] = input->dims()[1];
transformed_input->Resize(framework::make_ddim(in_dims_vec));
transformed_input->mutable_data<T>(context.GetPlace());
} else if (dim == 1) {
transformed_input->Resize(input->dims());
auto in_dims_vec = framework::vectorize(input->dims());
in_dims_vec[1] = input->dims()[2];
in_dims_vec[2] = input->dims()[1];
transformed_input->Resize(framework::make_ddim(in_dims_vec));
transformed_input->mutable_data<T>(context.GetPlace());
}
}
template <typename DeviceContext, typename T>
inline void TransToChannelLast(const framework::ExecutionContext& context,
const Tensor* input, Tensor* transformed_input) {
int dim = input->dims().size() - 2;
if (dim == 3) {
auto& dev_ctx = context.template device_context<DeviceContext>();
std::vector<int> axis{0, 2, 3, 4, 1};
math::Transpose<DeviceContext, T, 5> trans5;
trans5(dev_ctx, *input, transformed_input, axis);
} else if (dim == 2) {
auto& dev_ctx = context.template device_context<DeviceContext>();
std::vector<int> axis{0, 2, 3, 1};
math::Transpose<DeviceContext, T, 4> trans4;
trans4(dev_ctx, *input, transformed_input, axis);
} else if (dim == 1) {
auto& dev_ctx = context.template device_context<DeviceContext>();
std::vector<int> axis{0, 2, 1};
math::Transpose<DeviceContext, T, 3> trans3;
trans3(dev_ctx, *input, transformed_input, axis);
}
}
class BatchNormOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;

@ -0,0 +1,69 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/operators/cast_op.h"
#include <memory>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename InT>
class CastXPUKernel : public framework::OpKernel<InT> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* in = context.Input<framework::Tensor>("X");
auto* out = context.Output<framework::Tensor>("Out");
auto in_type = static_cast<framework::proto::VarType::Type>(
context.Attr<int>("in_dtype"));
auto out_type = static_cast<framework::proto::VarType::Type>(
context.Attr<int>("out_dtype"));
auto* in_data = in->data<InT>();
auto numel = in->numel();
auto& dev_ctx = context.template device_context<DeviceContext>();
int r = -1;
if (out_type == framework::proto::VarType::FP32) {
auto* out_data = out->mutable_data<float>(context.GetPlace());
r = xpu::cast<InT, float>(dev_ctx.x_context(), in_data, out_data, numel);
} else if (out_type == framework::proto::VarType::INT32) {
auto* out_data = out->mutable_data<int>(context.GetPlace());
r = xpu::cast<InT, int>(dev_ctx.x_context(), in_data, out_data, numel);
} else if (out_type == framework::proto::VarType::INT64) {
auto* out_data = out->mutable_data<int64_t>(context.GetPlace());
r = xpu::cast<InT, int64_t>(dev_ctx.x_context(), in_data, out_data,
numel);
} else {
PADDLE_THROW(platform::errors::Unavailable("Not supported cast %d -> %d",
in_type, out_type));
}
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
r));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_XPU_KERNEL(
cast, ops::CastXPUKernel<paddle::platform::XPUDeviceContext, int>,
ops::CastXPUKernel<paddle::platform::XPUDeviceContext, float>,
ops::CastXPUKernel<paddle::platform::XPUDeviceContext, int64_t>);
#endif

@ -67,6 +67,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
}
auto in_tensors = context.MultiInput<framework::LoDTensor>("Input");
bool use_align = context.Attr<bool>("use_align");
if (context.Attr<bool>("check_name")) {
for (size_t i = 0; i < in_var_names.size(); ++i) {
@ -93,7 +94,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
context.Attr<int>("dtype"));
size_t size_of_dtype = framework::SizeOfType(dtype);
GetMemSizeAndDtype(in_tensors, in_var_names, &numel, size_of_dtype,
context.GetPlace());
context.GetPlace(), use_align);
// Alloc the continuous space
auto fused_tensor = context.Output<framework::LoDTensor>("FusedOutput");
@ -111,8 +112,11 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
framework::TensorCopy(*in_tensors[i], context.GetPlace(), dev_ctx,
&sub_tensor);
offset += platform::Alignment(len * size_of_dtype, context.GetPlace()) /
size_of_dtype;
offset +=
use_align
? platform::Alignment(len * size_of_dtype, context.GetPlace()) /
size_of_dtype
: len;
}
} else if (context.Attr<bool>("set_constant")) {
math::SetConstant<DeviceContext, T> set_constant;
@ -131,8 +135,10 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
->ShareDataWith(fused_tensor->Slice(
static_cast<int64_t>(offset), static_cast<int64_t>(offset + len)))
.Resize(dim);
len = platform::Alignment(len * size_of_dtype, context.GetPlace()) /
size_of_dtype;
len = use_align
? platform::Alignment(len * size_of_dtype, context.GetPlace()) /
size_of_dtype
: len;
offset += len;
ss << "output(" << out_var_names[i] << ") dim:(" << dim << ")"
<< " address: " << out_tensors[i]->data<void>() << ", ";
@ -144,7 +150,8 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
void GetMemSizeAndDtype(
const std::vector<const framework::LoDTensor *> &lod_tensors,
const std::vector<std::string> var_names, size_t *numel,
const size_t &size_of_dtype, const platform::Place &place) const {
const size_t &size_of_dtype, const platform::Place &place,
const bool use_align = true) const {
PADDLE_ENFORCE_EQ(
lod_tensors.size(), var_names.size(),
platform::errors::InvalidArgument(
@ -167,9 +174,11 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
ss << "input(" << var_names[i] << ") dim:(" << lod_tensors[i]->dims()
<< ") "
<< " addres:" << lod_tensors[i]->data<void>() << ", ";
*numel += platform::Alignment(static_cast<size_t>(size) * size_of_dtype,
place) /
size_of_dtype;
*numel += use_align
? platform::Alignment(
static_cast<size_t>(size) * size_of_dtype, place) /
size_of_dtype
: static_cast<size_t>(size);
}
VLOG(10) << ss.str();
@ -223,6 +232,10 @@ class CoalesceTensorOpMaker : public framework::OpProtoAndCheckerMaker {
"Whether to check the name of Input and Output to ensure "
"they are the same separately.")
.SetDefault(false);
AddAttr<bool>("use_align",
"Whether to consider memory chunk and take alignment into "
"account for inputs and outputs.")
.SetDefault(true);
AddComment(R"DOC(
CoalesceTensor Operator.

@ -0,0 +1,185 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/concat_op.h"
#include <memory>
#include <string>
#include <vector>
#ifdef PADDLE_WITH_MKLDNN
#include <paddle/fluid/platform/mkldnn_helper.h>
#endif
#ifdef PADDLE_WITH_XPU
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename DeviceContext, typename T>
class ConcatXPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto ins = ctx.MultiInput<framework::Tensor>("X");
framework::Tensor* out = ctx.Output<framework::Tensor>("Out");
int axis = ctx.Attr<int>("axis");
PADDLE_ENFORCE_NE(ins[0], nullptr, platform::errors::InvalidArgument(
"The input should not be null."));
PADDLE_ENFORCE_NE(ctx.HasInput("AxisTensor"), true,
platform::errors::InvalidArgument(
"XPU donot surpport AxisTensor for now"));
axis = ComputeAxis(static_cast<int64_t>(axis),
static_cast<int64_t>(ins[0]->dims().size()));
PADDLE_ENFORCE_GE(
axis, 0, platform::errors::InvalidArgument("concat: axis shoud >= 0!"));
PADDLE_ENFORCE_LT(axis, ins[0]->dims().size(),
platform::errors::InvalidArgument(
"concat: axis shoud < ins[0]->dims()!"));
auto place = ctx.GetPlace();
out->mutable_data<T>(place);
std::vector<int> choose_idx;
int n = 0;
for (unsigned int i = 0; i < ins.size(); ++i) {
if (ins[i] && ins[i]->numel() > 0) {
choose_idx.push_back(i);
n++;
}
}
PADDLE_ENFORCE_LE(n, 8, platform::errors::InvalidArgument(
"XPU only surpport at most 8 tensors for now"));
PADDLE_ENFORCE_GT(
n, 0, platform::errors::InvalidArgument("No tensor need concat?"));
int h = 1;
int w_except_axis = 1;
for (int i = 0; i < axis; ++i) {
h *= (ins[choose_idx[0]]->dims())[i];
}
for (int i = axis + 1; i < ins[0]->dims().size(); ++i) {
w_except_axis *= (ins[choose_idx[0]]->dims())[i];
}
for (int i = 1; i < n; ++i) {
int hh = 1;
int ww = 1;
for (int j = 0; j < axis; ++j) {
hh *= (ins[choose_idx[i]]->dims())[j];
}
for (int j = axis + 1; j < ins[i]->dims().size(); ++j) {
ww *= (ins[choose_idx[i]]->dims())[j];
}
PADDLE_ENFORCE_EQ(hh, h, platform::errors::InvalidArgument(
"concat: h should be eual!"));
PADDLE_ENFORCE_EQ(ww, w_except_axis,
platform::errors::InvalidArgument(
"concat: w should be eual except for axis!"));
}
auto& dev_ctx = ctx.template device_context<DeviceContext>();
std::unique_ptr<int[]> in_w_host(new int[n]);
std::unique_ptr<const float* []> ptrs(new const float*[n]);
for (int i = 0; i < n; ++i) {
ptrs[i] = ins[choose_idx[i]]->data<T>();
in_w_host[i] = w_except_axis * (ins[choose_idx[i]]->dims())[axis];
}
int r =
xpu::concat<float>(dev_ctx.x_context(), h, (const int*)in_w_host.get(),
n, (const float**)ptrs.get(), out->data<T>());
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
r));
}
};
template <typename DeviceContext, typename T>
class ConcatGradXPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const {
auto* out_grad =
ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto ins = ctx.MultiInput<framework::LoDTensor>("X");
auto out_var_names = ctx.OutputNames(framework::GradVarName("X"));
auto outs =
ctx.MultiOutput<framework::LoDTensor>(framework::GradVarName("X"));
{
auto dx = outs;
auto x = ins;
for (size_t i = 0; i < dx.size(); ++i) {
if (dx[i] != nullptr) {
dx[i]->set_lod(x[i]->lod());
}
}
}
PADDLE_ENFORCE_NE(ins[0], nullptr, platform::errors::InvalidArgument(
"The input should not be null."));
auto axis = ctx.Attr<int>("axis");
if (ctx.HasInput("AxisTensor")) {
auto* axis_tensor = ctx.Input<framework::Tensor>("AxisTensor");
axis = GetDataFromTensor<int>(axis_tensor)[0];
}
axis = ComputeAxis(static_cast<int64_t>(axis),
static_cast<int64_t>(ins[0]->dims().size()));
// get output tensor that the name is not kEmptyVarName
std::vector<framework::Tensor*> outputs;
for (size_t j = 0; j < outs.size(); ++j) {
if (out_var_names[j] != framework::kEmptyVarName &&
outs[j]->numel() != 0UL) {
outs[j]->mutable_data<T>(ctx.GetPlace());
outputs.push_back(outs[j]);
} else {
outputs.push_back(nullptr);
}
}
PADDLE_ENFORCE_GE(axis, 0, platform::errors::InvalidArgument(
"concat_grad: axis shoud >= 0!"));
PADDLE_ENFORCE_LT(axis, out_grad->dims().size(),
platform::errors::InvalidArgument(
"concat_grad: axis shoud < ins[0]->dims()!"));
auto out_grad_stride = framework::stride_numel(out_grad->dims());
int n = outputs.size();
PADDLE_ENFORCE_LE(n, 16,
platform::errors::InvalidArgument(
"XPU only surpport at most 16 tensors for now"));
int h = out_grad_stride[0] / out_grad_stride[axis];
auto& dev_ctx = ctx.template device_context<DeviceContext>();
std::unique_ptr<int[]> in_w_host(new int[n]);
std::unique_ptr<float* []> ptrs(new float*[n]);
for (int i = 0; i < n; ++i) {
auto out_stride = framework::stride_numel(outputs[i]->dims());
ptrs[i] = outputs[i]->data<T>();
in_w_host[i] = out_stride[axis];
}
int r = xpu::concat_grad(dev_ctx.x_context(), h, in_w_host.get(), n,
reinterpret_cast<float**>(ptrs.get()),
out_grad->data<T>());
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
r));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_XPU_KERNEL(
concat, ops::ConcatXPUKernel<paddle::platform::XPUDeviceContext, float>);
REGISTER_OP_XPU_KERNEL(
concat_grad,
ops::ConcatGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
#endif

@ -20,6 +20,7 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/layout_utils.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/depthwise_conv.h"
#include "paddle/fluid/operators/math/im2col.h"
@ -138,102 +139,6 @@ inline bool IsExpand(const std::vector<int64_t>& filter_dim,
return !(filter_1 && strides_1 && padding_0 && dilation_1);
}
template <typename DeviceContext, typename T>
inline void ResizeToChannelFirst(const framework::ExecutionContext& context,
const Tensor* input,
Tensor* transformed_input) {
int dim = input->dims().size() - 2;
if (dim == 3) {
// input
transformed_input->Resize(input->dims());
auto in_dims_vec = framework::vectorize(input->dims());
in_dims_vec[1] = input->dims()[4];
in_dims_vec[2] = input->dims()[1];
in_dims_vec[3] = input->dims()[2];
in_dims_vec[4] = input->dims()[3];
transformed_input->Resize(framework::make_ddim(in_dims_vec));
transformed_input->mutable_data<T>(context.GetPlace());
} else if (dim == 2) {
// input
transformed_input->Resize(input->dims());
auto in_dims_vec = framework::vectorize(input->dims());
in_dims_vec[1] = input->dims()[3];
in_dims_vec[2] = input->dims()[1];
in_dims_vec[3] = input->dims()[2];
transformed_input->Resize(framework::make_ddim(in_dims_vec));
transformed_input->mutable_data<T>(context.GetPlace());
}
}
template <typename DeviceContext, typename T>
inline void ResizeToChannelLast(const framework::ExecutionContext& context,
const Tensor* input,
Tensor* transformed_input) {
int dim = input->dims().size() - 2;
if (dim == 3) {
// input
transformed_input->Resize(input->dims());
auto in_dims_vec = framework::vectorize(input->dims());
in_dims_vec[1] = input->dims()[2];
in_dims_vec[2] = input->dims()[3];
in_dims_vec[3] = input->dims()[4];
in_dims_vec[4] = input->dims()[1];
transformed_input->Resize(framework::make_ddim(in_dims_vec));
transformed_input->mutable_data<T>(context.GetPlace());
} else if (dim == 2) {
// input
transformed_input->Resize(input->dims());
auto in_dims_vec = framework::vectorize(input->dims());
in_dims_vec[1] = input->dims()[2];
in_dims_vec[2] = input->dims()[3];
in_dims_vec[3] = input->dims()[1];
transformed_input->Resize(framework::make_ddim(in_dims_vec));
transformed_input->mutable_data<T>(context.GetPlace());
}
}
template <typename DeviceContext, typename T>
inline void TransToChannelFirst(const framework::ExecutionContext& context,
const Tensor* input,
Tensor* transformed_input) {
int dim = input->dims().size() - 2;
if (dim == 3) {
auto& dev_ctx = context.template device_context<DeviceContext>();
std::vector<int> axis{0, 4, 1, 2, 3};
math::Transpose<DeviceContext, T, 5> trans5;
trans5(dev_ctx, *input, transformed_input, axis);
} else if (dim == 2) {
auto& dev_ctx = context.template device_context<DeviceContext>();
std::vector<int> axis{0, 3, 1, 2};
math::Transpose<DeviceContext, T, 4> trans4;
trans4(dev_ctx, *input, transformed_input, axis);
}
}
template <typename DeviceContext, typename T>
inline void TransToChannelLast(const framework::ExecutionContext& context,
const Tensor* input, Tensor* transformed_input) {
int dim = input->dims().size() - 2;
if (dim == 3) {
auto& dev_ctx = context.template device_context<DeviceContext>();
std::vector<int> axis{0, 2, 3, 4, 1};
math::Transpose<DeviceContext, T, 5> trans5;
trans5(dev_ctx, *input, transformed_input, axis);
} else if (dim == 2) {
auto& dev_ctx = context.template device_context<DeviceContext>();
std::vector<int> axis{0, 2, 3, 1};
math::Transpose<DeviceContext, T, 4> trans4;
trans4(dev_ctx, *input, transformed_input, axis);
}
}
// Define Op classes in .h file so that other conv
// operator implementations can reuse the code.
class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker {

@ -27,10 +27,10 @@ class GemmConvXPUKernel : public framework::OpKernel<T> {
// that avoids modifying the variable in the Scope.
Tensor filter = *context.Input<Tensor>("Filter");
Tensor* output = context.Output<Tensor>("Output");
Tensor* max_input = context.Output<Tensor>("MaxInput");
Tensor* max_filter = context.Output<Tensor>("MaxFilter");
max_input->mutable_data<T>(context.GetPlace());
max_filter->mutable_data<T>(context.GetPlace());
// Tensor* max_input = context.Output<Tensor>("MaxInput");
// Tensor* max_filter = context.Output<Tensor>("MaxFilter");
// max_input->mutable_data<T>(context.GetPlace());
// max_filter->mutable_data<T>(context.GetPlace());
output->mutable_data<T>(context.GetPlace());
int groups = context.Attr<int>("groups");
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
@ -47,33 +47,47 @@ class GemmConvXPUKernel : public framework::OpKernel<T> {
dilations[0] == 1 && dilations[1] == 1, true,
platform::errors::InvalidArgument("XPU only support dilation == 1."));
auto& dev_ctx = context.template device_context<DeviceContext>();
PADDLE_ENFORCE_EQ(
xpu::findmax(dev_ctx.x_context(), input->data<T>(), input->numel(),
max_input->data<T>()) == xpu::Error_t::SUCCESS,
true, platform::errors::InvalidArgument("XPU kernel error!"));
PADDLE_ENFORCE_EQ(
xpu::findmax(dev_ctx.x_context(), filter.data<T>(), filter.numel(),
max_filter->data<T>()) == xpu::Error_t::SUCCESS,
true, platform::errors::InvalidArgument("XPU kernel error!"));
// PADDLE_ENFORCE_EQ(
// xpu::findmax(dev_ctx.x_context(), input->data<T>(), input->numel(),
// max_input->data<T>()) == xpu::Error_t::SUCCESS,
// true, platform::errors::InvalidArgument(
// "XPU conv kernel error,can not finde max_input,please "
// "check whether Baidu Kunlun "
// "Card is properly installed."));
// PADDLE_ENFORCE_EQ(
// xpu::findmax(dev_ctx.x_context(), filter.data<T>(), filter.numel(),
// max_filter->data<T>()) == xpu::Error_t::SUCCESS,
// true, platform::errors::InvalidArgument(
// "XPU conv kernel error,can not find max_filter,please "
// "check whether Baidu Kunlun "
// "Card is properly installed."));
if (groups == 1) {
int r = xpu::conv2d_forward_int16<float, float, float, float>(
dev_ctx.x_context(), batch_size, img_c, img_h, img_w, f, win_h, win_w,
strides[0], strides[1], paddings[0], paddings[1], dilations[0],
dilations[1], groups, input->data<float>(), filter.data<float>(),
output->data<float>(), nullptr, nullptr, xpu::Activation_t::LINEAR,
// nullptr, nullptr);
max_input->data<float>(), max_filter->data<float>());
PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true,
platform::errors::InvalidArgument("XPU kernel error!"));
nullptr, nullptr);
// max_input->data<float>(), max_filter->data<float>());
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External("XPU conv kernel return wrong value[%d], "
"please check whether Baidu Kunlun Card "
"is properly installed.",
r));
} else {
int r = xpu::conv2d_int16_with_group<float, float, float>(
dev_ctx.x_context(), input->data<float>(), filter.data<float>(),
output->data<float>(), batch_size, img_c, img_h, img_w, f, win_h,
win_w, groups, strides[0], strides[1], paddings[0], paddings[1],
// nullptr, nullptr);
max_input->data<float>(), max_filter->data<float>());
PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true,
platform::errors::InvalidArgument("XPU kernel error!"));
nullptr, nullptr);
// max_input->data<float>(), max_filter->data<float>());
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External("XPU conv kernel return wrong value[%d], "
"please check whether Baidu Kunlun Card "
"is properly installed.",
r));
}
}
};
@ -82,9 +96,9 @@ class GemmConvGradXPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* input = context.Input<Tensor>("Input");
const Tensor* max_input = context.Input<Tensor>("MaxInput");
const Tensor* max_filter = context.Input<Tensor>("MaxFilter");
Tensor* max_output_grad = context.Output<Tensor>("MaxOutputGrad");
// const Tensor* max_input = context.Input<Tensor>("MaxInput");
// const Tensor* max_filter = context.Input<Tensor>("MaxFilter");
// Tensor* max_output_grad = context.Output<Tensor>("MaxOutputGrad");
const Tensor* output_grad =
context.Input<Tensor>(framework::GradVarName("Output"));
Tensor* input_grad =
@ -119,34 +133,45 @@ class GemmConvGradXPUKernel : public framework::OpKernel<T> {
filter_grad->mutable_data<T>(context.GetPlace());
}
auto& dev_ctx = context.template device_context<DeviceContext>();
max_output_grad->Resize({4});
max_output_grad->mutable_data<T>(context.GetPlace());
PADDLE_ENFORCE_EQ(
xpu::findmax(dev_ctx.x_context(), output_grad->data<T>(),
output_grad->numel(),
max_output_grad->data<T>()) == xpu::Error_t::SUCCESS,
true, platform::errors::InvalidArgument("XPU kernel error!"));
// max_output_grad->Resize({4});
// max_output_grad->mutable_data<T>(context.GetPlace());
// PADDLE_ENFORCE_EQ(
// xpu::findmax(dev_ctx.x_context(), output_grad->data<T>(),
// output_grad->numel(),
// max_output_grad->data<T>()) == xpu::Error_t::SUCCESS,
// true,
// platform::errors::External(
// "XPU conv kernel error, can not find max_output_grad, please
// check "
// "whether Baidu Kunlun Card is "
// "properly installed."));
if (input_grad) {
int r = xpu::conv2d_backward_int16(
dev_ctx.x_context(), batch_size, img_c, img_h, img_w, f, win_h, win_w,
strides[0], strides[1], paddings[0], paddings[1], dilations[0],
dilations[1], groups, output_grad->data<float>(),
filter.data<float>(), input_grad->data<float>(),
// nullptr, nullptr,
max_output_grad->data<float>(), max_filter->data<float>());
PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true,
platform::errors::InvalidArgument("XPU kernel error!"));
filter.data<float>(), input_grad->data<float>(), nullptr, nullptr);
// max_output_grad->data<float>(), max_filter->data<float>());
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External("XPU conv kernel return wrong value[%d], "
"please check whether Baidu Kunlun Card "
"is properly installed.",
r));
}
if (filter_grad) {
int r = xpu::conv2d_backward_weight_int16(
dev_ctx.x_context(), batch_size, img_c, img_h, img_w, f, win_h, win_w,
strides[0], strides[1], paddings[0], paddings[1], dilations[0],
dilations[1], groups, output_grad->data<float>(),
input->data<float>(), filter_grad->data<float>(),
// nullptr, nullptr,
max_output_grad->data<float>(), max_input->data<float>());
PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true,
platform::errors::InvalidArgument("XPU kernel error!"));
input->data<float>(), filter_grad->data<float>(), nullptr, nullptr);
// max_output_grad->data<float>(), max_input->data<float>());
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External("XPU conv kernel return wrong value[%d], "
"please check whether Baidu Kunlun Card "
"is properly installed.",
r));
}
}
};

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save