support reduce_max op on kunlun (#30581)
* support reduce_max op on kunlun * support reduce_max op on kunlun * support reduce_max op on kunlun * support reduce_max op on kunlunrevert-31068-fix_conv3d_windows
parent
5013c67644
commit
10271ddfc4
@ -0,0 +1,149 @@
|
||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifdef PADDLE_WITH_XPU
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include "paddle/fluid/operators/reduce_ops/reduce_op_xpu.h"
|
||||
#include "paddle/fluid/platform/xpu_header.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace operators {
|
||||
|
||||
template <typename DeviceContext, typename T>
|
||||
class ReduceMaxXPUKernel : public framework::OpKernel<T> {
|
||||
public:
|
||||
void Compute(const framework::ExecutionContext& context) const override {
|
||||
XPUReduce<DeviceContext, T>(context, xpu::reduce_max<T>);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename DeviceContext, typename T>
|
||||
class ReduceMaxGradXPUKernel : public framework::OpKernel<T> {
|
||||
public:
|
||||
void Compute(const framework::ExecutionContext& context) const override {
|
||||
auto dims = context.Attr<std::vector<int>>("dim");
|
||||
bool reduce_all = context.Attr<bool>("reduce_all");
|
||||
auto* x = context.Input<Tensor>("X");
|
||||
auto* out = context.Input<Tensor>("Out");
|
||||
auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
|
||||
auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
|
||||
|
||||
int in_dtype = context.Attr<int>("in_dtype");
|
||||
PADDLE_ENFORCE_EQ(
|
||||
in_dtype == -1, true,
|
||||
platform::errors::InvalidArgument(
|
||||
"XPU only support in_dtype == -1 in reduce_sum_grad op."));
|
||||
|
||||
auto& dev_ctx = context.template device_context<DeviceContext>();
|
||||
x_grad->mutable_data<T>(context.GetPlace());
|
||||
const T* x_data = x->data<T>();
|
||||
const T* out_data = out->data<T>();
|
||||
const T* out_grad_data = out_grad->data<T>();
|
||||
auto* x_grad_data = x_grad->data<T>();
|
||||
const auto& input_dim_size = x->dims().size();
|
||||
std::vector<int> true_dims;
|
||||
for (size_t i = 0; i < dims.size(); ++i) {
|
||||
if (dims[i] < 0) {
|
||||
true_dims.push_back(dims[i] + input_dim_size);
|
||||
} else {
|
||||
true_dims.push_back(dims[i]);
|
||||
}
|
||||
}
|
||||
std::vector<int> ydims(input_dim_size);
|
||||
std::vector<int> xdims((input_dim_size));
|
||||
std::set<int> dims_set(true_dims.begin(), true_dims.end());
|
||||
for (auto i = 0; i < input_dim_size; i++) {
|
||||
xdims[i] = x->dims()[i];
|
||||
if (dims_set.find(i) != dims_set.end() || reduce_all) {
|
||||
ydims[i] = 1;
|
||||
} else {
|
||||
ydims[i] = x->dims()[i];
|
||||
}
|
||||
}
|
||||
|
||||
T* brocast1 = nullptr;
|
||||
T* brocast2 = nullptr;
|
||||
bool* equal = nullptr;
|
||||
PADDLE_ENFORCE_EQ(
|
||||
xpu_malloc(reinterpret_cast<void**>(&brocast1), x->numel() * sizeof(T)),
|
||||
XPU_SUCCESS,
|
||||
platform::errors::ResourceExhausted("XPU has no enough memory"));
|
||||
PADDLE_ENFORCE_EQ(
|
||||
xpu_malloc(reinterpret_cast<void**>(&equal), x->numel() * sizeof(bool)),
|
||||
XPU_SUCCESS,
|
||||
platform::errors::ResourceExhausted("XPU has no enough memory"));
|
||||
PADDLE_ENFORCE_EQ(
|
||||
xpu_malloc(reinterpret_cast<void**>(&brocast2), x->numel() * sizeof(T)),
|
||||
XPU_SUCCESS,
|
||||
platform::errors::ResourceExhausted("XPU has no enough memory"));
|
||||
|
||||
// step 1. brocast out and out_grad
|
||||
int r = xpu::broadcast<T>(dev_ctx.x_context(), out_data, brocast1, ydims,
|
||||
xdims);
|
||||
PADDLE_ENFORCE_EQ(
|
||||
r == xpu::Error_t::SUCCESS, true,
|
||||
platform::errors::External("XPU broadcast in reduce_max_grad op return"
|
||||
" wrong value[%d %s].",
|
||||
r, XPUAPIErrorMsg[r]));
|
||||
r = xpu::broadcast<T>(dev_ctx.x_context(), out_grad_data, brocast2, ydims,
|
||||
xdims);
|
||||
PADDLE_ENFORCE_EQ(
|
||||
r == xpu::Error_t::SUCCESS, true,
|
||||
platform::errors::External("XPU broadcast in reduce_max_grad op return"
|
||||
" wrong value[%d %s].",
|
||||
r, XPUAPIErrorMsg[r]));
|
||||
// step 2. comparse out_brocast and x
|
||||
r = xpu::elementwise_equal<T>(dev_ctx.x_context(), x_data, brocast1, equal,
|
||||
x->numel());
|
||||
PADDLE_ENFORCE_EQ(
|
||||
r == xpu::Error_t::SUCCESS, true,
|
||||
platform::errors::External("XPU elementwise_equal in reduce_max_grad "
|
||||
"op return wrong value[%d %s].",
|
||||
r, XPUAPIErrorMsg[r]));
|
||||
// step 3. get x_grad
|
||||
r = xpu::constant<T>(dev_ctx.x_context(), brocast1, x->numel(), 0);
|
||||
PADDLE_ENFORCE_EQ(
|
||||
r == xpu::Error_t::SUCCESS, true,
|
||||
platform::errors::External("XPU constant in reduce_max_grad op return"
|
||||
" wrong value[%d %s].",
|
||||
r, XPUAPIErrorMsg[r]));
|
||||
r = xpu::select<T>(dev_ctx.x_context(), equal, brocast2, brocast1,
|
||||
x_grad_data, xdims, xdims);
|
||||
PADDLE_ENFORCE_EQ(
|
||||
r == xpu::Error_t::SUCCESS, true,
|
||||
platform::errors::External("XPU select in reduce_max_grad op return"
|
||||
" wrong value[%d %s].",
|
||||
r, XPUAPIErrorMsg[r]));
|
||||
|
||||
if (dev_ctx.x_context()->xpu_stream) {
|
||||
dev_ctx.Wait();
|
||||
}
|
||||
xpu_free(brocast1);
|
||||
xpu_free(brocast2);
|
||||
xpu_free(equal);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace operators
|
||||
} // namespace paddle
|
||||
|
||||
REGISTER_OP_XPU_KERNEL(
|
||||
reduce_max,
|
||||
ops::ReduceMaxXPUKernel<paddle::platform::XPUDeviceContext, float>);
|
||||
REGISTER_OP_XPU_KERNEL(
|
||||
reduce_max_grad,
|
||||
ops::ReduceMaxGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
|
||||
|
||||
#endif
|
||||
@ -0,0 +1,100 @@
|
||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifdef PADDLE_WITH_XPU
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
|
||||
#include "paddle/fluid/platform/xpu_header.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace operators {
|
||||
|
||||
template <typename DeviceContext, typename T>
|
||||
void XPUReduce(
|
||||
const framework::ExecutionContext& context,
|
||||
std::function<int(xpu::Context*, const T*, T*, const std::vector<int>&,
|
||||
const std::vector<int>&)>
|
||||
func) {
|
||||
PADDLE_ENFORCE_EQ(
|
||||
platform::is_xpu_place(context.GetPlace()), true,
|
||||
platform::errors::Unavailable("This kernel only runs on XPU."));
|
||||
bool reduce_all = context.Attr<bool>("reduce_all");
|
||||
auto dims = context.Attr<std::vector<int>>("dim");
|
||||
auto* x = context.Input<Tensor>("X");
|
||||
auto* y = context.Output<Tensor>("Out");
|
||||
y->mutable_data<T>(context.GetPlace());
|
||||
auto& dev_ctx = context.template device_context<DeviceContext>();
|
||||
|
||||
int out_dtype = context.Attr<int>("out_dtype");
|
||||
PADDLE_ENFORCE_EQ(out_dtype == -1, true,
|
||||
platform::errors::InvalidArgument(
|
||||
"XPU only support out_dtype == -1 in reduce op."));
|
||||
|
||||
const auto* x_data = x->data<T>();
|
||||
auto* y_data = y->data<T>();
|
||||
const auto& input_dim_size = x->dims().size();
|
||||
std::vector<int> true_dims;
|
||||
for (size_t i = 0; i < dims.size(); ++i) {
|
||||
if (dims[i] < 0) {
|
||||
true_dims.push_back(dims[i] + input_dim_size);
|
||||
} else {
|
||||
true_dims.push_back(dims[i]);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<int> reduce_dims;
|
||||
std::vector<int> xdims((input_dim_size));
|
||||
for (int i = 0; i < input_dim_size; ++i) {
|
||||
xdims[i] = x->dims()[i];
|
||||
}
|
||||
if (reduce_all) {
|
||||
for (int i = 0; i < input_dim_size; ++i) {
|
||||
reduce_dims.push_back(i);
|
||||
}
|
||||
} else {
|
||||
std::set<int> dims_set(true_dims.begin(), true_dims.end());
|
||||
for (auto i = 0; i < input_dim_size; i++) {
|
||||
if (dims_set.find(i) != dims_set.end()) {
|
||||
if (x->dims()[i] != 1) {
|
||||
reduce_dims.push_back(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (reduce_dims.size() == 0) {
|
||||
int r = xpu::copy<T>(dev_ctx.x_context(), x_data, y_data,
|
||||
x->numel() * sizeof(T));
|
||||
PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true,
|
||||
platform::errors::External("XPU copy in reduce op return "
|
||||
"wrong value[%d %s].",
|
||||
r, XPUAPIErrorMsg[r]));
|
||||
} else {
|
||||
int r = func(dev_ctx.x_context(), x_data, y_data, xdims, reduce_dims);
|
||||
PADDLE_ENFORCE_EQ(
|
||||
r == xpu::Error_t::SUCCESS, true,
|
||||
platform::errors::External("XPU reduce op return wrong value[%d %s].",
|
||||
r, XPUAPIErrorMsg[r]));
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace operators
|
||||
} // namespace paddle
|
||||
#endif
|
||||
@ -0,0 +1,74 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import numpy as np
|
||||
import sys
|
||||
sys.path.append("..")
|
||||
from op_test_xpu import OpTest, XPUOpTest
|
||||
from op_test import skip_check_grad_ci
|
||||
import paddle
|
||||
import paddle.fluid.core as core
|
||||
import paddle.fluid as fluid
|
||||
from paddle.fluid import compiler, Program, program_guard
|
||||
from paddle.fluid.framework import convert_np_dtype_to_dtype_
|
||||
|
||||
|
||||
class TestXPUReduceMaxOp(XPUOpTest):
|
||||
def setUp(self):
|
||||
self.init_op_type()
|
||||
self.initTestCase()
|
||||
self.use_xpu = True
|
||||
self.use_mkldnn = False
|
||||
self.attrs = {
|
||||
'dim': self.axis,
|
||||
'keep_dim': self.keep_dim,
|
||||
'reduce_all': self.reduce_all
|
||||
}
|
||||
self.inputs = {'X': np.random.random(self.shape).astype("float32")}
|
||||
if self.attrs['reduce_all']:
|
||||
self.outputs = {'Out': self.inputs['X'].max()}
|
||||
else:
|
||||
self.outputs = {
|
||||
'Out': self.inputs['X'].max(axis=self.axis,
|
||||
keepdims=self.attrs['keep_dim'])
|
||||
}
|
||||
|
||||
def test_check_output(self):
|
||||
if paddle.is_compiled_with_xpu():
|
||||
paddle.enable_static()
|
||||
place = paddle.XPUPlace(0)
|
||||
self.check_output_with_place(place)
|
||||
|
||||
def test_check_grad(self):
|
||||
if paddle.is_compiled_with_xpu():
|
||||
paddle.enable_static()
|
||||
place = paddle.XPUPlace(0)
|
||||
self.check_grad_with_place(place, ['X'], 'Out')
|
||||
|
||||
def init_op_type(self):
|
||||
self.op_type = "reduce_max"
|
||||
self.use_mkldnn = False
|
||||
self.keep_dim = False
|
||||
self.reduce_all = False
|
||||
|
||||
def initTestCase(self):
|
||||
self.shape = (5, 6, 10)
|
||||
self.axis = (-1, )
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
Loading…
Reference in new issue