|
|
@ -34,32 +34,58 @@ using mindspore::schema::PrimitiveType_Transpose;
|
|
|
|
namespace mindspore::kernel {
|
|
|
|
namespace mindspore::kernel {
|
|
|
|
|
|
|
|
|
|
|
|
int TransposeOpenCLKernel::CheckSpecs() {
|
|
|
|
int TransposeOpenCLKernel::CheckSpecs() {
|
|
|
|
auto param = reinterpret_cast<TransposeParameter *>(op_parameter_);
|
|
|
|
if ((in_tensors_.size() != 1 && in_tensors_.size() != 2) || out_tensors_.size() != 1) {
|
|
|
|
if (in_tensors_[0]->shape().size() != 4 || in_tensors_[0]->shape()[0] > 1) {
|
|
|
|
MS_LOG(ERROR) << "Transpose input output size unsupported.";
|
|
|
|
MS_LOG(ERROR) << "Transpose only support 4d tensor and n = 1 yet.";
|
|
|
|
return RET_ERROR;
|
|
|
|
return mindspore::lite::RET_ERROR;
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (param->num_axes_ == 4 && param->perm_[0] == 0 && param->perm_[1] == 3 && param->perm_[2] == 1 &&
|
|
|
|
tensor_size_ = GpuTensorInfo(out_tensors_[0]);
|
|
|
|
param->perm_[3] == 2) {
|
|
|
|
if (tensor_size_.NDim > 4) {
|
|
|
|
type = TransposeType::AXIS0312;
|
|
|
|
MS_LOG(ERROR) << "Transpose don't support 5d tensor or higher.";
|
|
|
|
} else if (param->num_axes_ == 4 && param->perm_[0] == 0 && param->perm_[1] == 2 && param->perm_[2] == 3 &&
|
|
|
|
return RET_ERROR;
|
|
|
|
param->perm_[3] == 1) {
|
|
|
|
|
|
|
|
type = TransposeType::AXIS0231;
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
MS_LOG(ERROR) << "unsupported transpose axes.";
|
|
|
|
|
|
|
|
return mindspore::lite::RET_ERROR;
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return RET_OK;
|
|
|
|
return RET_OK;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int TransposeOpenCLKernel::Prepare() {
|
|
|
|
int TransposeOpenCLKernel::Prepare() {
|
|
|
|
|
|
|
|
auto param = reinterpret_cast<TransposeParameter *>(op_parameter_);
|
|
|
|
|
|
|
|
if (tensor_size_.NDim == 2) {
|
|
|
|
|
|
|
|
perm_4d_[0] = tensor_size_.AlignAxis(param->perm_[0]);
|
|
|
|
|
|
|
|
perm_4d_[1] = 1;
|
|
|
|
|
|
|
|
perm_4d_[2] = 2;
|
|
|
|
|
|
|
|
perm_4d_[3] = tensor_size_.AlignAxis(param->perm_[1]);
|
|
|
|
|
|
|
|
} else if (tensor_size_.NDim == 3) {
|
|
|
|
|
|
|
|
perm_4d_[0] = tensor_size_.AlignAxis(param->perm_[0]);
|
|
|
|
|
|
|
|
perm_4d_[1] = 1;
|
|
|
|
|
|
|
|
perm_4d_[2] = tensor_size_.AlignAxis(param->perm_[1]);
|
|
|
|
|
|
|
|
perm_4d_[3] = tensor_size_.AlignAxis(param->perm_[2]);
|
|
|
|
|
|
|
|
} else if (tensor_size_.NDim == 4) {
|
|
|
|
|
|
|
|
perm_4d_[0] = tensor_size_.AlignAxis(param->perm_[0]);
|
|
|
|
|
|
|
|
perm_4d_[1] = tensor_size_.AlignAxis(param->perm_[1]);
|
|
|
|
|
|
|
|
perm_4d_[2] = tensor_size_.AlignAxis(param->perm_[2]);
|
|
|
|
|
|
|
|
perm_4d_[3] = tensor_size_.AlignAxis(param->perm_[3]);
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
perm_4d_[0] = 0;
|
|
|
|
|
|
|
|
perm_4d_[0] = 1;
|
|
|
|
|
|
|
|
perm_4d_[0] = 2;
|
|
|
|
|
|
|
|
perm_4d_[0] = 3;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
if (tensor_size_.N == 1 && perm_4d_[0] == 0 && perm_4d_[1] == 3 && perm_4d_[2] == 1 && perm_4d_[3] == 2) {
|
|
|
|
|
|
|
|
type_ = TransposeType::AXIS0312;
|
|
|
|
|
|
|
|
} else if (tensor_size_.N == 1 && perm_4d_[0] == 0 && perm_4d_[1] == 2 && perm_4d_[2] == 3 && perm_4d_[3] == 1) {
|
|
|
|
|
|
|
|
type_ = TransposeType::AXIS0231;
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
type_ = TransposeType::GENERAL;
|
|
|
|
|
|
|
|
}
|
|
|
|
std::string kernel_name = "transpose";
|
|
|
|
std::string kernel_name = "transpose";
|
|
|
|
if (type == TransposeType::AXIS0312) {
|
|
|
|
if (type_ == TransposeType::AXIS0312) {
|
|
|
|
kernel_name += "_0312";
|
|
|
|
kernel_name += "_0312";
|
|
|
|
} else if (type == TransposeType::AXIS0231) {
|
|
|
|
} else if (type_ == TransposeType::AXIS0231) {
|
|
|
|
kernel_name += "_0231";
|
|
|
|
kernel_name += "_0231";
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
kernel_name += "_general";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (in_tensors_[0]->shape()[2] * UP_DIV(in_tensors_[0]->shape()[3], C4NUM) > MAX_IMAGE2D_SIZE) {
|
|
|
|
if (in_tensors_[0]->shape().size() == 4 &&
|
|
|
|
|
|
|
|
in_tensors_[0]->shape()[2] * UP_DIV(in_tensors_[0]->shape()[3], C4NUM) > MAX_IMAGE2D_SIZE) {
|
|
|
|
// just for input
|
|
|
|
// just for input
|
|
|
|
kernel_name += "_oversize";
|
|
|
|
kernel_name += "_oversize";
|
|
|
|
}
|
|
|
|
}
|
|
|
@ -80,27 +106,40 @@ int TransposeOpenCLKernel::Prepare() {
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void TransposeOpenCLKernel::SetConstArgs() {
|
|
|
|
void TransposeOpenCLKernel::SetConstArgs() {
|
|
|
|
std::vector<int> shapex = out_tensors_[0]->shape();
|
|
|
|
size_t n = tensor_size_.N;
|
|
|
|
size_t n = shapex[0]; // n=1
|
|
|
|
size_t h = tensor_size_.H;
|
|
|
|
size_t h = shapex[1];
|
|
|
|
size_t w = tensor_size_.W;
|
|
|
|
size_t w = shapex[2];
|
|
|
|
size_t c = tensor_size_.C;
|
|
|
|
size_t c = shapex[3];
|
|
|
|
|
|
|
|
int arg_idx = 2;
|
|
|
|
int arg_idx = 2;
|
|
|
|
cl_int4 shape = {static_cast<int>(n), static_cast<int>(h), static_cast<int>(w), static_cast<int>(c)};
|
|
|
|
cl_int4 shape = {static_cast<int>(n), static_cast<int>(h), static_cast<int>(w), static_cast<int>(c)};
|
|
|
|
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, shape);
|
|
|
|
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, shape);
|
|
|
|
|
|
|
|
if (type_ == TransposeType::GENERAL) {
|
|
|
|
|
|
|
|
int de_perm[4]; // output to input perm
|
|
|
|
|
|
|
|
for (int i = 0; i < 4; i++) {
|
|
|
|
|
|
|
|
de_perm[perm_4d_[i]] = i;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
cl_int4 de_perm_cl = {de_perm[0], de_perm[1], de_perm[2], de_perm[3]};
|
|
|
|
|
|
|
|
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, de_perm_cl);
|
|
|
|
|
|
|
|
GpuTensorInfo in_shape = GpuTensorInfo(in_tensors_[0]);
|
|
|
|
|
|
|
|
cl_int4 out_shape = {static_cast<cl_int>(in_shape.N), static_cast<cl_int>(in_shape.H),
|
|
|
|
|
|
|
|
static_cast<cl_int>(in_shape.W), static_cast<cl_int>(in_shape.C)};
|
|
|
|
|
|
|
|
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_shape);
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void TransposeOpenCLKernel::SetGlobalLocal() {
|
|
|
|
void TransposeOpenCLKernel::SetGlobalLocal() {
|
|
|
|
std::vector<int> shapex = out_tensors_[0]->shape();
|
|
|
|
size_t n = tensor_size_.N;
|
|
|
|
size_t h = shapex[1];
|
|
|
|
size_t h = tensor_size_.H;
|
|
|
|
size_t w = shapex[2];
|
|
|
|
size_t w = tensor_size_.W;
|
|
|
|
size_t c = shapex[3];
|
|
|
|
size_t c = tensor_size_.C;
|
|
|
|
size_t c4 = UP_DIV(c, 4);
|
|
|
|
size_t c4 = UP_DIV(c, 4);
|
|
|
|
local_size_ = {};
|
|
|
|
local_size_ = {};
|
|
|
|
if (type == TransposeType::AXIS0312) { // NHWC -> NCHW
|
|
|
|
if (type_ == TransposeType::AXIS0312) { // NHWC -> NCHW
|
|
|
|
global_size_ = {UP_DIV(h, C4NUM), w, c4};
|
|
|
|
global_size_ = {UP_DIV(h, C4NUM), w, c4};
|
|
|
|
} else if (type == TransposeType::AXIS0231) { // NCHW -> NHWC
|
|
|
|
} else if (type_ == TransposeType::AXIS0231) { // NCHW -> NHWC
|
|
|
|
global_size_ = {h, UP_DIV(w, C4NUM), c4};
|
|
|
|
global_size_ = {h, UP_DIV(w, C4NUM), c4};
|
|
|
|
|
|
|
|
} else { // general
|
|
|
|
|
|
|
|
global_size_ = {n * h, w, c4};
|
|
|
|
}
|
|
|
|
}
|
|
|
|
AlignGlobalLocal(global_size_, local_size_);
|
|
|
|
AlignGlobalLocal(global_size_, local_size_);
|
|
|
|
}
|
|
|
|
}
|
|
|
|