From cf2b868892bc7d638d8db35497d1462b7cd52fe1 Mon Sep 17 00:00:00 2001 From: Pengyongrong Date: Tue, 18 Aug 2020 20:48:41 -0700 Subject: [PATCH] solve some problems for power ops --- .../src/runtime/kernel/opencl/cl/power.cl | 18 +++------ .../runtime/kernel/opencl/kernel/conv2d.cc | 16 ++++---- .../kernel/opencl/kernel/conv2d_transpose.cc | 18 ++++----- .../kernel/opencl/kernel/fullconnection.cc | 10 ++--- .../kernel/opencl/kernel/layer_norm.cc | 38 +++++++++--------- .../runtime/kernel/opencl/kernel/matmul.cc | 8 ++-- .../src/runtime/kernel/opencl/kernel/power.cc | 40 +------------------ .../src/runtime/kernel/opencl/kernel/power.h | 4 -- .../src/runtime/kernel/opencl/power_tests.cc | 16 ++++---- 9 files changed, 60 insertions(+), 108 deletions(-) diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/power.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/power.cl index 6d1e32566e..44691bcfd4 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/power.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/power.cl @@ -24,27 +24,19 @@ FLT OptimizedPowerImpl(FLT x, int exponent) { return exponent >= 0 ? result : 1 / result; } -__kernel void power(__read_only image2d_t input0, __global FLT *input1, __write_only image2d_t output, +__kernel void power(__read_only image2d_t input0, __read_only image2d_t input1, __write_only image2d_t output, int4 output_shape, FLT4 parameter) { CHECK_IDX; int n = X / output_shape.y; int h = X % output_shape.y; - int unalign_w = (int)parameter.w; FLT4 result; FLT4 result0 = READ_IMAGE(input0, smp_none, (int2)((Y)*output_shape.w + Z, (n * output_shape.y + h))); - int index_weight = (n * output_shape.y + h) * output_shape.z * unalign_w + Y * unalign_w + Z * C4NUM; + FLT4 result1 = READ_IMAGE(input1, smp_none, (int2)((Y)*output_shape.w + Z, (n * output_shape.y + h))); + FLT tmp_result[4]; FLT tmp_result0[4] = {result0.x, result0.y, result0.z, result0.w}; - FLT tmp_result1[4] = {0.0f, 0.0f, 0.0f, 0.0f}; - if ((Z + 1) * C4NUM <= unalign_w) { - for (int i = 0; i < C4NUM; ++i) { - tmp_result1[i] = input1[index_weight + i]; - } - } else { - for (int i = 0; i < unalign_w % C4NUM; ++i) { - tmp_result1[i] = input1[index_weight + i]; - } - } + FLT tmp_result1[4] = {result1.x, result1.y, result1.z, result1.w}; + for (int i = 0; i < 4; ++i) { tmp_result0[i] = tmp_result0[i] * parameter.z + parameter.y; if (floor(tmp_result1[i]) == tmp_result1[i]) { diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc index 3cf836ad2a..ca474e73e9 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc @@ -65,6 +65,14 @@ int Conv2DOpenCLKernel::CheckSpecs() { MS_LOG(ERROR) << "Conv2D only supports 4D output Tensor but get " << out_tensors_.front()->shape().size() << "D."; return RET_ERROR; } + if (!in_tensors_.at(1)->IsConst()) { + MS_LOG(ERROR) << "Conv2D don't support non-constant filter yet."; + return RET_ERROR; + } + if (in_tensors_.size() == 3 && !in_tensors_.at(2)->IsConst()) { + MS_LOG(ERROR) << "Conv2D don't support non-constant bias yet."; + return RET_ERROR; + } // for fusion: ActivationType_LEAKY_RELU ActivationType_TANH switch (static_cast(param_->act_type_)) { case ActType_No: @@ -302,16 +310,8 @@ int Conv2DOpenCLKernel::InitBias() { } int Conv2DOpenCLKernel::InitWeights() { - if (!in_tensors_.at(1)->IsConst()) { - MS_LOG(ERROR) << "Conv2D don't support non-constant filter yet."; - return RET_ERROR; - } InitFilter(); if (has_bias_) { - if (!in_tensors_.at(2)->IsConst()) { - MS_LOG(ERROR) << "Conv2D don't support non-constant bias yet."; - return RET_ERROR; - } InitBias(); } return RET_OK; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc index 6cd6af9fda..a3a0ee01bf 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc @@ -49,6 +49,14 @@ int Conv2dTransposeOpenCLKernel::CheckSpecs() { MS_LOG(ERROR) << "Unsupported activation type " << param->act_type_; return RET_ERROR; } + if (!in_tensors_.at(1)->IsConst()) { + MS_LOG(ERROR) << "Conv2dTranspose don't support non-constant filter yet."; + return RET_ERROR; + } + if (in_tensors_.size() == 3 && !in_tensors_.at(2)->IsConst()) { + MS_LOG(ERROR) << "Conv2dTranspose don't support non-constant bias yet."; + return RET_ERROR; + } return RET_OK; } @@ -117,10 +125,6 @@ void Conv2dTransposeOpenCLKernel::SetConstArgs() { } int Conv2dTransposeOpenCLKernel::InitWeights() { - if (!in_tensors_.at(1)->IsConst()) { - MS_LOG(ERROR) << "Conv2dTranspose don't support non-constant filter yet."; - return RET_ERROR; - } ConvParameter *param = reinterpret_cast(op_parameter_); int ci = in_tensors_[0]->shape()[3]; int co = out_tensors_[0]->shape()[3]; @@ -189,11 +193,7 @@ int Conv2dTransposeOpenCLKernel::InitWeights() { bias_ = allocator->Malloc(im_dst_x * im_dst_y * C4NUM * data_size, img_size); bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true); memset(bias_, 0x00, div_co * C4NUM * data_size); - if (in_tensors_.size() >= 3) { - if (!in_tensors_.at(2)->IsConst()) { - MS_LOG(ERROR) << "Conv2dTranspose don't support non-constant bias yet."; - return RET_ERROR; - } + if (in_tensors_.size() == 3) { auto bias_dtype = in_tensors_[2]->data_type(); if (bias_dtype == kNumberTypeFloat32 && enable_fp16_) { for (int i = 0; i < co; i++) { diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc index 54e3ffa6b8..10662dce8c 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc @@ -92,6 +92,10 @@ int FullConnectionOpenCLKernel::CheckSpecs() { return RET_ERROR; } } + if (in_tensors_.size() == 3 && !in_tensors_.at(2)->IsConst()) { + MS_LOG(ERROR) << "FullConnection don't support non-constant bias yet."; + return RET_ERROR; + } CI_remainder_ = input_nhw / N_; return RET_OK; } @@ -211,11 +215,7 @@ int FullConnectionOpenCLKernel::InitBias() { bias_ = allocator->Malloc(im_dst_x * im_dst_y * C4NUM * dtype_size, img_size); bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true); memset(bias_, 0x00, co4 * C4NUM * dtype_size); - if (in_tensors_.size() >= 3) { - if (!in_tensors_.at(2)->IsConst()) { - MS_LOG(ERROR) << "FullConnection don't support non-constant bias yet."; - return RET_ERROR; - } + if (in_tensors_.size() == 3) { if (in_tensors_[2]->data_type() == kNumberTypeFloat32 && enable_fp16_) { for (int i = 0; i < CO_; i++) { reinterpret_cast(bias_)[i] = reinterpret_cast(in_tensors_[2]->data_c())[i]; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/layer_norm.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/layer_norm.cc index b98e24ff77..e6fc9e9b5a 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/layer_norm.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/layer_norm.cc @@ -33,24 +33,24 @@ namespace mindspore::kernel { int LayerNormOpenCLKernel::CheckSpecs() { auto param = reinterpret_cast(this->op_parameter_); - if (param->elementwise_mode_ == ELEMENTWISE_PER_CHANNEL) { - if (in_tensors_.size() != 3) { - MS_LOG(ERROR) << " invalid in_tensors_ size" << in_tensors_.size() << std::endl; - return RET_ERROR; - } - if (param->normalized_dims_ > in_tensors_.at(0)->shape().size()) { - MS_LOG(ERROR) << " invalid normalized_shape_ size" << param->normalized_dims_ << std::endl; - return RET_ERROR; - } - } else if (param->elementwise_mode_ == ELEMENTWISE_NOT) { - if (in_tensors_.size() != 1) { - MS_LOG(ERROR) << " invalid in_tensors_ size" << in_tensors_.size() << std::endl; - return RET_ERROR; - } - } else { - MS_LOG(ERROR) << "Unsupported elementwise_mode_" << param->elementwise_mode_; - return RET_ERROR; - } + // if (param->elementwise_mode_ == ELEMENTWISE_PER_CHANNEL) { + // if (in_tensors_.size() != 3) { + // MS_LOG(ERROR) << " invalid in_tensors_ size" << in_tensors_.size() << std::endl; + // return RET_ERROR; + // } + // if (param->normalized_dims_ > in_tensors_.at(0)->shape().size()) { + // MS_LOG(ERROR) << " invalid normalized_shape_ size" << param->normalized_dims_ << std::endl; + // return RET_ERROR; + // } + // } else if (param->elementwise_mode_ == ELEMENTWISE_NOT) { + // if (in_tensors_.size() != 1) { + // MS_LOG(ERROR) << " invalid in_tensors_ size" << in_tensors_.size() << std::endl; + // return RET_ERROR; + // } + // } else { + // MS_LOG(ERROR) << "Unsupported elementwise_mode_" << param->elementwise_mode_; + // return RET_ERROR; + // } if (in_tensors_.at(0)->shape().size() != 4 || out_tensors_.size() != 1) { MS_LOG(ERROR) << "UnSupported in_tensors_.shape.size: " << in_tensors_.at(0)->shape().size() << " out_tensors_.size(): " << out_tensors_.size(); @@ -184,7 +184,7 @@ int LayerNormOpenCLKernel::Initweight() { int LayerNormOpenCLKernel::Prepare() { use_fp16_enable_ = ocl_runtime_->GetFp16Enable(); auto param = reinterpret_cast(this->op_parameter_); - elementwise_affine_ = param->elementwise_mode_; + elementwise_affine_ = true; // param->elementwise_mode_; normalized_dims_ = param->normalized_dims_; epsilon_ = param->epsilon_; if (elementwise_affine_) { diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc index 9f10aabb7a..65623e513a 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc @@ -48,6 +48,10 @@ int MatMulOpenCLKernel::CheckSpecs() { MS_LOG(ERROR) << "matmul only support input shape size= 2, 3 or 4."; return mindspore::lite::RET_ERROR; } + if (!in_tensors_.at(kWeightIndex)->IsConst()) { + MS_LOG(ERROR) << "Matmul don't support non-constant filter yet."; + return RET_ERROR; + } return RET_OK; } @@ -80,10 +84,6 @@ int MatMulOpenCLKernel::Prepare() { int MatMulOpenCLKernel::InitWeights() { // ABMCI @ ABCICO = ABMCO - if (!in_tensors_.at(kWeightIndex)->IsConst()) { - MS_LOG(ERROR) << "Matmul don't support non-constant filter yet."; - return RET_ERROR; - } auto ret = DequantWeight(); if (ret != RET_OK) { return ret; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/power.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/power.cc index 2bc1c9afc7..a8e800682b 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/power.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/power.cc @@ -48,40 +48,6 @@ int PowerOpenCLKernel::CheckSpecs() { return RET_OK; } -int PowerOpenCLKernel::Initweight() { - auto allocator = ocl_runtime_->GetAllocator(); - GpuTensorInfo img_info(in_tensors_.at(1)); - auto weight_tensor = in_tensors_.at(1); - size_t weight_size = img_info.OriginSize; - weight_ = allocator->Malloc(weight_size); - allocator->MapBuffer(weight_, CL_MAP_WRITE, nullptr, true); - memset(weight_, 0x00, weight_size); - - if (weight_tensor->data_type() == kNumberTypeFloat16) { - if (use_fp16_enable_) { - memcpy(weight_, weight_tensor->data_c(), weight_size); - } else { - auto weight_fp32 = reinterpret_cast(weight_); - auto origin_bias_fp16 = reinterpret_cast(weight_tensor->data_c()); - for (int i = 0; i < img_info.ElementsNum; ++i) { - weight_fp32[i] = static_cast(origin_bias_fp16[i]); - } - } - } else { - if (use_fp16_enable_) { - auto weight_fp16 = reinterpret_cast(weight_); - auto origin_bias_fp32 = reinterpret_cast(weight_tensor->data_c()); - for (int i = 0; i < img_info.ElementsNum; ++i) { - weight_fp16[i] = static_cast(origin_bias_fp32[i]); - } - } else { - memcpy(weight_, weight_tensor->data_c(), weight_size); - } - } - allocator->UnmapBuffer(weight_); - return RET_OK; -} - void PowerGetWorkGroup(const std::vector &global, std::vector *local, int max_size) { const int max_divider = 8; const int max_x = 2, max_y = 8; @@ -145,11 +111,9 @@ int PowerOpenCLKernel::Prepare() { std::string kernel_name = "power"; std::string source = power_source; std::string program_name = "power"; - if (broadcast_ && in_tensors_.size() == 1) { + if (broadcast_) { power_ = param->power_; kernel_name += "_broadcast"; - } else { - Initweight(); } scale_ = param->scale_; shift_ = param->shift_; @@ -168,7 +132,7 @@ int PowerOpenCLKernel::Run() { ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(0)->data_c()); } else { ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(0)->data_c()); - ocl_runtime_->SetKernelArg(kernel_, arg_cn++, weight_, lite::opencl::MemType::BUF); + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(1)->data_c()); } ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_.at(0)->data_c()); ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/power.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/power.h index bd86acedd5..04b2a7318a 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/power.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/power.h @@ -37,14 +37,10 @@ class PowerOpenCLKernel : public OpenCLKernel { void SetGlobalLocal() override; int Run() override; - private: - int Initweight(); - private: cl_int4 out_shape_{}; bool broadcast_{false}; bool use_fp16_enable_{false}; - void *weight_{nullptr}; float power_{1.0}; float scale_{0.0}; float shift_{1.0}; diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/power_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/power_tests.cc index 1d11eb0273..e4b55a67b3 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/power_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/power_tests.cc @@ -48,8 +48,8 @@ TEST_F(TestPowerOpenCLCI, Int32CI) { 100.0, 121.0, 1728.0, 1.0, 196.0, 225.0, 16.0, 289.0}; for (auto fp16_enable : {false, true}) { auto *param = CreateParameter(broadcast_, shift_, scale_); - TestMain({{input0_shape, input0_data, VAR}, {input1_shape, input1_data, CONST_TENSOR}}, {output_shape, output_data}, - param, fp16_enable, fp16_enable ? 1e-3 : 1e-9); + TestMain({{input0_shape, input0_data, VAR}, {input1_shape, input1_data, VAR}}, {output_shape, output_data}, param, + fp16_enable, fp16_enable ? 1e-3 : 1e-9); } } @@ -68,8 +68,8 @@ TEST_F(TestPowerOpenCLCI, Fp32CI) { 3.20657016, 0.64395994, 0.01526405, 0.13275899, 5.85509388, 0.16177453, 0.07150001, 0.0542811}; for (auto fp16_enable : {false, true}) { auto *param = CreateParameter(broadcast_, shift_, scale_); - TestMain({{input0_shape, input0_data, VAR}, {input1_shape, input1_data, CONST_TENSOR}}, {output_shape, output_data}, - param, fp16_enable, fp16_enable ? 1e-2 : 1e-6); + TestMain({{input0_shape, input0_data, VAR}, {input1_shape, input1_data, VAR}}, {output_shape, output_data}, param, + fp16_enable, fp16_enable ? 1e-2 : 1e-6); } } @@ -87,8 +87,8 @@ TEST_F(TestPowerOpenCLCI, Fp32UnAlign) { 3.20657016, 0.64395994, 0.01526405, 0.13275899, 5.85509388, 0.16177453, 0.07150001}; for (auto fp16_enable : {false, true}) { auto *param = CreateParameter(broadcast_, shift_, scale_); - TestMain({{input0_shape, input0_data, VAR}, {input1_shape, input1_data, CONST_TENSOR}}, {output_shape, output_data}, - param, fp16_enable, fp16_enable ? 1e-2 : 1e-6); + TestMain({{input0_shape, input0_data, VAR}, {input1_shape, input1_data, VAR}}, {output_shape, output_data}, param, + fp16_enable, fp16_enable ? 1e-2 : 1e-6); } } @@ -121,8 +121,8 @@ TEST_F(TestPowerOpenCLCI, Fp16CI) { 0.4856, 1.014, 0.2025, -1.736, 0.2134, 0.489, -0.596, 0.7466}; for (auto fp16_enable : {true}) { auto *param = CreateParameter(broadcast_, shift_, scale_); - TestMain({{input0_shape, input0_data, VAR}, {input1_shape, input1_data, CONST_TENSOR}}, {output_shape, output_data}, - param, fp16_enable, fp16_enable ? 1e-3 : 1e-6); + TestMain({{input0_shape, input0_data, VAR}, {input1_shape, input1_data, VAR}}, {output_shape, output_data}, param, + fp16_enable, fp16_enable ? 1e-3 : 1e-6); } }