diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/concat.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/concat.cl index c3dc183323..a2b67d3fa9 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/concat.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/concat.cl @@ -73,7 +73,7 @@ __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | int coordinate_x = Y * input_shape0.w + Z; \ int coordinate_y = X; \ result = READ_IMAGE(input0, smp_none, (int2)(coordinate_x, coordinate_y)); \ - } else { \ + } else if (Y < boundary1) { \ int coordinate_x = (Y - boundary0) * input_shape1.w + Z; \ int coordinate_y = X; \ result = READ_IMAGE(input1, smp_none, (int2)(coordinate_x, coordinate_y)); \ @@ -123,7 +123,7 @@ __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | int coordinate_x = Y * input_shape0.w + Z; \ int coordinate_y = X; \ result = READ_IMAGE(input0, smp_none, (int2)(coordinate_x, coordinate_y)); \ - } else { \ + } else if (Z < boundary1) { \ int coordinate_x = Y * input_shape1.w + Z - boundary0; \ int coordinate_y = X; \ result = READ_IMAGE(input1, smp_none, (int2)(coordinate_x, coordinate_y)); \ diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/gather.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/gather.cl index 3ac91e2df0..97a68da12d 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/gather.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/gather.cl @@ -1,5 +1,5 @@ #pragma OPENCL EXTENSION cl_khr_fp16 : enable - +#define C4NUM 4 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; __kernel void gather(__write_only image2d_t dst_data, __read_only image2d_t src_data, __global int *indices, diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/split.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/split.cl new file mode 100644 index 0000000000..5bd65f0617 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/split.cl @@ -0,0 +1,114 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; +#define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) +#define C4NUM 4 + +#define CHECK_IDX_ALIGN \ + const int X = get_global_id(0); \ + const int Y = get_global_id(1); \ + const int Z = get_global_id(2); \ + if (X > in_shape.x * in_shape.y || Y > in_shape.z || Z > in_shape.w || in_shape.y == 0) { \ + return; \ + } + +#define ARGS_ALIGN \ + const int IN = X / in_shape.y; \ + const int IH = X % in_shape.y; \ + int coordinate_x = IN * in_shape.y + IH; \ + int coordinate_y = Y * in_shape.w + Z; \ + FLT4 result = READ_IMAGE(input, smp_none, (int2)(coordinate_y, coordinate_x)); + +__kernel void split_out2_axis3(__read_only image2d_t input, __write_only image2d_t output1, + __write_only image2d_t output2, __global int *split_sizes_, int4 in_shape, + int4 out_shape1, int4 out_shape2) { + CHECK_IDX_ALIGN; + ARGS_ALIGN; + int boundary = UP_DIV(split_sizes_[0], C4NUM); + if (Z < boundary) { + coordinate_x = IN * out_shape1.y + IH; + coordinate_y = Y * out_shape1.w + Z; + WRITE_IMAGE(output1, (int2)(coordinate_y, coordinate_x), result); + } else { + coordinate_x = IN * out_shape2.y + IH; + coordinate_y = Y * out_shape2.w + Z - boundary; + WRITE_IMAGE(output2, (int2)(coordinate_y, coordinate_x), result); + } +} + +__kernel void split_out2_axis2(__read_only image2d_t input, __write_only image2d_t output1, + __write_only image2d_t output2, __global int *split_sizes_, int4 in_shape, + int4 out_shape1, int4 out_shape2) { + CHECK_IDX_ALIGN; + ARGS_ALIGN; + if (Y < split_sizes_[0]) { + coordinate_x = IN * out_shape1.y + IH; + coordinate_y = Y * out_shape1.w + Z; + WRITE_IMAGE(output1, (int2)(coordinate_y, coordinate_x), result); + } else { + coordinate_x = IN * out_shape2.y + IH; + coordinate_y = (Y - split_sizes_[0]) * out_shape2.w + Z; + WRITE_IMAGE(output2, (int2)(coordinate_y, coordinate_x), result); + } +} + +__kernel void split_out2_axis1(__read_only image2d_t input, __write_only image2d_t output1, + __write_only image2d_t output2, __global int *split_sizes_, int4 in_shape, + int4 out_shape1, int4 out_shape2) { + CHECK_IDX_ALIGN; + ARGS_ALIGN; + if (IH < split_sizes_[0]) { + coordinate_x = IN * out_shape1.y + IH; + coordinate_y = Y * out_shape1.w + Z; + WRITE_IMAGE(output1, (int2)(coordinate_y, coordinate_x), result); + } else { + coordinate_x = IN * out_shape2.y + IH - split_sizes_[0]; + coordinate_y = Y * out_shape2.w + Z; + WRITE_IMAGE(output2, (int2)(coordinate_y, coordinate_x), result); + } +} + +// UnAlign in Axis C for concat +#define CHECK_IDX_UNALIGN \ + const int X = get_global_id(0); \ + const int Y = get_global_id(1); \ + if (X >= in_shape.x * in_shape.y || Y >= in_shape.z || in_shape.y == 0) { \ + return; \ + } + +#define ARGS_UNALIGN \ + const int IN = X / in_shape.y, IH = X % in_shape.y; \ + const int IW = Y; \ + const int Align_inShape = UP_DIV(in_shape.w, C4NUM); \ + int index_input = (IN * in_shape.y + IH) * stride_w + IW * Align_inShape * C4NUM; + +int dosplit(__global FLT *input, __write_only image2d_t output, int4 out_shape, int IN, int IH, int IW, + int index_input) { + int Remainder = out_shape.w % C4NUM; + int coordinate_x = IN * out_shape.y + IH; + int align_w = UP_DIV(out_shape.w, C4NUM); + for (int i = 0; i < align_w; ++i) { + int coordinate_y = IW * align_w + i; + if ((i + 1) * C4NUM <= out_shape.w) { + FLT4 result = {input[index_input], input[index_input + 1], input[index_input + 2], input[index_input + 3]}; + WRITE_IMAGE(output, (int2)(coordinate_y, coordinate_x), result); + index_input += 4; + } else { + FLT result_temp[4] = {}; + for (int j = 0; j < Remainder; ++j) { + result_temp[j] = input[index_input++]; + } + FLT4 result = {result_temp[0], result_temp[1], result_temp[2], result_temp[3]}; + WRITE_IMAGE(output, (int2)(coordinate_y, coordinate_x), result); + } + } + return index_input; +} + +__kernel void split_out2_axis3_unalign(__global FLT *input, __write_only image2d_t output1, + __write_only image2d_t output2, __global int *split_sizes_, int4 in_shape, + int4 out_shape1, int4 out_shape2, int stride_w) { + CHECK_IDX_UNALIGN; + ARGS_UNALIGN; + index_input = dosplit(input, output1, out_shape1, IN, IH, IW, index_input); + index_input = dosplit(input, output2, out_shape2, IN, IH, IW, index_input); +} diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc index a030f2ff3f..bab94aee26 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc @@ -161,6 +161,14 @@ void ConcatOpenCLKernel::SetGlobalLocal() { } int ConcatOpenCLKernel::Prepare() { + if (axis_ == 0) { + for (int i = 0; i < in_tensors_.size(); ++i) { + if (in_tensors_.at(0)->shape().size() != 1) { + return RET_OK; + } + } + axis_ = 3; + } for (int i = 0; i < in_tensors_.size(); ++i) { int length = in_tensors_[0]->shape().size(); if (in_tensors_[i]->shape()[length - 1] % C4NUM != 0) { diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc index cc06be8534..a43a93e179 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc @@ -99,6 +99,9 @@ void GatherOpenCLKernel::SetGlobalLocal() { int GatherOpenCLKernel::Prepare() { std::string kernel_name = "gather"; + if (in_tensors_.at(0)->shape().size() == 1 && axis_ == 0) { + axis_ = 3; + } #ifdef PROGRAM_WITH_IL kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); #else @@ -106,17 +109,13 @@ int GatherOpenCLKernel::Prepare() { ocl_runtime_->LoadSource(program_name, gather_source); ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); #endif - if (!in_tensors_.at(1)->IsConst()) { - intensor1_is_tensor = true; - } - - if (!intensor1_is_tensor) { + if (in_tensors_.at(1)->IsConst()) { + intensor1_is_tensor = false; int ret = InitWeights(); if (ret != RET_OK) { return ret; } } - SetGlobalLocal(); SetConstArgs(); MS_LOG(DEBUG) << kernel_name << " Init Done!"; @@ -125,7 +124,6 @@ int GatherOpenCLKernel::Prepare() { int GatherOpenCLKernel::ConvertTensorToweight() { auto allocator = ocl_runtime_->GetAllocator(); - GpuTensorInfo img_info(in_tensors_[1]); auto indices_tensor = in_tensors_.at(1); auto indices_num = indices_tensor->ElementsNum(); indices_data_ = reinterpret_cast(allocator->Malloc(sizeof(int32_t) * indices_num)); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.h index d61395a85e..bde9e4515f 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.h @@ -41,13 +41,10 @@ class GatherOpenCLKernel : public OpenCLKernel { int Tune() override { return lite::RET_OK; } int ConvertTensorToweight(); - protected: - int UpdateWeights(); - private: int32_t *indices_data_{nullptr}; int axis_ = {0}; - bool intensor1_is_tensor{false}; + bool intensor1_is_tensor{true}; bool enable_fp16_{false}; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc index cd9d785b76..a36d16c88a 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc @@ -17,17 +17,12 @@ #include #include #include -#include "nnacl/fp32/common_func_fp32.h" #include "src/kernel_registry.h" -#include "src/runtime/kernel/opencl/utils.h" #include "src/runtime/kernel/opencl/kernel/matmul.h" -#include "src/common/utils.h" +#include "src/runtime/kernel/opencl/kernel/strassen.h" #ifndef PROGRAM_WITH_IL - #include "src/runtime/kernel/opencl/cl/matmul.cl.inc" -#include "src/runtime/kernel/opencl/cl/strassen.cl.inc" - #endif using mindspore::kernel::KERNEL_ARCH::kGPU; @@ -36,6 +31,21 @@ using mindspore::schema::PrimitiveType_MatMul; namespace mindspore::kernel { +bool IsUseStrassenMatmul(const std::vector &in_tensors_) { + if (in_tensors_.at(0)->shape().size() == 2) { + auto shape0 = in_tensors_.at(0)->shape(); + auto shape1 = in_tensors_.at(1)->shape(); + if (in_tensors_.at(1)->IsConst() && (shape0[0] == shape0[1]) && (shape1[0] == shape1[1]) && + (shape0[0] == shape1[0]) && (shape0[0] % 8 == 0)) { + return true; + } else { + return false; + } + } else { + return false; + } +} + int MatMulOpenCLKernel::CheckSpecs() { if (in_tensors_.size() != 2 || out_tensors_.size() != 1) { MS_LOG(ERROR) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size(); @@ -74,35 +84,14 @@ int MatMulOpenCLKernel::Prepare() { } std::map dims2str = {{2, "_2d"}, {3, "_4d"}, {4, "_4d"}}; kernel_name += dims2str[dims]; - if (in_tensors_.at(0)->shape().size() == 2) { - auto shape0 = in_tensors_.at(0)->shape(); - auto shape1 = in_tensors_.at(1)->shape(); - if (in_tensors_.at(1)->IsConst() && (shape0[0] == shape0[1]) && (shape1[0] == shape1[1]) && - (shape0[0] == shape1[0]) && (shape0[0] % 8 == 0)) { - use_strassen = true; - } - } #ifdef PROGRAM_WITH_IL kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); #else std::string source = matmul_source; - if (use_strassen) { - source.clear(); - source = strassen_source; - } std::string program_name = "MatMul"; ocl_runtime_->LoadSource(program_name, source); - if (use_strassen) { - kernel_name = "MatMul_Strassen_NHWC4_2d"; - ocl_runtime_->BuildKernel(kernel_IMG_add_sub_2, program_name, "MatMul_IMG_Add_Sub_2"); - ocl_runtime_->BuildKernel(kernel_BUF_add_sub_2, program_name, "MatMul_BUF_Add_Sub_2"); - ocl_runtime_->BuildKernel(kernel_back_result, program_name, "Strassen_Back_Result"); - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); - ocl_runtime_->BuildKernel(MatMul_StrassenBUFFilled, program_name, "MatMul_BUF_Filled"); - ocl_runtime_->BuildKernel(MatMul_StrassenIMGFilled, program_name, "MatMul_IMG_Filled"); - } else { - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); - } + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); + #endif auto ret = InitWeights(); if (ret != RET_OK) { @@ -114,31 +103,6 @@ int MatMulOpenCLKernel::Prepare() { return mindspore::lite::RET_OK; } -void MatMulOpenCLKernel::AllocatorMemoryForStrassen(int NumA, int NumB) { - std::vector img_size; - img_size.push_back(UP_DIV(NumA, C4NUM)); - img_size.push_back(NumA); - size_t img_dtype = enable_fp16_ ? CL_HALF_FLOAT : CL_FLOAT; - size_t dtype_size = enable_fp16_ ? sizeof(CL_HALF_FLOAT) : sizeof(CL_FLOAT); - img_size.push_back(img_dtype); - auto allocator = ocl_runtime_->GetAllocator(); - size_t memA = NumA * NumA; - - size_t memB = NumB * NumB * dtype_size; - for (int depth = 0; depth < MAXDEPTH; depth++) { - B_temp[depth] = allocator->Malloc(memB); - A_temp[depth] = allocator->Malloc(memA, img_size); - - M1[depth] = allocator->Malloc(memA, img_size); - M2[depth] = allocator->Malloc(memA, img_size); - M3[depth] = allocator->Malloc(memA, img_size); - M4[depth] = allocator->Malloc(memA, img_size); - M5[depth] = allocator->Malloc(memA, img_size); - M6[depth] = allocator->Malloc(memA, img_size); - M7[depth] = allocator->Malloc(memA, img_size); - } -} - int MatMulOpenCLKernel::InitWeights() { if (act_weight_) { return RET_OK; @@ -165,64 +129,39 @@ int MatMulOpenCLKernel::InitWeights() { auto originWeightFp32 = reinterpret_cast(in_tensors_.at(kWeightIndex)->data_c()); auto originWeightFp16 = reinterpret_cast(in_tensors_.at(kWeightIndex)->data_c()); bool isModelFp16 = in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16; - if (use_strassen) { - int NumA = in_tensors_[0]->shape()[0]; - int NumB = in_tensors_[1]->shape()[0]; - AllocatorMemoryForStrassen(NumA / 2, NumB / 2); - size_t size = NumA * NumB * dtype_size; - transposeB = false; - if (isModelFp16) { - if (enable_fp16_) { - memcpy(padWeightFp16, originWeightFp16, size); - } else { - for (int i = 0; i < NumA * NumB; ++i) { - padWeightFp32[i] = static_cast(originWeightFp16[i]); - } - } - } else { - if (enable_fp16_) { - for (int i = 0; i < NumA * NumB; ++i) { - padWeightFp16[i] = static_cast(originWeightFp32[i]); - } - } else { - memcpy(padWeightFp32, originWeightFp32, size); - } - } - } else { - // pad weight - // ABCICO -> AB(CI4)(CO4)(4 from CO)(4 from CI) - // if tranposeB, ABCOCI -> AB(CI4)(CO4)(4 from CO)(4 from CI) - int index = 0; - for (int aa = 0; aa < a; aa++) { - for (int bb = 0; bb < b; bb++) { - int baseAB = (aa * b + bb) * ci * co; - for (int i = 0; i < ci4; ++i) { - for (int j = 0; j < co4; ++j) { - for (int k = 0; k < C4NUM; ++k) { - for (int l = 0; l < C4NUM; ++l) { - int src_ci = i * C4NUM + l; - int src_co = j * C4NUM + k; - if (src_ci < ci && src_co < co) { - int originId = baseAB + src_ci * co + src_co; - if (transposeB) { - originId = baseAB + src_co * ci + src_ci; - } - if (enable_fp16_) { - if (!isModelFp16) { - padWeightFp16[index++] = originWeightFp32[originId]; - } else { - padWeightFp16[index++] = originWeightFp16[originId]; - } + // pad weight + // ABCICO -> AB(CI4)(CO4)(4 from CO)(4 from CI) + // if tranposeB, ABCOCI -> AB(CI4)(CO4)(4 from CO)(4 from CI) + int index = 0; + for (int aa = 0; aa < a; aa++) { + for (int bb = 0; bb < b; bb++) { + int baseAB = (aa * b + bb) * ci * co; + for (int i = 0; i < ci4; ++i) { + for (int j = 0; j < co4; ++j) { + for (int k = 0; k < C4NUM; ++k) { + for (int l = 0; l < C4NUM; ++l) { + int src_ci = i * C4NUM + l; + int src_co = j * C4NUM + k; + if (src_ci < ci && src_co < co) { + int originId = baseAB + src_ci * co + src_co; + if (transposeB) { + originId = baseAB + src_co * ci + src_ci; + } + if (enable_fp16_) { + if (!isModelFp16) { + padWeightFp16[index++] = originWeightFp32[originId]; } else { - if (!isModelFp16) { - padWeightFp32[index++] = originWeightFp32[originId]; - } else { - padWeightFp32[index++] = originWeightFp16[originId]; - } + padWeightFp16[index++] = originWeightFp16[originId]; } } else { - index++; + if (!isModelFp16) { + padWeightFp32[index++] = originWeightFp32[originId]; + } else { + padWeightFp32[index++] = originWeightFp16[originId]; + } } + } else { + index++; } } } @@ -236,266 +175,67 @@ int MatMulOpenCLKernel::InitWeights() { return RET_OK; } -void AlignStrassenGlobalLocal(const std::vector &global, const std::vector &local, - cl::NDRange *global_range, cl::NDRange *local_range) { - *local_range = cl::NDRange(local[0], local[1], local[2]); - *global_range = - cl::NDRange(UP_ROUND(global[0], local[0]), UP_ROUND(global[1], local[1]), UP_ROUND(global[2], local[2])); -} - -// 0 : global_size_, 1: global_size_add_sub -void MatMulOpenCLKernel::StrassenSetGlobalLocal(size_t strassen_size, int type_flag) { - size_t strassen_size_C4 = UP_DIV(strassen_size, C4NUM); - local_size_add_sub = {16, 1, 16}; - if (type_flag == 0) { - global_size_ = {strassen_size_C4, 1, strassen_size}; - AlignGlobalLocal(global_size_, local_size_); - } else { - global_size_add_sub = {strassen_size_C4, 1, strassen_size}; - AlignStrassenGlobalLocal(global_size_add_sub, local_size_add_sub, &global_add_sub_, &local_add_sub_); - } -} - void MatMulOpenCLKernel::SetGlobalLocal() { // local size should less than MAX_GROUP_SIZE local_size_ = {32, 4, 1}; global_size_ = {1, 1, 1}; - if (use_strassen) { - size_t strassen_size = outShape[3] / 2; - StrassenSetGlobalLocal(strassen_size, 0); // set global_ and local - StrassenSetGlobalLocal(strassen_size, 1); // set global_size_add_sub - StrassenSetGlobalLocal(strassen_size, 2); // set global_size_weights - } else { - global_size_ = {UP_DIV(static_cast(outShape[3]), C4NUM), - 4 * static_cast(outShape[0]) * static_cast(outShape[1]), - static_cast(outShape[2])}; - AlignGlobalLocal(global_size_, local_size_); - } -} - -void MatMulOpenCLKernel::StrassenSetConstArgs(cl::Kernel *kernel, int index, int strassen_size, bool is_matmul_kernel) { - cl_int4 shape; - if (is_matmul_kernel) { - shape = {1, 1, strassen_size, strassen_size}; - } else { - shape = {strassen_size, 1, 1, UP_DIV(strassen_size, C4NUM)}; - } - ocl_runtime_->SetKernelArg(*kernel, index, shape); + global_size_ = {UP_DIV(static_cast(outShape[3]), C4NUM), + 4 * static_cast(outShape[0]) * static_cast(outShape[1]), + static_cast(outShape[2])}; + AlignGlobalLocal(global_size_, local_size_); } void MatMulOpenCLKernel::SetConstArgs() { int arg_count = 2; cl_int4 in_shape = {inShape[0], inShape[1], inShape[2], inShape[3]}; cl_int4 out_shape = {outShape[0], outShape[1], outShape[2], outShape[3]}; - cl_int4 shape_offset = {0, 0, 0, 0}; - if (use_strassen) { - int strassen_size = inShape[3] / 2; - out_shape.s[2] = in_shape.s[2] = in_shape.s[2] / 2; - out_shape.s[3] = in_shape.s[3] = in_shape.s[3] / 2; - StrassenSetConstArgs(&kernel_IMG_add_sub_2, 3, strassen_size, false); - StrassenSetConstArgs(&kernel_BUF_add_sub_2, 2, strassen_size, false); + if (act_weight_) { + arg_count++; } else { - if (act_weight_) { - arg_count++; - } else { - ocl_runtime_->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF); - } + ocl_runtime_->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF); } ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_shape); ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_shape); - ocl_runtime_->SetKernelArg(kernel_, arg_count++, shape_offset); } -// OriginSize = N*H*W*C typesize = sizeof(type data) width = W * UP_DIV(C,C4NUM) size = N -void MatMulOpenCLKernel::PrintImage2d(void *IMGData, size_t typesize, size_t width, size_t size) { - auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper(); - int alignment = runtime_wrapper.GetInstance()->GetImagePitchAlignment(); - auto runtime = runtime_wrapper.GetInstance(); - runtime->SyncCommandQueue(); - MS_ASSERT(alignment); - size_t row_pitch = UP_ROUND(width, alignment) * typesize * C4NUM; - size_t OriginSize = size * size * typesize; - std::vector data(OriginSize); - auto row_size = width * typesize * C4NUM; - - for (int i = 0; i < size; ++i) { - memcpy(reinterpret_cast(data.data()) + i * row_size, static_cast(IMGData) + i * row_pitch, - row_size); - } - for (int i = 0; i < size * size; ++i) { - if ((i + 1) % size == 0) { - std::cout << std::endl; - } +int MatMulOpenCLKernel::Run() { + MS_LOG(DEBUG) << this->name() << " Running!"; + int arg_count = 0; + ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c()); + ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c()); + if (act_weight_) { + ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[1]->data_c()); } + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); + return mindspore::lite::RET_OK; } -void MatMulOpenCLKernel::StrassenDataFilled(cl::Kernel *kernel, void *input, void *output, const int size, - cl_int2 offset, lite::opencl::MemType mem_type) { - if (input == nullptr || output == nullptr) { - MS_LOG(ERROR) << "StrassenDataFilled input or output can not nullptr"; - return; - } - if (mem_type == lite::opencl::MemType::IMG) { - ocl_runtime_->SetKernelArg(*kernel, 0, input); - ocl_runtime_->SetKernelArg(*kernel, 1, output); +kernel::LiteKernel *OpenCLMatMulKernelCreator(const std::vector &inputs, + const std::vector &outputs, OpParameter *opParameter, + const lite::InnerContext *ctx, const kernel::KernelKey &desc, + const mindspore::lite::PrimitiveC *primitive) { + kernel::OpenCLKernel *kernel; + if (IsUseStrassenMatmul(inputs)) { + MS_LOG(DEBUG) << "use_matmul_strassen"; + kernel = new (std::nothrow) StrassenOpenCLKernel(opParameter, inputs, outputs); } else { - ocl_runtime_->SetKernelArg(*kernel, 0, input, lite::opencl::MemType::BUF); - ocl_runtime_->SetKernelArg(*kernel, 1, output, lite::opencl::MemType::BUF); + kernel = new (std::nothrow) MatMulOpenCLKernel(opParameter, inputs, outputs); } - StrassenSetConstArgs(kernel, 2, size, false); - ocl_runtime_->SetKernelArg(*kernel, 3, offset); - ocl_runtime_->RunKernel(*kernel, global_add_sub_, local_add_sub_, nullptr, &event_); -} - -void MatMulOpenCLKernel::StrassenAddSub(cl::Kernel *kernel, void *input, void *output, const int size, cl_int4 offset, - int flag, lite::opencl::MemType mem_type) { - if (input == nullptr || output == nullptr) { - MS_LOG(ERROR) << "StrassenAddSub input or output can not nullptr"; - return; - } - if (mem_type == lite::opencl::MemType::IMG) { - ocl_runtime_->SetKernelArg(*kernel, 0, input, lite::opencl::MemType::IMG); - ocl_runtime_->SetKernelArg(*kernel, 1, output, lite::opencl::MemType::IMG); - } else { - ocl_runtime_->SetKernelArg(*kernel, 0, input, lite::opencl::MemType::BUF); - ocl_runtime_->SetKernelArg(*kernel, 1, output, lite::opencl::MemType::BUF); + if (kernel == nullptr) { + MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr."; + free(opParameter); + return nullptr; } - StrassenSetConstArgs(kernel, 2, size, false); - ocl_runtime_->SetKernelArg(*kernel, 3, offset); - ocl_runtime_->SetKernelArg(*kernel, 4, flag); - ocl_runtime_->RunKernel(*kernel, global_add_sub_, local_add_sub_, nullptr, &event_); -} - -void MatMulOpenCLKernel::StrassenBackResult(cl::Kernel *kernel, void *input1, void *input2, void *input3, void *input4, - void *input5, void *input6, void *input7, void *output, const int size) { - if (input1 == nullptr || input2 == nullptr || input3 == nullptr || input4 == nullptr || input5 == nullptr || - input6 == nullptr || input7 == nullptr || output == nullptr) { - MS_LOG(ERROR) << "StrassenBackResult input or output can not nullptr"; - return; + auto ret = kernel->CheckSpecs(); + if (ret != mindspore::lite::RET_OK) { + MS_LOG(ERROR) << "Check " << opParameter->name_ << " specification failed!"; + delete kernel; + return nullptr; } - ocl_runtime_->SetKernelArg(*kernel, 0, input1); - ocl_runtime_->SetKernelArg(*kernel, 1, input2); - ocl_runtime_->SetKernelArg(*kernel, 2, input3); - ocl_runtime_->SetKernelArg(*kernel, 3, input4); - ocl_runtime_->SetKernelArg(*kernel, 4, input5); - ocl_runtime_->SetKernelArg(*kernel, 5, input6); - ocl_runtime_->SetKernelArg(*kernel, 6, input7); - ocl_runtime_->SetKernelArg(*kernel, 7, output); - StrassenSetConstArgs(kernel, 8, size, false); - ocl_runtime_->RunKernel(*kernel, global_add_sub_, local_add_sub_, nullptr, &event_); + return kernel; } -void MatMulOpenCLKernel::StrassenRunMmatmul(void *input, void *weight, void *output, const int size) { - if (input == nullptr || weight == nullptr || output == nullptr) { - MS_LOG(ERROR) << "StrassenRunMmatmul input ,weight or output can not nullptr"; - return; - } - ocl_runtime_->SetKernelArg(kernel_, 0, input); - ocl_runtime_->SetKernelArg(kernel_, 1, output); - ocl_runtime_->SetKernelArg(kernel_, 2, weight, lite::opencl::MemType::BUF); - StrassenSetConstArgs(&kernel_, 3, size, true); - StrassenSetConstArgs(&kernel_, 4, size, true); - ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); -} - -void MatMulOpenCLKernel::DoStrassen(void *data, void *weight, void *result, const int size, const int depth, - const int threshold) { - const int size_2 = size / 2; - int C4 = UP_DIV(size_2, C4NUM); - if (size <= threshold) { - // run matmul; - StrassenSetGlobalLocal(size, 0); - StrassenRunMmatmul(data, weight, result, size); - return; - } - // flag = 0 : add otherwise flag = 1 : sub - // M1 = A11 * ( B12- B22) - StrassenSetGlobalLocal(size_2, 1); - StrassenDataFilled(&MatMul_StrassenIMGFilled, data, A_temp[depth + 1], size_2, {0, 0}, lite::opencl::MemType::IMG); - StrassenAddSub(&kernel_BUF_add_sub_2, weight, B_temp[depth + 1], size_2, {0, C4, size_2, C4}, 1, - lite::opencl::MemType::BUF); - DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M1[depth + 1], size_2, depth + 1, threshold); - - // M2 = (A11 + A12) * B22 - StrassenSetGlobalLocal(size_2, 1); - StrassenDataFilled(&MatMul_StrassenBUFFilled, weight, B_temp[depth + 1], size_2, {size_2, C4}, - lite::opencl::MemType::BUF); - StrassenAddSub(&kernel_IMG_add_sub_2, data, A_temp[depth + 1], size_2, {0, 0, 0, C4}, 0, lite::opencl::MemType::IMG); - DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M2[depth + 1], size_2, depth + 1, threshold); - - // M3 = (A21 + A22) * B11 - StrassenSetGlobalLocal(size_2, 1); - StrassenDataFilled(&MatMul_StrassenBUFFilled, weight, B_temp[depth + 1], size_2, {0, 0}, lite::opencl::MemType::BUF); - StrassenAddSub(&kernel_IMG_add_sub_2, data, A_temp[depth + 1], size_2, {size_2, 0, size_2, C4}, 0, - lite::opencl::MemType::IMG); - DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M3[depth + 1], size_2, depth + 1, threshold); - - // M4 = A22 * (B21 - B11) - StrassenSetGlobalLocal(size_2, 1); - StrassenDataFilled(&MatMul_StrassenIMGFilled, data, A_temp[depth + 1], size_2, {size_2, C4}, - lite::opencl::MemType::IMG); - StrassenAddSub(&kernel_BUF_add_sub_2, weight, B_temp[depth + 1], size_2, {size_2, 0, 0, 0}, 1, - lite::opencl::MemType::BUF); - DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M4[depth + 1], size_2, depth + 1, threshold); - - // M5 = (A11 + A22) * (B11 + B22) - StrassenSetGlobalLocal(size_2, 1); - StrassenAddSub(&kernel_IMG_add_sub_2, data, A_temp[depth + 1], size_2, {0, 0, size_2, C4}, 0, - lite::opencl::MemType::IMG); - // (B11 + B22) - StrassenAddSub(&kernel_BUF_add_sub_2, weight, B_temp[depth + 1], size_2, {0, 0, size_2, C4}, 0, - lite::opencl::MemType::BUF); - DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M5[depth + 1], size_2, depth + 1, threshold); - - // M6 = (A12 - A22) * (B21 + B22) - StrassenSetGlobalLocal(size_2, 1); - StrassenAddSub(&kernel_IMG_add_sub_2, data, A_temp[depth + 1], size_2, {0, C4, size_2, C4}, 1, - lite::opencl::MemType::IMG); - StrassenAddSub(&kernel_BUF_add_sub_2, weight, B_temp[depth + 1], size_2, {size_2, 0, size_2, C4}, 0, - lite::opencl::MemType::BUF); - DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M6[depth + 1], size_2, depth + 1, threshold); - - // M7 = (A11 - A21) * (B11 + B12) - StrassenSetGlobalLocal(size_2, 1); - StrassenAddSub(&kernel_IMG_add_sub_2, data, A_temp[depth + 1], size_2, {0, 0, size_2, 0}, 1, - lite::opencl::MemType::IMG); - StrassenAddSub(&kernel_BUF_add_sub_2, weight, B_temp[depth + 1], size_2, {0, 0, 0, C4}, 0, - lite::opencl::MemType::BUF); - DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M7[depth + 1], size_2, depth + 1, threshold); - - // BackResult - StrassenSetGlobalLocal(size_2, 1); - StrassenBackResult(&kernel_back_result, M1[depth + 1], M2[depth + 1], M3[depth + 1], M4[depth + 1], M5[depth + 1], - M6[depth + 1], M7[depth + 1], result, size_2); -} - -int MatMulOpenCLKernel::Run() { - MS_LOG(DEBUG) << this->name() << " Running!"; - if (use_strassen) { - int threshold = 0; - const int up_bound = 1024; - const int down_bound = 256; - if (in_tensors_.at(0)->shape()[0] >= up_bound) { - threshold = UP_DIV(in_tensors_.at(0)->shape()[0], C4NUM) / 2; - } else if (in_tensors_.at(0)->shape()[0] <= down_bound) { - threshold = in_tensors_.at(0)->shape()[0]; - } else { - threshold = UP_DIV(in_tensors_.at(0)->shape()[0], C4NUM); - } - DoStrassen(in_tensors_.at(0)->data_c(), padWeight_, out_tensors_.at(0)->data_c(), in_tensors_.at(0)->shape()[0], 0, - threshold); - } else { - int arg_count = 0; - ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c()); - ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c()); - if (act_weight_) { - ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[1]->data_c()); - } - ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); - } - return mindspore::lite::RET_OK; -} +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_MatMul, OpenCLMatMulKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_MatMul, OpenCLMatMulKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_MatMul, OpenCLKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_MatMul, OpenCLKernelCreator) } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h index e3630a45fa..ab9ec6807f 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h @@ -41,7 +41,7 @@ class MatMulOpenCLKernel : public OpenCLKernel { void SetGlobalLocal() override; int Tune() override { return lite::RET_OK; } - private: + protected: void *padWeight_{nullptr}; bool enable_fp16_{false}; bool transposeA{false}; @@ -51,43 +51,6 @@ class MatMulOpenCLKernel : public OpenCLKernel { bool act_weight_{false}; std::vector inShape{std::vector(MAX_DIMS, 1)}; std::vector outShape{std::vector(MAX_DIMS, 1)}; - - // strassen - private: - void AllocatorMemoryForStrassen(int NumA, int NumB); - void DoStrassen(void *data, void *weight, void *result, const int size, const int depth, const int threshold); - void StrassenSetGlobalLocal(size_t strassen_size, int type_flag); - void StrassenSetConstArgs(cl::Kernel *kernel, int index, int strassen_size, bool is_matmul_kernel); - void StrassenDataFilled(cl::Kernel *kernel, void *input, void *output, const int size, cl_int2 offset, - lite::opencl::MemType mem_type); - void StrassenAddSub(cl::Kernel *kernel, void *input, void *output, const int size, cl_int4 offset, int flag, - lite::opencl::MemType mem_type); - void StrassenBackResult(cl::Kernel *kernel, void *input1, void *input2, void *input3, void *input4, void *input5, - void *input6, void *input7, void *output, const int size); - void StrassenRunMmatmul(void *input, void *weight, void *output, const int size); - void PrintImage2d(void *IMGData, size_t typesize, size_t width, size_t size); - bool use_strassen{false}; - cl::Kernel kernel_IMG_add_sub_2; - cl::Kernel MatMul_StrassenBUFFilled; - cl::Kernel MatMul_StrassenIMGFilled; - cl::Kernel kernel_BUF_add_sub_2; - cl::Kernel kernel_back_result; - cl::NDRange global_add_sub_, local_add_sub_; - std::vector global_size_add_sub; - std::vector local_size_add_sub; - // image 2d - void *A_temp[MAXDEPTH] = {nullptr}; - - void *M1[MAXDEPTH] = {nullptr}; - void *M2[MAXDEPTH] = {nullptr}; - void *M3[MAXDEPTH] = {nullptr}; - void *M4[MAXDEPTH] = {nullptr}; - void *M5[MAXDEPTH] = {nullptr}; - void *M6[MAXDEPTH] = {nullptr}; - void *M7[MAXDEPTH] = {nullptr}; - - // buffer - void *B_temp[MAXDEPTH] = {nullptr}; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/split.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/split.cc new file mode 100644 index 0000000000..27d45f6023 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/split.cc @@ -0,0 +1,206 @@ +/** + * Copyright 2019 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/kernel/opencl/kernel/split.h" +#include +#include +#include +#include +#include "src/kernel_registry.h" +#include "src/runtime/kernel/opencl/cl/split.cl.inc" + +using mindspore::kernel::KERNEL_ARCH::kGPU; +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_Split; +namespace mindspore::kernel { + +int SplitOpenCLKernel::RunAxis0() { + auto allocator_ = ocl_runtime_->GetAllocator(); + std::vector img_size; + auto src_data = in_tensors_[0]->data_c(); + cl::Image2D *in_image = reinterpret_cast(allocator_->GetImage(src_data)); + if (in_image == nullptr) { + MS_LOG(ERROR) << "RunAxis0 in_image can not be nullptr"; + return RET_ERROR; + } + auto src_area = cl::array{0, 0, 0}; + for (int i = 0; i < out_tensors_.size(); i++) { + auto dst_data = out_tensors_[i]->data_c(); + allocator_->GetImageSize(dst_data, &img_size); + auto dst_area = cl::array{0, 0, 0}; + auto region = cl::array{img_size[0], img_size[1], 1}; + cl::Image2D *out_image = reinterpret_cast(allocator_->GetImage(dst_data)); + if (out_image == nullptr) { + MS_LOG(ERROR) << "RunAxis0 out_image can not be nullptr"; + return RET_ERROR; + } + ocl_runtime_->GetDefaultCommandQueue()->enqueueCopyImage(*in_image, *out_image, src_area, dst_area, region); + src_area[1] += region[1]; + } + return RET_OK; +} + +int SplitOpenCLKernel::CheckSpecs() { + if (out_tensors_.size() != 2 || in_tensors_.size() != 1) { + MS_LOG(ERROR) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size(); + return RET_ERROR; + } + if (in_tensors_.at(0)->IsConst()) { + MS_LOG(ERROR) << "in_tensors_ must be tensor"; + return RET_ERROR; + } + for (auto &out_tensor : out_tensors_) { + if (out_tensor->IsConst()) { + MS_LOG(ERROR) << "out_tensor must be tensor"; + return RET_ERROR; + } + } + auto param = reinterpret_cast(this->op_parameter_); + if (param->num_split_ != 2 && param->num_split_ != 1) { + MS_LOG(ERROR) << "num_split_ only supported 1 or 2 yet"; + return RET_ERROR; + } + if (param->split_dim_ < 0 || param->split_dim_ > 3) { + MS_LOG(ERROR) << "split_dim_ must between 0~3"; + return RET_ERROR; + } + if (param->split_sizes_ == nullptr) { + MS_LOG(ERROR) << "split_sizes_ can not nullptr"; + return RET_ERROR; + } + return RET_OK; +} + +void SplitOpenCLKernel::AlignSplitSizes(SplitParameter *param, const std::vector &in_shape) { + auto allocator = ocl_runtime_->GetAllocator(); + int shape_dim = in_shape.at(param->split_dim_); + if (num_split_ == 1) { + size_t num_split = UP_DIV(shape_dim, param->split_sizes_[0]); + split_sizes_ = reinterpret_cast(allocator->Malloc(num_split * sizeof(int))); + for (int i = 0; i < num_split - 1; ++i) { + split_sizes_[i] = (i + 1) * param->split_sizes_[0]; + } + } else { + int sum = 0; + split_sizes_ = reinterpret_cast(allocator->Malloc(num_split_ * sizeof(int))); + for (int i = 0; i < num_split_ - 1; ++i) { + sum += param->split_sizes_[i]; + split_sizes_[i] = sum; + } + } +} + +int SplitOpenCLKernel::Prepare() { + auto param = reinterpret_cast(this->op_parameter_); + auto in_shape = in_tensors_.at(0)->shape(); + int increment_dim = C4NUM - in_shape.size(); + split_dim_ = param->split_dim_ == 0 ? param->split_dim_ : param->split_dim_ + increment_dim; + num_split_ = param->num_split_; + if (split_dim_ == 0) { + return RET_OK; + } + for (int i = 0; i < out_tensors_.size(); ++i) { + int length = out_tensors_[0]->shape().size(); + if (split_dim_ == 3) { + if (out_tensors_[i]->shape()[length - 1] % C4NUM != 0) { + Align_ = false; + } + } + } + AlignSplitSizes(param, in_shape); + std::string kernel_name = "split_out"; + kernel_name += num_split_ == 1 ? std::to_string(out_tensors().size()) : std::to_string(num_split_); + kernel_name += "_axis" + std::to_string(split_dim_); + if (!Align_) { + kernel_name += "_unalign"; + } + MS_LOG(DEBUG) << "kernel_name=: " << kernel_name; + std::string source = split_source; + std::string program_name = "split"; + ocl_runtime_->LoadSource(program_name, source); + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); + MS_LOG(DEBUG) << kernel_name << " Init Done!"; + SetConstArgs(); + SetGlobalLocal(); + return RET_OK; +} + +void SplitOpenCLKernel::SetConstArgs() { + int arg_cn = out_tensors_.size() + 2; + cl_int4 shape = {}; + for (int i = 0; i < in_tensors_[0]->shape().size(); ++i) { + shape.s[i] = in_tensors_[0]->shape()[i]; + } + Broadcast2GpuShape(in_shape_.s, shape.s, out_tensors_[0]->shape().size(), 1); + if (Align_) { + in_shape_.s[3] = UP_DIV(in_shape_.s[3], C4NUM); + } + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_shape_); + + for (int i = 0; i < out_tensors_.size(); ++i) { + cl_int4 temp = {}; + for (int j = 0; j < out_tensors_[i]->shape().size(); ++j) { + temp.s[j] = out_tensors_[i]->shape()[j]; + } + Broadcast2GpuShape(out_shape_.s, temp.s, out_tensors_[i]->shape().size(), 1); + if (Align_) { + out_shape_.s[3] = UP_DIV(out_shape_.s[3], C4NUM); + } + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_shape_); + } + GpuTensorInfo img_info(in_tensors_.at(0)); + size_t dtype = enable_fp16_ ? sizeof(cl_half) : sizeof(cl_float); + stride_w = img_info.RowPitch() / dtype; + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, stride_w); + return; +} + +void SplitOpenCLKernel::SetGlobalLocal() { + OH = in_shape_.s[0] * in_shape_.s[1]; + OW = in_shape_.s[2]; + if (Align_) { + OC = in_shape_.s[3]; + } + global_size_ = {OH, OW, OC}; + local_size_ = {1, 1, 1}; + OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); + return; +} + +int SplitOpenCLKernel::Run() { + if (split_dim_ == 0) { + RunAxis0(); + return RET_OK; + } + int arg_cn = 0; + if (Align_) { + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(0)->data_c()); + } else { + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(0)->data_c(), lite::opencl::MemType::BUF); + } + for (int i = 0; i < out_tensors_.size(); ++i) { + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_.at(i)->data_c()); + } + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, split_sizes_, lite::opencl::MemType::BUF); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); + return RET_OK; +} + +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Split, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Split, OpenCLKernelCreator) +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/split.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/split.h new file mode 100644 index 0000000000..6892e1d305 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/split.h @@ -0,0 +1,60 @@ +/** + * Copyright 2019 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_SPLIT_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_SPLIT_H_ + +#include +#include "src/runtime/kernel/opencl/opencl_kernel.h" +#include "nnacl/split_parameter.h" + +namespace mindspore::kernel { + +class SplitOpenCLKernel : public OpenCLKernel { + public: + SplitOpenCLKernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs) + : OpenCLKernel(parameter, inputs, outputs) {} + + ~SplitOpenCLKernel() override = default; + + int Prepare() override; + + int CheckSpecs() override; + void SetConstArgs() override; + void SetGlobalLocal() override; + int Run() override; + + private: + void AlignSplitSizes(SplitParameter *param, const std::vector &in_shape); + int RunAxis0(); + + private: + cl_int4 in_shape_{}; + cl_int4 out_shape_ = {}; + bool Align_{true}; + bool enable_fp16_{false}; + size_t num_split_ = 1; + int *split_sizes_{nullptr}; + int split_dim_ = 0; + cl_int stride_w{1}; + uint32_t OH = {1}; + uint32_t OW = {1}; + uint32_t OC = {1}; +}; + +} // namespace mindspore::kernel +#endif diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/strassen.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/strassen.cc new file mode 100644 index 0000000000..5be0ad12a6 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/strassen.cc @@ -0,0 +1,361 @@ +/** + * Copyright 2019 Huawei Technologies n., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include "src/runtime/kernel/opencl/kernel/matmul.h" +#include "src/runtime/kernel/opencl/kernel/strassen.h" +#include "src/common/utils.h" + +#ifndef PROGRAM_WITH_IL + +#include "src/runtime/kernel/opencl/cl/strassen.cl.inc" + +#endif + +namespace mindspore::kernel { + +int StrassenOpenCLKernel::Prepare() { +#ifdef PROGRAM_WITH_IL + kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); +#else + std::string kernel_name = "MatMul_Strassen_NHWC4_2d"; + std::string source = strassen_source; + std::string program_name = "MatMul"; + ocl_runtime_->LoadSource(program_name, source); + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); + ocl_runtime_->BuildKernel(kernel_IMG_add_sub_2, program_name, "MatMul_IMG_Add_Sub_2"); + ocl_runtime_->BuildKernel(kernel_BUF_add_sub_2, program_name, "MatMul_BUF_Add_Sub_2"); + ocl_runtime_->BuildKernel(kernel_back_result, program_name, "Strassen_Back_Result"); + ocl_runtime_->BuildKernel(MatMul_StrassenBUFFilled, program_name, "MatMul_BUF_Filled"); + ocl_runtime_->BuildKernel(MatMul_StrassenIMGFilled, program_name, "MatMul_IMG_Filled"); + +#endif + auto ret = InitWeights(); + if (ret != RET_OK) { + return ret; + } + SetConstArgs(); + SetGlobalLocal(); + MS_LOG(DEBUG) << kernel_name << " Init Done!"; + return mindspore::lite::RET_OK; +} + +void StrassenOpenCLKernel::AllocatorMemoryForStrassen(int NumA, int NumB) { + std::vector img_size; + img_size.push_back(UP_DIV(NumA, C4NUM)); + img_size.push_back(NumA); + size_t img_dtype = enable_fp16_ ? CL_HALF_FLOAT : CL_FLOAT; + size_t dtype_size = enable_fp16_ ? sizeof(CL_HALF_FLOAT) : sizeof(CL_FLOAT); + img_size.push_back(img_dtype); + auto allocator = ocl_runtime_->GetAllocator(); + size_t memA = NumA * NumA; + + size_t memB = NumB * NumB * dtype_size; + for (int depth = 0; depth < MAXDEPTH; depth++) { + B_temp[depth] = allocator->Malloc(memB); + A_temp[depth] = allocator->Malloc(memA, img_size); + + M1[depth] = allocator->Malloc(memA, img_size); + M2[depth] = allocator->Malloc(memA, img_size); + M3[depth] = allocator->Malloc(memA, img_size); + M4[depth] = allocator->Malloc(memA, img_size); + M5[depth] = allocator->Malloc(memA, img_size); + M6[depth] = allocator->Malloc(memA, img_size); + M7[depth] = allocator->Malloc(memA, img_size); + } +} + +int StrassenOpenCLKernel::InitWeights() { + // ABMCI @ ABCICO = ABMCO + auto ret = DequantWeight(); + if (ret != RET_OK) { + return ret; + } + auto allocator = ocl_runtime_->GetAllocator(); + int NumA = in_tensors_[0]->shape()[0]; + int NumB = in_tensors_[1]->shape()[0]; + size_t dtype_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float); + padWeight_ = allocator->Malloc(NumA * NumB * dtype_size); + padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true); + auto padWeightFp32 = reinterpret_cast(padWeight_); + auto padWeightFp16 = reinterpret_cast(padWeight_); + memset(padWeight_, 0x00, NumA * NumB * dtype_size); + auto originWeightFp32 = reinterpret_cast(in_tensors_.at(kWeightIndex)->data_c()); + auto originWeightFp16 = reinterpret_cast(in_tensors_.at(kWeightIndex)->data_c()); + bool isModelFp16 = in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16; + AllocatorMemoryForStrassen(NumA / 2, NumB / 2); + size_t size = NumA * NumB * dtype_size; + if (isModelFp16) { + if (enable_fp16_) { + memcpy(padWeightFp16, originWeightFp16, size); + } else { + for (int i = 0; i < NumA * NumB; ++i) { + padWeightFp32[i] = static_cast(originWeightFp16[i]); + } + } + } else { + if (enable_fp16_) { + for (int i = 0; i < NumA * NumB; ++i) { + padWeightFp16[i] = static_cast(originWeightFp32[i]); + } + } else { + memcpy(padWeightFp32, originWeightFp32, size); + } + } + allocator->UnmapBuffer(padWeight_); + FreeDequantedWeight(); + return RET_OK; +} + +void AlignStrassenGlobalLocal(const std::vector &global, const std::vector &local, + cl::NDRange *global_range, cl::NDRange *local_range) { + *local_range = cl::NDRange(local[0], local[1], local[2]); + *global_range = + cl::NDRange(UP_ROUND(global[0], local[0]), UP_ROUND(global[1], local[1]), UP_ROUND(global[2], local[2])); +} + +// 0 : global_size_, 1: global_size_add_sub +void StrassenOpenCLKernel::StrassenSetGlobalLocal(size_t strassen_size, int type_flag) { + size_t strassen_size_C4 = UP_DIV(strassen_size, C4NUM); + local_size_add_sub = {16, 1, 16}; + if (type_flag == 0) { + global_size_ = {strassen_size_C4, 1, strassen_size}; + AlignGlobalLocal(global_size_, local_size_); + } else { + global_size_add_sub = {strassen_size_C4, 1, strassen_size}; + AlignStrassenGlobalLocal(global_size_add_sub, local_size_add_sub, &global_add_sub_, &local_add_sub_); + } +} + +void StrassenOpenCLKernel::SetGlobalLocal() { + // local size should less than MAX_GROUP_SIZE + local_size_ = {32, 4, 1}; + global_size_ = {1, 1, 1}; + size_t strassen_size = outShape[3] / 2; + StrassenSetGlobalLocal(strassen_size, 0); // set global_ and local + StrassenSetGlobalLocal(strassen_size, 1); // set global_size_add_sub + StrassenSetGlobalLocal(strassen_size, 2); // set global_size_weights +} + +void StrassenOpenCLKernel::StrassenSetConstArgs(cl::Kernel *kernel, int index, int strassen_size, + bool is_matmul_kernel) { + cl_int4 shape; + if (is_matmul_kernel) { + shape = {1, 1, strassen_size, strassen_size}; + } else { + shape = {strassen_size, 1, 1, UP_DIV(strassen_size, C4NUM)}; + } + ocl_runtime_->SetKernelArg(*kernel, index, shape); +} + +void StrassenOpenCLKernel::SetConstArgs() { + int arg_count = 2; + cl_int4 in_shape = {inShape[0], inShape[1], inShape[2], inShape[3]}; + cl_int4 out_shape = {outShape[0], outShape[1], outShape[2], outShape[3]}; + cl_int4 shape_offset = {0, 0, 0, 0}; + int strassen_size = inShape[3] / 2; + out_shape.s[2] = in_shape.s[2] = in_shape.s[2] / 2; + out_shape.s[3] = in_shape.s[3] = in_shape.s[3] / 2; + StrassenSetConstArgs(&kernel_IMG_add_sub_2, 3, strassen_size, false); + StrassenSetConstArgs(&kernel_BUF_add_sub_2, 2, strassen_size, false); + ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_shape); + ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_shape); + ocl_runtime_->SetKernelArg(kernel_, arg_count++, shape_offset); +} + +// OriginSize = N*H*W*C typesize = sizeof(type data) width = W * UP_DIV(C,C4NUM) size = N +void StrassenOpenCLKernel::PrintImage2d(void *IMGData, size_t typesize, size_t width, size_t size) { + auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper(); + int alignment = runtime_wrapper.GetInstance()->GetImagePitchAlignment(); + auto runtime = runtime_wrapper.GetInstance(); + runtime->SyncCommandQueue(); + MS_ASSERT(alignment); + size_t row_pitch = UP_ROUND(width, alignment) * typesize * C4NUM; + size_t OriginSize = size * size * typesize; + std::vector data(OriginSize); + auto row_size = width * typesize * C4NUM; + + for (int i = 0; i < size; ++i) { + memcpy(reinterpret_cast(data.data()) + i * row_size, static_cast(IMGData) + i * row_pitch, + row_size); + } + for (int i = 0; i < size * size; ++i) { + if ((i + 1) % size == 0) { + std::cout << std::endl; + } + } +} + +void StrassenOpenCLKernel::StrassenDataFilled(cl::Kernel *kernel, void *input, void *output, const int size, + cl_int2 offset, lite::opencl::MemType mem_type) { + if (input == nullptr || output == nullptr) { + MS_LOG(ERROR) << "StrassenDataFilled input or output can not nullptr"; + return; + } + if (mem_type == lite::opencl::MemType::IMG) { + ocl_runtime_->SetKernelArg(*kernel, 0, input); + ocl_runtime_->SetKernelArg(*kernel, 1, output); + } else { + ocl_runtime_->SetKernelArg(*kernel, 0, input, lite::opencl::MemType::BUF); + ocl_runtime_->SetKernelArg(*kernel, 1, output, lite::opencl::MemType::BUF); + } + StrassenSetConstArgs(kernel, 2, size, false); + ocl_runtime_->SetKernelArg(*kernel, 3, offset); + ocl_runtime_->RunKernel(*kernel, global_add_sub_, local_add_sub_, nullptr, &event_); +} + +void StrassenOpenCLKernel::StrassenAddSub(cl::Kernel *kernel, void *input, void *output, const int size, cl_int4 offset, + int flag, lite::opencl::MemType mem_type) { + if (input == nullptr || output == nullptr) { + MS_LOG(ERROR) << "StrassenAddSub input or output can not nullptr"; + return; + } + if (mem_type == lite::opencl::MemType::IMG) { + ocl_runtime_->SetKernelArg(*kernel, 0, input, lite::opencl::MemType::IMG); + ocl_runtime_->SetKernelArg(*kernel, 1, output, lite::opencl::MemType::IMG); + } else { + ocl_runtime_->SetKernelArg(*kernel, 0, input, lite::opencl::MemType::BUF); + ocl_runtime_->SetKernelArg(*kernel, 1, output, lite::opencl::MemType::BUF); + } + StrassenSetConstArgs(kernel, 2, size, false); + ocl_runtime_->SetKernelArg(*kernel, 3, offset); + ocl_runtime_->SetKernelArg(*kernel, 4, flag); + ocl_runtime_->RunKernel(*kernel, global_add_sub_, local_add_sub_, nullptr, &event_); +} + +void StrassenOpenCLKernel::StrassenBackResult(cl::Kernel *kernel, void *input1, void *input2, void *input3, + void *input4, void *input5, void *input6, void *input7, void *output, + const int size) { + if (input1 == nullptr || input2 == nullptr || input3 == nullptr || input4 == nullptr || input5 == nullptr || + input6 == nullptr || input7 == nullptr || output == nullptr) { + MS_LOG(ERROR) << "StrassenBackResult input or output can not nullptr"; + return; + } + ocl_runtime_->SetKernelArg(*kernel, 0, input1); + ocl_runtime_->SetKernelArg(*kernel, 1, input2); + ocl_runtime_->SetKernelArg(*kernel, 2, input3); + ocl_runtime_->SetKernelArg(*kernel, 3, input4); + ocl_runtime_->SetKernelArg(*kernel, 4, input5); + ocl_runtime_->SetKernelArg(*kernel, 5, input6); + ocl_runtime_->SetKernelArg(*kernel, 6, input7); + ocl_runtime_->SetKernelArg(*kernel, 7, output); + StrassenSetConstArgs(kernel, 8, size, false); + ocl_runtime_->RunKernel(*kernel, global_add_sub_, local_add_sub_, nullptr, &event_); +} + +void StrassenOpenCLKernel::StrassenRunMmatmul(void *input, void *weight, void *output, const int size) { + if (input == nullptr || weight == nullptr || output == nullptr) { + MS_LOG(ERROR) << "StrassenRunMmatmul input ,weight or output can not nullptr"; + return; + } + ocl_runtime_->SetKernelArg(kernel_, 0, input); + ocl_runtime_->SetKernelArg(kernel_, 1, output); + ocl_runtime_->SetKernelArg(kernel_, 2, weight, lite::opencl::MemType::BUF); + StrassenSetConstArgs(&kernel_, 3, size, true); + StrassenSetConstArgs(&kernel_, 4, size, true); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); +} + +void StrassenOpenCLKernel::DoStrassen(void *data, void *weight, void *result, const int size, const int depth, + const int threshold) { + const int size_2 = size / 2; + int C4 = UP_DIV(size_2, C4NUM); + if (size <= threshold) { + // run matmul; + StrassenSetGlobalLocal(size, 0); + StrassenRunMmatmul(data, weight, result, size); + return; + } + // flag = 0 : add otherwise flag = 1 : sub + // M1 = A11 * ( B12- B22) + StrassenSetGlobalLocal(size_2, 1); + StrassenDataFilled(&MatMul_StrassenIMGFilled, data, A_temp[depth + 1], size_2, {0, 0}, lite::opencl::MemType::IMG); + StrassenAddSub(&kernel_BUF_add_sub_2, weight, B_temp[depth + 1], size_2, {0, C4, size_2, C4}, 1, + lite::opencl::MemType::BUF); + DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M1[depth + 1], size_2, depth + 1, threshold); + + // M2 = (A11 + A12) * B22 + StrassenSetGlobalLocal(size_2, 1); + StrassenDataFilled(&MatMul_StrassenBUFFilled, weight, B_temp[depth + 1], size_2, {size_2, C4}, + lite::opencl::MemType::BUF); + StrassenAddSub(&kernel_IMG_add_sub_2, data, A_temp[depth + 1], size_2, {0, 0, 0, C4}, 0, lite::opencl::MemType::IMG); + DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M2[depth + 1], size_2, depth + 1, threshold); + + // M3 = (A21 + A22) * B11 + StrassenSetGlobalLocal(size_2, 1); + StrassenDataFilled(&MatMul_StrassenBUFFilled, weight, B_temp[depth + 1], size_2, {0, 0}, lite::opencl::MemType::BUF); + StrassenAddSub(&kernel_IMG_add_sub_2, data, A_temp[depth + 1], size_2, {size_2, 0, size_2, C4}, 0, + lite::opencl::MemType::IMG); + DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M3[depth + 1], size_2, depth + 1, threshold); + + // M4 = A22 * (B21 - B11) + StrassenSetGlobalLocal(size_2, 1); + StrassenDataFilled(&MatMul_StrassenIMGFilled, data, A_temp[depth + 1], size_2, {size_2, C4}, + lite::opencl::MemType::IMG); + StrassenAddSub(&kernel_BUF_add_sub_2, weight, B_temp[depth + 1], size_2, {size_2, 0, 0, 0}, 1, + lite::opencl::MemType::BUF); + DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M4[depth + 1], size_2, depth + 1, threshold); + + // M5 = (A11 + A22) * (B11 + B22) + StrassenSetGlobalLocal(size_2, 1); + StrassenAddSub(&kernel_IMG_add_sub_2, data, A_temp[depth + 1], size_2, {0, 0, size_2, C4}, 0, + lite::opencl::MemType::IMG); + // (B11 + B22) + StrassenAddSub(&kernel_BUF_add_sub_2, weight, B_temp[depth + 1], size_2, {0, 0, size_2, C4}, 0, + lite::opencl::MemType::BUF); + DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M5[depth + 1], size_2, depth + 1, threshold); + + // M6 = (A12 - A22) * (B21 + B22) + StrassenSetGlobalLocal(size_2, 1); + StrassenAddSub(&kernel_IMG_add_sub_2, data, A_temp[depth + 1], size_2, {0, C4, size_2, C4}, 1, + lite::opencl::MemType::IMG); + StrassenAddSub(&kernel_BUF_add_sub_2, weight, B_temp[depth + 1], size_2, {size_2, 0, size_2, C4}, 0, + lite::opencl::MemType::BUF); + DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M6[depth + 1], size_2, depth + 1, threshold); + + // M7 = (A11 - A21) * (B11 + B12) + StrassenSetGlobalLocal(size_2, 1); + StrassenAddSub(&kernel_IMG_add_sub_2, data, A_temp[depth + 1], size_2, {0, 0, size_2, 0}, 1, + lite::opencl::MemType::IMG); + StrassenAddSub(&kernel_BUF_add_sub_2, weight, B_temp[depth + 1], size_2, {0, 0, 0, C4}, 0, + lite::opencl::MemType::BUF); + DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M7[depth + 1], size_2, depth + 1, threshold); + + // BackResult + StrassenSetGlobalLocal(size_2, 1); + StrassenBackResult(&kernel_back_result, M1[depth + 1], M2[depth + 1], M3[depth + 1], M4[depth + 1], M5[depth + 1], + M6[depth + 1], M7[depth + 1], result, size_2); +} + +int StrassenOpenCLKernel::Run() { + MS_LOG(DEBUG) << this->name() << " Running!"; + int threshold = 0; + const int up_bound = 1024; + const int down_bound = 256; + if (in_tensors_.at(0)->shape()[0] >= up_bound) { + threshold = UP_DIV(in_tensors_.at(0)->shape()[0], C4NUM) / 2; + } else if (in_tensors_.at(0)->shape()[0] <= down_bound) { + threshold = in_tensors_.at(0)->shape()[0]; + } else { + threshold = UP_DIV(in_tensors_.at(0)->shape()[0], C4NUM); + } + DoStrassen(in_tensors_.at(0)->data_c(), padWeight_, out_tensors_.at(0)->data_c(), in_tensors_.at(0)->shape()[0], 0, + threshold); + return mindspore::lite::RET_OK; +} +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/strassen.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/strassen.h new file mode 100644 index 0000000000..3210848d85 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/strassen.h @@ -0,0 +1,77 @@ +/** + * Copyright 2019 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_STRASSEN_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_STRASSEN_H_ + +#include +#include +#include "src/runtime/kernel/opencl/kernel/matmul.h" + +namespace mindspore::kernel { + +class StrassenOpenCLKernel : public MatMulOpenCLKernel { + public: + StrassenOpenCLKernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs) + : MatMulOpenCLKernel(parameter, inputs, outputs) {} + ~StrassenOpenCLKernel() override = default; + + public: + int Run() override; + int Prepare() override; + int InitWeights() override; + void SetConstArgs() override; + void SetGlobalLocal() override; + + // strassen + private: + void AllocatorMemoryForStrassen(int NumA, int NumB); + void DoStrassen(void *data, void *weight, void *result, const int size, const int depth, const int threshold); + void StrassenSetGlobalLocal(size_t strassen_size, int type_flag); + void StrassenSetConstArgs(cl::Kernel *kernel, int index, int strassen_size, bool is_matmul_kernel); + void StrassenDataFilled(cl::Kernel *kernel, void *input, void *output, const int size, cl_int2 offset, + lite::opencl::MemType mem_type); + void StrassenAddSub(cl::Kernel *kernel, void *input, void *output, const int size, cl_int4 offset, int flag, + lite::opencl::MemType mem_type); + void StrassenBackResult(cl::Kernel *kernel, void *input1, void *input2, void *input3, void *input4, void *input5, + void *input6, void *input7, void *output, const int size); + void StrassenRunMmatmul(void *input, void *weight, void *output, const int size); + void PrintImage2d(void *IMGData, size_t typesize, size_t width, size_t size); + cl::Kernel kernel_IMG_add_sub_2; + cl::Kernel MatMul_StrassenBUFFilled; + cl::Kernel MatMul_StrassenIMGFilled; + cl::Kernel kernel_BUF_add_sub_2; + cl::Kernel kernel_back_result; + cl::NDRange global_add_sub_, local_add_sub_; + std::vector global_size_add_sub; + std::vector local_size_add_sub; + // image 2d + void *A_temp[MAXDEPTH] = {nullptr}; + void *M1[MAXDEPTH] = {nullptr}; + void *M2[MAXDEPTH] = {nullptr}; + void *M3[MAXDEPTH] = {nullptr}; + void *M4[MAXDEPTH] = {nullptr}; + void *M5[MAXDEPTH] = {nullptr}; + void *M6[MAXDEPTH] = {nullptr}; + void *M7[MAXDEPTH] = {nullptr}; + // buffer + void *B_temp[MAXDEPTH] = {nullptr}; +}; + +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_WINOGRAD_H_ diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/common.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/common.cc index 484175ac6f..fd554ebc11 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/common.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/common.cc @@ -26,7 +26,156 @@ using mindspore::lite::KernelRegistry; using mindspore::schema::Format::Format_NHWC; namespace mindspore::lite::opencl::test { +// muti-output +void TestMain(const std::vector &input_infos, const std::vector &output_info, + OpParameter *op_parameter, bool fp16_enable, float atol, float rtol, bool print_data) { + std::vector input_infos_new; + auto transform_fun = [](ArgsTuple in) -> ArgsTupleWithDtype { + return ArgsTupleWithDtype(std::get<0>(in), std::get<1>(in), std::get<2>(in), kNumberTypeFloat32); + }; + std::transform(input_infos.begin(), input_infos.end(), std::back_inserter(input_infos_new), transform_fun); + TestMain(input_infos_new, output_info, op_parameter, fp16_enable, atol, rtol, print_data); +} + +void TestMain(const std::vector &input_infos, const std::vector &output_info, + OpParameter *op_parameter, bool fp16_enable, float atol, float rtol, bool print_data) { + auto primitive_type = static_cast(op_parameter->type_); + static std::set packed_op = { + schema::PrimitiveType_Conv2D, schema::PrimitiveType_DeConv2D, schema::PrimitiveType_DepthwiseConv2D, + schema::PrimitiveType_DeDepthwiseConv2D, schema::PrimitiveType_MatMul}; + + // simulating benchmark: session::LiteSession::CreateSession() -> session->Init() + MS_LOG(DEBUG) << "initialize OpenCLRuntime and OpenCLAllocator"; + auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper(); + auto ocl_runtime = runtime_wrapper.GetInstance(); + ocl_runtime->SetFp16Enable(fp16_enable); + EXPECT_TRUE(ocl_runtime->Init() == RET_OK); + + // simulating benchmark: session_->CompileGraph() -> ConvertTensors() + MS_LOG(DEBUG) << "create Tensors & init weight data"; + std::vector> in_tensors; + std::vector> out_tensors; + // firstly, create all Tensors + in_tensors.reserve(input_infos.size()); // vector's capacity() is 0, so call reserve() avoiding vector re-malloc + for (auto input_info : input_infos) { + auto &shape = std::get<0>(input_info); + auto category = std::get<2>(input_info); + auto data_type = std::get<3>(input_info); + in_tensors.emplace_back(std::make_shared(data_type, shape, Format_NHWC, category)); + } + for (auto outout_info : output_info) { + const std::vector &output_shape = std::get<0>(outout_info); + out_tensors.emplace_back(std::make_shared(kNumberTypeFloat32, output_shape, Format_NHWC, VAR)); + } + // secondly, init weight Tensor's data + std::vector kernel_inputs; + std::vector subgraph_inputs; + std::vector outputs; + std::map subgraph_inputs_data; + for (int i = 0; i < in_tensors.size(); ++i) { + auto tensor = in_tensors[i]; + auto *input_data = std::get<1>(input_infos[i]); + kernel_inputs.push_back(tensor.get()); + if (tensor->category() != VAR) { // tensor is weight + // simulating src/lite_session.cc:WeightTensorNeedCopy() + if (packed_op.count(primitive_type)) { + tensor->set_data(input_data); + } else { + memcpy(tensor->MutableData(), input_data, tensor->Size()); + } + } else { + EXPECT_TRUE(tensor->data_type() == kNumberTypeFloat32 || tensor->data_type() == kNumberTypeInt32); + subgraph_inputs.push_back(tensor.get()); + subgraph_inputs_data[tensor.get()] = reinterpret_cast(input_data); + } + } + for (int i = 0; i < out_tensors.size(); ++i) { + auto out_tensor = out_tensors[i]; + outputs.push_back(out_tensor.get()); + } + + // simulating benchmark: session_->CompileGraph() -> scheduler.Schedule() -> BuildKernels() + MS_LOG(DEBUG) << "create OpenCLKernel"; + kernel::KernelKey key{kernel::kGPU, kernel_inputs.front()->data_type(), primitive_type}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + if (creator == nullptr) { + std::cerr << "can't get registry function for: " << schema::EnumNamePrimitiveType(primitive_type) + << ". Maybe you forget setting op_parameter_.type_ for OpParameter." << std::endl; + free(op_parameter); + FAIL(); + } + auto *kernel = creator(kernel_inputs, outputs, op_parameter, nullptr, key, nullptr); + if (kernel == nullptr) { + std::cerr << "call registry function error: " << schema::EnumNamePrimitiveType(primitive_type) << std::endl; + free(op_parameter); + FAIL(); + } + kernel->set_name(schema::EnumNamesPrimitiveType()[primitive_type]); + + // simulating benchmark: session_->CompileGraph() -> scheduler.Schedule() -> ConstructSubGraphs() + MS_LOG(DEBUG) << "create SubGraph"; + std::vector kernels{kernel}; + auto sub_graph = new (std::nothrow) OpenCLSubGraph(subgraph_inputs, outputs, kernels, kernels, kernels); + if (sub_graph == nullptr) { + return; + } + + // call sub_graph->Init() after construct subgraph like scheduler.cc + MS_LOG(DEBUG) << "call sub_graph->Init()"; + EXPECT_TRUE(sub_graph->Init() == RET_OK); + + // simulating benchmark: session_->CompileGraph() -> PrepareKernels() -> OpenCLSubGraph.Prepare() + MS_LOG(DEBUG) << "call sub_graph->Prepare()"; + EXPECT_TRUE(sub_graph->Prepare() == RET_OK); // will set Tensor's allocator be OpenCLAllocator + + // simulating benchmark: model->Free(), clear weight data in input_infos + std::vector> saved_weights; + for (int i = 0; i < in_tensors.size(); ++i) { + auto &tensor = in_tensors[i]; + if (tensor->category() != VAR) { + saved_weights.emplace_back(new uint8_t[tensor->Size()]); + auto *weight_data = std::get<1>(input_infos[i]); + memcpy(saved_weights.back().get(), weight_data, tensor->Size()); + srand(time(nullptr)); + memset(weight_data, rand(), tensor->Size()); + } + } + + // simulating benchmark: LoadInput() + MS_LOG(DEBUG) << "malloc and init input data"; + for (auto input : subgraph_inputs) { + EXPECT_TRUE(input->MutableData() != nullptr); // malloc Image2D & call MapBuffer() + memcpy(input->data_c(), subgraph_inputs_data[input], input->Size()); + } + + // simulating benchmark: MarkAccuracy() -> session_->RunGraph() -> executor_->Run() -> OpenCLSubGraph->Run() + MS_LOG(DEBUG) << "run SubGraph & compare result"; + EXPECT_TRUE(sub_graph->Run() == RET_OK); // will call UnmapBuffer() for input + + for (int i = 0; i < outputs.size(); ++i) { + ocl_runtime->GetAllocator()->MapBuffer(outputs[i]->data_c(), CL_MAP_READ, nullptr, true); + float *expect_data = reinterpret_cast(std::get<1>(output_info[i])); + CompareOutput(outputs[i]->data_c(), expect_data, outputs[i]->ElementsNum(), atol, rtol, print_data); + ocl_runtime->GetAllocator()->UnmapBuffer(outputs[i]->data_c()); + } + + MS_LOG(DEBUG) << "release resources"; + for (auto &tensor : in_tensors) { + if (tensor->category() != VAR && packed_op.count(primitive_type)) { + tensor->set_data(nullptr); + } + } + for (int i = 0, j = 0; i < in_tensors.size(); ++i) { // resume weight data to input_infos + auto &tensor = in_tensors[i]; + if (tensor->category() != VAR) { + auto *weight_data = std::get<1>(input_infos[i]); + memcpy(weight_data, saved_weights[j++].get(), tensor->Size()); + } + } + delete sub_graph; +} +// single-output void TestMain(const std::vector &input_infos, std::tuple, float *> output_info, OpParameter *op_parameter, bool fp16_enable, float atol, float rtol, bool print_data) { auto primitive_type = static_cast(op_parameter->type_); diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/common.h b/mindspore/lite/test/ut/src/runtime/kernel/opencl/common.h index 75cc0186d3..58c8ab6a1b 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/common.h +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/common.h @@ -31,6 +31,7 @@ using Tensor = mindspore::lite::Tensor; using ArgsTuple = std::tuple, void *, Tensor::Category>; +using ArgsTupleOut = std::tuple, void *>; using ArgsTupleWithDtype = std::tuple, void *, Tensor::Category, mindspore::TypeId>; constexpr Tensor::Category VAR = Tensor::VAR; constexpr Tensor::Category CONST_TENSOR = Tensor::Category::CONST_TENSOR; @@ -89,10 +90,17 @@ T *CreateParameter(schema::PrimitiveType type) { return param; } -void TestMain(const std::vector &input_infos, std::tuple, float *> output_info, +void TestMain(const std::vector &input_infos, const std::vector &output_info, + OpParameter *op_parameter, bool fp16_enable = false, float atol = 1e-9, float rtol = 1e-9, + bool print_output = false); + +void TestMain(const std::vector &input_infos, const std::vector &output_info, OpParameter *op_parameter, bool fp16_enable = false, float atol = 1e-9, float rtol = 1e-9, bool print_output = false); +void TestMain(const std::vector &input_infos, std::tuple, float *> output_info, + OpParameter *op_parameter, bool fp16_enable = false, float atol = 1e-9, float rtol = 1e-9, + bool print_output = false); void TestMain(const std::vector &input_infos, std::tuple, float *> output_info, OpParameter *op_parameter, bool fp16_enable = false, float atol = 1e-9, float rtol = 1e-9, bool print_output = false); diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/concat_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/concat_tests.cc index 4efdbc98b0..37536eb38a 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/concat_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/concat_tests.cc @@ -44,6 +44,21 @@ TEST_F(TestOpenCL_Concat, input2_axis0) { } } +TEST_F(TestOpenCL_Concat, input2_axis0_shape1) { + std::vector input0_shape = {1}; + std::vector input1_shape = {1}; + std::vector output_shape = {2}; + int axis = 0; + float input0_data[] = {0.75}; + float input1_data[] = {0.5}; + float output_data[] = {0.75, 0.5}; + for (auto fp16_enable : {false}) { + auto *param = CreateParameter(axis); + TestMain({{input0_shape, input0_data, VAR}, {input1_shape, input1_data, VAR}}, {output_shape, output_data}, param, + fp16_enable, fp16_enable ? 1e-3 : 1e-9); + } +} + TEST_F(TestOpenCL_Concat, input2_axis1_Align) { std::vector input0_shape = {2, 2, 2, 8}; std::vector input1_shape = {2, 2, 2, 8}; diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/gather_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/gather_tests.cc index ba04555630..258205be5e 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/gather_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/gather_tests.cc @@ -32,13 +32,30 @@ OpParameter *CreateParameter(int axis) { TEST_F(TestOpenCL_Gather, Axis0) { int axis = 0; std::vector input_shape = {10}; - std::vector indices_shape = {2}; - std::vector output_shape = {2}; + std::vector indices_shape = {5}; + std::vector output_shape = {5}; float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; - int32_t indices[] = {1, 3}; - float output_data[] = {1, 3}; + int32_t indices[] = {1, 3, 5, 7, 9}; + float output_data[] = {1, 3, 5, 7, 9}; - for (auto fp16_enable : {false, true}) { + for (auto fp16_enable : {false}) { + auto *param = CreateParameter(axis); + TestMain( + {{input_shape, input_data, VAR, kNumberTypeFloat32}, {indices_shape, indices, CONST_TENSOR, kNumberTypeInt32}}, + {output_shape, output_data}, param, fp16_enable); + } +} + +TEST_F(TestOpenCL_Gather, Axis0ConstTensor) { + int axis = 0; + std::vector input_shape = {10}; + std::vector indices_shape = {1}; + std::vector output_shape = {1}; + float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + int32_t indices[] = {1}; + float output_data[] = {1}; + + for (auto fp16_enable : {false}) { auto *param = CreateParameter(axis); TestMain( {{input_shape, input_data, VAR, kNumberTypeFloat32}, {indices_shape, indices, CONST_TENSOR, kNumberTypeInt32}}, @@ -49,11 +66,11 @@ TEST_F(TestOpenCL_Gather, Axis0) { TEST_F(TestOpenCL_Gather, Axis0_Tensor) { int axis = 0; std::vector input_shape = {10}; - std::vector indices_shape = {2}; - std::vector output_shape = {2}; + std::vector indices_shape = {1}; + std::vector output_shape = {1}; float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; - int32_t indices[] = {1, 3}; - float output_data[] = {1, 3}; + int32_t indices[] = {1}; + float output_data[] = {1}; for (auto fp16_enable : {false}) { auto *param = CreateParameter(axis); diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc index 208f9d8190..cbfdc04f5e 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc @@ -45,7 +45,7 @@ TEST_F(TestOpenCL_MatMul, 2Dfile) { auto output_data = reinterpret_cast(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size)); for (auto fp16_enable : {false}) { - auto *param = CreateParameter(); + auto *param = CreateParameter(false, false); TestMain({{input_shape, input_data, VAR}, {weight_shape, weight_data, CONST_TENSOR}}, {output_shape, output_data}, param, fp16_enable, fp16_enable ? 1e-3 : 1e-3); } diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/split_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/split_tests.cc new file mode 100644 index 0000000000..51ce3e0e1d --- /dev/null +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/split_tests.cc @@ -0,0 +1,57 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "ut/src/runtime/kernel/opencl/common.h" +#include "nnacl/split_parameter.h" + +namespace mindspore::lite::opencl::test { + +class TestOpenCL_Split : public CommonTest {}; + +namespace { +// PrimitiveType_Split: src/ops/populate/split_populate.cc +OpParameter *CreateParameter(int split_dim_, int num_split_, std::vector split_sizes_) { + auto *param = test::CreateParameter(schema::PrimitiveType_Split); + param->split_dim_ = split_dim_; + param->num_split_ = num_split_; + param->split_sizes_ = reinterpret_cast(malloc(param->num_split_ * sizeof(int))); + for (int i = 0; i < param->num_split_; ++i) { + param->split_sizes_[i] = split_sizes_[i]; + } + return reinterpret_cast(param); +} +} // namespace + +TEST_F(TestOpenCL_Split, input2_axis3) { + std::vector input_shape = {2, 2, 2, 12}; + std::vector output_shape1 = {2, 2, 2, 6}; + std::vector output_shape2 = {2, 2, 2, 6}; + int split_dim_ = 3; + int num_split_ = 2; // len of split_sizes_ + std::vector split_sizes_{6, 6}; + size_t input_size, output1_size, output2_size; + std::string inputPpath = "./test_data/splitfp32_input.bin"; + std::string output1Ppath = "./test_data/splitfp32_output1.bin"; + std::string output2Ppath = "./test_data/splitfp32_output2.bin"; + auto input_data = reinterpret_cast(mindspore::lite::ReadFile(inputPpath.c_str(), &input_size)); + auto output_data1 = reinterpret_cast(mindspore::lite::ReadFile(output1Ppath.c_str(), &output1_size)); + auto output_data2 = reinterpret_cast(mindspore::lite::ReadFile(output2Ppath.c_str(), &output2_size)); + for (auto fp16_enable : {false}) { + auto *param = CreateParameter(split_dim_, num_split_, split_sizes_); + TestMain({{input_shape, input_data, VAR}}, {{output_shape1, output_data1}, {output_shape2, output_data2}}, param, + fp16_enable, fp16_enable ? 1e-3 : 1e-9); + } +} +} // namespace mindspore::lite::opencl::test diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/split/splitfp32_input.bin b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/split/splitfp32_input.bin new file mode 100644 index 0000000000..46ea373338 Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/split/splitfp32_input.bin differ diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/split/splitfp32_output1.bin b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/split/splitfp32_output1.bin new file mode 100644 index 0000000000..cc47aff50a Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/split/splitfp32_output1.bin differ diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/split/splitfp32_output2.bin b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/split/splitfp32_output2.bin new file mode 100644 index 0000000000..70866cda49 Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/split/splitfp32_output2.bin differ