!4169 add opencl benchmark and fix ops bugs

Merge pull request !4169 from wandongdong/master
pull/4169/MERGE
mindspore-ci-bot 5 years ago committed by Gitee
commit 2fbff42b2c

@ -1,4 +1,4 @@
__kernel void AvgPooling2d(__global float4 *input, __global float4 *output, const int4 input_shape,
__kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output, const int4 input_shape,
const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) {
// axis to dst tensor coordinate
int X = get_global_id(0);
@ -31,7 +31,7 @@ __kernel void AvgPooling2d(__global float4 *input, __global float4 *output, cons
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void AvgPooling2dImage2d(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
__kernel void AvgPooling2d_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
const int4 output_shape, const int2 stride, const int2 kernel_size,
const int2 padding) {
// axis to dst tensor coordinate

@ -148,8 +148,6 @@ int ArithmeticOpenCLKernel::Run() {
}
runtime_->SetKernelArg(kernel_, arg_idx++, weight_);
runtime_->SetKernelArg(kernel_, arg_idx++, bias_);
MS_LOG(DEBUG) << arg_idx-2 << " " << weight_;
MS_LOG(DEBUG) << arg_idx-1 << " " << bias_;
}
runtime_->SetKernelArg(kernel_, arg_idx++, outputs_[0]->Data());
int H = outputs_[0]->Batch() * outputs_[0]->Height();

@ -147,21 +147,18 @@ void ConcatGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *
local->push_back(z);
}
int ConcatOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->Name() << " Running!";
auto param = reinterpret_cast<ConcatParameter *>(this->opParameter);
if (param->axis_ == 0) {
return Run_axis0();
}
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
MS_LOG(INFO) << " judge the numbers of input vector";
auto input0_shape = inputs_[0]->shape();
auto input1_shape = inputs_[1]->shape();
auto input2_shape = inputs_[2]->shape();
auto output_shape = outputs_[0]->shape();
cl_int2 input0_shape2_ = {DivideRoundUp(input0_shape[3], 4), DivideRoundUp(input1_shape[3], 4)}; // change
cl_int3 input0_shape3_ = {DivideRoundUp(input0_shape[3], 4), DivideRoundUp(input1_shape[3], 4),
DivideRoundUp(input2_shape[3], 4)};
cl_int4 output_shape_ = {output_shape[0], output_shape[1], output_shape[2], DivideRoundUp(output_shape[3], 4)};
uint32_t OH = output_shape[0] * output_shape[1]; // N*H
@ -173,14 +170,15 @@ int ConcatOpenCLKernel::Run() {
int arg_cn = 0;
if (inputs_.size() == 2) {
MS_LOG(INFO) << " SetKernelArg";
ocl_runtime->SetKernelArg(kernel_, arg_cn++, outputs_[0]->Data());
ocl_runtime->SetKernelArg(kernel_, arg_cn++, inputs_[0]->Data());
ocl_runtime->SetKernelArg(kernel_, arg_cn++, inputs_[1]->Data());
ocl_runtime->SetKernelArg(kernel_, arg_cn++, input0_shape2_);
ocl_runtime->SetKernelArg(kernel_, arg_cn++, output_shape_);
} else if (inputs_.size() == 3) {
MS_LOG(INFO) << " SetKernelArg";
auto input2_shape = inputs_[2]->shape();
cl_int3 input0_shape3_ = {DivideRoundUp(input0_shape[3], 4), DivideRoundUp(input1_shape[3], 4),
DivideRoundUp(input2_shape[3], 4)};
ocl_runtime->SetKernelArg(kernel_, arg_cn++, outputs_[0]->Data());
ocl_runtime->SetKernelArg(kernel_, arg_cn++, inputs_[0]->Data());
ocl_runtime->SetKernelArg(kernel_, arg_cn++, inputs_[1]->Data());

@ -228,6 +228,9 @@ static int GetBiggestDivider(int x, int y) {
int ConvolutionOpenCLKernel::GetGlobalLocal(std::vector<size_t> *global, std::vector<size_t> *local) {
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
auto param = reinterpret_cast<ConvParameter *>(opParameter);
param->output_h_ = outputs_[0]->Height();
param->output_w_ = outputs_[0]->Width();
param->output_channel_ = outputs_[0]->Channel();
constexpr size_t work_group_size[] = {4, 4, 1};
auto max_work_item_sizes = ocl_runtime->GetWorkItemSize();
@ -287,7 +290,7 @@ int ConvolutionOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_s
}
int ConvolutionOpenCLKernel::Run() {
std::cout << "ConvolutionOpenCLKernel::Run()\n";
MS_LOG(DEBUG) << this->Name() << " Running!";
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
int arg_cn = 0;

@ -24,6 +24,7 @@ SubGraphOpenCLKernel::~SubGraphOpenCLKernel() { UnInit(); }
int SubGraphOpenCLKernel::Init() {
allocator_ = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator();
MS_LOG(DEBUG) << "input num=" << inputs_.size() << ", output num=" << outputs_.size();
for (const auto tensor : inputs_) {
tensor->set_allocator(allocator_);
}
@ -38,8 +39,7 @@ int SubGraphOpenCLKernel::Init() {
data = allocator_->MapBuffer(data, CL_MAP_WRITE, nullptr, true);
tensor->SetData(data);
} else {
MS_LOG(ERROR) << "OpenCL kernel must use GPU buffer pointer, "
<< "please make sure that this buffer allocate by OpenCLAllocator!";
MS_LOG(ERROR) << "SubGraphOpenCLKernel input nullptr!";
}
}
return 0;

@ -109,7 +109,7 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t>& img_size)
auto mem_buf = iter->second;
bool is_match{mem_buf->img_size.size() == img_size.size()};
for (int i = 0; i < img_size.size() && is_match; ++i) {
is_match = img_size[i] == mem_buf->img_size[i];
is_match &= img_size[i] == mem_buf->img_size[i];
}
if (is_match) {
free_list_.erase(iter);
@ -166,7 +166,7 @@ void *OpenCLAllocator::CreateImageFromHost(void *data, size_t size, const std::v
auto mem_buf = iter->second;
bool is_match{mem_buf->img_size.size() == img_size.size()};
for (int i = 0; i < img_size.size() && is_match; ++i) {
is_match = img_size[i] == mem_buf->img_size[i];
is_match &= img_size[i] == mem_buf->img_size[i];
}
if (is_match) {
free_list_.erase(iter);

@ -168,6 +168,7 @@ int OpenCLExecutor::TransformTensorLayoutToBuffer(tensor::Tensor *tensor, schema
int OpenCLExecutor::TransformTensorLayoutToImage(tensor::Tensor *tensor, schema::Format src_format,
schema::Format dst_format) {
if (dst_format == schema::Format_NHWC4) {
tensor->SetFormat(schema::Format_NHWC4);
// convert to nhwc4
auto *src_data = tensor->Data();
auto *dst_data{src_data};
@ -190,7 +191,6 @@ int OpenCLExecutor::TransformTensorLayoutToImage(tensor::Tensor *tensor, schema:
dst_data = allocator_->CreateImageFromHost(src_data, tensor->Size(), img_size);
tensor->SetData(dst_data);
allocator_->Free(src_data);
tensor->SetFormat(schema::Format_NHWC4);
return RET_OK;
} else {
MS_LOG(ERROR) << "Unsupport layout transform: " << schema::EnumNameFormat(tensor->GetFormat()) << " to "

@ -142,10 +142,26 @@ kernel::LiteKernel *Scheduler::CreateSubKernel(const std::vector<kernel::LiteKer
kernel::LiteKernel *sub_kernel = nullptr;
#if SUPPORT_GPU
if (arch == kernel::KERNEL_ARCH::kGPU) {
std::vector<tensor::Tensor *> input_tensors = kernel::LiteKernelUtil::SubgraphInputTensors(kernels);
std::vector<tensor::Tensor *> output_tensors = kernel::LiteKernelUtil::SubgraphOutputTensors(kernels);
std::vector<kernel::LiteKernel *> input_kernels = kernel::LiteKernelUtil::SubgraphInputKernels(kernels);
std::vector<kernel::LiteKernel *> output_kernels = kernel::LiteKernelUtil::SubgraphOutputKernels(kernels);
auto head_kernel = kernels.front();
auto tail_kernel = kernels.back();
std::vector<kernel::LiteKernel *> input_kernels{head_kernel};
std::vector<kernel::LiteKernel *> output_kernels{tail_kernel};
std::vector<tensor::Tensor *> input_tensors;
std::vector<tensor::Tensor *> output_tensors;
for (auto tensor : head_kernel->GetInputs()) {
if (tensor->Data() == nullptr) {
input_tensors.emplace_back(tensor);
}
}
for (auto tensor : tail_kernel->GetInputs()) {
if (tensor->Data() == nullptr) {
output_tensors.emplace_back(tensor);
}
}
// std::vector<tensor::Tensor *> input_tensors = kernel::LiteKernelUtil::SubgraphInputTensors(kernels);
// std::vector<tensor::Tensor *> output_tensors = kernel::LiteKernelUtil::SubgraphOutputTensors(kernels);
// std::vector<kernel::LiteKernel *> input_kernels = kernel::LiteKernelUtil::SubgraphInputKernels(kernels);
// std::vector<kernel::LiteKernel *> output_kernels = kernel::LiteKernelUtil::SubgraphOutputKernels(kernels);
sub_kernel =
new kernel::SubGraphOpenCLKernel(input_tensors, output_tensors, input_kernels, output_kernels, kernels);
sub_kernel->Init();

Loading…
Cancel
Save