fix opencl depthwise1x1

pull/9215/head
wandongdong 4 years ago
parent ff2c44c935
commit 15546ddea8

@ -182,37 +182,55 @@ __kernel void DepthwiseConv2d_IMG_NHWC4_b221(__write_only image2d_t dst_data, __
WRITE_IMAGE(dst_data, (int2)((X + 1) * dst_size.z + Z, Y + 1), r[3]);
}
}
__kernel void DepthwiseConv2d_IMG_NHWC4_1x1(__write_only image2d_t dst_data, __read_only image2d_t src_data,
__global FLT4 *filter, __global FLT4 *bias, int2 kernel_size, int2 stride,
int2 padding, int2 dilation, int4 src_size, int4 dst_size,
float relu_clip_min, float relu_clip_max) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2) * 2;
__kernel void DepthwiseConv2d_IMG_NHWC4_1x1_b221(__write_only image2d_t dst_data, __read_only image2d_t src_data,
__global FLT4 *filter, __global FLT4 *bias, int2 kernel_size,
int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size,
float relu_clip_min, float relu_clip_max) {
int X = get_global_id(1) * 2;
int Y = get_global_id(2) * 2;
int Z = get_global_id(0);
if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
FLT4 r[2] = {(FLT4)(0.0f, 0.0f, 0.0f, 0.0f), (FLT4)(0.0f, 0.0f, 0.0f, 0.0f)};
FLT4 r[4] = {(FLT4)(0.0f, 0.0f, 0.0f, 0.0f), (FLT4)(0.0f, 0.0f, 0.0f, 0.0f), (FLT4)(0.0f, 0.0f, 0.0f, 0.0f),
(FLT4)(0.0f, 0.0f, 0.0f, 0.0f)};
int x_offset = X * stride.x + padding.x;
int y_offset = Y * stride.y + padding.y;
int fx_c = Z;
int f_len = kernel_size.x * kernel_size.y;
int fx_c = Z * f_len;
bool last_x = (get_global_id(1) == (dst_size.x + 1) / 2) && ((dst_size.x & 0x1) == 1);
bool last_y = (get_global_id(2) == (dst_size.y + 1) / 2) && ((dst_size.y & 0x1) == 1);
int y_c = y_offset;
bool outside_y = y_c < 0 || y_c >= src_size.y;
int y_c_a1 = y_c + stride.y;
int x_c = x_offset;
bool outside_x = x_c < 0 || x_c >= src_size.x;
if (!outside_x && !outside_y) {
FLT4 flt_p0 = filter[fx_c];
FLT4 flt_p1 = filter[fx_c + 1];
FLT4 src_p0 = READ_IMAGE(src_data, smp_zero, (int2)(Z + x_c * src_size.z, y_c));
FLT4 src_p1 = READ_IMAGE(src_data, smp_zero, (int2)(Z + 1 + x_c * src_size.z, y_c));
r[0] += TO_FLT4(src_p0 * flt_p0);
r[1] += TO_FLT4(src_p1 * flt_p1);
}
int x_c_a1 = x_c + stride.x;
int x_sign = x_c < 0 ? -1 : 1;
int x_a1_sign = x_c_a1 < 0 ? -1 : 1;
FLT4 flt_p0 = filter[fx_c];
FLT4 src_p00_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z * x_sign + x_c * src_size.z, y_c));
r[0] += TO_FLT4(src_p00_c0 * flt_p0);
FLT4 src_p01_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z * x_a1_sign + x_c_a1 * src_size.z, y_c));
r[1] += TO_FLT4(src_p01_c0 * flt_p0);
FLT4 src_p10_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z + x_c * src_size.z, y_c_a1));
r[2] += TO_FLT4(src_p10_c0 * flt_p0);
FLT4 src_p11_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z * x_a1_sign + x_c_a1 * src_size.z, y_c_a1));
r[3] += TO_FLT4(src_p11_c0 * flt_p0);
r[0] += bias[Z];
r[1] += bias[Z];
r[2] += bias[Z];
r[3] += bias[Z];
r[0] = clamp(r[0], (FLT)(relu_clip_min), (FLT)(relu_clip_max));
r[1] += bias[Z + 1];
r[1] = clamp(r[1], (FLT)(relu_clip_min), (FLT)(relu_clip_max));
r[2] = clamp(r[2], (FLT)(relu_clip_min), (FLT)(relu_clip_max));
r[3] = clamp(r[3], (FLT)(relu_clip_min), (FLT)(relu_clip_max));
WRITE_IMAGE(dst_data, (int2)(X * dst_size.z + Z, Y), r[0]);
if ((dst_size.z & 0x1) == 0) {
WRITE_IMAGE(dst_data, (int2)(X * dst_size.z + Z + 1, Y), r[1]);
if (!last_x) {
WRITE_IMAGE(dst_data, (int2)((X + 1) * dst_size.z + Z, Y), r[1]);
}
if (!last_y) {
WRITE_IMAGE(dst_data, (int2)(X * dst_size.z + Z, Y + 1), r[2]);
}
if (!last_y && !last_x) {
WRITE_IMAGE(dst_data, (int2)((X + 1) * dst_size.z + Z, Y + 1), r[3]);
}
}
__kernel void DepthwiseConv2d_BUF_NC4HW4(__global FLT4 *dst_data, __global FLT4 *src_data, __global FLT4 *filter,

@ -35,6 +35,35 @@ using mindspore::schema::PrimitiveType_BiasAdd;
namespace mindspore::kernel {
int BiasAddOpenCLKernel::CheckSpecs() {
if (in_tensors_.size() == 0) {
MS_LOG(ERROR) << "Input data size must be greater than 0, but your size is " << in_tensors_.size();
return RET_ERROR;
}
if (in_tensors_[0]->shape()[0] > 1) {
MS_LOG(ERROR) << "Input data size unsupported multi-batch.";
return RET_ERROR;
}
return RET_OK;
}
void BiasAddOpenCLKernel::SetConstArgs() {
int arg_idx = 2;
std::map<schema::Format, int> data_type{
{schema::Format::Format_NC4, 1}, {schema::Format::Format_NHWC4, 2}, {schema::Format::Format_NC4HW4, 3}};
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_shape_);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, BiasAdd_);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, data_type[schema::Format::Format_NHWC4]);
}
void BiasAddOpenCLKernel::SetGlobalLocal() {
cl_int4 global_size = input_shape_;
global_size.s[2] = UP_DIV(global_size.s[3], C4NUM) * global_size.s[2];
std::vector<size_t> local = {1, 1};
std::vector<size_t> global = {static_cast<size_t>(global_size.s[1]), static_cast<size_t>(global_size.s[2])};
OpenCLKernel::AlignGlobalLocal(global, local);
}
int BiasAddOpenCLKernel::InitWeights() {
int C = in_tensors_[1]->shape()[0];
int div_ci = UP_DIV(C, C4NUM);
@ -52,7 +81,7 @@ int BiasAddOpenCLKernel::InitWeights() {
return RET_OK;
}
int BiasAddOpenCLKernel::Init() {
int BiasAddOpenCLKernel::Prepare() {
in_size_ = in_tensors_[0]->shape().size();
out_size_ = out_tensors_[0]->shape().size();
for (int i = 0; i < in_size_; ++i) {
@ -77,67 +106,27 @@ int BiasAddOpenCLKernel::Init() {
ocl_runtime_->LoadSource(program_name, source);
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
auto ret = InitWeights();
if (ret != RET_OK) {
return ret;
}
SetGlobalLocal();
SetConstArgs();
MS_LOG(DEBUG) << program_name << " Init Done!";
return mindspore::lite::RET_OK;
}
int BiasAddOpenCLKernel::Run() {
cl_int4 global_size = GetGlobalshape();
MS_LOG(DEBUG) << op_parameter_->name_ << " Running!";
int arg_idx = 0;
std::map<schema::Format, int> data_type{
{schema::Format::Format_NC4, 1}, {schema::Format::Format_NHWC4, 2}, {schema::Format::Format_NC4HW4, 3}};
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_shape_);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, BiasAdd_);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, data_type[schema::Format::Format_NHWC4]);
std::vector<size_t> local = {1, 1};
std::vector<size_t> global = {static_cast<size_t>(global_size.s[1]), static_cast<size_t>(global_size.s[2])};
auto ret = ocl_runtime_->RunKernel(kernel_, global, local);
if (ret != mindspore::lite::RET_OK) {
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c());
auto ret = ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "Run kernel " << op_parameter_->name_ << " error.";
return mindspore::lite::RET_ERROR;
return RET_ERROR;
}
return mindspore::lite::RET_OK;
}
cl_int4 BiasAddOpenCLKernel::GetGlobalshape() {
cl_int4 global_shape = input_shape_;
global_shape.s[2] = UP_DIV(global_shape.s[3], C4NUM) * global_shape.s[2];
return global_shape;
}
kernel::LiteKernel *OpenCLBiasAddKernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
const lite::InnerContext *ctx, const kernel::KernelKey &desc,
const lite::PrimitiveC *primitive) {
if (inputs.size() == 0) {
MS_LOG(ERROR) << "Input data size must be greater than 0, but your size is " << inputs.size();
free(opParameter);
return nullptr;
}
if (inputs[0]->shape()[0] > 1) {
MS_LOG(ERROR) << "Input data size unsupported multi-batch.";
free(opParameter);
return nullptr;
}
auto *kernel = new (std::nothrow) BiasAddOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
if (kernel == nullptr) {
MS_LOG(ERROR) << "Kernel " << opParameter->name_ << "is nullptr.";
free(opParameter);
return nullptr;
}
auto ret = kernel->Init();
if (ret != mindspore::lite::RET_OK) {
MS_LOG(ERROR) << "Init BiasAdd kernel failed!";
delete kernel;
return nullptr;
}
return kernel;
return RET_OK;
}
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_BiasAdd, OpenCLBiasAddKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_BiasAdd, OpenCLBiasAddKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_BiasAdd, OpenCLKernelCreator<BiasAddOpenCLKernel>)
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_BiasAdd, OpenCLKernelCreator<BiasAddOpenCLKernel>)
} // namespace mindspore::kernel

@ -33,13 +33,15 @@ class BiasAddOpenCLKernel : public OpenCLKernel {
: OpenCLKernel(parameter, inputs, outputs) {}
~BiasAddOpenCLKernel() override = default;
int Init() override;
int Run() override;
int Prepare() override;
int CheckSpecs() override;
void SetConstArgs() override;
void SetGlobalLocal() override;
int InitWeights() override;
int Run() override;
private:
cl_int4 GetGlobalshape();
void *BiasAdd_{nullptr};
int in_size_{};
int out_size_{};

@ -131,8 +131,8 @@ int PowerOpenCLKernel::Run() {
cl_float4 parameter = {power_, shift_, scale_, 1};
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, parameter);
}
ocl_runtime_->RunKernel(kernel_, global, local);
AlignGlobalLocal(global, local);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
return RET_OK;
}

@ -132,7 +132,8 @@ int PReluOpenCLKernel::Run() {
std::vector<size_t> local = {4, 4, 1};
std::vector<size_t> global = {static_cast<size_t>(H_), static_cast<size_t>(W_), static_cast<size_t>(CO_SLICES_)};
auto ret = ocl_runtime_->RunKernel(kernel_, global, local);
AlignGlobalLocal(global, local);
auto ret = ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
if (ret != mindspore::lite::RET_OK) {
MS_LOG(ERROR) << "Run kernel " << op_parameter_->name_ << " error.";
return mindspore::lite::RET_ERROR;

@ -60,6 +60,7 @@ void ScaleOpenCLKernel::Image2dGetWorkGroupSize() {
local_size_ = {16, 16};
auto image2d_info = GpuTensorInfo(out_tensors_[0]);
global_size_ = {image2d_info.width, image2d_info.height};
OpenCLKernel::AlignGlobalLocal(global_size_, local_size_);
}
int ScaleOpenCLKernel::InitWeights() {
@ -245,7 +246,7 @@ int ScaleOpenCLKernel::Run() {
}
}
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, param->activation_type_);
ocl_runtime_->RunKernel(kernel_, global_size_, local_size_);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
return RET_OK;
}

@ -195,7 +195,6 @@ class OpenCLKernel : public LiteKernel {
return RET_OK;
}
int Init() override { return RET_ERROR; } // !!!To be deleted
int Prepare() override { return RET_OK; }
int PreProcess() override { return RET_ERROR; }
int ReSize() override { return RET_ERROR; }
@ -235,7 +234,7 @@ class OpenCLKernel : public LiteKernel {
tuning_params.push_back(default_tuning_param);
std::vector<size_t> max_work_items = ocl_runtime_->GetWorkItemSize();
size_t max_workgroup_size = ocl_runtime_->GetMaxWorkGroupSize(kernel_);
size_t MIN_WORKGROUP_SIZE = 8;
const size_t MIN_WORKGROUP_SIZE = 8;
std::set<size_t> candidate_x = GenerateLocalByGlobal(global_size_[0]);
std::set<size_t> candidate_y = {1};
std::set<size_t> candidate_z = {1};

@ -229,16 +229,11 @@ int OpenCLSubGraph::Init() {
return ret;
}
auto opencl_exec = reinterpret_cast<lite::opencl::OpenCLExecutor *>(executor_);
ocl_runtime_->SetProfiling(true);
ret = opencl_exec->RunOrTune(in_tensors_, out_tensors_, nodes_, allocator_, nullptr, nullptr, true);
if (ret != RET_OK) {
MS_LOG(ERROR) << "Run opencl executor failed: " << ret;
return ret;
}
ocl_runtime_->SetProfiling(false);
#ifdef Debug
ocl_runtime_->SetProfiling(true);
#endif
return RET_OK;
}
@ -262,50 +257,6 @@ void OpenCLSubGraph::UpdateTensorDataType() {
}
}
int OpenCLSubGraph::MallocTensorWithReuse() {
int ret;
kernel::LiteKernelUtil::InitTensorRefCount(nodes_);
for (auto *kernel : nodes_) {
MS_ASSERT(kernel);
auto *op_kernel = reinterpret_cast<kernel::OpenCLKernel *>(kernel);
auto outputs = kernel->out_tensors();
for (auto i = 0; i < outputs.size(); ++i) {
auto *output = outputs.at(i);
MS_ASSERT(output);
if (op_kernel->GetMemType() == MemType::IMG) {
std::vector<size_t> img_size;
ret = op_kernel->GetImageSize(i, &img_size);
if (ret != RET_OK) {
MS_LOG(WARNING) << "GetImageSize failed";
}
auto data_ptr = allocator_->Malloc(output->Size(), img_size);
output->set_data(data_ptr);
} else {
ret = output->MallocData(allocator_);
if (ret != RET_OK) {
MS_LOG(WARNING) << "MallocData failed";
}
}
output->set_allocator(allocator_);
}
for (auto input_kernel : kernel->in_kernels()) {
MS_ASSERT(input_kernel);
ret = input_kernel->DecOutTensorRefCount();
if (ret != RET_OK) {
MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed";
}
}
}
for (auto kernel : out_kernels_) {
MS_ASSERT(kernel);
ret = kernel->DecOutTensorRefCount();
if (ret != RET_OK) {
MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed";
}
}
return RET_OK;
}
void OpenCLSubGraph::GetKernelFromToTensor(const std::vector<lite::Tensor *> &in_tensors,
const std::vector<kernel::LiteKernel *> &in_kernels,
std::vector<std::vector<kernel::LiteKernel *>> *out_kernels, bool is_from) {
@ -379,8 +330,6 @@ void OpenCLSubGraph::UnInit() {
delete this->executor_;
}
int OpenCLSubGraph::InferShape() { return RET_OK; }
int OpenCLSubGraph::ReSize() { return RET_OK; }
int OpenCLSubGraph::Run() {
@ -395,7 +344,6 @@ int OpenCLSubGraph::Run() {
MS_LOG(ERROR) << "OpenCL subgraph input tensor data is null";
return RET_ERROR;
}
allocator_->UnmapBuffer(tensor->data_c());
ret = allocator_->UnmapBuffer(tensor->data_c());
if (ret != RET_OK) {
return ret;

@ -43,7 +43,6 @@ class OpenCLSubGraph : public SubGraphKernel {
int PostProcess() override { return mindspore::lite::RET_OK; }
int Prepare() override;
int Init() override;
int InferShape();
int ReSize() override;
int Run() override;
int Run(const KernelCallBack &before, const KernelCallBack &after) override { return this->Run(); };
@ -51,7 +50,6 @@ class OpenCLSubGraph : public SubGraphKernel {
private:
void UnInit();
void UpdateTensorDataType();
int MallocTensorWithReuse();
void ReplaceOutTensorAndKernelToNull(const std::vector<lite::Tensor *> &in_tensors,
const std::vector<std::vector<kernel::LiteKernel *>> &in_kernels,
lite::opencl::MemType mem_type);
@ -66,6 +64,9 @@ class OpenCLSubGraph : public SubGraphKernel {
void GetKernelFromToTensor(const std::vector<lite::Tensor *> &in_tensors,
const std::vector<kernel::LiteKernel *> &in_kernels,
std::vector<std::vector<kernel::LiteKernel *>> *out_kernels, bool is_from);
void Fusion();
private:
lite::opencl::OpenCLAllocator *allocator_{nullptr};
std::vector<lite::Tensor *> in_convert_tensors_;
std::vector<lite::Tensor *> out_convert_tensors_;
@ -78,9 +79,6 @@ class OpenCLSubGraph : public SubGraphKernel {
std::set<LiteKernel *> nodes_set_;
lite::opencl::OpenCLRuntimeWrapper ocl_runtime_wrap_;
lite::opencl::OpenCLRuntime *ocl_runtime_{nullptr};
private:
void Fusion();
};
} // namespace mindspore::kernel

@ -30,7 +30,9 @@ int OpenCLExecutor::Run(std::vector<Tensor *> &inputs, std::vector<Tensor *> &ou
int OpenCLExecutor::RunOrTune(std::vector<Tensor *> &inputs, std::vector<Tensor *> &outputs,
std::vector<kernel::LiteKernel *> &kernels, Allocator *allocator,
const KernelCallBack &before, const KernelCallBack &after, bool is_tune) {
int ret;
int ret{RET_OK};
ocl_runtime.GetInstance()->SetProfiling(is_tune);
kernel::LiteKernelUtil::InitTensorRefCount(kernels);
for (auto *kernel : kernels) {
MS_ASSERT(kernel);
@ -77,11 +79,6 @@ int OpenCLExecutor::RunOrTune(std::vector<Tensor *> &inputs, std::vector<Tensor
MS_LOG(ERROR) << "run kernel failed, name: " << kernel->name();
return ret;
}
#ifdef Debug
MS_LOG(INFO) << "OpenCl kernel " << kernel->name() << "(" << kernel->type_str()
<< ") execute time is: " << op_kernel->GetProfilingTimeMs() << "ms";
#endif
}
if (after != nullptr) {
if (!after(TensorVectorCast(kernel->in_tensors()), TensorVectorCast(kernel->out_tensors()), callbackParam)) {
@ -95,7 +92,16 @@ int OpenCLExecutor::RunOrTune(std::vector<Tensor *> &inputs, std::vector<Tensor
MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed";
}
}
#ifdef Debug
MS_LOG(INFO) << "OpenCl kernel " << kernel->name() << "(" << kernel->type_str()
<< ") execute time is: " << op_kernel->GetProfilingTimeMs() << "ms";
#endif
}
return RET_OK;
#ifdef Debug
ocl_runtime.GetInstance()->SetProfiling(true);
#else
ocl_runtime.GetInstance()->SetProfiling(false);
#endif
return ret;
}
} // namespace mindspore::lite::opencl

@ -406,76 +406,6 @@ int OpenCLRuntime::BuildKernel(cl::Kernel &kernel, const std::string &program_na
return RET_OK;
}
// Run Kernel with 1D, 2D, 3D group size, and local size can be empty.
int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const std::vector<size_t> &global,
const std::vector<size_t> &local, cl::CommandQueue *command_queue) {
if (command_queue == nullptr) {
command_queue = default_command_queue_;
}
MS_ASSERT(local.size() == 0 || local.size() == global.size());
std::vector<size_t> internal_global_ws = global;
for (size_t i = 0; i < local.size(); ++i) {
internal_global_ws[i] = ROUND_UP(global[i], local[i]);
}
MS_LOG(DEBUG) << "global size: " << global.size() << ", local size: " << local.size();
for (size_t i = 0; i < global.size(); i++) {
MS_LOG(DEBUG) << "global[" << i << "] = " << global[i];
}
for (size_t i = 0; i < local.size(); i++) {
MS_LOG(DEBUG) << "local[" << i << "] = " << local[i];
}
cl::NDRange global_range = cl::NullRange;
cl::NDRange local_range = cl::NullRange;
if (global.size() == 1) {
global_range = cl::NDRange(internal_global_ws[0]);
if (!local.empty()) {
local_range = cl::NDRange(local[0]);
}
} else if (global.size() == 2) {
global_range = cl::NDRange(internal_global_ws[0], internal_global_ws[1]);
if (!local.empty()) {
local_range = cl::NDRange(local[0], local[1]);
}
} else if (global.size() == 3) {
global_range = cl::NDRange(internal_global_ws[0], internal_global_ws[1], internal_global_ws[2]);
if (!local.empty()) {
local_range = cl::NDRange(local[0], local[1], local[2]);
}
} else {
MS_LOG(ERROR) << "Not supported NDRange!";
return RET_ERROR;
}
cl::Event event;
cl_int ret = CL_SUCCESS;
ret = command_queue->enqueueNDRangeKernel(kernel, cl::NullRange, global_range, local_range, nullptr, &event);
if (ret != CL_SUCCESS) {
MS_LOG(ERROR) << "Kernel execute failed:" << CLErrorCode(ret);
return RET_ERROR;
}
static int cnt = 0;
const int flush_period = 10;
if (cnt % flush_period == 0) {
auto flush_ret = command_queue->flush();
if (flush_ret != CL_SUCCESS) {
MS_LOG(WARNING) << "CL Flush failed:" << CLErrorCode(ret);
}
}
cnt++;
MS_LOG(DEBUG) << "RunKernel success!";
#ifdef Debug
event.wait();
cl_ulong time_start;
cl_ulong time_end;
event.getProfilingInfo(CL_PROFILING_COMMAND_START, &time_start);
event.getProfilingInfo(CL_PROFILING_COMMAND_END, &time_end);
double nanoSeconds = time_end - time_start;
MS_LOG(INFO) << "OpenCl Execution time is: " << nanoSeconds / 1000000.0 << "ms";
#endif
return RET_OK;
}
// Run Kernel with 1D, 2D, 3D group size, and local size can be empty.
int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const cl::NDRange &global, const cl::NDRange &local,
cl::CommandQueue *command_queue, cl::Event *event) {

@ -115,8 +115,6 @@ class OpenCLRuntime {
bool LoadSource(const std::string &program_name, const std::string &source);
int BuildKernel(cl::Kernel &kernel, const std::string &program_name, const std::string &kernel_name,
const std::set<std::string> &build_options = {});
int RunKernel(const cl::Kernel &kernel, const std::vector<size_t> &global, const std::vector<size_t> &local,
cl::CommandQueue *command_queue = nullptr); // !!!To be deleted
int RunKernel(const cl::Kernel &kernel, const cl::NDRange &global, const cl::NDRange &local,
cl::CommandQueue *command_queue = nullptr, cl::Event *event = nullptr);
bool CopyDeviceMemToHost(void *dst, const void *src, size_t size, cl::CommandQueue *command_queue = nullptr,

Loading…
Cancel
Save