fix opencl infershape bug

pull/13998/head
wangdongxu 4 years ago
parent 7324ee14c6
commit d4f49b3a10

@ -68,12 +68,14 @@ void *OpenCLAllocator::MinimumFit(MemType mem_type, size_t size, const ImageSize
void *OpenCLAllocator::CreateBuffer(size_t size, void *data, size_t flags, cl::Buffer **buffer) {
cl_int ret = CL_SUCCESS;
MS_ASSERT(buffer);
MS_ASSERT(size > 0);
*buffer = new (std::nothrow) cl::Buffer(*ocl_runtime_->Context(), static_cast<cl_mem_flags>(flags), size, data, &ret);
if (*buffer == nullptr) {
MS_LOG(ERROR) << "Create OpenCL buffer failed! (ERROR CODE: " << ret << ")";
return nullptr;
}
void *host_ptr = ocl_runtime_->MapBuffer(**buffer, CL_MAP_READ | CL_MAP_WRITE, size);
MS_ASSERT(host_ptr);
if (host_ptr == nullptr) {
delete *buffer;
MS_LOG(ERROR) << "Map buffer failed, can not found buffer :" << *buffer << ", host_ptr=" << host_ptr;

File diff suppressed because it is too large Load Diff

@ -44,6 +44,19 @@ using mindspore::schema::PrimitiveType_Eltwise;
namespace mindspore::kernel {
int ArithmeticOpenCLKernel::CheckSpecs() {
for (auto &tensor : in_tensors_) {
if (tensor->data_type() != kNumberTypeFloat32 && tensor->data_type() != kNumberTypeFloat16) {
MS_LOG(ERROR) << "ArithmeticOpenCLKernel only support fp32/fp16 input";
return RET_ERROR;
}
}
for (auto &tensor : out_tensors_) {
if (tensor->data_type() != kNumberTypeFloat32 && tensor->data_type() != kNumberTypeFloat16) {
MS_LOG(ERROR) << "ArithmeticOpenCLKernel only support fp32/fp16 output";
return RET_ERROR;
}
}
if (in_tensors_.size() != 2 || out_tensors_.size() != 1) {
MS_LOG(ERROR) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size();
return RET_ERROR;

@ -286,6 +286,7 @@ void Conv2DOpenCLKernel::InitFilter() {
}
FreeDequantedWeight();
FreeTmpWeight(in_tensors_.at(kWeightIndex)->data_c());
}
void Conv2DOpenCLKernel::InitBias() {
@ -322,6 +323,7 @@ void Conv2DOpenCLKernel::InitBias() {
}
}
allocator->UnmapBuffer(packed_bias_);
FreeTmpWeight(in_tensors_.at(kBiasIndex)->data_c());
}
void Conv2DOpenCLKernel::SetConstArgs() {
@ -480,11 +482,9 @@ kernel::LiteKernel *OpenCLConv2DCreator(const std::vector<lite::Tensor *> &input
MS_ASSERT(!inputs.empty());
MS_ASSERT(!outputs.empty());
MS_ASSERT(opParameter);
MS_ASSERT(inputs.front()->shape().size() == 4);
MS_ASSERT(outputs.front()->shape().size() == 4);
auto *conv_param = reinterpret_cast<ConvParameter *>(opParameter);
int input_channel = inputs.front()->shape().at(3);
int output_channel = outputs.front()->shape().at(3);
int input_channel = conv_param->input_channel_;
int output_channel = conv_param->output_channel_;
int group = conv_param->group_;
// case 1: depthwise conv2d
@ -529,6 +529,10 @@ kernel::LiteKernel *OpenCLConv2DCreator(const std::vector<lite::Tensor *> &input
}
}
if (!infer_shape_done) {
StoreTmpWeight(inputs.at(kWeightIndex));
if (inputs.size() > kBiasIndex) {
StoreTmpWeight(inputs.at(kBiasIndex));
}
MS_LOG(WARNING) << "kernel don't infer shape yet!";
return kernel;
}

@ -97,10 +97,11 @@ int ReshapeOpenCLKernel::Run() {
}
int ReshapeOpenCLKernel::PreProcess() {
if (Type() == PrimitiveType_Reshape && !infer_shape_flag_) {
if (Type() == PrimitiveType_Reshape && !op_parameter_->infer_flag_) {
auto shape_tensor = in_tensors_[1];
if (!shape_tensor->IsConst()) {
ocl_runtime_->SyncCommandQueue();
shape_tensor->MutableData();
}
}
return OpenCLKernel::PreProcess();

@ -33,7 +33,7 @@ using mindspore::schema::PrimitiveType_Resize;
namespace mindspore::kernel {
int ResizeOpenCLKernel::CheckSpecs() {
if (in_tensors_.size() != 1 || out_tensors_.size() != 1) {
if (in_tensors_.size() != 2 || out_tensors_.size() != 1) {
MS_LOG(ERROR) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size();
return RET_ERROR;
}
@ -119,6 +119,17 @@ int ResizeOpenCLKernel::Run() {
return RET_OK;
}
int ResizeOpenCLKernel::PreProcess() {
if (Type() == PrimitiveType_Resize && !op_parameter_->infer_flag_) {
auto shape_tensor = in_tensors_[1];
if (!shape_tensor->IsConst()) {
ocl_runtime_->SyncCommandQueue();
shape_tensor->MutableData();
}
}
return OpenCLKernel::PreProcess();
}
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Resize, OpenCLKernelCreator<ResizeOpenCLKernel>)
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Resize, OpenCLKernelCreator<ResizeOpenCLKernel>)
} // namespace mindspore::kernel

@ -34,6 +34,7 @@ class ResizeOpenCLKernel : public OpenCLKernel {
int CheckSpecs() override;
void SetConstArgs() override;
void SetGlobalLocal() override;
int PreProcess() override;
private:
float getResizeScaleFactor(int input_size, int output_size);

@ -71,8 +71,20 @@ int StackOpenCLKernel::CheckSpecs() {
MS_LOG(ERROR) << " only support input size = 2 and output size = 1";
return RET_ERROR;
}
for (auto &tensor : in_tensors_) {
if (tensor->data_type() != kNumberTypeFloat32 && tensor->data_type() != kNumberTypeFloat16) {
MS_LOG(ERROR) << " only support fp32/fp16 input";
return RET_ERROR;
}
}
for (auto &tensor : out_tensors_) {
if (tensor->data_type() != kNumberTypeFloat32 && tensor->data_type() != kNumberTypeFloat16) {
MS_LOG(ERROR) << " only support fp32/fp16 output";
return RET_ERROR;
}
}
if (in_tensors_[0]->shape().size() > 4 || in_tensors_[0]->shape().size() <= 0) {
MS_LOG(ERROR) << " only support dim <= 4 ";
MS_LOG(ERROR) << " only support 0<dim<=4";
return RET_ERROR;
}
axis_ = axis_ < 0 ? axis_ + in_tensors_[0]->shape().size() : axis_;

@ -64,32 +64,22 @@ void ToFormatOpenCLKernel::SetGlobalLocal() {
}
int ToFormatOpenCLKernel::Prepare() {
std::map<TypeId, std::string> dtype_str{
{kNumberTypeFloat32, "float"}, {kNumberTypeFloat16, "half"}, {kNumberTypeInt32, "float"}};
std::string kernel_name;
if (out_mem_type_ == MemType::IMG) {
kernel_name = "to_format_NHWC_to_NHWC4_IMG_" + dtype_str[in_tensors_.front()->data_type()];
} else {
kernel_name = "to_format_NHWC4_to_NHWC_BUF_" + dtype_str[out_tensors_.front()->data_type()];
}
static std::map<TypeId, std::string> dtype_str{{kNumberTypeFloat32, "float32"},
{kNumberTypeFloat16, "float16"},
{kNumberTypeInt32, "int32"},
{kNumberTypeUInt32, "uint32"}};
auto in_tensor = in_tensors_.front();
auto out_tensor = out_tensors_.front();
std::string kernel_name = out_mem_type_ == MemType::IMG ? "BUF_to_IMG_" : "IMG_to_BUF_";
kernel_name += dtype_str[in_tensor->data_type()] + "_" + dtype_str[out_tensor->data_type()];
this->set_name(kernel_name);
#ifdef PROGRAM_WITH_IL
kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
#else
std::string program_name = "to_format";
std::string source = to_format_source;
ocl_runtime_->LoadSource(program_name, source);
std::vector<std::string> ext_build_opt;
if (in_tensors_[0]->data_type() == kNumberTypeFloat32) {
ext_build_opt.push_back("-DREAD_IMAGEIN=read_imagef");
} else {
ext_build_opt.push_back("-DREAD_IMAGEIN=read_imageh");
}
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, ext_build_opt);
#endif
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
auto output = GpuTensorInfo(out_tensors_.front());
auto output = GpuTensorInfo(out_tensor);
N_ = output.N;
H_ = output.H;
W_ = output.W;
@ -112,15 +102,8 @@ int ToFormatOpenCLKernel::Run() {
}
int ToFormatOpenCLKernel::InferShape() {
if (infer_shape_flag_) {
return RET_OK;
}
if (in_tensors_[0]->shape().size() == 0 || in_tensors_[0]->ElementsNum() < 0) {
MS_LOG(ERROR) << "to_format op in tensor shape is 0, infer shape failed!";
return RET_ERROR;
}
out_tensors_[0]->set_shape(in_tensors_[0]->shape());
infer_shape_flag_ = true;
op_parameter_->infer_flag_ = false;
return RET_OK;
}

@ -97,7 +97,7 @@ int OpenCLKernel::GetImageSize(size_t idx, lite::opencl::ImageSize *img_size) {
}
void OpenCLKernel::PrintOutput(int print_num, const std::string &out_file) {
printf("%-30s", name().c_str());
printf("%-30s ", name().c_str());
if (out_tensors().empty()) {
return;
}
@ -134,7 +134,9 @@ void OpenCLKernel::PrintOutput(int print_num, const std::string &out_file) {
auto total_num = mem_type == lite::opencl::MemType::BUF ? img_info.ElementsNum : img_info.ElementsC4Num;
for (int i = 0; i < print_num && i < total_num; ++i) {
if (tensor->data_type() == kNumberTypeFloat16) {
if (tensor->data_type() == kNumberTypeInt32) {
printf("%d %7d | ", i, reinterpret_cast<int32_t *>(data.data())[i]);
} else if (tensor->data_type() == kNumberTypeFloat16) {
printf("%d %7.3f | ", i, reinterpret_cast<float16_t *>(data.data())[i]);
} else {
printf("%d %7.3f | ", i, reinterpret_cast<float *>(data.data())[i]);
@ -191,7 +193,7 @@ int OpenCLKernel::PostProcess() {
}
int OpenCLKernel::InferShape() {
if (infer_shape_flag_) {
if (op_parameter_->infer_flag_) {
return RET_OK;
}
op_parameter_->infer_flag_ = true;
@ -202,12 +204,11 @@ int OpenCLKernel::InferShape() {
op_parameter_->infer_flag_ = false;
return ret;
}
infer_shape_flag_ = true;
return RET_OK;
}
int OpenCLKernel::ReSize() {
if (infer_shape_flag_) {
if (op_parameter_->infer_flag_) {
return RET_OK;
}
auto ret = InferShape();

@ -27,6 +27,7 @@
#include "src/runtime/gpu/opencl/opencl_runtime.h"
#include "mindspore/lite/src/dequant.h"
#include "src/runtime/kernel/opencl/utils.h"
#include "nnacl/resize_parameter.h"
using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_OK;
@ -35,15 +36,15 @@ namespace mindspore::kernel {
struct OpenCLToFormatParameter {
OpParameter op_parameter{};
schema::Format src_format{schema::Format::Format_NHWC};
schema::Format dst_format{schema::Format::Format_NHWC4};
lite::opencl::MemType out_mem_type{lite::opencl::MemType::IMG};
};
template <typename SrcT, typename DstT>
void Broadcast2GpuShape(DstT *dst, const SrcT *src, int src_num) {
MS_ASSERT(dst);
MS_ASSERT(src);
if (src == nullptr || src_num <= 0) {
return;
}
auto *N = dst;
auto *H = dst + 1;
auto *W = dst + 2;
@ -70,10 +71,12 @@ void Broadcast2GpuShape(DstT *dst, const SrcT *src, int src_num) {
template <typename SrcT, typename DstT>
void Broadcast2GpuShape(DstT *dst, const SrcT *src, int src_num, DstT default_value) {
MS_ASSERT(dst);
MS_ASSERT(src);
for (int i = 0; i < 4; ++i) {
dst[i] = default_value;
}
if (src == nullptr || src_num <= 0) {
return;
}
Broadcast2GpuShape(dst, src, src_num);
}
@ -92,6 +95,10 @@ struct GpuTensorInfo {
H = shape.s[1];
W = shape.s[2];
C = shape.s[3];
MS_ASSERT(N > 0);
MS_ASSERT(H > 0);
MS_ASSERT(W > 0);
MS_ASSERT(C > 0);
Slice = UP_DIV(C, C4NUM);
FLT_size = tensor->data_type() == kNumberTypeFloat16 ? sizeof(cl_half) : sizeof(cl_float);
@ -167,7 +174,6 @@ class OpenCLKernel : public LiteKernel {
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
: LiteKernel(parameter, inputs, outputs, ctx) {
ocl_runtime_ = ocl_runtime_wrap_.GetInstance();
infer_shape_flag_ = parameter->infer_flag_;
}
~OpenCLKernel() override = default;
int AlignGlobalLocal(const std::vector<size_t> &global, const std::vector<size_t> &local);
@ -199,8 +205,6 @@ class OpenCLKernel : public LiteKernel {
int DequantWeight();
void FreeDequantedWeight();
virtual int InferShape();
bool GetInferShapeFlag() { return infer_shape_flag_; }
void SetInferShapeFlag(bool flag) { infer_shape_flag_ = flag; }
protected:
static std::set<size_t> GenerateLocalByGlobal(size_t global_i);
@ -225,7 +229,6 @@ class OpenCLKernel : public LiteKernel {
cl::Event event_;
void *restore_quant_data_{nullptr};
bool dequant_flag_{false};
bool infer_shape_flag_{false};
private:
lite::opencl::OpenCLRuntimeWrapper ocl_runtime_wrap_;
@ -241,7 +244,7 @@ kernel::LiteKernel *OpenCLKernelCreator(const std::vector<lite::Tensor *> &input
free(opParameter);
return nullptr;
}
if (!reinterpret_cast<kernel::OpenCLKernel *>(kernel)->GetInferShapeFlag()) {
if (!opParameter->infer_flag_) {
MS_LOG(WARNING) << "kernel don't infer shape yet!";
return kernel;
}

@ -121,8 +121,6 @@ int OpenCLSubGraph::GenToFormatOp(const std::vector<lite::Tensor *> &in_tensors,
for (size_t i = 0; i < in_tensors.size(); ++i) {
auto *in_tensor = in_tensors.at(i);
auto dst_format = (mem_type == MemType::IMG) ? schema::Format::Format_NHWC4 : schema::Format::Format_NHWC;
auto src_format = (mem_type == MemType::IMG) ? schema::Format::Format_NHWC : schema::Format::Format_NHWC4;
auto *new_tensor = new (std::nothrow)
lite::Tensor(in_tensor->data_type(), in_tensor->shape(), in_tensor->format(), lite::Tensor::VAR);
MS_ASSERT(new_tensor);
@ -130,20 +128,9 @@ int OpenCLSubGraph::GenToFormatOp(const std::vector<lite::Tensor *> &in_tensors,
MS_LOG(ERROR) << "OpenCLSubGraph new tensor failed!";
return RET_ERROR;
}
if (mem_type == MemType::IMG) {
new_tensor->set_format(dst_format);
in_tensor->set_format(src_format);
} else {
new_tensor->set_format(src_format);
in_tensor->set_format(dst_format);
}
out_tensors->emplace_back(new_tensor);
KernelKey desc{kGPU, kNumberTypeFloat32, PRIM_TO_FORMAT};
if (mem_type == MemType::IMG && ocl_runtime_->GetFp16Enable()) {
desc.data_type = kNumberTypeFloat16;
new_tensor->set_data_type(kNumberTypeFloat16);
}
auto *parameter = static_cast<OpenCLToFormatParameter *>(malloc(sizeof(OpenCLToFormatParameter)));
MS_ASSERT(parameter);
if (parameter == nullptr) {
@ -153,16 +140,7 @@ int OpenCLSubGraph::GenToFormatOp(const std::vector<lite::Tensor *> &in_tensors,
return RET_ERROR;
}
parameter->op_parameter.type_ = PRIM_TO_FORMAT;
bool output_shape_setted = true;
for (auto output : *out_tensors) {
if (output->shape().empty() || output->ElementsNum() < 0) {
output_shape_setted = false;
break;
}
}
parameter->op_parameter.infer_flag_ = output_shape_setted;
parameter->src_format = src_format;
parameter->dst_format = dst_format;
parameter->op_parameter.infer_flag_ = false;
parameter->out_mem_type = mem_type;
out_parameters->emplace_back(parameter);
LiteKernel *in_convert_op = nullptr;
@ -255,8 +233,7 @@ int OpenCLSubGraph::Init() {
int OpenCLSubGraph::UpdateTensorDataTypePass() {
bool is_fp16 = ocl_runtime_->GetFp16Enable();
MS_ASSERT(in_tensors_[0]);
if (is_fp16 && (in_tensors_[0]->data_type() == kNumberTypeFloat32)) {
if (is_fp16) {
std::set<lite::Tensor *> out_set;
out_set.insert(in_tensors_.begin(), in_tensors_.end());
out_set.insert(out_tensors_.begin(), out_tensors_.end());
@ -330,16 +307,6 @@ void OpenCLSubGraph::GetInOutNodes() {
}
}
bool OpenCLSubGraph::IsSubGraphInferShapeDone() {
for (auto node : this->nodes_) {
auto opencl_kernel = reinterpret_cast<kernel::OpenCLKernel *>(node);
if (!opencl_kernel->GetInferShapeFlag()) {
return false;
}
}
return true;
}
int OpenCLSubGraph::Prepare() {
for (const auto tensor : in_tensors_) {
MS_ASSERT(tensor);
@ -354,7 +321,6 @@ int OpenCLSubGraph::Prepare() {
MS_LOG(ERROR) << "Create OpenCLExecutor fail";
return RET_ERROR;
}
auto ret = RET_OK;
for (auto node : this->nodes_) {
if (node == nullptr) {
MS_LOG(ERROR) << "node in Subgraph is nullptr";
@ -363,26 +329,28 @@ int OpenCLSubGraph::Prepare() {
auto opencl_kernel = reinterpret_cast<kernel::OpenCLKernel *>(node);
std::set<int> pre_init_weight_list = {schema::PrimitiveType_MatMul, schema::PrimitiveType_BiasAdd};
if (pre_init_weight_list.find(opencl_kernel->Type()) != pre_init_weight_list.end()) {
ret = opencl_kernel->InitWeights();
auto ret = opencl_kernel->InitWeights();
if (ret != RET_OK) {
MS_LOG(ERROR) << "init weights " << node->name() << " failed";
return ret;
}
}
if (opencl_kernel->GetInferShapeFlag()) {
ret = node->Prepare();
if (opencl_kernel->op_parameter()->infer_flag_) {
auto ret = node->Prepare();
if (ret != RET_OK) {
MS_LOG(ERROR) << "prepare node " << node->name() << " failed";
return ret;
}
}
}
auto opencl_exec = reinterpret_cast<lite::opencl::OpenCLExecutor *>(executor_);
// If tuning_mode is DEFAULT, just malloc memory for reuse.
ret = opencl_exec->RunOrTune(in_tensors_, out_tensors_, nodes_, allocator_, nullptr, nullptr, true);
if (ret != RET_OK) {
MS_LOG(ERROR) << "Run opencl executor failed: " << ret;
return ret;
if (all_kernels_infer_done_) {
auto opencl_exec = reinterpret_cast<lite::opencl::OpenCLExecutor *>(executor_);
// If tuning_mode is DEFAULT, just malloc memory for reuse.
auto ret = opencl_exec->RunOrTune(in_tensors_, out_tensors_, nodes_, allocator_, nullptr, nullptr, true);
if (ret != RET_OK) {
MS_LOG(ERROR) << "Run opencl executor failed: " << ret;
return ret;
}
}
return RET_OK;
}
@ -423,7 +391,7 @@ int OpenCLSubGraph::ReSize(bool interrupt) {
for (auto &output : outputs) {
output->FreeData();
}
opencl_kernel->SetInferShapeFlag(false);
opencl_kernel->op_parameter()->infer_flag_ = false;
}
for (auto kernel : nodes_) {
auto opencl_kernel = reinterpret_cast<kernel::OpenCLKernel *>(kernel);

@ -36,6 +36,9 @@ class OpenCLSubGraph : public SubGraphKernel {
subgraph_type_ = kGpuSubGraph;
this->name_ = "GpuSubGraph";
nodes_set_.insert(nodes.begin(), nodes.end());
all_kernels_infer_done_ = std::all_of(nodes_.begin(), nodes_.end(), [](const kernel::LiteKernel *kernel) {
return kernel && kernel->op_parameter() && kernel->op_parameter()->infer_flag_;
});
}
~OpenCLSubGraph() override;
@ -48,7 +51,6 @@ class OpenCLSubGraph : public SubGraphKernel {
int Run() override;
int Run(const KernelCallBack &before, const KernelCallBack &after) override;
int InsertOpsPass();
bool IsSubGraphInferShapeDone();
private:
void UnInit();
@ -83,6 +85,7 @@ class OpenCLSubGraph : public SubGraphKernel {
std::set<LiteKernel *> nodes_set_;
lite::opencl::OpenCLRuntimeWrapper ocl_runtime_wrap_;
lite::opencl::OpenCLRuntime *ocl_runtime_{nullptr};
bool all_kernels_infer_done_ = false;
};
} // namespace mindspore::kernel

@ -296,4 +296,27 @@ int CheckParamLikeTensor(const std::string &kernel_name, const std::string &tens
return RET_OK;
}
static std::set<void *> tmp_weights;
void StoreTmpWeight(lite::Tensor *tensor) {
MS_LOG(WARNING) << "store weight when kernel don't infer shape!";
if (tensor && tensor->data_c() && tensor->Size()) {
void *new_data = malloc(tensor->Size());
MS_ASSERT(new_data);
if (new_data == nullptr) {
return;
}
memcpy(new_data, tensor->data_c(), tensor->Size());
tensor->set_data(new_data);
tmp_weights.insert(new_data);
}
}
void FreeTmpWeight(void *data) {
if (tmp_weights.count(data)) {
free(data);
tmp_weights.erase(data);
}
}
} // namespace mindspore::kernel

@ -63,6 +63,9 @@ void PackNHWCToNHWC4(void *src, void *dst, bool src_is_fp16, bool dst_is_fp16, c
int CheckParamLikeTensor(const std::string &kernel_name, const std::string &tensor_name, lite::Tensor *tensor,
TypeId expect_data_type, const std::vector<int> &expect_shape);
void StoreTmpWeight(lite::Tensor *tensor);
void FreeTmpWeight(void *tensor);
template <class T1, class T2>
void PackNCHWToNC4HW4(void *src, void *dst, int batch, int plane_in, int plane_out, int channel,
const std::function<T2(T1)> &to_dtype) {

@ -224,7 +224,7 @@ int32_t Tensor::ElementsC4Num() const {
if (this->category_ == CONST_SCALAR) {
return 1;
}
int32_t result = 0;
int32_t result = 1;
if (this->shape_.size() == 4) {
result = Batch() * Height() * Width() * ((Channel() + 3) / 4 * 4);
} else if (this->shape_.size() == 2) {

Loading…
Cancel
Save