fix gpu weight quant bug

pull/14724/head
yeyunpeng2020 4 years ago
parent a3fc997c4e
commit 9c4c5cc47e

@ -44,13 +44,13 @@ __kernel void conv2d_transpose(__read_only image2d_t src_data, __write_only imag
for (int ci = 0; ci < src_size.z; ++ci) { for (int ci = 0; ci < src_size.z; ++ci) {
FLT4 x0 = (FLT4)0.f; FLT4 x0 = (FLT4)0.f;
FLT4 x2 = (FLT4)0.f; FLT4 x2 = (FLT4)0.f;
if (out0_src_h < src_size.x) { if (out0_src_h < src_size.x && out0_src_h >= 0) {
x0 = READ_IMAGE(src_data, smp_zero, (int2)(out0_src_w * src_size.z + ci, n * src_size.x + out0_src_h)); x0 = READ_IMAGE(src_data, smp_zero, (int2)(out0_src_w * src_size.z + ci, n * src_size.x + out0_src_h));
x2 = READ_IMAGE(src_data, smp_zero, (int2)(out1_src_w * src_size.z + ci, n * src_size.x + out0_src_h)); x2 = READ_IMAGE(src_data, smp_zero, (int2)(out1_src_w * src_size.z + ci, n * src_size.x + out0_src_h));
} }
FLT4 x1 = (FLT4)0.f; FLT4 x1 = (FLT4)0.f;
FLT4 x3 = (FLT4)0.f; FLT4 x3 = (FLT4)0.f;
if (out1_src_h < src_size.x) { if (out1_src_h < src_size.x && out1_src_h >= 0) {
x1 = READ_IMAGE(src_data, smp_zero, (int2)(out0_src_w * src_size.z + ci, n * src_size.x + out1_src_h)); x1 = READ_IMAGE(src_data, smp_zero, (int2)(out0_src_w * src_size.z + ci, n * src_size.x + out1_src_h));
x3 = READ_IMAGE(src_data, smp_zero, (int2)(out1_src_w * src_size.z + ci, n * src_size.x + out1_src_h)); x3 = READ_IMAGE(src_data, smp_zero, (int2)(out1_src_w * src_size.z + ci, n * src_size.x + out1_src_h));
} }

@ -245,11 +245,6 @@ void ConvertFilter(void *src, void *dst, TypeId src_dtype, TypeId dst_dtype, Fil
void Conv2DOpenCLKernel::InitFilter() { void Conv2DOpenCLKernel::InitFilter() {
auto allocator = ocl_runtime_->GetAllocator(); auto allocator = ocl_runtime_->GetAllocator();
auto ret = DequantWeight();
if (ret != RET_OK) {
return;
}
// allocate opencl memory: buffer or image2d // allocate opencl memory: buffer or image2d
size_t size = 0; size_t size = 0;
int Ogroup = block_size_.C; int Ogroup = block_size_.C;
@ -285,7 +280,6 @@ void Conv2DOpenCLKernel::InitFilter() {
allocator->UnmapBuffer(packed_filter_); allocator->UnmapBuffer(packed_filter_);
} }
FreeDequantedWeight();
FreeTmpWeight(in_tensors_.at(kWeightIndex)->data_c()); FreeTmpWeight(in_tensors_.at(kWeightIndex)->data_c());
} }

@ -132,10 +132,6 @@ int Conv2dTransposeOpenCLKernel::InitWeights() {
} }
int Conv2dTransposeOpenCLKernel::InitFilter() { int Conv2dTransposeOpenCLKernel::InitFilter() {
auto ret = DequantWeight();
if (ret != RET_OK) {
return ret;
}
auto *param = reinterpret_cast<ConvParameter *>(op_parameter_); auto *param = reinterpret_cast<ConvParameter *>(op_parameter_);
int ci = in_tensors_[0]->shape()[3]; int ci = in_tensors_[0]->shape()[3];
int co = out_tensors_[0]->shape()[3]; int co = out_tensors_[0]->shape()[3];
@ -191,7 +187,6 @@ int Conv2dTransposeOpenCLKernel::InitFilter() {
} }
} }
allocator->UnmapBuffer(padWeight_); allocator->UnmapBuffer(padWeight_);
FreeDequantedWeight();
return RET_OK; return RET_OK;
} }

@ -102,10 +102,6 @@ int DepthwiseConv2dOpenCLKernel::Prepare() {
} }
int DepthwiseConv2dOpenCLKernel::InitWeights() { int DepthwiseConv2dOpenCLKernel::InitWeights() {
auto ret = DequantWeight();
if (ret != RET_OK) {
return ret;
}
auto parameter = reinterpret_cast<ConvParameter *>(op_parameter_); auto parameter = reinterpret_cast<ConvParameter *>(op_parameter_);
auto allocator = ocl_runtime_->GetAllocator(); auto allocator = ocl_runtime_->GetAllocator();
bool is_fp16 = ocl_runtime_->GetFp16Enable(); bool is_fp16 = ocl_runtime_->GetFp16Enable();
@ -162,7 +158,6 @@ int DepthwiseConv2dOpenCLKernel::InitWeights() {
} else { } else {
packed_weight_ = allocator->Malloc(pack_weight_size, temp_filter.data()); packed_weight_ = allocator->Malloc(pack_weight_size, temp_filter.data());
} }
FreeDequantedWeight();
if (packed_weight_ == nullptr) { if (packed_weight_ == nullptr) {
return RET_ERROR; return RET_ERROR;
} }

@ -58,18 +58,6 @@ int FullConnectionOpenCLKernel::CheckSpecs() {
MS_LOG(ERROR) << "Unsupported activation type " << param->act_type_; MS_LOG(ERROR) << "Unsupported activation type " << param->act_type_;
return RET_ERROR; return RET_ERROR;
} }
// for fusion: ActivationType_TANH
switch (static_cast<int>(param->act_type_)) {
case ActType_No:
case ActType_Relu:
case ActType_Relu6:
case ActivationType_TANH:
break;
default: {
MS_LOG(ERROR) << "Unsupported activation type " << param->act_type_;
return RET_ERROR;
}
}
N_ = out_gpu_info.N; N_ = out_gpu_info.N;
CO_ = out_gpu_info.C; CO_ = out_gpu_info.C;
auto intensor_shape = GpuTensorInfo(in_tensors_[0]); auto intensor_shape = GpuTensorInfo(in_tensors_[0]);
@ -140,10 +128,6 @@ int FullConnectionOpenCLKernel::InitWeights() {
} // namespace mindspore::kernel } // namespace mindspore::kernel
int FullConnectionOpenCLKernel::InitFilter() { int FullConnectionOpenCLKernel::InitFilter() {
auto ret = DequantWeight();
if (ret != RET_OK) {
return ret;
}
auto allocator = ocl_runtime_->GetAllocator(); auto allocator = ocl_runtime_->GetAllocator();
auto intensor_shape = GpuTensorInfo(in_tensors_[0]); auto intensor_shape = GpuTensorInfo(in_tensors_[0]);
int co4 = UP_DIV(CO_, C4NUM); int co4 = UP_DIV(CO_, C4NUM);
@ -196,7 +180,6 @@ int FullConnectionOpenCLKernel::InitFilter() {
} }
} }
allocator->UnmapBuffer(padWeight_); allocator->UnmapBuffer(padWeight_);
FreeDequantedWeight();
return RET_OK; return RET_OK;
} }

@ -106,10 +106,6 @@ int MatMulOpenCLKernel::InitWeights() {
return RET_OK; return RET_OK;
} }
// ABMCI @ ABCICO = ABMCO // ABMCI @ ABCICO = ABMCO
auto ret = DequantWeight();
if (ret != RET_OK) {
return ret;
}
auto allocator = ocl_runtime_->GetAllocator(); auto allocator = ocl_runtime_->GetAllocator();
auto weight_shape = in_tensors_[1]->shape(); auto weight_shape = in_tensors_[1]->shape();
int weight_ndim = weight_shape.size(); int weight_ndim = weight_shape.size();
@ -184,7 +180,6 @@ int MatMulOpenCLKernel::InitWeights() {
} }
allocator->UnmapBuffer(padWeight_); allocator->UnmapBuffer(padWeight_);
FreeDequantedWeight();
return RET_OK; return RET_OK;
} }

@ -69,10 +69,6 @@ void StrassenOpenCLKernel::AllocatorMemoryForStrassen(int NumA, int NumB) {
int StrassenOpenCLKernel::InitWeights() { int StrassenOpenCLKernel::InitWeights() {
// ABMCI @ ABCICO = ABMCO // ABMCI @ ABCICO = ABMCO
auto ret = DequantWeight();
if (ret != RET_OK) {
return ret;
}
auto allocator = ocl_runtime_->GetAllocator(); auto allocator = ocl_runtime_->GetAllocator();
int NumA = in_tensors_[0]->shape()[0]; int NumA = in_tensors_[0]->shape()[0];
int NumB = in_tensors_[1]->shape()[0]; int NumB = in_tensors_[1]->shape()[0];
@ -105,7 +101,6 @@ int StrassenOpenCLKernel::InitWeights() {
} }
} }
allocator->UnmapBuffer(padWeight_); allocator->UnmapBuffer(padWeight_);
FreeDequantedWeight();
return RET_OK; return RET_OK;
} }

@ -91,11 +91,6 @@ void WinogradOpenCLKernel::BuildKernel() {
void WinogradOpenCLKernel::InitFilter() { void WinogradOpenCLKernel::InitFilter() {
auto allocator = ocl_runtime_->GetAllocator(); auto allocator = ocl_runtime_->GetAllocator();
auto ret = DequantWeight();
if (ret != RET_OK) {
return;
}
// allocate opencl memory: buffer or image2d // allocate opencl memory: buffer or image2d
size_t size = 0; size_t size = 0;
int Ogroup = 2; int Ogroup = 2;
@ -140,8 +135,6 @@ void WinogradOpenCLKernel::InitFilter() {
memcpy(packed_filter_, tmp.data(), size); memcpy(packed_filter_, tmp.data(), size);
allocator->UnmapBuffer(packed_filter_); allocator->UnmapBuffer(packed_filter_);
} }
FreeDequantedWeight();
} }
void WinogradOpenCLKernel::AllocateMemory() { void WinogradOpenCLKernel::AllocateMemory() {

@ -351,57 +351,6 @@ std::set<size_t> OpenCLKernel::GenerateLocalByGlobal(size_t global_i) {
return local_; return local_;
} }
int OpenCLKernel::DequantWeight() {
bool is_fp16 = ocl_runtime_->GetFp16Enable();
auto *weight_tensor = in_tensors_.at(kWeightIndex);
restore_quant_data_ = weight_tensor->data_c();
dequant_flag_ = !weight_tensor->quant_params().empty() && weight_tensor->quant_params().front().inited &&
restore_quant_data_ != nullptr;
if (dequant_flag_) {
void *dequant_weight{nullptr};
bool set_flag{true};
if (is_fp16) {
#ifdef ENABLE_ARM64
if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeInt8) {
dequant_weight = lite::DequantUtil::DequantData<int8_t, float16_t>(weight_tensor);
weight_tensor->set_data_type(kNumberTypeFloat16);
} else if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeInt16) {
dequant_weight = lite::DequantUtil::DequantData<int16_t, float16_t>(weight_tensor);
weight_tensor->set_data_type(kNumberTypeFloat16);
} else {
set_flag = false;
}
#else
set_flag = false;
#endif
} else {
if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeInt8) {
dequant_weight = lite::DequantUtil::DequantData<int8_t, float>(weight_tensor);
weight_tensor->set_data_type(kNumberTypeFloat32);
} else if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeInt16) {
dequant_weight = lite::DequantUtil::DequantData<int16_t, float>(weight_tensor);
weight_tensor->set_data_type(kNumberTypeFloat32);
} else {
set_flag = false;
}
}
if (set_flag && dequant_weight == nullptr) {
MS_LOG(ERROR) << "dequant data failed.";
return RET_ERROR;
}
weight_tensor->set_data(dequant_weight);
}
return RET_OK;
}
void OpenCLKernel::FreeDequantedWeight() {
auto *weight_tensor = in_tensors_.at(kWeightIndex);
if (dequant_flag_) {
free(weight_tensor->data_c());
weight_tensor->set_data(restore_quant_data_);
}
}
int OpenCLKernel::CheckSpecs() { int OpenCLKernel::CheckSpecs() {
if (out_mem_type_ == lite::opencl::MemType::IMG) { if (out_mem_type_ == lite::opencl::MemType::IMG) {
if (!GpuTensorInfo(out_tensors_[0]).IsImageSizeValid()) { if (!GpuTensorInfo(out_tensors_[0]).IsImageSizeValid()) {

@ -202,8 +202,6 @@ class OpenCLKernel : public LiteKernel {
void SetMemType(lite::opencl::MemType mem_type) { out_mem_type_ = mem_type; } void SetMemType(lite::opencl::MemType mem_type) { out_mem_type_ = mem_type; }
OpParameter *GetParameter() { return op_parameter_; } OpParameter *GetParameter() { return op_parameter_; }
virtual double GetProfilingTimeMs(); virtual double GetProfilingTimeMs();
int DequantWeight();
void FreeDequantedWeight();
virtual int InferShape(); virtual int InferShape();
protected: protected:

@ -345,9 +345,22 @@ kernel::LiteKernel *Scheduler::FindBackendKernel(const std::vector<Tensor *> &in
kernel::KernelKey gpu_desc{kGPU, kNumberTypeFloat32, desc.type}; kernel::KernelKey gpu_desc{kGPU, kNumberTypeFloat32, desc.type};
if (context_->IsGpuFloat16Enabled()) gpu_desc.data_type = kNumberTypeFloat16; if (context_->IsGpuFloat16Enabled()) gpu_desc.data_type = kNumberTypeFloat16;
if (in_tensors.front()->data_type() == kNumberTypeInt8) gpu_desc.data_type = kNumberTypeInt8; if (in_tensors.front()->data_type() == kNumberTypeInt8) gpu_desc.data_type = kNumberTypeInt8;
// weight quant
std::map<Tensor *, Tensor *> restored_origin_tensors;
for (auto &tensor : in_tensors) {
int index = 0;
auto channel_first = IsChannelFirst(index++, op_parameter);
auto *restore_tensor = DequantUtil::DequantTensor(tensor, desc.data_type, channel_first, kNumberTypeFloat32);
if (restore_tensor != nullptr) {
restored_origin_tensors[tensor] = restore_tensor;
}
}
auto *kernel = KernelRegistry::GetInstance()->GetKernel(in_tensors, out_tensors, context_, gpu_desc, op_parameter); auto *kernel = KernelRegistry::GetInstance()->GetKernel(in_tensors, out_tensors, context_, gpu_desc, op_parameter);
if (kernel != nullptr) { if (kernel != nullptr) {
MS_LOG(DEBUG) << "Get gpu op success: " << PrimitiveCurVersionTypeName(gpu_desc.type) << " " << node->name_; MS_LOG(DEBUG) << "Get gpu op success: " << PrimitiveCurVersionTypeName(gpu_desc.type) << " " << node->name_;
FreeRestoreTensors(&restored_origin_tensors);
return kernel; return kernel;
} else { } else {
MS_LOG(DEBUG) << "Get gpu op failed, scheduler to cpu: " << PrimitiveCurVersionTypeName(gpu_desc.type) << " " MS_LOG(DEBUG) << "Get gpu op failed, scheduler to cpu: " << PrimitiveCurVersionTypeName(gpu_desc.type) << " "
@ -356,6 +369,7 @@ kernel::LiteKernel *Scheduler::FindBackendKernel(const std::vector<Tensor *> &in
if (ret == RET_INFER_INVALID || ret == RET_OK) { if (ret == RET_INFER_INVALID || ret == RET_OK) {
op_parameter = op_parameters_[node->output_indices_.at(0)]; op_parameter = op_parameters_[node->output_indices_.at(0)];
} else { } else {
RestoreTensorData(restored_origin_tensors);
MS_LOG(ERROR) << "Try repeat infer fail: " << node->name_; MS_LOG(ERROR) << "Try repeat infer fail: " << node->name_;
return nullptr; return nullptr;
} }

@ -1 +1 @@
ml_face_openclose.tflite ml_face_openclose.tflite

Loading…
Cancel
Save