concat_optimization

pull/6778/head
Pengyongrong 4 years ago
parent 859e704644
commit a7c88c0643

@ -480,9 +480,9 @@ gene_clhpp() {
do
file="$(basename ${file_path})"
inc_file=$(echo ${CL_SRC_DIR}/${file} | sed 's/$/.inc/')
sed 's/^/\"/;s/$/ \\n\" \\/' ${CL_SRC_DIR}/${file} > ${inc_file}
sed 's/\\/\\\\/g;s/\"/\\\"/g;s/^/\"/;s/$/\\n\" \\/' ${CL_SRC_DIR}/${file} > ${inc_file}
kernel_name=$(echo ${file} | sed s'/.\{3\}$//')
sed -i "1i\static const char *${kernel_name}_source =\"\\n\" \\" ${inc_file}
sed -i "1i\static const char *${kernel_name}_source =\"\\n\" \\" ${inc_file}
sed -i '$a\;' ${inc_file}
done
}

File diff suppressed because it is too large Load Diff

@ -93,13 +93,19 @@ int ConcatOpenCLKernel::Init() {
std::string kernel_name = "Concat";
if (in_tensors_.size() == 2) {
kernel_name += "2input";
kernel_name += "2inputaxis";
kernel_name += std::to_string(param->axis_);
} else if (in_tensors_.size() == 3) {
kernel_name += "3input";
kernel_name += "3inputaxis";
kernel_name += std::to_string(param->axis_);
} else if (in_tensors_.size() == 4) {
kernel_name += "4input";
kernel_name += "4inputaxis";
kernel_name += std::to_string(param->axis_);
} else if (in_tensors_.size() == 6) {
kernel_name += "6inputaxis";
kernel_name += std::to_string(param->axis_);
} else {
MS_LOG(ERROR) << " input must be 2 3 or 4";
MS_LOG(ERROR) << " input must be 2 , 3 , 4 or 6";
return RET_ERROR;
}
if (in_format == schema::Format_NC4HW4) {
@ -107,6 +113,7 @@ int ConcatOpenCLKernel::Init() {
} else if (in_format == schema::Format_NHWC4) {
kernel_name += "_NHWC4";
}
MS_LOG(DEBUG) << "kernel_name=: " << kernel_name;
std::set<std::string> build_options;
std::string source = concat_source;
std::string program_name = "Concat";
@ -118,16 +125,36 @@ int ConcatOpenCLKernel::Init() {
int ConcatOpenCLKernel::ReSize() { return RET_OK; }
int ConcatOpenCLKernel::GetSumShape(std::vector<int> *sum_shape, std::vector<int> *in_shape) {
std::vector<int> temp_sum = {0, 0, 0, 0};
for (int i = 0; i < in_tensors_.size(); ++i) {
auto temp = in_tensors_[i]->shape();
for (int j = 0; j < temp.size(); ++j) {
in_shape->push_back(temp[j]);
temp_sum.at(j) += temp[j];
sum_shape->push_back(temp_sum.at(j));
int ConcatOpenCLKernel::IntegraShapeToXYZ() {
auto in_format = op_format_;
if (out_tensors_[0]->shape().size() > 4 || out_tensors_[0]->shape().size() <= 0) {
MS_LOG(ERROR) << "in_tensors_.shape() must between 0~4";
return RET_ERROR;
}
if (in_format == schema::Format_NHWC4 || in_format == schema::Format_NC4HW4) {
for (int i = 0; i < in_tensors_.size(); ++i) {
cl_int4 temp_cl;
auto temp = in_tensors_[i]->shape();
temp_cl = {temp[0], temp[1], temp[2], UP_DIV(temp[3], C4NUM)};
XYZShape.push_back(temp_cl);
}
} else {
for (int i = 0; i < in_tensors_.size(); ++i) {
auto temp = in_tensors_[i]->shape();
for (int j = temp.size(); j < C4NUM; ++j) {
temp.push_back(1);
}
cl_int4 temp_cl = {temp[0], temp[1], temp[2], UP_DIV(temp[3], C4NUM)};
XYZShape.push_back(temp_cl);
}
auto temp = out_tensors_[0]->shape();
for (int i = out_tensors_[0]->shape().size(); i < C4NUM; ++i) {
temp.push_back(1);
}
}
shape_nhwc = {out_tensors_[0]->shape()[0] * out_tensors_[0]->shape()[1], out_tensors_[0]->shape()[2],
UP_DIV(out_tensors_[0]->shape()[3], C4NUM)};
return RET_OK;
}
@ -151,70 +178,31 @@ int ConcatOpenCLKernel::Run() {
if (param->axis_ == 0) {
return RunAxis0();
}
auto input1_shape = in_tensors_[0]->shape();
auto input2_shape = in_tensors_[1]->shape();
auto output_shape = out_tensors_[0]->shape();
cl_int4 input_shape1_ = {input1_shape[0], input1_shape[1], input1_shape[2], UP_DIV(input1_shape[3], C4NUM)};
cl_int4 input_shape2_ = {input2_shape[0], input2_shape[1], input2_shape[2], UP_DIV(input2_shape[3], C4NUM)};
cl_int4 output_shape_ = {output_shape[0], output_shape[1], output_shape[2], UP_DIV(output_shape[3], C4NUM)};
uint32_t OH = output_shape[0] * output_shape[1]; // N*H
uint32_t OW = output_shape[2];
uint32_t OC = UP_DIV(output_shape[3], C4NUM);
IntegraShapeToXYZ();
const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize();
std::vector<size_t> local = {1, 1, 1}; // init local
std::vector<size_t> global = {OH, OW, OC};
std::vector<size_t> local = {1, 1, 1};
std::vector<size_t> global = {static_cast<size_t>(shape_nhwc.s[0]), static_cast<size_t>(shape_nhwc.s[1]),
static_cast<size_t>(shape_nhwc.s[2])};
ConcatGetWorkGroup(global, &local, max_global[0]);
GetSumShape(&sum_shape, &in_shape);
int arg_cn = 0;
if (in_tensors_.size() == 2) {
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape1_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape2_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param->axis_);
} else if (in_tensors_.size() == 3) {
auto input3_shape = in_tensors_[2]->shape();
cl_int4 input_shape3_ = {input3_shape[0], input3_shape[1], input3_shape[2], UP_DIV(input3_shape[3], C4NUM)};
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[2]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape1_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape2_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape3_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param->axis_);
} else if (in_tensors_.size() == 4) {
auto input3_shape = in_tensors_[2]->shape();
auto input4_shape = in_tensors_[3]->shape();
cl_int4 input_shape3_ = {input3_shape[0], input3_shape[1], input3_shape[2], UP_DIV(input3_shape[3], C4NUM)};
cl_int4 input_shape4_ = {input4_shape[0], input4_shape[1], input4_shape[2], UP_DIV(input4_shape[3], C4NUM)};
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[2]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[3]->data_c());
if (in_tensors_.size() == 2 || in_tensors_.size() == 3 || in_tensors_.size() == 4 || in_tensors_.size() == 6) {
int arg_cn = 0;
for (int i = 0; i < in_tensors_.size(); ++i) {
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[i]->data_c());
}
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape1_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape2_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape3_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape4_);
for (int i = 0; i < XYZShape.size(); ++i) {
cl_int4 temp = {XYZShape[i].s[0], XYZShape[i].s[1], XYZShape[i].s[2], XYZShape[i].s[3]};
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, temp);
}
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param->axis_);
} else {
MS_LOG(ERROR) << " input sizes must 2 or 3 or 4";
MS_LOG(ERROR) << "unsupported input size :" << in_tensors_.size();
return RET_ERROR;
}
ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
return RET_OK;
}

@ -41,12 +41,12 @@ class ConcatOpenCLKernel : public OpenCLKernel {
int GetImageSize(size_t idx, std::vector<size_t> *img_size) override;
int GetSumShape(std::vector<int> *sum_shape, std::vector<int> *in_shape);
int IntegraShapeToXYZ();
private:
cl::Kernel kernel_;
std::vector<int> sum_shape;
std::vector<int> in_shape;
std::vector<cl_int3> XYZShape;
cl_int4 shape_nhwc;
};
} // namespace mindspore::kernel

@ -146,7 +146,7 @@ TEST_F(TestConcatOpenCLCI, ConcatFp32_2inputforCI) {
delete sub_graph;
}
TEST_F(TestConcatOpenCLfp16, ConcatFp16_2input_dim4_axis1) {
TEST_F(TestConcatOpenCLfp16, ConcatFp16_4input_dim4_axis1) {
MS_LOG(INFO) << " begin test ";
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
ocl_runtime->SetFp16Enable(true);
@ -276,7 +276,7 @@ TEST_F(TestConcatOpenCLfp16, ConcatFp16_2input_dim4_axis1) {
delete sub_graph;
}
TEST_F(TestConcatOpenCLfp32, ConcatFp32_2input_dim4_axis3) {
TEST_F(TestConcatOpenCLfp32, ConcatFp32_3input_dim4_axis1) {
MS_LOG(INFO) << " begin test ";
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
ocl_runtime->Init();
@ -396,4 +396,146 @@ TEST_F(TestConcatOpenCLfp32, ConcatFp32_2input_dim4_axis3) {
}
delete sub_graph;
}
TEST_F(TestConcatOpenCLfp16, ConcatFp16_6input_dim4_axis1) {
MS_LOG(INFO) << " begin test ";
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
ocl_runtime->SetFp16Enable(true);
ocl_runtime->Init();
auto allocator = ocl_runtime->GetAllocator();
// get the input from .bin
size_t input1_size, input2_size, input3_size, input4_size, input5_size, input6_size, output_size;
std::string input1Ppath = "./test_data/concatfp16_input1.bin";
std::string input2Ppath = "./test_data/concatfp16_input2.bin";
std::string input3Ppath = "./test_data/concatfp16_input3.bin";
std::string input4Ppath = "./test_data/concatfp16_input4.bin";
std::string input5Ppath = "./test_data/concatfp16_input5.bin";
std::string input6Ppath = "./test_data/concatfp16_input6.bin";
std::string correctOutputPath = "./test_data/concatfp16_output.bin";
auto input_data1 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input1Ppath.c_str(), &input1_size));
auto input_data2 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input2Ppath.c_str(), &input2_size));
auto input_data3 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input3Ppath.c_str(), &input3_size));
auto input_data4 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input4Ppath.c_str(), &input4_size));
auto input_data5 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input5Ppath.c_str(), &input5_size));
auto input_data6 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input6Ppath.c_str(), &input6_size));
auto correctOutput =
reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size));
MS_LOG(INFO) << " init tensors ";
constexpr int INPUT_NUM = 6;
std::array<std::vector<int>, INPUT_NUM> input_shapes = {
std::vector<int>{1, 1200, 3, 4}, std::vector<int>{1, 600, 3, 4}, std::vector<int>{1, 150, 3, 4},
std::vector<int>{1, 50, 3, 4}, std::vector<int>{1, 30, 3, 4}, std::vector<int>{1, 4, 3, 4}};
std::vector<int> output_shape = {1, 2034, 3, 4};
auto data_type = kNumberTypeFloat16;
auto tensor_type = lite::TensorCategory(schema::NodeType_ValueNode);
std::vector<lite::Tensor *> inputs;
for (auto &shape : input_shapes) {
auto input_temp = new (std::nothrow) lite::Tensor(data_type, shape, schema::Format_NHWC, tensor_type);
inputs.push_back(input_temp);
if (input_temp == nullptr) {
MS_LOG(INFO) << " new input_tensor failed ";
return;
}
}
auto *output_tensor = new (std::nothrow) lite::Tensor(data_type, output_shape, schema::Format_NHWC, tensor_type);
if (output_tensor == nullptr) {
MS_LOG(INFO) << " new output_tensor failed ";
for (auto tensor : inputs) {
delete tensor;
}
return;
}
std::vector<lite::Tensor *> outputs{output_tensor};
MS_LOG(INFO) << " input_shapes size =: " << input_shapes.size();
MS_LOG(INFO) << " initialize tensors ";
auto param = reinterpret_cast<ConcatParameter *>(malloc(sizeof(ConcatParameter)));
if (param == nullptr) {
MS_LOG(INFO) << " new ConcatParameter failed ";
for (auto tensor : inputs) {
delete tensor;
}
for (auto tensor : outputs) {
delete tensor;
}
return;
}
param->axis_ = 1;
auto *concat_kernel =
new (std::nothrow) kernel::ConcatOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
if (concat_kernel == nullptr) {
MS_LOG(INFO) << " new kernel::ConcatOpenCLKernel failed ";
for (auto tensor : inputs) {
delete tensor;
}
for (auto tensor : outputs) {
delete tensor;
}
delete param;
return;
}
concat_kernel->SetFormatType(schema::Format_NC4HW4);
concat_kernel->Init();
// to do allocate memory for inputs and outputs
for (auto &input_tensor : inputs) {
input_tensor->MallocData(allocator);
}
MS_LOG(INFO) << " initialize sub_graph ";
std::vector<kernel::LiteKernel *> kernels{concat_kernel};
auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
if (sub_graph == nullptr) {
MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
for (auto tensor : inputs) {
delete tensor;
}
for (auto tensor : outputs) {
delete tensor;
}
delete param;
delete concat_kernel;
return;
}
sub_graph->Init();
MS_LOG(INFO) << " initialize input data ";
if (inputs.size() == 2) {
memcpy(inputs[0]->data_c(), input_data1, input1_size);
memcpy(inputs[1]->data_c(), input_data2, input2_size);
} else if (inputs.size() == 3) {
memcpy(inputs[0]->data_c(), input_data1, input1_size);
memcpy(inputs[1]->data_c(), input_data2, input2_size);
memcpy(inputs[2]->data_c(), input_data3, input3_size);
} else if (inputs.size() == 4) {
memcpy(inputs[0]->data_c(), input_data1, input1_size);
memcpy(inputs[1]->data_c(), input_data2, input2_size);
memcpy(inputs[2]->data_c(), input_data3, input3_size);
memcpy(inputs[3]->data_c(), input_data4, input4_size);
} else if (inputs.size() == 6) {
memcpy(inputs[0]->data_c(), input_data1, input1_size);
memcpy(inputs[1]->data_c(), input_data2, input2_size);
memcpy(inputs[2]->data_c(), input_data3, input3_size);
memcpy(inputs[3]->data_c(), input_data4, input4_size);
memcpy(inputs[4]->data_c(), input_data5, input5_size);
memcpy(inputs[5]->data_c(), input_data6, input6_size);
} else {
MS_LOG(ERROR) << " input size must be 2 or 3 or 4";
}
std::cout << "==================output data================" << std::endl;
sub_graph->Run();
auto *output_data_gpu = reinterpret_cast<float16_t *>(output_tensor->MutableData());
CompareOutputData1(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.000001);
lite::opencl::OpenCLRuntime::DeleteInstance();
for (auto tensor : inputs) {
tensor->SetData(nullptr);
delete tensor;
}
for (auto tensor : outputs) {
tensor->SetData(nullptr);
delete tensor;
}
delete sub_graph;
}
} // namespace mindspore

Loading…
Cancel
Save