From 937dbf8b03f4172897804bfd3b334227128356ee Mon Sep 17 00:00:00 2001 From: wandongdong Date: Wed, 2 Sep 2020 04:08:33 -0700 Subject: [PATCH] add gpu benchmark --- mindspore/lite/src/lite_kernel.h | 5 --- .../src/runtime/kernel/opencl/cl/transpose.cl | 18 +++++++-- .../kernel/opencl/kernel/activation.cc | 2 +- .../kernel/opencl/kernel/conv2d_transpose.cc | 2 +- .../runtime/kernel/opencl/kernel/matmul.cc | 2 +- .../runtime/kernel/opencl/kernel/reshape.cc | 3 ++ mindspore/lite/test/models_tflite_gpu.cfg | 3 ++ mindspore/lite/test/run_benchmark_nets.sh | 37 +++++++++++++++++++ .../src/runtime/kernel/opencl/matmul_tests.cc | 2 +- 9 files changed, 61 insertions(+), 13 deletions(-) create mode 100644 mindspore/lite/test/models_tflite_gpu.cfg diff --git a/mindspore/lite/src/lite_kernel.h b/mindspore/lite/src/lite_kernel.h index d646135267..713257ac2d 100644 --- a/mindspore/lite/src/lite_kernel.h +++ b/mindspore/lite/src/lite_kernel.h @@ -27,11 +27,6 @@ #include "src/ir/tensor.h" #include "include/errorcode.h" -#ifdef ENABLE_FP16 -using FLOAT_t = float16_t; -#else -using FLOAT_t = float; -#endif // using mindspore::kernel::AddressPtr; namespace mindspore::kernel { diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl index 7be8e94b74..4b07a1e89e 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl @@ -55,10 +55,20 @@ __kernel void transpose_NHWC4_BUF(__read_only image2d_t src_data, global FLT4 *d result[1] = (FLT4)(0.0f); result[2] = (FLT4)(0.0f); result[3] = (FLT4)(0.0f); - FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X) % W * C.y + Y, (4 * X) / W)); - FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 1) % W * C.y + Y, (4 * X + 1) / W)); - FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 2) % W * C.y + Y, (4 * X + 2) / W)); - FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 3) % W * C.y + Y, (4 * X + 3) / W)); + bool over_size = W * C.y > 65535; + FLT4 x0, x1, x2, x3; + if (over_size) { + x0 = READ_IMAGE(src_data, smp_zero, (int2)(C, 4 * X)); + x1 = READ_IMAGE(src_data, smp_zero, (int2)(C, 4 * X + 1)); + x2 = READ_IMAGE(src_data, smp_zero, (int2)(C, 4 * X + 2)); + x3 = READ_IMAGE(src_data, smp_zero, (int2)(C, 4 * X + 3)); + } else { + x0 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X) % W * C.y + Y, (4 * X) / W)); + x1 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 1) % W * C.y + Y, (4 * X + 1) / W)); + x2 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 2) % W * C.y + Y, (4 * X + 2) / W)); + x3 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 3) % W * C.y + Y, (4 * X + 3) / W)); + } + result[0].x = x0.x; result[0].y = x1.x; result[0].z = x2.x; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc index d38b43da44..19685cec4f 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc @@ -44,7 +44,7 @@ void ActivationOpenClKernel::InitBuffer() { alpha_buff_ = allocator->MapBuffer(alpha_buff_, CL_MAP_WRITE, nullptr, true); memset(alpha_buff_, 0x00, fp_size); if (enable_fp16_) { - auto fp16 = (float16_t)alpha_; + auto fp16 = (int16_t)alpha_; memcpy(alpha_buff_, &fp16, fp_size); } else { memcpy(alpha_buff_, &alpha_, fp_size); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc index 27106da1fd..e9385cef48 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc @@ -72,7 +72,7 @@ void Conv2dTransposeOpenCLKernel::PadWeight() { int div_ci = UP_DIV(ci, C4NUM); int div_co = UP_DIV(co, C4NUM); auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator(); - auto data_size = enable_fp16_ ? sizeof(float16_t) : sizeof(float); + auto data_size = enable_fp16_ ? sizeof(int16_t) : sizeof(float); // IHWO to OHWI4(I)4(O)(converter format is IHWO) // init padWeight_(buffer mem) diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc index 9431e555d5..af81b3747b 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc @@ -75,7 +75,7 @@ int MatMulOpenCLKernel::ReSize() { return RET_OK; } void MatMulOpenCLKernel::PadWeight() { auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator(); - size_t dtype_size = enable_fp16_ ? sizeof(float16_t) : sizeof(float); + size_t dtype_size = enable_fp16_ ? sizeof(int16_t) : sizeof(float); padWeight_ = allocator->Malloc(sizeCI.s[1] * sizeCO.s[1] * C4NUM * C4NUM * dtype_size); padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true); memset(padWeight_, 0x00, sizeCI.s[1] * sizeCO.s[1] * C4NUM * C4NUM * dtype_size); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc index d3d21a8199..c4124a743c 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc @@ -27,6 +27,7 @@ using mindspore::lite::KernelRegistrar; using mindspore::lite::RET_ERROR; using mindspore::lite::RET_OK; using mindspore::schema::PrimitiveType_Reshape; +using mindspore::schema::PrimitiveType_Squeeze; namespace mindspore::kernel { @@ -142,4 +143,6 @@ kernel::LiteKernel *OpenCLReshapeKernelCreator(const std::vector> "${run_benchmark_log_file}" + echo 'cd /data/local/tmp/benchmark_test' > adb_run_cmd.txt + echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelPath='${model_name}'.ms --inDataPath=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --calibDataPath=/data/local/tmp/input_output/output/'${model_name}'.ms.out --warmUpLoopCount=1 --loopCount=1' >> "${run_benchmark_log_file}" + echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelPath='${model_name}'.ms --inDataPath=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --calibDataPath=/data/local/tmp/input_output/output/'${model_name}'.ms.out --warmUpLoopCount=1 --loopCount=1' >> adb_run_cmd.txt + adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_benchmark_log_file}" + if [ $? = 0 ]; then + run_result='arm64_gpu: '${model_name}' pass' + echo ${run_result} >> ${run_benchmark_result_file} + else + run_result='arm64_gpu: '${model_name}' failed' + echo ${run_result} >> ${run_benchmark_result_file} + return 1 + fi + # run benchmark test without clib data + #echo ${model_name} + echo 'cd /data/local/tmp/benchmark_test' > adb_run_cmd.txt + echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelPath='${model_name}'.ms --warmUpLoopCount=1 --loopCount=2' >> "${run_benchmark_log_file}" + echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelPath='${model_name}'.ms --warmUpLoopCount=1 --loopCount=2' >> adb_run_cmd.txt + adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_benchmark_log_file}" + if [ $? = 0 ]; then + run_result='arm64_gpu: '${model_name}' pass' + echo ${run_result} >> ${run_benchmark_result_file} + else + run_result='arm64_gpu: '${model_name}' failed' + echo ${run_result} >> ${run_benchmark_result_file} + return 1 + fi + #sleep 1 + done < ${models_tflite_gpu_config} } # Print start msg before run testcase @@ -397,6 +433,7 @@ models_tflite_posttraining_config=${basepath}/models_tflite_posttraining.cfg models_onnx_config=${basepath}/models_onnx.cfg models_fp16_config=${basepath}/models_fp16.cfg models_mindspore_config=${basepath}/models_mindspore.cfg +models_tflite_gpu_config=${basepath}/models_tflite_gpu.cfg Convert_status=0 rm -rf ${basepath}/ms_models diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc index 8b21e5f845..5a6ea2ef89 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc @@ -36,7 +36,7 @@ void RunTestCaseMatMul(const std::vector &shape, void *input_data, void *we size_t dtype_size = sizeof(float); if (enable_fp16) { ocl_runtime->SetFp16Enable(true); - dtype_size = sizeof(float16_t); + dtype_size = sizeof(int16_t); } auto allocator = ocl_runtime->GetAllocator(); int ci = shape[0];