From e2e222f4d231cd768f21ccbb22f09f5ca13705bc Mon Sep 17 00:00:00 2001 From: lzk Date: Sun, 7 Mar 2021 17:51:18 -0800 Subject: [PATCH] transpose op --- .../fp32/convolution_depthwise_fp32_coder.cc | 3 +- mindspore/lite/nnacl/fp16/transpose_fp16.c | 48 ++ mindspore/lite/nnacl/fp32/pack_fp32.c | 467 ++++++++++++------ mindspore/lite/nnacl/fp32/pack_fp32.h | 22 +- mindspore/lite/nnacl/fp32/transpose_fp32.c | 77 ++- mindspore/lite/nnacl/fp32/transpose_fp32.h | 6 +- .../nnacl/intrinsics/sse/PackNHWCToNCHWFp32.c | 140 ------ mindspore/lite/nnacl/transpose.h | 2 +- .../runtime/kernel/arm/fp32/transpose_fp32.cc | 99 ++-- .../runtime/kernel/arm/fp32/transpose_fp32.h | 12 +- .../runtime/kernel/arm/int8/transpose_int8.cc | 25 +- .../runtime/kernel/arm/int8/transpose_int8.h | 7 + .../kernel/npu/convolution_base_npu.cc | 2 +- .../arm/fp32/deconvolution_fp32_tests.cc | 19 +- .../kernel/arm/fp32/transpose_fp32_tests.cc | 6 +- 15 files changed, 566 insertions(+), 369 deletions(-) delete mode 100644 mindspore/lite/nnacl/intrinsics/sse/PackNHWCToNCHWFp32.c diff --git a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/convolution_depthwise_fp32_coder.cc b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/convolution_depthwise_fp32_coder.cc index 27e45cfc98..200d8b2d47 100644 --- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/convolution_depthwise_fp32_coder.cc +++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/convolution_depthwise_fp32_coder.cc @@ -39,7 +39,8 @@ int ConvolutionDepthwiseFP32Coder::InitWeightBias() { MS_CHECK_PTR(packed_weight_); MS_CHECK_RET_CODE(memset_s(packed_weight_, packed_weight_data_size, 0, packed_weight_data_size), "memset packed weight failed!"); - PackNCHWToNHWCFp32(origin_weight, packed_weight_, 1, filter_tensor_->Height() * filter_tensor_->Width(), channel); + PackNCHWToNHWCFp32(origin_weight, packed_weight_, 1, filter_tensor_->Height() * filter_tensor_->Width(), channel, 0, + 0); auto channel_size = static_cast(channel); auto bias_size = static_cast(channel_size * sizeof(float)); diff --git a/mindspore/lite/nnacl/fp16/transpose_fp16.c b/mindspore/lite/nnacl/fp16/transpose_fp16.c index aa62d715e9..98c0efa86a 100644 --- a/mindspore/lite/nnacl/fp16/transpose_fp16.c +++ b/mindspore/lite/nnacl/fp16/transpose_fp16.c @@ -127,6 +127,52 @@ void Fp16TransposeDim5(const float16_t *in_data, float16_t *out_data, int *strid } } +void Fp16TransposeDim6(const float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm, + const int *output_shape) { + const int stride0 = strides[perm[0]]; + const int stride1 = strides[perm[1]]; + const int stride2 = strides[perm[2]]; + const int stride3 = strides[perm[3]]; + const int stride4 = strides[perm[4]]; + const int stride5 = strides[perm[5]]; + const int out_stride0 = out_strides[0]; + const int out_stride1 = out_strides[1]; + const int out_stride2 = out_strides[2]; + const int out_stride3 = out_strides[3]; + const int out_stride4 = out_strides[4]; + const int output0 = output_shape[0]; + const int output1 = output_shape[1]; + const int output2 = output_shape[2]; + const int output3 = output_shape[3]; + const int output4 = output_shape[4]; + const int output5 = output_shape[5]; + + for (int i = 0; i < output0; ++i) { + int out_stride0_i = i * out_stride0; + int stride0_i = i * stride0; + for (int j = 0; j < output1; ++j) { + int out_stride1_j = j * out_stride1; + int stride1_j = j * stride1; + for (int k = 0; k < output2; ++k) { + int out_stride2_k = k * out_stride2; + int stride2_k = k * stride2; + for (int m = 0; m < output3; ++m) { + int out_stride3_m = m * out_stride3; + int stride3_m = m * stride3; + for (int n = 0; n < output4; ++n) { + int out_stride4_n = n * out_stride4; + int stride4_n = n * stride4; + for (int g = 0; g < output5; ++g) { + out_data[out_stride0_i + out_stride1_j + out_stride2_k + out_stride3_m + out_stride4_n + g] = + in_data[stride0_i + stride1_j + stride2_k + stride3_m + stride4_n + g * stride5]; + } + } + } + } + } + } +} + void TransposeDimsFp16(const float16_t *in_data, float16_t *out_data, const int *strides, const int *out_strides, const int *perm, const int *output_shape, int dims, int *size, int *position) { *(size + dims - 1) = 1; @@ -190,6 +236,8 @@ int Fp16DoTranspose(const float16_t *in_data, float16_t *out_data, const int *ou Fp16TransposeDim4(in_data, out_data, strides, out_strides, perm, output_shape); } else if (num_axes == 5) { Fp16TransposeDim5(in_data, out_data, strides, out_strides, perm, output_shape); + } else if (num_axes == 6) { + Fp16TransposeDim6(in_data, out_data, strides, out_strides, perm, output_shape); } else { TransposeDimsFp16(in_data, out_data, strides, out_strides, perm, output_shape, num_axes, size, position); } diff --git a/mindspore/lite/nnacl/fp32/pack_fp32.c b/mindspore/lite/nnacl/fp32/pack_fp32.c index ff192c154a..aa4f970b91 100644 --- a/mindspore/lite/nnacl/fp32/pack_fp32.c +++ b/mindspore/lite/nnacl/fp32/pack_fp32.c @@ -1,5 +1,5 @@ /** - * Copyright 2020 Huawei Technologies Co., Ltd + * Copyright 2020-2021 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,7 +17,7 @@ #include "nnacl/fp32/pack_fp32.h" void PackWeightKHWToHWKFp32(const void *src, void *dst, int plane, int channel) { - return PackNCHWToNHWCFp32(src, dst, 1, plane, channel); + return PackNCHWToNHWCFp32(src, dst, 1, plane, channel, 0, 0); } void PackHWCToWHC(const float *src, float *dst, int height, int width, int channel) { @@ -286,166 +286,45 @@ void PackDepthwiseIndirectWeightC8Fp32(const void *src, void *dst, int height, i } } -#ifndef ENABLE_SSE -void PackNHWCToNCHWFp32(const void *src, void *dst, int batches, int plane, int channel) { - int hw8 = plane / C8NUM * C8NUM; +void PackNHWCToNCHWFp32(const void *src, void *dst, int batches, int plane, int channel, int task_id, + int thread_count) { +#ifdef ENABLE_ARM64 + Transpose8X8Fp32Func Transpose8X8Fp32Func_ = Transpose8X8Fp32Arm64; +#elif defined(ENABLE_ARM32) + Transpose8X8Fp32Func Transpose8X8Fp32Func_ = Transpose8X8Fp32Arm32; +#elif defined(ENABLE_AVX) + Transpose8X8Fp32Func Transpose8X8Fp32Func_ = Transpose8X8Fp32Avx; +#elif defined(ENABLE_SSE) && !defined(ENABLE_AVX) + Transpose8X8Fp32Func Transpose8X8Fp32Func_ = Transpose8X8Fp32Sse; +#endif + int hw8 = plane / C8NUM; + int task_start = 0; + int task_end = plane; + if (thread_count > 0) { + int offset_hw = UP_DIV(hw8, thread_count) * C8NUM; + task_start = offset_hw * task_id; + int count = plane - task_start; + if (count <= 0) { + return; + } + task_end = (task_id + 1) == thread_count ? plane : MSMIN(plane, task_start + offset_hw); + hw8 = task_start + (task_end - task_start) > offset_hw ? offset_hw : 0; + } else { + hw8 *= C8NUM; + } int c8 = channel / C8NUM * C8NUM; int batch = plane * channel; for (int n = 0; n < batches; n++) { const float *src_batch = (const float *)src + n * batch; float *dst_batch = (float *)dst + n * batch; - int hw = 0; + int hw = task_start; for (; hw < hw8; hw += C8NUM) { int c = 0; for (; c < c8; c += C8NUM) { const float *src_ptr = src_batch + hw * channel + c; float *dst_ptr = dst_batch + c * plane + hw; -#ifdef ENABLE_ARM64 - size_t srcStride = channel * sizeof(float); - size_t dstStride = plane * sizeof(float); - asm volatile( - "mov x10, %[src_ptr]\n" - "mov x11, %[dst_ptr]\n" - - "ld1 {v0.4s, v1.4s}, [x10], %[srcStride]\n" - "ld1 {v2.4s, v3.4s}, [x10], %[srcStride]\n" - - "zip1 v8.4s, v0.4s, v2.4s\n" - "zip2 v9.4s, v0.4s, v2.4s\n" - "zip1 v12.4s, v1.4s, v3.4s\n" - "zip2 v13.4s, v1.4s, v3.4s\n" - - "ld1 {v4.4s, v5.4s}, [x10], %[srcStride]\n" - "ld1 {v6.4s, v7.4s}, [x10], %[srcStride]\n" - - "zip1 v10.4s, v4.4s, v6.4s\n" - "zip2 v11.4s, v4.4s, v6.4s\n" - "zip1 v14.4s, v5.4s, v7.4s\n" - "zip2 v15.4s, v5.4s, v7.4s\n" - - "ld1 {v0.4s, v1.4s}, [x10], %[srcStride]\n" - "ld1 {v2.4s, v3.4s}, [x10], %[srcStride]\n" - - "trn1 v16.2d, v8.2d, v10.2d\n" - "trn2 v18.2d, v8.2d, v10.2d\n" - "trn1 v20.2d, v9.2d, v11.2d\n" - "trn2 v22.2d, v9.2d, v11.2d\n" - - "ld1 {v4.4s, v5.4s}, [x10], %[srcStride]\n" - "ld1 {v6.4s, v7.4s}, [x10], %[srcStride]\n" - - "trn1 v24.2d, v12.2d, v14.2d\n" - "trn2 v26.2d, v12.2d, v14.2d\n" - "trn1 v28.2d, v13.2d, v15.2d\n" - "trn2 v30.2d, v13.2d, v15.2d\n" - - "zip1 v8.4s, v0.4s, v2.4s\n" - "zip2 v9.4s, v0.4s, v2.4s\n" - "zip1 v12.4s, v1.4s, v3.4s\n" - "zip2 v13.4s, v1.4s, v3.4s\n" - - "zip1 v10.4s, v4.4s, v6.4s\n" - "zip2 v11.4s, v4.4s, v6.4s\n" - "zip1 v14.4s, v5.4s, v7.4s\n" - "zip2 v15.4s, v5.4s, v7.4s\n" - - "trn1 v17.2d, v8.2d, v10.2d\n" - "trn2 v19.2d, v8.2d, v10.2d\n" - "trn1 v21.2d, v9.2d, v11.2d\n" - "trn2 v23.2d, v9.2d, v11.2d\n" - - "trn1 v25.2d, v12.2d, v14.2d\n" - "trn2 v27.2d, v12.2d, v14.2d\n" - "trn1 v29.2d, v13.2d, v15.2d\n" - "trn2 v31.2d, v13.2d, v15.2d\n" - - "st1 {v16.4s, v17.4s}, [x11], %[dstStride]\n" - "st1 {v18.4s, v19.4s}, [x11], %[dstStride]\n" - "st1 {v20.4s, v21.4s}, [x11], %[dstStride]\n" - "st1 {v22.4s, v23.4s}, [x11], %[dstStride]\n" - "st1 {v24.4s, v25.4s}, [x11], %[dstStride]\n" - "st1 {v26.4s, v27.4s}, [x11], %[dstStride]\n" - "st1 {v28.4s, v29.4s}, [x11], %[dstStride]\n" - "st1 {v30.4s, v31.4s}, [x11], %[dstStride]\n" - - : - : - [ dst_ptr ] "r"(dst_ptr), [ src_ptr ] "r"(src_ptr), [ srcStride ] "r"(srcStride), [ dstStride ] "r"(dstStride) - : "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", - "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", - "v30", "v31"); -#elif ENABLE_ARM32 - size_t srcStride = channel * sizeof(float); - size_t dstStride = plane * sizeof(float); - asm volatile( - "mov r10, %[src_ptr]\n" - "mov r12, %[dst_ptr]\n" - - "vld1.32 {q0, q1}, [r10], %[srcStride]\n" - "vld1.32 {q2, q3}, [r10], %[srcStride]\n" - - "vtrn.32 d0, d4\n" - "vtrn.32 d1, d5\n" - "vtrn.32 d2, d6\n" - "vtrn.32 d3, d7\n" - - "vld1.32 {q4, q5}, [r10], %[srcStride]\n" - "vld1.32 {q6, q7}, [r10], %[srcStride]\n" - - "vtrn.32 d8, d12\n" - "vtrn.32 d9, d13\n" - "vtrn.32 d10, d14\n" - "vtrn.32 d11, d15\n" - - "vld1.32 {q8, q9}, [r10], %[srcStride]\n" - "vld1.32 {q10, q11}, [r10], %[srcStride]\n" - - "vswp d1, d8\n" - "vswp d3, d10\n" - "vswp d5, d12\n" - "vswp d7, d14\n" - - "vtrn.32 d16, d20\n" - "vtrn.32 d17, d21\n" - "vtrn.32 d18, d22\n" - "vtrn.32 d19, d23\n" - - "vld1.32 {q12, q13}, [r10], %[srcStride]\n" - "vld1.32 {q14, q15}, [r10], %[srcStride]\n" - - "vtrn.32 d24, d28\n" - "vtrn.32 d25, d29\n" - "vtrn.32 d26, d30\n" - "vtrn.32 d27, d31\n" - - "vswp d17, d24\n" - "vswp d19, d26\n" - "vswp d21, d28\n" - "vswp d23, d30\n" - - "add r10, r12, #16\n" - "vst1.32 {q0}, [r12], %[dstStride]\n" - "vst1.32 {q8}, [r10], %[dstStride]\n" - "vst1.32 {q2}, [r12], %[dstStride]\n" - "vst1.32 {q10}, [r10], %[dstStride]\n" - "vst1.32 {q4}, [r12], %[dstStride]\n" - "vst1.32 {q12}, [r10], %[dstStride]\n" - "vst1.32 {q6}, [r12], %[dstStride]\n" - "vst1.32 {q14}, [r10], %[dstStride]\n" - "vst1.32 {q1}, [r12], %[dstStride]\n" - "vst1.32 {q9}, [r10], %[dstStride]\n" - "vst1.32 {q3}, [r12], %[dstStride]\n" - "vst1.32 {q11}, [r10], %[dstStride]\n" - "vst1.32 {q5}, [r12], %[dstStride]\n" - "vst1.32 {q13}, [r10], %[dstStride]\n" - "vst1.32 {q7}, [r12], %[dstStride]\n" - "vst1.32 {q15}, [r10], %[dstStride]\n" - - : - : - [ dst_ptr ] "r"(dst_ptr), [ src_ptr ] "r"(src_ptr), [ srcStride ] "r"(srcStride), [ dstStride ] "r"(dstStride) - : "r10", "r12", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", - "q15"); +#if defined(ENABLE_ARM64) || defined(ENABLE_AVX) || defined(ENABLE_SSE) || defined(ENABLE_ARM32) + Transpose8X8Fp32Func_(src_ptr, dst_ptr, channel, plane); #else for (int tr = 0; tr < C8NUM; tr++) { for (int tc = 0; tc < C8NUM; tc++) { @@ -462,7 +341,7 @@ void PackNHWCToNCHWFp32(const void *src, void *dst, int batches, int plane, int } } } - for (; hw < plane; hw++) { + for (; hw < task_end; hw++) { const float *src_ptr = src_batch + hw * channel; float *dst_ptr = dst_batch + hw; for (size_t i = 0; i < channel; i++) { @@ -470,10 +349,286 @@ void PackNHWCToNCHWFp32(const void *src, void *dst, int batches, int plane, int } } } - return; +} + +void PackNCHWToNHWCFp32(const void *src, void *dst, int batch, int plane, int channel, int task_id, int thread_count) { + return PackNHWCToNCHWFp32(src, dst, batch, channel, plane, task_id, thread_count); +} + +#ifdef ENABLE_ARM64 +inline void Transpose8X8Fp32Arm64(const float *src_ptr, float *dst_ptr, int src_stride, int dst_stride) { + size_t srcStride = src_stride * sizeof(float); + size_t dstStride = dst_stride * sizeof(float); + asm volatile( + "mov x10, %[src_ptr]\n" + "mov x11, %[dst_ptr]\n" + + "ld1 {v0.4s, v1.4s}, [x10], %[srcStride]\n" + "ld1 {v2.4s, v3.4s}, [x10], %[srcStride]\n" + + "zip1 v8.4s, v0.4s, v2.4s\n" + "zip2 v9.4s, v0.4s, v2.4s\n" + "zip1 v12.4s, v1.4s, v3.4s\n" + "zip2 v13.4s, v1.4s, v3.4s\n" + + "ld1 {v4.4s, v5.4s}, [x10], %[srcStride]\n" + "ld1 {v6.4s, v7.4s}, [x10], %[srcStride]\n" + + "zip1 v10.4s, v4.4s, v6.4s\n" + "zip2 v11.4s, v4.4s, v6.4s\n" + "zip1 v14.4s, v5.4s, v7.4s\n" + "zip2 v15.4s, v5.4s, v7.4s\n" + + "ld1 {v0.4s, v1.4s}, [x10], %[srcStride]\n" + "ld1 {v2.4s, v3.4s}, [x10], %[srcStride]\n" + + "trn1 v16.2d, v8.2d, v10.2d\n" + "trn2 v18.2d, v8.2d, v10.2d\n" + "trn1 v20.2d, v9.2d, v11.2d\n" + "trn2 v22.2d, v9.2d, v11.2d\n" + + "ld1 {v4.4s, v5.4s}, [x10], %[srcStride]\n" + "ld1 {v6.4s, v7.4s}, [x10], %[srcStride]\n" + + "trn1 v24.2d, v12.2d, v14.2d\n" + "trn2 v26.2d, v12.2d, v14.2d\n" + "trn1 v28.2d, v13.2d, v15.2d\n" + "trn2 v30.2d, v13.2d, v15.2d\n" + + "zip1 v8.4s, v0.4s, v2.4s\n" + "zip2 v9.4s, v0.4s, v2.4s\n" + "zip1 v12.4s, v1.4s, v3.4s\n" + "zip2 v13.4s, v1.4s, v3.4s\n" + + "zip1 v10.4s, v4.4s, v6.4s\n" + "zip2 v11.4s, v4.4s, v6.4s\n" + "zip1 v14.4s, v5.4s, v7.4s\n" + "zip2 v15.4s, v5.4s, v7.4s\n" + + "trn1 v17.2d, v8.2d, v10.2d\n" + "trn2 v19.2d, v8.2d, v10.2d\n" + "trn1 v21.2d, v9.2d, v11.2d\n" + "trn2 v23.2d, v9.2d, v11.2d\n" + + "trn1 v25.2d, v12.2d, v14.2d\n" + "trn2 v27.2d, v12.2d, v14.2d\n" + "trn1 v29.2d, v13.2d, v15.2d\n" + "trn2 v31.2d, v13.2d, v15.2d\n" + + "st1 {v16.4s, v17.4s}, [x11], %[dstStride]\n" + "st1 {v18.4s, v19.4s}, [x11], %[dstStride]\n" + "st1 {v20.4s, v21.4s}, [x11], %[dstStride]\n" + "st1 {v22.4s, v23.4s}, [x11], %[dstStride]\n" + "st1 {v24.4s, v25.4s}, [x11], %[dstStride]\n" + "st1 {v26.4s, v27.4s}, [x11], %[dstStride]\n" + "st1 {v28.4s, v29.4s}, [x11], %[dstStride]\n" + "st1 {v30.4s, v31.4s}, [x11], %[dstStride]\n" + + : + : [ dst_ptr ] "r"(dst_ptr), [ src_ptr ] "r"(src_ptr), [ srcStride ] "r"(srcStride), [ dstStride ] "r"(dstStride) + : "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", + "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); +} +#endif + +#ifdef ENABLE_ARM32 +inline void Transpose8X8Fp32Arm32(const float *src_ptr, float *dst_ptr, int src_stride, int dst_stride) { + size_t srcStride = src_stride * sizeof(float); + size_t dstStride = dst_stride * sizeof(float); + asm volatile( + "mov r10, %[src_ptr]\n" + "mov r12, %[dst_ptr]\n" + + "vld1.32 {q0, q1}, [r10], %[srcStride]\n" + "vld1.32 {q2, q3}, [r10], %[srcStride]\n" + + "vtrn.32 d0, d4\n" + "vtrn.32 d1, d5\n" + "vtrn.32 d2, d6\n" + "vtrn.32 d3, d7\n" + + "vld1.32 {q4, q5}, [r10], %[srcStride]\n" + "vld1.32 {q6, q7}, [r10], %[srcStride]\n" + + "vtrn.32 d8, d12\n" + "vtrn.32 d9, d13\n" + "vtrn.32 d10, d14\n" + "vtrn.32 d11, d15\n" + + "vld1.32 {q8, q9}, [r10], %[srcStride]\n" + "vld1.32 {q10, q11}, [r10], %[srcStride]\n" + + "vswp d1, d8\n" + "vswp d3, d10\n" + "vswp d5, d12\n" + "vswp d7, d14\n" + + "vtrn.32 d16, d20\n" + "vtrn.32 d17, d21\n" + "vtrn.32 d18, d22\n" + "vtrn.32 d19, d23\n" + + "vld1.32 {q12, q13}, [r10], %[srcStride]\n" + "vld1.32 {q14, q15}, [r10], %[srcStride]\n" + + "vtrn.32 d24, d28\n" + "vtrn.32 d25, d29\n" + "vtrn.32 d26, d30\n" + "vtrn.32 d27, d31\n" + + "vswp d17, d24\n" + "vswp d19, d26\n" + "vswp d21, d28\n" + "vswp d23, d30\n" + + "add r10, r12, #16\n" + "vst1.32 {q0}, [r12], %[dstStride]\n" + "vst1.32 {q8}, [r10], %[dstStride]\n" + "vst1.32 {q2}, [r12], %[dstStride]\n" + "vst1.32 {q10}, [r10], %[dstStride]\n" + "vst1.32 {q4}, [r12], %[dstStride]\n" + "vst1.32 {q12}, [r10], %[dstStride]\n" + "vst1.32 {q6}, [r12], %[dstStride]\n" + "vst1.32 {q14}, [r10], %[dstStride]\n" + "vst1.32 {q1}, [r12], %[dstStride]\n" + "vst1.32 {q9}, [r10], %[dstStride]\n" + "vst1.32 {q3}, [r12], %[dstStride]\n" + "vst1.32 {q11}, [r10], %[dstStride]\n" + "vst1.32 {q5}, [r12], %[dstStride]\n" + "vst1.32 {q13}, [r10], %[dstStride]\n" + "vst1.32 {q7}, [r12], %[dstStride]\n" + "vst1.32 {q15}, [r10], %[dstStride]\n" + + : + : [ dst_ptr ] "r"(dst_ptr), [ src_ptr ] "r"(src_ptr), [ srcStride ] "r"(srcStride), [ dstStride ] "r"(dstStride) + : "r10", "r12", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", + "q15"); } #endif -void PackNCHWToNHWCFp32(const void *src, void *dst, int batch, int plane, int channel) { - return PackNHWCToNCHWFp32(src, dst, batch, channel, plane); +#ifdef ENABLE_AVX +inline void Transpose8X8Fp32Avx(const float *src_ptr, float *dst_ptr, int src_stride, int dst_stride) { + LOAD256X8_F32(src, src_ptr, src_stride) + __m256 r1 = _mm256_unpacklo_ps(src1, src2); + __m256 r2 = _mm256_unpackhi_ps(src1, src2); + __m256 r3 = _mm256_unpacklo_ps(src3, src4); + __m256 r4 = _mm256_unpackhi_ps(src3, src4); + __m256 r5 = _mm256_unpacklo_ps(src5, src6); + __m256 r6 = _mm256_unpackhi_ps(src5, src6); + __m256 r7 = _mm256_unpacklo_ps(src7, src8); + __m256 r8 = _mm256_unpackhi_ps(src7, src8); + + __m256 v; + v = _mm256_shuffle_ps(r1, r3, 0x4E); + src1 = _mm256_blend_ps(r1, v, 0xCC); + src2 = _mm256_blend_ps(r3, v, 0x33); + + v = _mm256_shuffle_ps(r2, r4, 0x4E); + src3 = _mm256_blend_ps(r2, v, 0xCC); + src4 = _mm256_blend_ps(r4, v, 0x33); + + v = _mm256_shuffle_ps(r5, r7, 0x4E); + src5 = _mm256_blend_ps(r5, v, 0xCC); + src6 = _mm256_blend_ps(r7, v, 0x33); + + v = _mm256_shuffle_ps(r6, r8, 0x4E); + src7 = _mm256_blend_ps(r6, v, 0xCC); + src8 = _mm256_blend_ps(r8, v, 0x33); + + r1 = _mm256_permute2f128_ps(src1, src5, 0x20); + r2 = _mm256_permute2f128_ps(src2, src6, 0x20); + r3 = _mm256_permute2f128_ps(src3, src7, 0x20); + r4 = _mm256_permute2f128_ps(src4, src8, 0x20); + r5 = _mm256_permute2f128_ps(src1, src5, 0x31); + r6 = _mm256_permute2f128_ps(src2, src6, 0x31); + r7 = _mm256_permute2f128_ps(src3, src7, 0x31); + r8 = _mm256_permute2f128_ps(src4, src8, 0x31); + + STORE256X8_F32(dst_ptr, dst_stride, r); } +#endif + +#if defined(ENABLE_SSE) && !defined(ENABLE_AVX) +inline void Transpose8X8Fp32Sse(const float *src_ptr, float *dst_ptr, int src_stride, int dst_stride) { + __m128 v0_ma = _mm_loadu_ps(src_ptr); + __m128 v1_ma = _mm_loadu_ps(src_ptr + src_stride); + __m128 v2_ma = _mm_loadu_ps(src_ptr + 2 * src_stride); + __m128 v3_ma = _mm_loadu_ps(src_ptr + 3 * src_stride); + + __m128 v4_ma = _mm_unpacklo_ps(v0_ma, v1_ma); + __m128 v5_ma = _mm_unpackhi_ps(v0_ma, v1_ma); + __m128 v6_ma = _mm_unpacklo_ps(v2_ma, v3_ma); + __m128 v7_ma = _mm_unpackhi_ps(v2_ma, v3_ma); + + __m128 v8_ma = _mm_movelh_ps(v4_ma, v6_ma); + __m128 v9_ma = _mm_movehl_ps(v6_ma, v4_ma); + __m128 v10_ma = _mm_movelh_ps(v5_ma, v7_ma); + __m128 v11_ma = _mm_movehl_ps(v7_ma, v5_ma); + + _mm_storeu_ps(dst_ptr, v8_ma); + _mm_storeu_ps(dst_ptr + dst_stride, v9_ma); + _mm_storeu_ps(dst_ptr + 2 * dst_stride, v10_ma); + _mm_storeu_ps(dst_ptr + 3 * dst_stride, v11_ma); + + v0_ma = _mm_loadu_ps(src_ptr + C4NUM); + v1_ma = _mm_loadu_ps(src_ptr + src_stride + C4NUM); + v2_ma = _mm_loadu_ps(src_ptr + 2 * src_stride + C4NUM); + v3_ma = _mm_loadu_ps(src_ptr + 3 * src_stride + C4NUM); + + v4_ma = _mm_unpacklo_ps(v0_ma, v1_ma); + v5_ma = _mm_unpackhi_ps(v0_ma, v1_ma); + v6_ma = _mm_unpacklo_ps(v2_ma, v3_ma); + v7_ma = _mm_unpackhi_ps(v2_ma, v3_ma); + + v8_ma = _mm_movelh_ps(v4_ma, v6_ma); + v9_ma = _mm_movehl_ps(v6_ma, v4_ma); + v10_ma = _mm_movelh_ps(v5_ma, v7_ma); + v11_ma = _mm_movehl_ps(v7_ma, v5_ma); + + _mm_storeu_ps(dst_ptr + C4NUM * dst_stride, v8_ma); + _mm_storeu_ps(dst_ptr + (C4NUM + 1) * dst_stride, v9_ma); + _mm_storeu_ps(dst_ptr + (C4NUM + 2) * dst_stride, v10_ma); + _mm_storeu_ps(dst_ptr + (C4NUM + 3) * dst_stride, v11_ma); + + v0_ma = _mm_loadu_ps(src_ptr + C4NUM * src_stride); + v1_ma = _mm_loadu_ps(src_ptr + (C4NUM + 1) * src_stride); + v2_ma = _mm_loadu_ps(src_ptr + (C4NUM + 2) * src_stride); + v3_ma = _mm_loadu_ps(src_ptr + (C4NUM + 3) * src_stride); + + v4_ma = _mm_unpacklo_ps(v0_ma, v1_ma); + v5_ma = _mm_unpackhi_ps(v0_ma, v1_ma); + v6_ma = _mm_unpacklo_ps(v2_ma, v3_ma); + v7_ma = _mm_unpackhi_ps(v2_ma, v3_ma); + + v8_ma = _mm_movelh_ps(v4_ma, v6_ma); + v9_ma = _mm_movehl_ps(v6_ma, v4_ma); + v10_ma = _mm_movelh_ps(v5_ma, v7_ma); + v11_ma = _mm_movehl_ps(v7_ma, v5_ma); + + _mm_storeu_ps(dst_ptr + C4NUM, v8_ma); + _mm_storeu_ps(dst_ptr + dst_stride + C4NUM, v9_ma); + _mm_storeu_ps(dst_ptr + 2 * dst_stride + C4NUM, v10_ma); + _mm_storeu_ps(dst_ptr + 3 * dst_stride + C4NUM, v11_ma); + + v0_ma = _mm_loadu_ps(src_ptr + C4NUM * src_stride + C4NUM); + v1_ma = _mm_loadu_ps(src_ptr + (C4NUM + 1) * src_stride + C4NUM); + v2_ma = _mm_loadu_ps(src_ptr + (C4NUM + 2) * src_stride + C4NUM); + v3_ma = _mm_loadu_ps(src_ptr + (C4NUM + 3) * src_stride + C4NUM); + + v4_ma = _mm_unpacklo_ps(v0_ma, v1_ma); + v5_ma = _mm_unpackhi_ps(v0_ma, v1_ma); + v6_ma = _mm_unpacklo_ps(v2_ma, v3_ma); + v7_ma = _mm_unpackhi_ps(v2_ma, v3_ma); + + v8_ma = _mm_movelh_ps(v4_ma, v6_ma); + v9_ma = _mm_movehl_ps(v6_ma, v4_ma); + v10_ma = _mm_movelh_ps(v5_ma, v7_ma); + v11_ma = _mm_movehl_ps(v7_ma, v5_ma); + + _mm_storeu_ps(dst_ptr + C4NUM * dst_stride + C4NUM, v8_ma); + _mm_storeu_ps(dst_ptr + (C4NUM + 1) * dst_stride + C4NUM, v9_ma); + _mm_storeu_ps(dst_ptr + (C4NUM + 2) * dst_stride + C4NUM, v10_ma); + _mm_storeu_ps(dst_ptr + (C4NUM + 3) * dst_stride + C4NUM, v11_ma); +} +#endif diff --git a/mindspore/lite/nnacl/fp32/pack_fp32.h b/mindspore/lite/nnacl/fp32/pack_fp32.h index 7bca84490f..cf89227846 100644 --- a/mindspore/lite/nnacl/fp32/pack_fp32.h +++ b/mindspore/lite/nnacl/fp32/pack_fp32.h @@ -1,5 +1,5 @@ /** - * Copyright 2020 Huawei Technologies Co., Ltd + * Copyright 2020-2021 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,8 +30,9 @@ void PackNHWCToNC4HW4Fp32(const void *src, void *dst, int batch, int plane, int void PackNCHWToNC4HW4Fp32(const void *src, void *dst, int batch, int plane, int channel); void PackNHWCToNHWC4Fp32(const void *src, void *dst, int batch, int plane, int channel); void PackNHWCToNHWC8Fp32(const void *src, void *dst, int batch, int plane, int channel); -void PackNHWCToNCHWFp32(const void *src, void *dst, int batch, int plane, int channel); -void PackNCHWToNHWCFp32(const void *src, void *dst, int batch, int plane, int channel); +// Note: If not multithreaded, please set task_id = 0 and thread_count = 0; +void PackNHWCToNCHWFp32(const void *src, void *dst, int batch, int plane, int channel, int task_id, int thread_count); +void PackNCHWToNHWCFp32(const void *src, void *dst, int batch, int plane, int channel, int task_id, int thread_count); void PackNHWC4ToNHWCFp32(const void *src, void *dst, int batch, int plane, int channel); void PackNC4HW4ToNHWC4Fp32(const void *src, void *dst, int batch, int plane, int channel); void PackNC4HW4ToNHWCFp32(const void *src, void *dst, int batch, int plane, int channel); @@ -43,6 +44,21 @@ void PackDepthwiseIndirectWeightC8Fp32(const void *src, void *dst, int height, i void Im2ColPackUnitFp32(const float *input_data, const ConvParameter *conv_param, float *packed_input, int real_cal_num, int block_index); +// Transpose 8X8 Fp32 block data +typedef void (*Transpose8X8Fp32Func)(const float *src_ptr, float *dst_ptr, int src_stride, int dst_stride); +#ifdef ENABLE_ARM64 +void Transpose8X8Fp32Arm64(const float *src_ptr, float *dst_ptr, int src_stride, int dst_stride); +#endif +#ifdef ENABLE_ARM32 +void Transpose8X8Fp32Arm32(const float *src_ptr, float *dst_ptr, int src_stride, int dst_stride); +#endif +#ifdef ENABLE_AVX +void Transpose8X8Fp32Avx(const float *src_ptr, float *dst_ptr, int src_stride, int dst_stride); +#endif +#if defined(ENABLE_SSE) && !defined(ENABLE_AVX) +void Transpose8X8Fp32Sse(const float *src_ptr, float *dst_ptr, int src_stride, int dst_stride); +#endif + #ifdef __cplusplus } #endif diff --git a/mindspore/lite/nnacl/fp32/transpose_fp32.c b/mindspore/lite/nnacl/fp32/transpose_fp32.c index af8ec38e95..8b96d51d25 100644 --- a/mindspore/lite/nnacl/fp32/transpose_fp32.c +++ b/mindspore/lite/nnacl/fp32/transpose_fp32.c @@ -125,20 +125,73 @@ void TransposeDim5Fp32(const float *in_data, float *out_data, const int *strides } } -void TransposeDimsFp32(const float *in_data, float *out_data, const int *strides, const int *out_strides, - const int *perm, const int *output_shape, int dims, int *size, int *position) { - *(size + dims - 1) = 1; - for (int i = dims - 1; i > 0; --i) { - *(size + i - 1) = *(size + i) * output_shape[i]; +void TransposeDim6Fp32(const float *in_data, float *out_data, const int *strides, const int *out_strides, + const int *perm, const int *output_shape) { + const int stride0 = strides[perm[0]]; + const int stride1 = strides[perm[1]]; + const int stride2 = strides[perm[2]]; + const int stride3 = strides[perm[3]]; + const int stride4 = strides[perm[4]]; + const int stride5 = strides[perm[5]]; + const int out_stride0 = out_strides[0]; + const int out_stride1 = out_strides[1]; + const int out_stride2 = out_strides[2]; + const int out_stride3 = out_strides[3]; + const int out_stride4 = out_strides[4]; + const int output0 = output_shape[0]; + const int output1 = output_shape[1]; + const int output2 = output_shape[2]; + const int output3 = output_shape[3]; + const int output4 = output_shape[4]; + const int output5 = output_shape[5]; + + for (int i = 0; i < output0; ++i) { + int out_stride0_i = i * out_stride0; + int stride0_i = i * stride0; + for (int j = 0; j < output1; ++j) { + int out_stride1_j = j * out_stride1; + int stride1_j = j * stride1; + for (int k = 0; k < output2; ++k) { + int out_stride2_k = k * out_stride2; + int stride2_k = k * stride2; + for (int m = 0; m < output3; ++m) { + int out_stride3_m = m * out_stride3; + int stride3_m = m * stride3; + for (int n = 0; n < output4; ++n) { + int out_stride4_m = n * out_stride4; + int stride4_m = n * stride4; + for (int g = 0; g < output5; ++g) { + out_data[out_stride0_i + out_stride1_j + out_stride2_k + out_stride3_m + out_stride4_m + g] = + in_data[stride0_i + stride1_j + stride2_k + stride3_m + stride4_m + g * stride5]; + } + } + } + } + } } +} - for (size_t idx = 0; idx < (*size) * output_shape[0]; ++idx) { +void TransposeDimsFp32(const float *in_data, float *out_data, const int *output_shape, int *size, int *position, + TransposeParameter *transpose_param, int task_id, int thread_num) { + int *perm = transpose_param->perm_; + int *strides = transpose_param->strides_; + int *out_strides = transpose_param->out_strides_; + int num_axes = transpose_param->num_axes_; + size_t data_size = (*size) * output_shape[0]; + size_t offset_size = UP_DIV(data_size, thread_num); + size_t task_offset = offset_size * task_id; + int count = data_size - task_offset; + if (count <= 0) { + return; + } + count = MSMIN(offset_size, count); + for (size_t idx = task_offset; idx < task_offset + count; ++idx) { int pos = idx; int output_idx = 0; int input_idx = 0; - for (int i = 0; i < dims; ++i) { + for (int i = 0; i < num_axes; ++i) { *(position + i) = pos / *(size + i); - int out_stride = i < dims - 1 ? out_strides[i] : 1; + int out_stride = i < num_axes - 1 ? out_strides[i] : 1; output_idx += (*(position + i) * out_stride); input_idx += (*(position + i) * strides[perm[i]]); pos -= *(position + i) * (*(size + i)); @@ -147,8 +200,8 @@ void TransposeDimsFp32(const float *in_data, float *out_data, const int *strides } } -int DoTransposeFp32(const float *in_data, float *out_data, const int *output_shape, TransposeParameter *transpose_param, - int *size, int *position) { +int DoTransposeFp32(const float *in_data, float *out_data, const int *output_shape, + TransposeParameter *transpose_param) { if (in_data == NULL || out_data == NULL) { return NNACL_ERR; } @@ -188,8 +241,10 @@ int DoTransposeFp32(const float *in_data, float *out_data, const int *output_sha TransposeDim4Fp32(in_data, out_data, strides, out_strides, perm, output_shape); } else if (num_axes == 5) { TransposeDim5Fp32(in_data, out_data, strides, out_strides, perm, output_shape); + } else if (num_axes == 6) { + TransposeDim6Fp32(in_data, out_data, strides, out_strides, perm, output_shape); } else { - TransposeDimsFp32(in_data, out_data, strides, out_strides, perm, output_shape, num_axes, size, position); + return NNACL_ERR; } return NNACL_OK; } diff --git a/mindspore/lite/nnacl/fp32/transpose_fp32.h b/mindspore/lite/nnacl/fp32/transpose_fp32.h index 9027577b06..b3e6a5fb3b 100644 --- a/mindspore/lite/nnacl/fp32/transpose_fp32.h +++ b/mindspore/lite/nnacl/fp32/transpose_fp32.h @@ -25,9 +25,9 @@ extern "C" { #endif -int DoTransposeFp32(const float *in_data, float *out_data, const int *output_shape, TransposeParameter *transpose_param, - int *size, int *position); - +int DoTransposeFp32(const float *in_data, float *out_data, const int *output_shape, TransposeParameter *param); +void TransposeDimsFp32(const float *in_data, float *out_data, const int *output_shape, int *size, int *position, + TransposeParameter *transpose_param, int task_id, int thread_num); #ifdef __cplusplus } #endif diff --git a/mindspore/lite/nnacl/intrinsics/sse/PackNHWCToNCHWFp32.c b/mindspore/lite/nnacl/intrinsics/sse/PackNHWCToNCHWFp32.c deleted file mode 100644 index 26f602610e..0000000000 --- a/mindspore/lite/nnacl/intrinsics/sse/PackNHWCToNCHWFp32.c +++ /dev/null @@ -1,140 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifdef ENABLE_SSE -#include -#include "nnacl/pack.h" -#include "nnacl/int8/conv_int8.h" - -void PackNHWCToNCHWFp32(const void *src, void *dst, int batches, int plane, int channel) { - int hw8 = plane / C8NUM * C8NUM; - int c8 = channel / C8NUM * C8NUM; - int batch = plane * channel; - for (int n = 0; n < batches; n++) { - const float *src_batch = (const float *)src + n * batch; - float *dst_batch = (float *)dst + n * batch; - int hw = 0; - for (; hw < hw8; hw += C8NUM) { - int c = 0; - for (; c < c8; c += C8NUM) { - const float *src_ptr = src_batch + hw * channel + c; - float *dst_ptr = dst_batch + c * plane + hw; - - // 11-14 - __m128 v0_ma = _mm_loadu_ps(src_ptr); - __m128 v1_ma = _mm_loadu_ps(src_ptr + channel); - __m128 v2_ma = _mm_loadu_ps(src_ptr + 2 * channel); - __m128 v3_ma = _mm_loadu_ps(src_ptr + 3 * channel); - - __m128 v4_ma = _mm_unpacklo_ps(v0_ma, v1_ma); - __m128 v5_ma = _mm_unpackhi_ps(v0_ma, v1_ma); - __m128 v6_ma = _mm_unpacklo_ps(v2_ma, v3_ma); - __m128 v7_ma = _mm_unpackhi_ps(v2_ma, v3_ma); - - __m128 v8_ma = _mm_movelh_ps(v4_ma, v6_ma); - __m128 v9_ma = _mm_movehl_ps(v6_ma, v4_ma); - __m128 v10_ma = _mm_movelh_ps(v5_ma, v7_ma); - __m128 v11_ma = _mm_movehl_ps(v7_ma, v5_ma); - - _mm_storeu_ps(dst_ptr, v8_ma); - _mm_storeu_ps(dst_ptr + plane, v9_ma); - _mm_storeu_ps(dst_ptr + 2 * plane, v10_ma); - _mm_storeu_ps(dst_ptr + 3 * plane, v11_ma); - - // 15-18 - v0_ma = _mm_loadu_ps(src_ptr + C4NUM); - v1_ma = _mm_loadu_ps(src_ptr + channel + C4NUM); - v2_ma = _mm_loadu_ps(src_ptr + 2 * channel + C4NUM); - v3_ma = _mm_loadu_ps(src_ptr + 3 * channel + C4NUM); - - v4_ma = _mm_unpacklo_ps(v0_ma, v1_ma); - v5_ma = _mm_unpackhi_ps(v0_ma, v1_ma); - v6_ma = _mm_unpacklo_ps(v2_ma, v3_ma); - v7_ma = _mm_unpackhi_ps(v2_ma, v3_ma); - - v8_ma = _mm_movelh_ps(v4_ma, v6_ma); - v9_ma = _mm_movehl_ps(v6_ma, v4_ma); - v10_ma = _mm_movelh_ps(v5_ma, v7_ma); - v11_ma = _mm_movehl_ps(v7_ma, v5_ma); - - _mm_storeu_ps(dst_ptr + C4NUM * plane, v8_ma); - _mm_storeu_ps(dst_ptr + (C4NUM + 1) * plane, v9_ma); - _mm_storeu_ps(dst_ptr + (C4NUM + 2) * plane, v10_ma); - _mm_storeu_ps(dst_ptr + (C4NUM + 3) * plane, v11_ma); - - // 21-24 - v0_ma = _mm_loadu_ps(src_ptr + C4NUM * channel); - v1_ma = _mm_loadu_ps(src_ptr + (C4NUM + 1) * channel); - v2_ma = _mm_loadu_ps(src_ptr + (C4NUM + 2) * channel); - v3_ma = _mm_loadu_ps(src_ptr + (C4NUM + 3) * channel); - - v4_ma = _mm_unpacklo_ps(v0_ma, v1_ma); - v5_ma = _mm_unpackhi_ps(v0_ma, v1_ma); - v6_ma = _mm_unpacklo_ps(v2_ma, v3_ma); - v7_ma = _mm_unpackhi_ps(v2_ma, v3_ma); - - v8_ma = _mm_movelh_ps(v4_ma, v6_ma); - v9_ma = _mm_movehl_ps(v6_ma, v4_ma); - v10_ma = _mm_movelh_ps(v5_ma, v7_ma); - v11_ma = _mm_movehl_ps(v7_ma, v5_ma); - - _mm_storeu_ps(dst_ptr + C4NUM, v8_ma); - _mm_storeu_ps(dst_ptr + plane + C4NUM, v9_ma); - _mm_storeu_ps(dst_ptr + 2 * plane + C4NUM, v10_ma); - _mm_storeu_ps(dst_ptr + 3 * plane + C4NUM, v11_ma); - - // 25-28 - v0_ma = _mm_loadu_ps(src_ptr + C4NUM * channel + C4NUM); - v1_ma = _mm_loadu_ps(src_ptr + (C4NUM + 1) * channel + C4NUM); - v2_ma = _mm_loadu_ps(src_ptr + (C4NUM + 2) * channel + C4NUM); - v3_ma = _mm_loadu_ps(src_ptr + (C4NUM + 3) * channel + C4NUM); - - v4_ma = _mm_unpacklo_ps(v0_ma, v1_ma); - v5_ma = _mm_unpackhi_ps(v0_ma, v1_ma); - v6_ma = _mm_unpacklo_ps(v2_ma, v3_ma); - v7_ma = _mm_unpackhi_ps(v2_ma, v3_ma); - - v8_ma = _mm_movelh_ps(v4_ma, v6_ma); - v9_ma = _mm_movehl_ps(v6_ma, v4_ma); - v10_ma = _mm_movelh_ps(v5_ma, v7_ma); - v11_ma = _mm_movehl_ps(v7_ma, v5_ma); - - _mm_storeu_ps(dst_ptr + C4NUM * plane + C4NUM, v8_ma); - _mm_storeu_ps(dst_ptr + (C4NUM + 1) * plane + C4NUM, v9_ma); - _mm_storeu_ps(dst_ptr + (C4NUM + 2) * plane + C4NUM, v10_ma); - _mm_storeu_ps(dst_ptr + (C4NUM + 3) * plane + C4NUM, v11_ma); - } - - for (; c < channel; c++) { - const float *src_ptr = src_batch + hw * channel + c; - float *dst_ptr = dst_batch + c * plane + hw; - for (size_t i = 0; i < C8NUM; i++) { - dst_ptr[i] = src_ptr[i * channel]; - } - } - } - for (; hw < plane; hw++) { - const float *src_ptr = src_batch + hw * channel; - float *dst_ptr = dst_batch + hw; - for (size_t i = 0; i < channel; i++) { - dst_ptr[i * plane] = src_ptr[i]; - } - } - } - return; -} - -#endif diff --git a/mindspore/lite/nnacl/transpose.h b/mindspore/lite/nnacl/transpose.h index c22e6e70e1..b69ddabfb1 100644 --- a/mindspore/lite/nnacl/transpose.h +++ b/mindspore/lite/nnacl/transpose.h @@ -19,7 +19,7 @@ #include "nnacl/op_base.h" -#define MAX_TRANSPOSE_DIM_SIZE 5 +#define MAX_TRANSPOSE_DIM_SIZE 6 typedef struct TransposeParameter { // primitive parameter diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.cc index fa5e55e9c4..cff1de66c0 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.cc @@ -22,6 +22,7 @@ using mindspore::lite::KernelRegistrar; using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_NULL_PTR; using mindspore::lite::RET_OK; using mindspore::lite::RET_OP_EXECUTE_FAILURE; using mindspore::schema::PrimitiveType_Transpose; @@ -82,31 +83,46 @@ TransposeCPUKernel::~TransposeCPUKernel() { } } -int TransposeCPUKernel::NhNcTranspose(lite::Tensor *in_tensor, lite::Tensor *out_tensor, TransposeParameter *param) { +void TransposeCPUKernel::GetNHNCTransposeFunc(lite::Tensor *in_tensor, lite::Tensor *out_tensor, + TransposeParameter *param) { auto out_shape = out_tensor->shape(); if (in_tensor->shape().size() == 4 && param->perm_[0] == 0 && param->perm_[1] == 2 && param->perm_[2] == 3 && param->perm_[3] == 1) { + nhnc_param_[0] = out_shape[0]; + nhnc_param_[1] = out_shape[1] * out_shape[2]; + nhnc_param_[2] = out_shape[3]; if (in_tensor->data_type() == kNumberTypeFloat32) { - PackNCHWToNHWCFp32(in_tensor->MutableData(), out_tensor->MutableData(), out_shape[0], out_shape[1] * out_shape[2], - out_shape[3]); - } else if (in_tensor->data_type() == kNumberTypeInt8) { - PackNCHWToNHWCInt8(in_tensor->MutableData(), out_tensor->MutableData(), out_shape[0], out_shape[1] * out_shape[2], - out_shape[3]); + NHNCTransposeFunc_ = PackNCHWToNHWCFp32; } - return RET_OK; } if (in_tensor->shape().size() == 4 && param->perm_[0] == 0 && param->perm_[1] == 3 && param->perm_[2] == 1 && param->perm_[3] == 2) { + nhnc_param_[0] = out_shape[0]; + nhnc_param_[1] = out_shape[2] * out_shape[3]; + nhnc_param_[2] = out_shape[1]; if (in_tensor->data_type() == kNumberTypeFloat32) { - PackNHWCToNCHWFp32(in_tensor->MutableData(), out_tensor->MutableData(), out_shape[0], out_shape[2] * out_shape[3], - out_shape[1]); - } else if (in_tensor->data_type() == kNumberTypeInt8) { - PackNHWCToNCHWInt8(in_tensor->MutableData(), out_tensor->MutableData(), out_shape[0], out_shape[2] * out_shape[3], - out_shape[1]); + NHNCTransposeFunc_ = PackNHWCToNCHWFp32; } - return RET_OK; } - return RET_ERROR; +} + +int TransposeCPUKernel::RunImpl(int task_id) { + if (NHNCTransposeFunc_ != nullptr) { + NHNCTransposeFunc_(in_data_, out_data_, nhnc_param_[0], nhnc_param_[1], nhnc_param_[2], task_id, thread_count_); + } else { + TransposeDimsFp32(in_data_, out_data_, out_shape_, dim_size_, position_ + dims_ * task_id, param_, task_id, + thread_count_); + } + return RET_OK; +} + +int TransposeImpl(void *kernel, int task_id) { + auto transpose = reinterpret_cast(kernel); + auto ret = transpose->RunImpl(task_id); + if (ret != RET_OK) { + MS_LOG(ERROR) << "TransposeImpl Run error task_id[" << task_id << "] error_code[" << ret << "]"; + } + return ret; } int TransposeCPUKernel::Run() { @@ -123,8 +139,8 @@ int TransposeCPUKernel::Run() { MS_ASSERT(in_data_); MS_ASSERT(out_data_); - TransposeParameter *param = reinterpret_cast(this->op_parameter_); - if (in_tensor->shape().size() != static_cast(param->num_axes_)) { + param_ = reinterpret_cast(this->op_parameter_); + if (in_tensor->shape().size() != static_cast(param_->num_axes_)) { memcpy(out_data_, in_data_, in_tensor->ElementsNum() * sizeof(float)); return RET_OK; } @@ -134,40 +150,48 @@ int TransposeCPUKernel::Run() { MS_ASSERT(input_perm->data_c() != nullptr); int *perm_data = reinterpret_cast(input_perm->data_c()); for (int i = 0; i < input_perm->ElementsNum(); ++i) { - param->perm_[i] = perm_data[i]; + param_->perm_[i] = perm_data[i]; } for (int i = input_perm->ElementsNum(); i < MAX_SHAPE_SIZE; ++i) { - param->perm_[i] = 0; + param_->perm_[i] = 0; } } - auto ret = NhNcTranspose(in_tensor, out_tensor, param); - if (ret == RET_OK) { + thread_count_ = op_parameter_->thread_num_; + GetNHNCTransposeFunc(in_tensor, out_tensor, param_); + if (NHNCTransposeFunc_ != nullptr) { + auto ret = ParallelLaunch(this->context_->thread_pool_, TransposeImpl, this, thread_count_); + if (ret != RET_OK) { + MS_LOG(ERROR) << "NHNCTransposeFunc_ is error!"; + } return ret; } - if (in_tensor->data_type() == kNumberTypeInt8) { - MS_LOG(ERROR) << "not support now"; - return RET_ERROR; - } - int dims = out_tensor->shape().size(); - if (dims > MAX_TRANSPOSE_DIM_SIZE) { - dim_size_ = reinterpret_cast(context_->allocator->Malloc(dims * sizeof(int))); + MS_ASSERT(out_shape_); + dims_ = out_tensor->shape().size(); + if (dims_ > MAX_TRANSPOSE_DIM_SIZE) { + dim_size_ = reinterpret_cast(context_->allocator->Malloc(dims_ * sizeof(int))); if (dim_size_ == nullptr) { MS_LOG(ERROR) << "Malloc data failed"; - return RET_ERROR; + return RET_NULL_PTR; } - position_ = reinterpret_cast(context_->allocator->Malloc(dims * sizeof(int))); + *(dim_size_ + dims_ - 1) = 1; + for (int i = dims_ - 1; i > 0; --i) { + *(dim_size_ + i - 1) = *(dim_size_ + i) * out_shape_[i]; + } + position_ = reinterpret_cast(context_->allocator->Malloc(dims_ * sizeof(int) * thread_count_)); if (position_ == nullptr) { - MS_LOG(ERROR) << "Malloc data failed"; context_->allocator->Free(dim_size_); - dim_size_ = nullptr; - return RET_ERROR; + MS_LOG(ERROR) << "Malloc data failed"; + return RET_NULL_PTR; } } - - MS_ASSERT(out_shape_); - ret = DoTransposeFp32(in_data_, out_data_, out_shape_, param, dim_size_, position_); - if (dims > MAX_TRANSPOSE_DIM_SIZE) { + int ret; + if (dims_ > MAX_TRANSPOSE_DIM_SIZE) { + ret = ParallelLaunch(this->context_->thread_pool_, TransposeImpl, this, thread_count_); + } else { + ret = DoTransposeFp32(in_data_, out_data_, out_shape_, param_); + } + if (dims_ > MAX_TRANSPOSE_DIM_SIZE) { context_->allocator->Free(dim_size_); context_->allocator->Free(position_); dim_size_ = nullptr; @@ -175,13 +199,10 @@ int TransposeCPUKernel::Run() { } if (ret != RET_OK) { MS_LOG(ERROR) << "Transpose run failed"; - return RET_ERROR; } - return ret; } REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Transpose, LiteKernelCreator) REG_KERNEL(kCPU, kNumberTypeInt32, PrimitiveType_Transpose, LiteKernelCreator) -REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_Transpose, LiteKernelCreator) } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.h index 90926625f2..9aa20507a7 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.h @@ -25,6 +25,10 @@ #include "src/kernel_registry.h" namespace mindspore::kernel { + +typedef void (*TransposeFunc)(const void *src, void *dst, int batch, int plane, int channel, int thread_num, + int task_id); + class TransposeCPUKernel : public LiteKernel { public: explicit TransposeCPUKernel(OpParameter *param, const std::vector &inputs, @@ -35,14 +39,20 @@ class TransposeCPUKernel : public LiteKernel { int Init() override; int ReSize() override; int Run() override; - int NhNcTranspose(lite::Tensor *in_tensor, lite::Tensor *out_tensor, TransposeParameter *param); + int RunImpl(int task_id); protected: + void GetNHNCTransposeFunc(lite::Tensor *in_tensor, lite::Tensor *out_tensor, TransposeParameter *param); float *in_data_ = nullptr; float *out_data_ = nullptr; int *out_shape_ = nullptr; int *dim_size_ = nullptr; int *position_ = nullptr; + TransposeParameter *param_ = nullptr; + TransposeFunc NHNCTransposeFunc_ = nullptr; + int thread_count_ = 0; + int nhnc_param_[3]; + int dims_ = 0; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/transpose_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/transpose_int8.cc index 6a870a97f3..e49423c9d3 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/transpose_int8.cc +++ b/mindspore/lite/src/runtime/kernel/arm/int8/transpose_int8.cc @@ -141,6 +141,25 @@ int TransposeInt8CPUKernel::DoTranspose(int task_id) { return RET_OK; } +void TransposeInt8CPUKernel::GetNHNCTransposeFunc(lite::Tensor *in_tensor, lite::Tensor *out_tensor, + TransposeParameter *param) { + auto out_shape = out_tensor->shape(); + if (in_tensor->shape().size() == 4 && param->perm_[0] == 0 && param->perm_[1] == 2 && param->perm_[2] == 3 && + param->perm_[3] == 1) { + nhnc_param_[0] = out_shape[0]; + nhnc_param_[1] = out_shape[1] * out_shape[2]; + nhnc_param_[2] = out_shape[3]; + NHNCTransposeFunc_ = PackNCHWToNHWCInt8; + } + if (in_tensor->shape().size() == 4 && param->perm_[0] == 0 && param->perm_[1] == 3 && param->perm_[2] == 1 && + param->perm_[3] == 2) { + nhnc_param_[0] = out_shape[0]; + nhnc_param_[1] = out_shape[2] * out_shape[3]; + nhnc_param_[2] = out_shape[1]; + NHNCTransposeFunc_ = PackNHWCToNCHWInt8; + } +} + int TransposeInt8CPUKernel::Run() { auto in_tensor = in_tensors_.front(); auto out_tensor = out_tensors_.front(); @@ -150,7 +169,11 @@ int TransposeInt8CPUKernel::Run() { in_ptr_ = reinterpret_cast(in_tensor->data_c()); out_ptr_ = reinterpret_cast(out_tensor->data_c()); - + GetNHNCTransposeFunc(in_tensor, out_tensor, transpose_param_); + if (NHNCTransposeFunc_ != nullptr) { + NHNCTransposeFunc_(in_ptr_, out_ptr_, nhnc_param_[0], nhnc_param_[1], nhnc_param_[2]); + return RET_OK; + } memcpy(in_shape_, in_dims.data(), in_dims.size() * sizeof(int)); memcpy(out_shape_, out_dims.data(), out_dims.size() * sizeof(int)); diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/transpose_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/transpose_int8.h index c78c826935..a0f6366ea5 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/transpose_int8.h +++ b/mindspore/lite/src/runtime/kernel/arm/int8/transpose_int8.h @@ -17,12 +17,16 @@ #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_TRANSPOSE_INT8_H_ #include +#include "nnacl/int8/pack_int8.h" #include "nnacl/int8/transpose_int8.h" #include "src/kernel_registry.h" #include "src/lite_kernel.h" #include "include/errorcode.h" namespace mindspore::kernel { + +typedef void (*TransposeFunc)(const void *src, void *dst, int batch, int plane, int channel); + class TransposeInt8CPUKernel : public LiteKernel { public: TransposeInt8CPUKernel(OpParameter *parameter, const std::vector &inputs, @@ -44,7 +48,9 @@ class TransposeInt8CPUKernel : public LiteKernel { void FreeTmpBuf(); private: + void GetNHNCTransposeFunc(lite::Tensor *in_tensor, lite::Tensor *out_tensor, TransposeParameter *param); TransposeParameter *transpose_param_; + TransposeFunc NHNCTransposeFunc_ = nullptr; int8_t *in_ptr_ = nullptr; int8_t *out_ptr_ = nullptr; int *dim_size_ = nullptr; @@ -56,6 +62,7 @@ class TransposeInt8CPUKernel : public LiteKernel { int num_unit_ = 0; int in_shape_[8] = {0}; int out_shape_[8] = {0}; + int nhnc_param_[3]; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/npu/convolution_base_npu.cc b/mindspore/lite/src/runtime/kernel/npu/convolution_base_npu.cc index 6f134709c0..02fd22d694 100644 --- a/mindspore/lite/src/runtime/kernel/npu/convolution_base_npu.cc +++ b/mindspore/lite/src/runtime/kernel/npu/convolution_base_npu.cc @@ -47,7 +47,7 @@ int ConvolutionBaseNPUKernel::InitWeightConst(const std::vector MS_LOG(ERROR) << "Malloc buffer failed."; return RET_ERROR; } - PackNHWCToNCHWFp32(nhwc_data, nchw_data, w_shape[0], w_shape[1] * w_shape[2], w_shape[3]); + PackNHWCToNCHWFp32(nhwc_data, nchw_data, w_shape[0], w_shape[1] * w_shape[2], w_shape[3], 0, 0); std::shared_ptr weight_tensor = std::shared_ptr(new (std::nothrow) ge::Tensor()); if (weight_tensor == nullptr) { diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/deconvolution_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/deconvolution_fp32_tests.cc index b4cfb75286..2f34ae980d 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/deconvolution_fp32_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/deconvolution_fp32_tests.cc @@ -335,7 +335,8 @@ int DeConvTestInit1(std::vector *inputs_, std::vectorMutableData(), in_t->Batch(), in_t->Width() * in_t->Height(), in_t->Channel()); + PackNCHWToNHWCFp32(in_nchw, in_t->MutableData(), in_t->Batch(), in_t->Width() * in_t->Height(), in_t->Channel(), 0, + 0); inputs_->push_back(in_t); std::vector weight_dims_nhwc = {2, 3, 3, 6}; @@ -358,7 +359,7 @@ int DeConvTestInit1(std::vector *inputs_, std::vectorMutableData(), weight_t->Batch(), weight_t->Width() * weight_t->Height(), - weight_t->Channel()); + weight_t->Channel(), 0, 0); inputs_->push_back(weight_t); auto *bias_t = new lite::Tensor(kNumberTypeFloat, {6}, schema::Format_NHWC, lite::Tensor::Category::CONST_TENSOR); @@ -463,7 +464,7 @@ int DeConvTestInit1(std::vector *inputs_, std::vectorBatch(), out_t->Width() * out_t->Height(), out_t->Channel()); + PackNCHWToNHWCFp32(nchw_co, *correct, out_t->Batch(), out_t->Width() * out_t->Height(), out_t->Channel(), 0, 0); conv_param->kernel_h_ = conv_param->kernel_w_ = 3; conv_param->stride_h_ = conv_param->stride_w_ = 2; @@ -531,7 +532,7 @@ int DeConvTestInit2(std::vector *inputs_, std::vectorBatch(), out_t->Width() * out_t->Height(), out_t->Channel()); + PackNCHWToNHWCFp32(nchw_co, *correct, out_t->Batch(), out_t->Width() * out_t->Height(), out_t->Channel(), 0, 0); conv_param->kernel_h_ = conv_param->kernel_w_ = 3; conv_param->stride_h_ = conv_param->stride_w_ = 2; @@ -571,7 +572,7 @@ int DeConvTestInit3(std::vector *inputs_, std::vector(in_t->MutableData()), in_t->Batch(), - in_t->Width() * in_t->Height(), in_t->Channel()); + in_t->Width() * in_t->Height(), in_t->Channel(), 0, 0); inputs_->push_back(in_t); std::vector w_dims_nhwc = {2, 2, 2, 2}; @@ -582,7 +583,7 @@ int DeConvTestInit3(std::vector *inputs_, std::vectorMutableData(), weight_t->Batch(), weight_t->Width() * weight_t->Height(), - weight_t->Channel()); + weight_t->Channel(), 0, 0); inputs_->push_back(weight_t); std::vector out_dims_nhwc = {1, 9, 9, 2}; @@ -609,7 +610,7 @@ int DeConvTestInit3(std::vector *inputs_, std::vectorBatch(), out_t->Width() * out_t->Height(), out_t->Channel()); + PackNCHWToNHWCFp32(nchw_co, *correct, out_t->Batch(), out_t->Width() * out_t->Height(), out_t->Channel(), 0, 0); conv_param->kernel_h_ = conv_param->kernel_w_ = 2; conv_param->stride_h_ = conv_param->stride_w_ = 3; @@ -658,7 +659,7 @@ int DeConvTestInit4(std::vector *inputs_, std::vector(mindspore::lite::ReadFile(weight_path.c_str(), &buffer_size)); PackNCHWToNHWCFp32(weight_nchw, weight_t->MutableData(), weight_t->Batch(), weight_t->Width() * weight_t->Height(), - weight_t->Channel()); + weight_t->Channel(), 0, 0); inputs_->push_back(weight_t); auto *bias_t = new lite::Tensor(kNumberTypeFloat, {40}, schema::Format_NHWC, lite::Tensor::Category::CONST_TENSOR); @@ -676,7 +677,7 @@ int DeConvTestInit4(std::vector *inputs_, std::vector(malloc(buffer_size)); - PackNCHWToNHWCFp32(out_nchw, *correct, out_t->Batch(), out_t->Width() * out_t->Height(), out_t->Channel()); + PackNCHWToNHWCFp32(out_nchw, *correct, out_t->Batch(), out_t->Width() * out_t->Height(), out_t->Channel(), 0, 0); conv_param->kernel_h_ = conv_param->kernel_w_ = 3; conv_param->stride_h_ = conv_param->stride_w_ = 1; diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/transpose_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/transpose_fp32_tests.cc index c0c63da22c..12e965431d 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/transpose_fp32_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/transpose_fp32_tests.cc @@ -63,7 +63,7 @@ TEST_F(TestTransposeFp32, TransposeFp32_axes4) { param->out_strides_[i] = out_strides[i]; } - auto ret = DoTransposeFp32(in, out, output_shape, param, nullptr, nullptr); + auto ret = DoTransposeFp32(in, out, output_shape, param); ASSERT_EQ(ret, 0); delete param; ASSERT_EQ(0, CompareOutputData(out, correct, 24, 0.000001)); @@ -102,7 +102,7 @@ TEST_F(TestTransposeFp32, TransposeFp32_axes3) { param->out_strides_[i] = out_strides[i]; } - auto ret = DoTransposeFp32(in, out, output_shape, param, nullptr, nullptr); + auto ret = DoTransposeFp32(in, out, output_shape, param); ASSERT_EQ(ret, 0); delete param; ASSERT_EQ(0, CompareOutputData(out, correct, 24, 0.000001)); @@ -142,7 +142,7 @@ TEST_F(TestTransposeFp32, TransposeFp32_axes2) { param->out_strides_[i] = out_strides[i]; } - auto ret = DoTransposeFp32(in, out, output_shape, param, nullptr, nullptr); + auto ret = DoTransposeFp32(in, out, output_shape, param); ASSERT_EQ(ret, 0); delete param; ASSERT_EQ(0, CompareOutputData(out, correct, 24, 0.000001));