From 08ed63ff91b675e6904c7422428f7d6089f3c4b0 Mon Sep 17 00:00:00 2001 From: lixian Date: Tue, 23 Mar 2021 09:40:31 +0800 Subject: [PATCH] fix write for big endian devices --- include/api/context.h | 2 +- mindspore/lite/include/ms_tensor.h | 4 +- .../lite/nnacl/assembly/arm64/AdderFp32.S | 72 +-- .../assembly/arm64/ConvDwFp32Indirect3x3.S | 2 +- .../assembly/arm64/ConvDwFp32Indirect5x5.S | 2 +- .../lite/nnacl/assembly/arm64/MatmulFp32Opt.S | 145 ++---- .../assembly/fp16/IndirectGemmFp16_16x8.S | 444 ++++++++---------- .../lite/nnacl/assembly/fp16/MatmulFp16Opt.S | 322 +++++-------- mindspore/lite/src/cxx_api/context.cc | 4 +- mindspore/lite/src/tensor.h | 2 +- 10 files changed, 407 insertions(+), 592 deletions(-) diff --git a/include/api/context.h b/include/api/context.h index 635faa250f..591c3b08a7 100644 --- a/include/api/context.h +++ b/include/api/context.h @@ -82,7 +82,7 @@ class MS_API CPUDeviceInfo : public DeviceInfoContext { public: enum DeviceType GetDeviceType() const override { return DeviceType::kCPU; }; - /// \brief Set the thread affinity of CPU cores. + /// \brief Set the thread affinity to CPU cores. /// /// \param mode: 0: no affinities, 1: big cores first, 2: little cores first void SetThreadAffinity(int mode); diff --git a/mindspore/lite/include/ms_tensor.h b/mindspore/lite/include/ms_tensor.h index 1a6f3112b8..ec3af53bac 100644 --- a/mindspore/lite/include/ms_tensor.h +++ b/mindspore/lite/include/ms_tensor.h @@ -53,7 +53,7 @@ class MS_API MSTensor { virtual Vector shape() const = 0; /// \brief Set the shape of MSTensor. - virtual void set_shape(const Vector &name) = 0; + virtual void set_shape(const Vector &shape) = 0; /// \brief Get number of element in MSTensor. /// @@ -71,7 +71,7 @@ class MS_API MSTensor { virtual String tensor_name() const = 0; /// \brief Set the name of MSTensor. - virtual void set_tensor_name(const String name) = 0; + virtual void set_tensor_name(const String &name) = 0; /// \brief Get the pointer of data in MSTensor. /// diff --git a/mindspore/lite/nnacl/assembly/arm64/AdderFp32.S b/mindspore/lite/nnacl/assembly/arm64/AdderFp32.S index 985074fed6..df9e94fffa 100644 --- a/mindspore/lite/nnacl/assembly/arm64/AdderFp32.S +++ b/mindspore/lite/nnacl/assembly/arm64/AdderFp32.S @@ -458,115 +458,91 @@ LoopRow4: b WriteEnd Write2: add x2, x2, #8 - str d9, [x11] + st1 {v9.2s}, [x11], x8 cmp x6, #1 beq WriteEnd - add x11, x11, x8 - str d11, [x11] + st1 {v11.2s}, [x11], x8 cmp x6, #2 beq WriteEnd - add x11, x11, x8 - str d13, [x11] + st1 {v13.2s}, [x11], x8 cmp x6, #3 beq WriteEnd - add x11, x11, x8 - str d15, [x11] + st1 {v15.2s}, [x11], x8 cmp x6, #4 beq WriteEnd - add x11, x11, x8 - str d17, [x11] + st1 {v17.2s}, [x11], x8 cmp x6, #5 beq WriteEnd - add x11, x11, x8 - str d19, [x11] + st1 {v19.2s}, [x11], x8 cmp x6, #6 beq WriteEnd - add x11, x11, x8 - str d21, [x11] + st1 {v21.2s}, [x11], x8 cmp x6, #7 beq WriteEnd - add x11, x11, x8 - str d23, [x11] + st1 {v23.2s}, [x11], x8 cmp x6, #8 beq WriteEnd - add x11, x11, x8 - str d25, [x11] + st1 {v25.2s}, [x11], x8 cmp x6, #9 beq WriteEnd - add x11, x11, x8 - str d27, [x11] + st1 {v27.2s}, [x11], x8 cmp x6, #10 beq WriteEnd - add x11, x11, x8 - str d29, [x11] + st1 {v29.2s}, [x11], x8 cmp x6, #11 beq WriteEnd - add x11, x11, x8 - str d31, [x11] - add x11, x11, x8 + st1 {v31.2s}, [x11], x8 add x11, x11, #8 b WriteEnd Write3: add x2, x2, #12 add x19, x11, #8 - str d9, [x11] + st1 {v9.2s}, [x11], x8 st1 {v9.s}[2], [x19], x8 cmp x6, #1 beq WriteEnd - add x11, x11, x8 - str d11, [x11] + st1 {v11.2s}, [x11], x8 st1 {v11.s}[2], [x19], x8 cmp x6, #2 beq WriteEnd - add x11, x11, x8 - str d13, [x11] + st1 {v13.2s}, [x11], x8 st1 {v13.s}[2], [x19], x8 cmp x6, #3 beq WriteEnd - add x11, x11, x8 - str d15, [x11] + st1 {v15.2s}, [x11], x8 st1 {v15.s}[2], [x19], x8 cmp x6, #4 beq WriteEnd - add x11, x11, x8 - str d17, [x11] + st1 {v17.2s}, [x11], x8 st1 {v17.s}[2], [x19], x8 cmp x6, #5 beq WriteEnd - add x11, x11, x8 - str d19, [x11] + st1 {v19.2s}, [x11], x8 st1 {v19.s}[2], [x19], x8 cmp x6, #6 beq WriteEnd - add x11, x11, x8 - str d21, [x11] + st1 {v21.2s}, [x11], x8 st1 {v21.s}[2], [x19], x8 cmp x6, #7 beq WriteEnd - add x11, x11, x8 - str d23, [x11] + st1 {v23.2s}, [x11], x8 st1 {v23.s}[2], [x19], x8 cmp x6, #8 beq WriteEnd - add x11, x11, x8 - str d25, [x11] + st1 {v25.2s}, [x11], x8 st1 {v25.s}[2], [x19], x8 cmp x6, #9 beq WriteEnd - add x11, x11, x8 - str d27, [x11] + st1 {v27.2s}, [x11], x8 st1 {v27.s}[2], [x19], x8 cmp x6, #10 beq WriteEnd - add x11, x11, x8 - str d29, [x11] + st1 {v29.2s}, [x11], x8 st1 {v29.s}[2], [x19], x8 cmp x6, #11 beq WriteEnd - add x11, x11, x8 - str d31, [x11] + st1 {v31.2s}, [x11], x8 st1 {v31.s}[2], [x19] - add x11, x11, x8 add x11, x11, #12 b WriteEnd Write4: diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S index 5be857a793..a60a27fe05 100644 --- a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S +++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S @@ -129,7 +129,7 @@ asm_function ConvDwFp32Indirect3x3 tbnz w11, #1, Write2 tbnz w11, #0, Write1 Write2: - str d29, [x0], #8 + st1 {v29.2s}, [x0], #8 ext v29.16b, v29.16b, v29.16b, #8 tbz w11, #0, NextPixel Write1: diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect5x5.S b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect5x5.S index 2ffb4a041a..5e1045aa72 100644 --- a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect5x5.S +++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect5x5.S @@ -260,7 +260,7 @@ asm_function ConvDwFp32Indirect5x5 tbnz w2, #1, Write2 tbnz w2, #0, Write1 Write2: - str d29, [x0], #8 + st1 {v29.2s}, [x0], #8 ext v29.16b, v29.16b, v29.16b, #8 tbz w2, #0, NextPixel Write1: diff --git a/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S b/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S index 7a103239b5..e495feec78 100644 --- a/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S +++ b/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S @@ -740,115 +740,91 @@ LoopRow4: b WriteEnd Write2: add x2, x2, #8 - str d8, [x11] + st1 {v8.2s}, [x11], x8 cmp x6, #1 beq WriteEnd - add x11, x11, x8 - str d10, [x11] + st1 {v10.2s}, [x11], x8 cmp x6, #2 beq WriteEnd - add x11, x11, x8 - str d12, [x11] + st1 {v12.2s}, [x11], x8 cmp x6, #3 beq WriteEnd - add x11, x11, x8 - str d14, [x11] + st1 {v14.2s}, [x11], x8 cmp x6, #4 beq WriteEnd - add x11, x11, x8 - str d16, [x11] + st1 {v16.2s}, [x11], x8 cmp x6, #5 beq WriteEnd - add x11, x11, x8 - str d18, [x11] + st1 {v18.2s}, [x11], x8 cmp x6, #6 beq WriteEnd - add x11, x11, x8 - str d20, [x11] + st1 {v20.2s}, [x11], x8 cmp x6, #7 beq WriteEnd - add x11, x11, x8 - str d22, [x11] + st1 {v22.2s}, [x11], x8 cmp x6, #8 beq WriteEnd - add x11, x11, x8 - str d24, [x11] + st1 {v24.2s}, [x11], x8 cmp x6, #9 beq WriteEnd - add x11, x11, x8 - str d26, [x11] + st1 {v26.2s}, [x11], x8 cmp x6, #10 beq WriteEnd - add x11, x11, x8 - str d28, [x11] + st1 {v28.2s}, [x11], x8 cmp x6, #11 beq WriteEnd - add x11, x11, x8 - str d30, [x11] - add x11, x11, x8 + st1 {v30.2s}, [x11], x8 add x11, x11, #8 b WriteEnd Write3: add x2, x2, #12 add x19, x11, #8 - str d8, [x11] + st1 {v8.2s}, [x11], x8 st1 {v8.s}[2], [x19], x8 cmp x6, #1 beq WriteEnd - add x11, x11, x8 - str d10, [x11] + st1 {v10.2s}, [x11], x8 st1 {v10.s}[2], [x19], x8 cmp x6, #2 beq WriteEnd - add x11, x11, x8 - str d12, [x11] + st1 {v12.2s}, [x11], x8 st1 {v12.s}[2], [x19], x8 cmp x6, #3 beq WriteEnd - add x11, x11, x8 - str d14, [x11] + st1 {v14.2s}, [x11], x8 st1 {v14.s}[2], [x19], x8 cmp x6, #4 beq WriteEnd - add x11, x11, x8 - str d16, [x11] + st1 {v16.2s}, [x11], x8 st1 {v16.s}[2], [x19], x8 cmp x6, #5 beq WriteEnd - add x11, x11, x8 - str d18, [x11] + st1 {v18.2s}, [x11], x8 st1 {v18.s}[2], [x19], x8 cmp x6, #6 beq WriteEnd - add x11, x11, x8 - str d20, [x11] + st1 {v20.2s}, [x11], x8 st1 {v20.s}[2], [x19], x8 cmp x6, #7 beq WriteEnd - add x11, x11, x8 - str d22, [x11] + st1 {v22.2s}, [x11], x8 st1 {v22.s}[2], [x19], x8 cmp x6, #8 beq WriteEnd - add x11, x11, x8 - str d24, [x11] + st1 {v24.2s}, [x11], x8 st1 {v24.s}[2], [x19], x8 cmp x6, #9 beq WriteEnd - add x11, x11, x8 - str d26, [x11] + st1 {v26.2s}, [x11], x8 st1 {v26.s}[2], [x19], x8 cmp x6, #10 beq WriteEnd - add x11, x11, x8 - str d28, [x11] + st1 {v28.2s}, [x11], x8 st1 {v28.s}[2], [x19], x8 cmp x6, #11 beq WriteEnd - add x11, x11, x8 - str d30, [x11] + st1 {v30.2s}, [x11], x8 st1 {v30.s}[2], [x19] - add x11, x11, x8 add x11, x11, #12 b WriteEnd Write4: @@ -955,62 +931,51 @@ LoopRow4: add x2, x2, #24 add x19, x11, #16 st1 {v8.4s}, [x11], x8 - str d9, [x19] + st1 {v9.2s}, [x19], x8 cmp x6, #1 beq WriteEnd - add x19, x19, x8 st1 {v10.4s}, [x11], x8 - str d11, [x19] + st1 {v11.2s}, [x19], x8 cmp x6, #2 beq WriteEnd - add x19, x19, x8 st1 {v12.4s}, [x11], x8 - str d13, [x19] + st1 {v13.2s}, [x19], x8 cmp x6, #3 beq WriteEnd - add x19, x19, x8 st1 {v14.4s}, [x11], x8 - str d15, [x19] + st1 {v15.2s}, [x19], x8 cmp x6, #4 beq WriteEnd - add x19, x19, x8 st1 {v16.4s}, [x11], x8 - str d17, [x19] + st1 {v17.2s}, [x19], x8 cmp x6, #5 beq WriteEnd - add x19, x19, x8 st1 {v18.4s}, [x11], x8 - str d19, [x19] + st1 {v19.2s}, [x19], x8 cmp x6, #6 beq WriteEnd - add x19, x19, x8 st1 {v20.4s}, [x11], x8 - str d21, [x19] + st1 {v21.2s}, [x19], x8 cmp x6, #7 beq WriteEnd - add x19, x19, x8 st1 {v22.4s}, [x11], x8 - str d23, [x19] + st1 {v23.2s}, [x19], x8 cmp x6, #8 beq WriteEnd - add x19, x19, x8 st1 {v24.4s}, [x11], x8 - str d25, [x19] + st1 {v25.2s}, [x19], x8 cmp x6, #9 beq WriteEnd - add x19, x19, x8 st1 {v26.4s}, [x11], x8 - str d27, [x19] + st1 {v27.2s}, [x19], x8 cmp x6, #10 beq WriteEnd - add x19, x19, x8 st1 {v28.4s}, [x11], x8 - str d29, [x19] + st1 {v29.2s}, [x19], x8 cmp x6, #11 beq WriteEnd - add x19, x19, x8 st1 {v30.4s}, [x11], x8 - str d31, [x19] + st1 {v31.2s}, [x19] add x11, x11, #24 b WriteEnd Write7: @@ -1018,75 +983,63 @@ LoopRow4: add x19, x11, #16 add x20, x11, #24 st1 {v8.4s}, [x11], x8 - str d9, [x19] + st1 {v9.2s}, [x19], x8 st1 {v9.s}[2], [x20], x8 cmp x6, #1 beq WriteEnd - add x19, x19, x8 st1 {v10.4s}, [x11], x8 - str d11, [x19] + st1 {v11.2s}, [x19], x8 st1 {v11.s}[2], [x20], x8 cmp x6, #2 beq WriteEnd - add x19, x19, x8 st1 {v12.4s}, [x11], x8 - str d13, [x19] + st1 {v13.2s}, [x19], x8 st1 {v13.s}[2], [x20], x8 cmp x6, #3 beq WriteEnd - add x19, x19, x8 st1 {v14.4s}, [x11], x8 - str d15, [x19] + st1 {v15.2s}, [x19], x8 st1 {v15.s}[2], [x20], x8 cmp x6, #4 beq WriteEnd - add x19, x19, x8 st1 {v16.4s}, [x11], x8 - str d17, [x19] + st1 {v17.2s}, [x19], x8 st1 {v17.s}[2], [x20], x8 cmp x6, #5 beq WriteEnd - add x19, x19, x8 st1 {v18.4s}, [x11], x8 - str d19, [x19] + st1 {v19.2s}, [x19], x8 st1 {v19.s}[2], [x20], x8 cmp x6, #6 beq WriteEnd - add x19, x19, x8 st1 {v20.4s}, [x11], x8 - str d21, [x19] + st1 {v21.2s}, [x19], x8 st1 {v21.s}[2], [x20], x8 cmp x6, #7 beq WriteEnd - add x19, x19, x8 st1 {v22.4s}, [x11], x8 - str d23, [x19] + st1 {v23.2s}, [x19], x8 st1 {v23.s}[2], [x20], x8 cmp x6, #8 beq WriteEnd - add x19, x19, x8 st1 {v24.4s}, [x11], x8 - str d25, [x19] + st1 {v25.2s}, [x19], x8 st1 {v25.s}[2], [x20], x8 cmp x6, #9 beq WriteEnd - add x19, x19, x8 st1 {v26.4s}, [x11], x8 - str d27, [x19] + st1 {v27.2s}, [x19], x8 st1 {v27.s}[2], [x20], x8 cmp x6, #10 beq WriteEnd - add x19, x19, x8 st1 {v28.4s}, [x11], x8 - str d29, [x19] + st1 {v29.2s}, [x19], x8 st1 {v29.s}[2], [x20], x8 cmp x6, #11 beq WriteEnd - add x19, x19, x8 st1 {v30.4s}, [x11], x8 - str d31, [x19] - add x19, x19, x8 - st1 {v31.s}[2], [x20], x8 + st1 {v31.2s}, [x19] + st1 {v31.s}[2], [x20] add x11, x11, #28 b WriteEnd WriteC8: diff --git a/mindspore/lite/nnacl/assembly/fp16/IndirectGemmFp16_16x8.S b/mindspore/lite/nnacl/assembly/fp16/IndirectGemmFp16_16x8.S index e1f2498278..5f2c7e641e 100644 --- a/mindspore/lite/nnacl/assembly/fp16/IndirectGemmFp16_16x8.S +++ b/mindspore/lite/nnacl/assembly/fp16/IndirectGemmFp16_16x8.S @@ -334,353 +334,301 @@ IndirectGemmStart: add x0, x0, #2 b WriteEnd Write2: - str s16, [x15] - add x15, x15, x7 - str s17, [x15] - add x15, x15, x7 - str s18, [x15] - add x15, x15, x7 - str s19, [x15] - add x15, x15, x7 - str s20, [x15] - add x15, x15, x7 - str s21, [x15] - add x15, x15, x7 - str s22, [x15] - add x15, x15, x7 - str s23, [x15] - add x15, x15, x7 - str s24, [x15] - add x15, x15, x7 - str s25, [x15] - add x15, x15, x7 - str s26, [x15] - add x15, x15, x7 - str s27, [x15] - add x15, x15, x7 - str s28, [x15] - add x15, x15, x7 - str s29, [x15] - add x15, x15, x7 - str s30, [x15] - add x15, x15, x7 - str s31, [x15] + add x17, x15, #2 + st1 {v16.h}[0], [x15], x7 + st1 {v16.h}[1], [x17], x7 + st1 {v17.h}[0], [x15], x7 + st1 {v17.h}[1], [x17], x7 + st1 {v18.h}[0], [x15], x7 + st1 {v18.h}[1], [x17], x7 + st1 {v19.h}[0], [x15], x7 + st1 {v19.h}[1], [x17], x7 + st1 {v20.h}[0], [x15], x7 + st1 {v20.h}[1], [x17], x7 + st1 {v21.h}[0], [x15], x7 + st1 {v21.h}[1], [x17], x7 + st1 {v22.h}[0], [x15], x7 + st1 {v22.h}[1], [x17], x7 + st1 {v23.h}[0], [x15], x7 + st1 {v23.h}[1], [x17], x7 + st1 {v24.h}[0], [x15], x7 + st1 {v24.h}[1], [x17], x7 + st1 {v25.h}[0], [x15], x7 + st1 {v25.h}[1], [x17], x7 + st1 {v26.h}[0], [x15], x7 + st1 {v26.h}[1], [x17], x7 + st1 {v27.h}[0], [x15], x7 + st1 {v27.h}[1], [x17], x7 + st1 {v28.h}[0], [x15], x7 + st1 {v28.h}[1], [x17], x7 + st1 {v29.h}[0], [x15], x7 + st1 {v29.h}[1], [x17], x7 + st1 {v30.h}[0], [x15], x7 + st1 {v30.h}[1], [x17], x7 + st1 {v31.h}[0], [x15] + st1 {v31.h}[1], [x17] add x0, x0, #4 b WriteEnd Write3: add x17, x15, #4 - str s16, [x15] - add x15, x15, x7 + add x16, x15, #2 + st1 {v16.h}[0], [x15], x7 + st1 {v16.h}[1], [x16], x7 st1 {v16.h}[2], [x17], x7 - str s17, [x15] - add x15, x15, x7 + st1 {v17.h}[0], [x15], x7 + st1 {v17.h}[1], [x16], x7 st1 {v17.h}[2], [x17], x7 - str s18, [x15] - add x15, x15, x7 + st1 {v18.h}[0], [x15], x7 + st1 {v18.h}[1], [x16], x7 st1 {v18.h}[2], [x17], x7 - str s19, [x15] - add x15, x15, x7 + st1 {v19.h}[0], [x15], x7 + st1 {v19.h}[1], [x16], x7 st1 {v19.h}[2], [x17], x7 - str s20, [x15] - add x15, x15, x7 + st1 {v20.h}[0], [x15], x7 + st1 {v20.h}[1], [x16], x7 st1 {v20.h}[2], [x17], x7 - str s21, [x15] - add x15, x15, x7 + st1 {v21.h}[0], [x15], x7 + st1 {v21.h}[1], [x16], x7 st1 {v21.h}[2], [x17], x7 - str s22, [x15] - add x15, x15, x7 + st1 {v22.h}[0], [x15], x7 + st1 {v22.h}[1], [x16], x7 st1 {v22.h}[2], [x17], x7 - str s23, [x15] - add x15, x15, x7 + st1 {v23.h}[0], [x15], x7 + st1 {v23.h}[1], [x16], x7 st1 {v23.h}[2], [x17], x7 - str s24, [x15] - add x15, x15, x7 + st1 {v24.h}[0], [x15], x7 + st1 {v24.h}[1], [x16], x7 st1 {v24.h}[2], [x17], x7 - str s25, [x15] - add x15, x15, x7 + st1 {v25.h}[0], [x15], x7 + st1 {v25.h}[1], [x16], x7 st1 {v25.h}[2], [x17], x7 - str s26, [x15] - add x15, x15, x7 + st1 {v26.h}[0], [x15], x7 + st1 {v26.h}[1], [x16], x7 st1 {v26.h}[2], [x17], x7 - str s27, [x15] - add x15, x15, x7 + st1 {v27.h}[0], [x15], x7 + st1 {v27.h}[1], [x16], x7 st1 {v27.h}[2], [x17], x7 - str s28, [x15] - add x15, x15, x7 + st1 {v28.h}[0], [x15], x7 + st1 {v28.h}[1], [x16], x7 st1 {v28.h}[2], [x17], x7 - str s29, [x15] - add x15, x15, x7 + st1 {v29.h}[0], [x15], x7 + st1 {v29.h}[1], [x16], x7 st1 {v29.h}[2], [x17], x7 - str s30, [x15] - add x15, x15, x7 + st1 {v30.h}[0], [x15], x7 + st1 {v30.h}[1], [x16], x7 st1 {v30.h}[2], [x17], x7 - str s31, [x15] + st1 {v31.h}[0], [x15] + st1 {v31.h}[1], [x16] st1 {v31.h}[2], [x17] add x0, x0, #6 b WriteEnd Write4: - str d16, [x15] - add x15, x15, x7 - str d17, [x15] - add x15, x15, x7 - str d18, [x15] - add x15, x15, x7 - str d19, [x15] - add x15, x15, x7 - str d20, [x15] - add x15, x15, x7 - str d21, [x15] - add x15, x15, x7 - str d22, [x15] - add x15, x15, x7 - str d23, [x15] - add x15, x15, x7 - str d24, [x15] - add x15, x15, x7 - str d25, [x15] - add x15, x15, x7 - str d26, [x15] - add x15, x15, x7 - str d27, [x15] - add x15, x15, x7 - str d28, [x15] - add x15, x15, x7 - str d29, [x15] - add x15, x15, x7 - str d30, [x15] - add x15, x15, x7 - str d31, [x15] + st1 {v16.4h}, [x15], x7 + st1 {v17.4h}, [x15], x7 + st1 {v18.4h}, [x15], x7 + st1 {v19.4h}, [x15], x7 + st1 {v20.4h}, [x15], x7 + st1 {v21.4h}, [x15], x7 + st1 {v22.4h}, [x15], x7 + st1 {v23.4h}, [x15], x7 + st1 {v24.4h}, [x15], x7 + st1 {v25.4h}, [x15], x7 + st1 {v26.4h}, [x15], x7 + st1 {v27.4h}, [x15], x7 + st1 {v28.4h}, [x15], x7 + st1 {v29.4h}, [x15], x7 + st1 {v30.4h}, [x15], x7 + st1 {v31.4h}, [x15] add x0, x0, #8 b WriteEnd Write5: add x17, x15, #8 - str d16, [x15] - add x15, x15, x7 + st1 {v16.4h}, [x15], x7 st1 {v16.h}[4], [x17], x7 - str d17, [x15] - add x15, x15, x7 + st1 {v17.4h}, [x15], x7 st1 {v17.h}[4], [x17], x7 - str d18, [x15] - add x15, x15, x7 + st1 {v18.4h}, [x15], x7 st1 {v18.h}[4], [x17], x7 - str d19, [x15] - add x15, x15, x7 + st1 {v19.4h}, [x15], x7 st1 {v19.h}[4], [x17], x7 - str d20, [x15] - add x15, x15, x7 + st1 {v20.4h}, [x15], x7 st1 {v20.h}[4], [x17], x7 - str d21, [x15] - add x15, x15, x7 + st1 {v21.4h}, [x15], x7 st1 {v21.h}[4], [x17], x7 - str d22, [x15] - add x15, x15, x7 + st1 {v22.4h}, [x15], x7 st1 {v22.h}[4], [x17], x7 - str d23, [x15] - add x15, x15, x7 + st1 {v23.4h}, [x15], x7 st1 {v23.h}[4], [x17], x7 - str d24, [x15] - add x15, x15, x7 + st1 {v24.4h}, [x15], x7 st1 {v24.h}[4], [x17], x7 - str d25, [x15] - add x15, x15, x7 + st1 {v25.4h}, [x15], x7 st1 {v25.h}[4], [x17], x7 - str d26, [x15] - add x15, x15, x7 + st1 {v26.4h}, [x15], x7 st1 {v26.h}[4], [x17], x7 - str d27, [x15] - add x15, x15, x7 + st1 {v27.4h}, [x15], x7 st1 {v27.h}[4], [x17], x7 - str d28, [x15] - add x15, x15, x7 + st1 {v28.4h}, [x15], x7 st1 {v28.h}[4], [x17], x7 - str d29, [x15] - add x15, x15, x7 + st1 {v29.4h}, [x15], x7 st1 {v29.h}[4], [x17], x7 - str d30, [x15] - add x15, x15, x7 + st1 {v30.4h}, [x15], x7 st1 {v30.h}[4], [x17], x7 - str d31, [x15] + st1 {v31.4h}, [x15] st1 {v31.h}[4], [x17] add x0, x0, #10 b WriteEnd Write6: add x17, x15, #8 - str d16, [x15] - add x15, x15, x7 + add x16, x15, #10 + st1 {v16.4h}, [x15], x7 ins v0.s[0], v16.s[2] - str s0, [x17] - add x17, x17, x7 - str d17, [x15] - add x15, x15, x7 + st1 {v0.h}[0], [x17], x7 + st1 {v0.h}[1], [x16], x7 + st1 {v17.4h}, [x15], x7 ins v1.s[0], v17.s[2] - str s1, [x17] - add x17, x17, x7 - str d18, [x15] - add x15, x15, x7 + st1 {v1.h}[0], [x17], x7 + st1 {v1.h}[1], [x16], x7 + st1 {v18.4h}, [x15], x7 ins v2.s[0], v18.s[2] - str s2, [x17] - add x17, x17, x7 - str d19, [x15] - add x15, x15, x7 + st1 {v2.h}[0], [x17], x7 + st1 {v2.h}[1], [x16], x7 + st1 {v19.4h}, [x15], x7 ins v3.s[0], v19.s[2] - str s3, [x17] - add x17, x17, x7 - str d20, [x15] - add x15, x15, x7 + st1 {v3.h}[0], [x17], x7 + st1 {v3.h}[1], [x16], x7 + st1 {v20.4h}, [x15], x7 ins v4.s[0], v20.s[2] - str s4, [x17] - add x17, x17, x7 - str d21, [x15] - add x15, x15, x7 + st1 {v4.h}[0], [x17], x7 + st1 {v4.h}[1], [x16], x7 + st1 {v21.4h}, [x15], x7 ins v5.s[0], v21.s[2] - str s5, [x17] - add x17, x17, x7 - str d22, [x15] - add x15, x15, x7 + st1 {v5.h}[0], [x17], x7 + st1 {v5.h}[1], [x16], x7 + st1 {v22.4h}, [x15], x7 ins v6.s[0], v22.s[2] - str s6, [x17] - add x17, x17, x7 - str d23, [x15] - add x15, x15, x7 + st1 {v6.h}[0], [x17], x7 + st1 {v6.h}[1], [x16], x7 + st1 {v23.4h}, [x15], x7 ins v7.s[0], v23.s[2] - str s7, [x17] - add x17, x17, x7 - str d24, [x15] - add x15, x15, x7 + st1 {v7.h}[0], [x17], x7 + st1 {v7.h}[1], [x16], x7 + st1 {v24.4h}, [x15], x7 ins v8.s[0], v24.s[2] - str s8, [x17] - add x17, x17, x7 - str d25, [x15] - add x15, x15, x7 + st1 {v8.h}[0], [x17], x7 + st1 {v8.h}[1], [x16], x7 + st1 {v25.4h}, [x15], x7 ins v9.s[0], v25.s[2] - str s9, [x17] - add x17, x17, x7 - str d26, [x15] - add x15, x15, x7 + st1 {v9.h}[0], [x17], x7 + st1 {v9.h}[1], [x16], x7 + st1 {v26.4h}, [x15], x7 ins v10.s[0], v26.s[2] - str s10, [x17] - add x17, x17, x7 - str d27, [x15] - add x15, x15, x7 + st1 {v10.h}[0], [x17], x7 + st1 {v10.h}[1], [x16], x7 + st1 {v27.4h}, [x15], x7 ins v11.s[0], v27.s[2] - str s11, [x17] - add x17, x17, x7 - str d28, [x15] - add x15, x15, x7 + st1 {v11.h}[0], [x17], x7 + st1 {v11.h}[1], [x16], x7 + st1 {v28.4h}, [x15], x7 ins v12.s[0], v28.s[2] - str s12, [x17] - add x17, x17, x7 - str d29, [x15] - add x15, x15, x7 + st1 {v12.h}[0], [x17], x7 + st1 {v12.h}[1], [x16], x7 + st1 {v29.4h}, [x15], x7 ins v13.s[0], v29.s[2] - str s13, [x17] - add x17, x17, x7 - str d30, [x15] - add x15, x15, x7 + st1 {v13.h}[0], [x17], x7 + st1 {v13.h}[1], [x16], x7 + st1 {v30.4h}, [x15], x7 ins v14.s[0], v30.s[2] - str s14, [x17] - add x17, x17, x7 - str d31, [x15] + st1 {v14.h}[0], [x17], x7 + st1 {v14.h}[1], [x16], x7 + st1 {v31.4h}, [x15] ins v15.s[0], v31.s[2] - str s15, [x17] + st1 {v14.h}[0], [x17] + st1 {v14.h}[1], [x16] add x0, x0, #12 b WriteEnd Write7: add x17, x15, #8 + add x18, x15, #10 add x16, x15, #12 - str d16, [x15] - add x15, x15, x7 + st1 {v16.4h}, [x15], x7 ins v0.s[0], v16.s[2] - str s0, [x17] - add x17, x17, x7 + st1 {v0.h}[0], [x17], x7 + st1 {v0.h}[1], [x18], x7 st1 {v16.h}[6], [x16], x7 - str d17, [x15] - add x15, x15, x7 + st1 {v17.4h}, [x15], x7 ins v1.s[0], v17.s[2] - str s1, [x17] - add x17, x17, x7 + st1 {v1.h}[0], [x17], x7 + st1 {v1.h}[1], [x18], x7 st1 {v17.h}[6], [x16], x7 - str d18, [x15] - add x15, x15, x7 + st1 {v18.4h}, [x15], x7 ins v2.s[0], v18.s[2] - str s2, [x17] - add x17, x17, x7 + st1 {v2.h}[0], [x17], x7 + st1 {v2.h}[1], [x18], x7 st1 {v18.h}[6], [x16], x7 - str d19, [x15] - add x15, x15, x7 + st1 {v19.4h}, [x15], x7 ins v3.s[0], v19.s[2] - str s3, [x17] - add x17, x17, x7 + st1 {v3.h}[0], [x17], x7 + st1 {v3.h}[1], [x18], x7 st1 {v19.h}[6], [x16], x7 - str d20, [x15] - add x15, x15, x7 + st1 {v20.4h}, [x15], x7 ins v4.s[0], v20.s[2] - str s4, [x17] - add x17, x17, x7 + st1 {v4.h}[0], [x17], x7 + st1 {v4.h}[1], [x18], x7 st1 {v20.h}[6], [x16], x7 - str d21, [x15] - add x15, x15, x7 + st1 {v21.4h}, [x15], x7 ins v5.s[0], v21.s[2] - str s5, [x17] - add x17, x17, x7 + st1 {v5.h}[0], [x17], x7 + st1 {v5.h}[1], [x18], x7 st1 {v21.h}[6], [x16], x7 - str d22, [x15] - add x15, x15, x7 + st1 {v22.4h}, [x15], x7 ins v6.s[0], v22.s[2] - str s6, [x17] - add x17, x17, x7 + st1 {v6.h}[0], [x17], x7 + st1 {v6.h}[1], [x18], x7 st1 {v22.h}[6], [x16], x7 - str d23, [x15] - add x15, x15, x7 + st1 {v23.4h}, [x15], x7 ins v7.s[0], v23.s[2] - str s7, [x17] - add x17, x17, x7 + st1 {v7.h}[0], [x17], x7 + st1 {v7.h}[1], [x18], x7 st1 {v23.h}[6], [x16], x7 - str d24, [x15] - add x15, x15, x7 + st1 {v24.4h}, [x15], x7 ins v8.s[0], v24.s[2] - str s8, [x17] - add x17, x17, x7 + st1 {v8.h}[0], [x17], x7 + st1 {v8.h}[1], [x18], x7 st1 {v24.h}[6], [x16], x7 - str d25, [x15] - add x15, x15, x7 + st1 {v25.4h}, [x15], x7 ins v9.s[0], v25.s[2] - str s9, [x17] - add x17, x17, x7 + st1 {v9.h}[0], [x17], x7 + st1 {v9.h}[1], [x18], x7 st1 {v25.h}[6], [x16], x7 - str d26, [x15] - add x15, x15, x7 + st1 {v26.4h}, [x15], x7 ins v10.s[0], v26.s[2] - str s10, [x17] - add x17, x17, x7 + st1 {v10.h}[0], [x17], x7 + st1 {v10.h}[1], [x18], x7 st1 {v26.h}[6], [x16], x7 - str d27, [x15] - add x15, x15, x7 + st1 {v27.4h}, [x15], x7 ins v11.s[0], v27.s[2] - str s11, [x17] - add x17, x17, x7 + st1 {v11.h}[0], [x17], x7 + st1 {v11.h}[1], [x18], x7 st1 {v27.h}[6], [x16], x7 - str d28, [x15] - add x15, x15, x7 + st1 {v28.4h}, [x15], x7 ins v12.s[0], v28.s[2] - str s12, [x17] - add x17, x17, x7 + st1 {v12.h}[0], [x17], x7 + st1 {v12.h}[1], [x18], x7 st1 {v28.h}[6], [x16], x7 - str d29, [x15] - add x15, x15, x7 + st1 {v29.4h}, [x15], x7 ins v13.s[0], v29.s[2] - str s13, [x17] - add x17, x17, x7 + st1 {v13.h}[0], [x17], x7 + st1 {v13.h}[1], [x18], x7 st1 {v29.h}[6], [x16], x7 - str d30, [x15] - add x15, x15, x7 + st1 {v30.4h}, [x15], x7 ins v14.s[0], v30.s[2] - str s14, [x17] - add x17, x17, x7 + st1 {v14.h}[0], [x17], x7 + st1 {v14.h}[1], [x18], x7 st1 {v30.h}[6], [x16], x7 - str d31, [x15] + st1 {v31.4h}, [x15] ins v15.s[0], v31.s[2] - str s15, [x17] + st1 {v15.h}[0], [x17] + st1 {v15.h}[1], [x18] st1 {v31.h}[6], [x16] add x0, x0, #14 b WriteEnd diff --git a/mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S b/mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S index 503a0f6f23..1d2eb479bc 100644 --- a/mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S +++ b/mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S @@ -677,400 +677,354 @@ LoopRow: b WriteEnd Write2: add x2, x2, #4 - str s16, [x11] + add x19, x11, #2 + st1 {v16.h}[0], [x11], x8 + st1 {v16.h}[1], [x19], x8 cmp x6, #1 beq WriteEnd - add x11, x11, x8 - str s17, [x11] + st1 {v17.h}[0], [x11], x8 + st1 {v17.h}[1], [x19], x8 cmp x6, #2 beq WriteEnd - add x11, x11, x8 - str s18, [x11] + st1 {v18.h}[0], [x11], x8 + st1 {v18.h}[1], [x19], x8 cmp x6, #3 beq WriteEnd - add x11, x11, x8 - str s19, [x11] + st1 {v19.h}[0], [x11], x8 + st1 {v19.h}[1], [x19], x8 cmp x6, #4 beq WriteEnd - add x11, x11, x8 - str s20, [x11] + st1 {v20.h}[0], [x11], x8 + st1 {v20.h}[1], [x19], x8 cmp x6, #5 beq WriteEnd - add x11, x11, x8 - str s21, [x11] + st1 {v21.h}[0], [x11], x8 + st1 {v21.h}[1], [x19], x8 cmp x6, #6 beq WriteEnd - add x11, x11, x8 - str s22, [x11] + st1 {v22.h}[0], [x11], x8 + st1 {v22.h}[1], [x19], x8 cmp x6, #7 beq WriteEnd - add x11, x11, x8 - str s23, [x11] + st1 {v23.h}[0], [x11], x8 + st1 {v23.h}[1], [x19], x8 cmp x6, #8 beq WriteEnd - add x11, x11, x8 - str s24, [x11] + st1 {v24.h}[0], [x11], x8 + st1 {v24.h}[1], [x19], x8 cmp x6, #9 beq WriteEnd - add x11, x11, x8 - str s25, [x11] + st1 {v25.h}[0], [x11], x8 + st1 {v25.h}[1], [x19], x8 cmp x6, #10 beq WriteEnd - add x11, x11, x8 - str s26, [x11] + st1 {v26.h}[0], [x11], x8 + st1 {v26.h}[1], [x19], x8 cmp x6, #11 beq WriteEnd - add x11, x11, x8 - str s27, [x11] + st1 {v27.h}[0], [x11], x8 + st1 {v27.h}[1], [x19], x8 cmp x6, #12 beq WriteEnd - add x11, x11, x8 - str s28, [x11] + st1 {v28.h}[0], [x11], x8 + st1 {v28.h}[1], [x19], x8 cmp x6, #13 beq WriteEnd - add x11, x11, x8 - str s29, [x11] + st1 {v29.h}[0], [x11], x8 + st1 {v29.h}[1], [x19], x8 cmp x6, #14 beq WriteEnd - add x11, x11, x8 - str s30, [x11] + st1 {v30.h}[0], [x11], x8 + st1 {v30.h}[1], [x19], x8 cmp x6, #15 beq WriteEnd - add x11, x11, x8 - str s31, [x11] - add x11, x11, x8 + st1 {v31.h}[0], [x11], x8 + st1 {v31.h}[1], [x19] add x11, x11, #4 b WriteEnd Write3: add x2, x2, #6 add x19, x11, #4 - str s16, [x11] + add x20, x11, #2 + st1 {v16.h}[0], [x11], x8 + st1 {v16.h}[1], [x20], x8 st1 {v16.h}[2], [x19], x8 cmp x6, #1 beq WriteEnd - add x11, x11, x8 - str s17, [x11] + st1 {v17.h}[0], [x11], x8 + st1 {v17.h}[1], [x20], x8 st1 {v17.h}[2], [x19], x8 cmp x6, #2 beq WriteEnd - add x11, x11, x8 - str s18, [x11] + st1 {v18.h}[0], [x11], x8 + st1 {v18.h}[1], [x20], x8 st1 {v18.h}[2], [x19], x8 cmp x6, #3 beq WriteEnd - add x11, x11, x8 - str s19, [x11] + st1 {v19.h}[0], [x11], x8 + st1 {v19.h}[1], [x20], x8 st1 {v19.h}[2], [x19], x8 cmp x6, #4 beq WriteEnd - add x11, x11, x8 - str s20, [x11] + st1 {v20.h}[0], [x11], x8 + st1 {v20.h}[1], [x20], x8 st1 {v20.h}[2], [x19], x8 cmp x6, #5 beq WriteEnd - add x11, x11, x8 - str s21, [x11] + st1 {v21.h}[0], [x11], x8 + st1 {v21.h}[1], [x20], x8 st1 {v21.h}[2], [x19], x8 cmp x6, #6 beq WriteEnd - add x11, x11, x8 - str s22, [x11] + st1 {v22.h}[0], [x11], x8 + st1 {v22.h}[1], [x20], x8 st1 {v22.h}[2], [x19], x8 cmp x6, #7 beq WriteEnd - add x11, x11, x8 - str s23, [x11] + st1 {v23.h}[0], [x11], x8 + st1 {v23.h}[1], [x20], x8 st1 {v23.h}[2], [x19], x8 cmp x6, #8 beq WriteEnd - add x11, x11, x8 - str s24, [x11] + st1 {v24.h}[0], [x11], x8 + st1 {v24.h}[1], [x20], x8 st1 {v24.h}[2], [x19], x8 cmp x6, #9 beq WriteEnd - add x11, x11, x8 - str s25, [x11] + st1 {v25.h}[0], [x11], x8 + st1 {v25.h}[1], [x20], x8 st1 {v25.h}[2], [x19], x8 cmp x6, #10 beq WriteEnd - add x11, x11, x8 - str s26, [x11] + st1 {v26.h}[0], [x11], x8 + st1 {v26.h}[1], [x20], x8 st1 {v26.h}[2], [x19], x8 cmp x6, #11 beq WriteEnd - add x11, x11, x8 - str s27, [x11] + st1 {v27.h}[0], [x11], x8 + st1 {v27.h}[1], [x20], x8 st1 {v27.h}[2], [x19], x8 cmp x6, #12 beq WriteEnd - add x11, x11, x8 - str s28, [x11] + st1 {v28.h}[0], [x11], x8 + st1 {v28.h}[1], [x20], x8 st1 {v28.h}[2], [x19], x8 cmp x6, #13 beq WriteEnd - add x11, x11, x8 - str s29, [x11] + st1 {v29.h}[0], [x11], x8 + st1 {v29.h}[1], [x20], x8 st1 {v29.h}[2], [x19], x8 cmp x6, #14 beq WriteEnd - add x11, x11, x8 - str s30, [x11] + st1 {v30.h}[0], [x11], x8 + st1 {v30.h}[1], [x20], x8 st1 {v30.h}[2], [x19], x8 cmp x6, #15 beq WriteEnd - add x11, x11, x8 - str s31, [x11] + st1 {v31.h}[0], [x11], x8 + st1 {v31.h}[1], [x20] st1 {v31.h}[2], [x19] - add x11, x11, x8 add x11, x11, #6 b WriteEnd Write4: add x2, x2, #8 - str d16, [x11] + st1 {v16.4h}, [x11], x8 cmp x6, #1 beq WriteEnd - add x11, x11, x8 - str d17, [x11] + st1 {v17.4h}, [x11], x8 cmp x6, #2 beq WriteEnd - add x11, x11, x8 - str d18, [x11] + st1 {v18.4h}, [x11], x8 cmp x6, #3 beq WriteEnd - add x11, x11, x8 - str d19, [x11] + st1 {v19.4h}, [x11], x8 cmp x6, #4 beq WriteEnd - add x11, x11, x8 - str d20, [x11] + st1 {v20.4h}, [x11], x8 cmp x6, #5 beq WriteEnd - add x11, x11, x8 - str d21, [x11] + st1 {v21.4h}, [x11], x8 cmp x6, #6 beq WriteEnd - add x11, x11, x8 - str d22, [x11] + st1 {v22.4h}, [x11], x8 cmp x6, #7 beq WriteEnd - add x11, x11, x8 - str d23, [x11] + st1 {v23.4h}, [x11], x8 cmp x6, #8 beq WriteEnd - add x11, x11, x8 - str d24, [x11] + st1 {v24.4h}, [x11], x8 cmp x6, #9 beq WriteEnd - add x11, x11, x8 - str d25, [x11] + st1 {v25.4h}, [x11], x8 cmp x6, #10 beq WriteEnd - add x11, x11, x8 - str d26, [x11] + st1 {v26.4h}, [x11], x8 cmp x6, #11 beq WriteEnd - add x11, x11, x8 - str d27, [x11] + st1 {v27.4h}, [x11], x8 cmp x6, #12 beq WriteEnd - add x11, x11, x8 - str d28, [x11] + st1 {v28.4h}, [x11], x8 cmp x6, #13 beq WriteEnd - add x11, x11, x8 - str d29, [x11] + st1 {v29.4h}, [x11], x8 cmp x6, #14 beq WriteEnd - add x11, x11, x8 - str d30, [x11] + st1 {v30.4h}, [x11], x8 cmp x6, #15 beq WriteEnd - add x11, x11, x8 - str d31, [x11] - add x11, x11, x8 + st1 {v31.4h}, [x11], x8 add x11, x11, #8 b WriteEnd Write5: add x2, x2, #10 add x19, x11, #8 - str d16, [x11] + st1 {v16.4h}, [x11], x8 st1 {v16.h}[4], [x19], x8 cmp x6, #1 beq WriteEnd - add x11, x11, x8 - str d17, [x11] + st1 {v17.4h}, [x11], x8 st1 {v17.h}[4], [x19], x8 cmp x6, #2 beq WriteEnd - add x11, x11, x8 - str d18, [x11] + st1 {v18.4h}, [x11], x8 st1 {v18.h}[4], [x19], x8 cmp x6, #3 beq WriteEnd - add x11, x11, x8 - str d19, [x11] + st1 {v19.4h}, [x11], x8 st1 {v19.h}[4], [x19], x8 cmp x6, #4 beq WriteEnd - add x11, x11, x8 - str d20, [x11] + st1 {v20.4h}, [x11], x8 st1 {v20.h}[4], [x19], x8 cmp x6, #5 beq WriteEnd - add x11, x11, x8 - str d21, [x11] + st1 {v21.4h}, [x11], x8 st1 {v21.h}[4], [x19], x8 cmp x6, #6 beq WriteEnd - add x11, x11, x8 - str d22, [x11] + st1 {v22.4h}, [x11], x8 st1 {v22.h}[4], [x19], x8 cmp x6, #7 beq WriteEnd - add x11, x11, x8 - str d23, [x11] + st1 {v23.4h}, [x11], x8 st1 {v23.h}[4], [x19], x8 cmp x6, #8 beq WriteEnd - add x11, x11, x8 - str d24, [x11] + st1 {v24.4h}, [x11], x8 st1 {v24.h}[4], [x19], x8 cmp x6, #9 beq WriteEnd - add x11, x11, x8 - str d25, [x11] + st1 {v25.4h}, [x11], x8 st1 {v25.h}[4], [x19], x8 cmp x6, #10 beq WriteEnd - add x11, x11, x8 - str d26, [x11] + st1 {v26.4h}, [x11], x8 st1 {v26.h}[4], [x19], x8 cmp x6, #11 beq WriteEnd - add x11, x11, x8 - str d27, [x11] + st1 {v27.4h}, [x11], x8 st1 {v27.h}[4], [x19], x8 cmp x6, #12 beq WriteEnd - add x11, x11, x8 - str d28, [x11] + st1 {v28.4h}, [x11], x8 st1 {v28.h}[4], [x19], x8 cmp x6, #13 beq WriteEnd - add x11, x11, x8 - str d29, [x11] + st1 {v29.4h}, [x11], x8 st1 {v29.h}[4], [x19], x8 cmp x6, #14 beq WriteEnd - add x11, x11, x8 - str d30, [x11] + st1 {v30.4h}, [x11], x8 st1 {v30.h}[4], [x19], x8 cmp x6, #15 beq WriteEnd - add x11, x11, x8 - str d31, [x11] + st1 {v31.4h}, [x11], x8 st1 {v31.h}[4], [x19] - add x11, x11, x8 add x11, x11, #10 b WriteEnd Write6: add x2, x2, #12 add x19, x11, #8 add x20, x11, #10 - str d16, [x11] + st1 {v16.4h}, [x11], x8 st1 {v16.h}[4], [x19], x8 st1 {v16.h}[5], [x20], x8 cmp x6, #1 beq WriteEnd - add x11, x11, x8 - str d17, [x11] + st1 {v17.4h}, [x11], x8 st1 {v17.h}[4], [x19], x8 st1 {v17.h}[5], [x20], x8 cmp x6, #2 beq WriteEnd - add x11, x11, x8 - str d18, [x11] + st1 {v18.4h}, [x11], x8 st1 {v18.h}[4], [x19], x8 st1 {v18.h}[5], [x20], x8 cmp x6, #3 beq WriteEnd - add x11, x11, x8 - str d19, [x11] + st1 {v19.4h}, [x11], x8 st1 {v19.h}[4], [x19], x8 st1 {v19.h}[5], [x20], x8 cmp x6, #4 beq WriteEnd - add x11, x11, x8 - str d20, [x11] + st1 {v20.4h}, [x11], x8 st1 {v20.h}[4], [x19], x8 st1 {v20.h}[5], [x20], x8 cmp x6, #5 beq WriteEnd - add x11, x11, x8 - str d21, [x11] + st1 {v21.4h}, [x11], x8 st1 {v21.h}[4], [x19], x8 st1 {v21.h}[5], [x20], x8 cmp x6, #6 beq WriteEnd - add x11, x11, x8 - str d22, [x11] + st1 {v22.4h}, [x11], x8 st1 {v22.h}[4], [x19], x8 st1 {v22.h}[5], [x20], x8 cmp x6, #7 beq WriteEnd - add x11, x11, x8 - str d23, [x11] + st1 {v23.4h}, [x11], x8 st1 {v23.h}[4], [x19], x8 st1 {v23.h}[5], [x20], x8 cmp x6, #8 beq WriteEnd - add x11, x11, x8 - str d24, [x11] + st1 {v24.4h}, [x11], x8 st1 {v24.h}[4], [x19], x8 st1 {v24.h}[5], [x20], x8 cmp x6, #9 beq WriteEnd - add x11, x11, x8 - str d25, [x11] + st1 {v25.4h}, [x11], x8 st1 {v25.h}[4], [x19], x8 st1 {v25.h}[5], [x20], x8 cmp x6, #10 beq WriteEnd - add x11, x11, x8 - str d26, [x11] + st1 {v26.4h}, [x11], x8 st1 {v26.h}[4], [x19], x8 st1 {v26.h}[5], [x20], x8 cmp x6, #11 beq WriteEnd - add x11, x11, x8 - str d27, [x11] + st1 {v27.4h}, [x11], x8 st1 {v27.h}[4], [x19], x8 st1 {v27.h}[5], [x20], x8 cmp x6, #12 beq WriteEnd - add x11, x11, x8 - str d28, [x11] + st1 {v28.4h}, [x11], x8 st1 {v28.h}[4], [x19], x8 st1 {v28.h}[5], [x20], x8 cmp x6, #13 beq WriteEnd - add x11, x11, x8 - str d29, [x11] + st1 {v29.4h}, [x11], x8 st1 {v29.h}[4], [x19], x8 st1 {v29.h}[5], [x20], x8 cmp x6, #14 beq WriteEnd - add x11, x11, x8 - str d30, [x11] + st1 {v30.4h}, [x11], x8 st1 {v30.h}[4], [x19], x8 st1 {v30.h}[5], [x20], x8 cmp x6, #15 beq WriteEnd - add x11, x11, x8 - str d31, [x11] + st1 {v31.4h}, [x11], x8 st1 {v31.h}[4], [x19] st1 {v31.h}[5], [x20] - add x11, x11, x8 add x11, x11, #12 b WriteEnd Write7: @@ -1078,116 +1032,100 @@ LoopRow: add x19, x11, #8 add x20, x11, #10 add x10, x11, #12 - str d16, [x11] + st1 {v16.4h}, [x11], x8 st1 {v16.h}[4], [x19], x8 st1 {v16.h}[5], [x20], x8 st1 {v16.h}[6], [x10], x8 cmp x6, #1 beq WriteEnd - add x11, x11, x8 - str d17, [x11] + st1 {v17.4h}, [x11], x8 st1 {v17.h}[4], [x19], x8 st1 {v17.h}[5], [x20], x8 st1 {v17.h}[6], [x10], x8 cmp x6, #2 beq WriteEnd - add x11, x11, x8 - str d18, [x11] + st1 {v18.4h}, [x11], x8 st1 {v18.h}[4], [x19], x8 st1 {v18.h}[5], [x20], x8 st1 {v18.h}[6], [x10], x8 cmp x6, #3 beq WriteEnd - add x11, x11, x8 - str d19, [x11] + st1 {v19.4h}, [x11], x8 st1 {v19.h}[4], [x19], x8 st1 {v19.h}[5], [x20], x8 st1 {v19.h}[6], [x10], x8 cmp x6, #4 beq WriteEnd - add x11, x11, x8 - str d20, [x11] + st1 {v20.4h}, [x11], x8 st1 {v20.h}[4], [x19], x8 st1 {v20.h}[5], [x20], x8 st1 {v20.h}[6], [x10], x8 cmp x6, #5 beq WriteEnd - add x11, x11, x8 - str d21, [x11] + st1 {v21.4h}, [x11], x8 st1 {v21.h}[4], [x19], x8 st1 {v21.h}[5], [x20], x8 st1 {v21.h}[6], [x10], x8 cmp x6, #6 beq WriteEnd - add x11, x11, x8 - str d22, [x11] + st1 {v22.4h}, [x11], x8 st1 {v22.h}[4], [x19], x8 st1 {v22.h}[5], [x20], x8 st1 {v22.h}[6], [x10], x8 cmp x6, #7 beq WriteEnd - add x11, x11, x8 - str d23, [x11] + st1 {v23.4h}, [x11], x8 st1 {v23.h}[4], [x19], x8 st1 {v23.h}[5], [x20], x8 st1 {v23.h}[6], [x10], x8 cmp x6, #8 beq WriteEnd - add x11, x11, x8 - str d24, [x11] + st1 {v24.4h}, [x11], x8 st1 {v24.h}[4], [x19], x8 st1 {v24.h}[5], [x20], x8 st1 {v24.h}[6], [x10], x8 cmp x6, #9 beq WriteEnd - add x11, x11, x8 - str d25, [x11] + st1 {v25.4h}, [x11], x8 st1 {v25.h}[4], [x19], x8 st1 {v25.h}[5], [x20], x8 st1 {v25.h}[6], [x10], x8 cmp x6, #10 beq WriteEnd - add x11, x11, x8 - str d26, [x11] + st1 {v26.4h}, [x11], x8 st1 {v26.h}[4], [x19], x8 st1 {v26.h}[5], [x20], x8 st1 {v26.h}[6], [x10], x8 cmp x6, #11 beq WriteEnd - add x11, x11, x8 - str d27, [x11] + st1 {v27.4h}, [x11], x8 st1 {v27.h}[4], [x19], x8 st1 {v27.h}[5], [x20], x8 st1 {v27.h}[6], [x10], x8 cmp x6, #12 beq WriteEnd - add x11, x11, x8 - str d28, [x11] + st1 {v28.4h}, [x11], x8 st1 {v28.h}[4], [x19], x8 st1 {v28.h}[5], [x20], x8 st1 {v28.h}[6], [x10], x8 cmp x6, #13 beq WriteEnd - add x11, x11, x8 - str d29, [x11] + st1 {v29.4h}, [x11], x8 st1 {v29.h}[4], [x19], x8 st1 {v29.h}[5], [x20], x8 st1 {v29.h}[6], [x10], x8 cmp x6, #14 beq WriteEnd - add x11, x11, x8 - str d30, [x11] + st1 {v30.4h}, [x11], x8 st1 {v30.h}[4], [x19], x8 st1 {v30.h}[5], [x20], x8 st1 {v30.h}[6], [x10], x8 cmp x6, #15 beq WriteEnd - add x11, x11, x8 - str d31, [x11] + st1 {v31.4h}, [x11], x8 st1 {v31.h}[4], [x19] st1 {v31.h}[5], [x20] st1 {v31.h}[6], [x10] - add x11, x11, x8 add x11, x11, #14 b WriteEnd WriteC8: diff --git a/mindspore/lite/src/cxx_api/context.cc b/mindspore/lite/src/cxx_api/context.cc index 13e771898f..542c1b5b48 100644 --- a/mindspore/lite/src/cxx_api/context.cc +++ b/mindspore/lite/src/cxx_api/context.cc @@ -30,8 +30,8 @@ constexpr auto kModelOptionKirinNpuFrequency = "mindspore.option.kirin_npu.frequ struct Context::Data { std::vector> device_info_list; - int32_t thread_num; - std::shared_ptr allocator; + int32_t thread_num = 2; + std::shared_ptr allocator = nullptr; }; struct DeviceInfoContext::Data { diff --git a/mindspore/lite/src/tensor.h b/mindspore/lite/src/tensor.h index d14ec20283..67f700123b 100644 --- a/mindspore/lite/src/tensor.h +++ b/mindspore/lite/src/tensor.h @@ -74,7 +74,7 @@ class Tensor : public mindspore::tensor::MSTensor { virtual bool operator==(const Tensor &tensor); - void set_tensor_name(std::string name) override { tensor_name_ = name; } + void set_tensor_name(const std::string &name) override { tensor_name_ = name; } std::string tensor_name() const override { return tensor_name_; }