fix assembly headers and declaration style

4 years ago · 51208e3411
parent f1e1d054bf
commit 51208e3411
93 changed files with 191 additions and 567 deletions
--- a/mindspore/lite/nnacl/assembly/arm32/ConvDw3x3Int8BorderPixel.S
+++ b/mindspore/lite/nnacl/assembly/arm32/ConvDw3x3Int8BorderPixel.S
@ -1,13 +1,8 @@
-#ifdef __arm__
-#ifndef __aarch64__
+#ifdef ENABLE_ARM32
 #include "nnacl/assembly_global.h"

 .text
 .align 5
-.global ConvDw3x3Int8BorderPixel
-#ifndef __APPLE__
-.type ConvDw3x3Int8BorderPixel, %function
-#endif

 // void ConvDw3x3Int8BorderPixel(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, size_t height,
 //                               size_t width, size_t in_kh_step, size_t in_kw_step, size_t channel, size_t in_zp, size_t out_zp,
@ -116,4 +111,3 @@ asm_function ConvDw3x3Int8BorderPixel
    vpop {q4-q7}
    pop {r4-r8, r9-r12, pc}
 #endif
-#endif
--- a/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Border.S
+++ b/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Border.S
@ -3,10 +3,6 @@

 .text
 .align 5
-.global ConvDwFp32Border
-#ifndef __APPLE__
-.type ConvDwFp32Border, %function
-#endif

 // void ConvDwFp32Border(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width,
 //                       size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu, size_t relu6)
--- a/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Center.S
+++ b/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Center.S
@ -1,13 +1,8 @@
-#ifdef __arm__
-#ifndef __aarch64__
+#ifdef ENABLE_ARM32
 #include "nnacl/assembly_global.h"

 .text
 .align 5
-.global ConvDwFp32Center
-#ifndef __APPLE__
-.type ConvDwFp32Center, %function
-#endif

 // void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width,
 //                      size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,
@ -164,4 +159,3 @@ LoopWEnd:
    vpop {q4-q7}
    pop {r0-r8, r10, r11, pc}
 #endif
-#endif
--- a/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Row.S
+++ b/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Row.S
@ -3,10 +3,6 @@

 .text
 .align 5
-.global ConvDwFp32Row
-#ifndef __APPLE__
-.type ConvDwFp32Row, %function
-#endif

 // voidConvDwFp32Row(float* output_ptr, const float* input_ptr, const float* filter_ptr,
 //                   size_t num_pixels, size_t input_channel, size_t input_step)
--- a/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8Center.S
+++ b/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8Center.S
@ -1,13 +1,9 @@
-#ifdef __arm__
-#ifndef __aarch64__
+#ifdef ENABLE_ARM32
 #include "nnacl/assembly_global.h"

 .text
 .align 5
-.global ConvDwInt8Center
-#ifndef __APPLE__
-.type ConvDwInt8Center, %function
-#endif
+
 // void DepthwiseCenterInt8(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, int height,
 //                          int width, int kernel_h, int kernel_w, int out_h_step, int block_channel, int in_sh_step,
 //                          int in_sw_step, int in_kh_step, int in_kw_step, int8_t *in_zp, int32_t *out_zp,
@ -277,4 +273,3 @@ asm_function ConvDwInt8Center
        vpop {q4-q7}
        pop {r0-r8, r10, r11, pc}
 #endif
-#endif
--- a/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8PostAlign4.S
+++ b/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8PostAlign4.S
@ -1,13 +1,8 @@
-#ifdef __arm__
-#ifndef __aarch64__
+#ifdef ENABLE_ARM32
 #include "nnacl/assembly_global.h"

 .text
 .align 5
-.global ConvDwInt8PostAlign4
-#ifndef __APPLE__
-.type ConvDwInt8PostAlign4, %function
-#endif

 // void ConvDwInt8PostAlign4(int8_t *dst, int32_t *buffer, int num_pixels, int32_t output_zp, int32_t out_multiplier,
 //                           int32_t left_shift, int32_t right_shift, int32_t acc_min, int32_t acc_max);
@ -108,4 +103,3 @@ asm_function ConvDwInt8PostAlign4
        bx lr

 #endif
-#endif
--- a/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8PostAlign4PerChannel.S
+++ b/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8PostAlign4PerChannel.S
@ -1,13 +1,8 @@
-#ifdef __arm__
-#ifndef __aarch64__
+#ifdef ENABLE_ARM32
 #include "nnacl/assembly_global.h"

 .text
 .align 5
-.global ConvDwInt8PostAlign4PerChannel
-#ifndef __APPLE__
-.type ConvDwInt8PostAlign4PerChannel, %function
-#endif

 // void ConvDwInt8PostAlign4PerChannel(int8_t *dst, int32_t *buffer, int channel4, int32_t output_zp, int32_t *out_multiplier,
 //                                     int32_t *left_shift, int32_t *right_shift, int32_t acc_min, int32_t acc_max);
@ -111,4 +106,3 @@ asm_function ConvDwInt8PostAlign4PerChannel
        bx lr

 #endif
-#endif
--- a/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8Row.S
+++ b/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8Row.S
@ -1,13 +1,8 @@
-#ifdef __arm__
-#ifndef __aarch64__
+#ifdef ENABLE_ARM32
 #include "nnacl/assembly_global.h"

 .text
 .align 5
-.global ConvDwInt8Row
-#ifndef __APPLE__
-.type ConvDwInt8Row, %function
-#endif

 // void ConvDwInt8Row(int32_t *output_ptr, const int8_t *input_ptr, const int16_t *weight_ptr, int num_pixels,
 //                    int output_channel, int input_step, int8_t input_zp)
@ -132,4 +127,3 @@ asm_function ConvDwInt8Row
    vpop {q4-q7}
    pop {r4-r8, r9-r12, pc}
 #endif
-#endif
--- a/mindspore/lite/nnacl/assembly/arm32/DeconvDwFp32Center.S
+++ b/mindspore/lite/nnacl/assembly/arm32/DeconvDwFp32Center.S
@ -1,13 +1,8 @@
-#ifdef __arm__
-#ifndef __aarch64__
+#ifdef ENABLE_ARM32
 #include "nnacl/assembly_global.h"

 .text
 .align 5
-.global DeconvDwFp32Center
-#ifndef __APPLE__
-.type DeconvDwFp32Center, %function
-#endif

 // void DeconvDwFp32Center(float *dst, const float *src, const float *weight, size_t height, size_t width,
 //                      size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
@ -67,4 +62,3 @@ asm_function DeconvDwFp32Center

    pop {r0-r8, r10, r11, pc}
 #endif
-#endif
--- a/mindspore/lite/nnacl/assembly/arm32/DeconvDwInt8Center.S
+++ b/mindspore/lite/nnacl/assembly/arm32/DeconvDwInt8Center.S
@ -1,13 +1,8 @@
-#ifdef __arm__
-#ifndef __aarch64__
+#ifdef ENABLE_ARM32
 #include "nnacl/assembly_global.h"

 .text
 .align 5
-.global DeconvDwInt8Center
-#ifndef __APPLE__
-.type DeconvDwInt8Center, %function
-#endif

 // void DeconvDwInt8Center(int32_t *dst, const int16_t *src, const int16_t *weight, size_t height, size_t width,
 //                         size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
@ -67,4 +62,3 @@ asm_function DeconvDwInt8Center

    pop {r0-r8, r10, r11, pc}
 #endif
-#endif
--- a/mindspore/lite/nnacl/assembly/arm32/DeconvDwInt8Post.S
+++ b/mindspore/lite/nnacl/assembly/arm32/DeconvDwInt8Post.S
@ -1,13 +1,8 @@
-#ifdef __arm__
-#ifndef __aarch64__
+#ifdef ENABLE_ARM32
 #include "nnacl/assembly_global.h"

 .text
 .align 5
-.global DeconvDwInt8Post
-#ifndef __APPLE__
-.type DeconvDwInt8Post, %function
-#endif

 // void DeconvDwInt8Post(int8_t *dst, int32_t *output_buffer, const int32_t *bias, int block_channel, int pixel_nums,
 //                       int out_multiplier, int left_shift, int right_shift, int32_t out_zp, int32_t acc_min,
@ -72,4 +67,3 @@ asm_function DeconvDwInt8Post
        bx lr

 #endif
-#endif
--- a/mindspore/lite/nnacl/assembly/arm32/IndirectGemmInt16to32_8x4.S
+++ b/mindspore/lite/nnacl/assembly/arm32/IndirectGemmInt16to32_8x4.S
@ -3,10 +3,6 @@

 .text
 .align 5
-.global IndirectGemmInt16to32_8x4
-#ifndef __APPLE__
-.type IndirectGemmInt16to32_8x4, %function
-#endif

 // void IndirectGemmInt16to32_8x4(int *output, short *input, short *weight, size_t kszie, size_t ic8, size_t oc4, size_t offset);
 // r0: output, r1: input, r2: weight, r3: kszie, r4: ic8, r5: oc4, r6: offset
--- a/mindspore/lite/nnacl/assembly/arm32/IndirectGemmInt8_2x4.S
+++ b/mindspore/lite/nnacl/assembly/arm32/IndirectGemmInt8_2x4.S
@ -1,13 +1,8 @@
-#ifdef __arm__
-#ifndef __aarch64__
+#ifdef ENABLE_ARM32
 #include "nnacl/assembly_global.h"

 .text
 .align 5
-.global IndirectGemmInt8_2x4
-#ifndef __APPLE__
-.type IndirectGemmInt8_2x4, %function
-#endif

 // void IndirectGemmInt8_2x4(int8_t *output, int8_t *input, int8_t *weight, int32_t *bias, size_t ksize, size_t ic4,
 // size_t oc, size_t offset, int32_t *input_sum, size_t act_min, size_t act_max, size_t out_zp, int32_t *out_multiplier,
@ -294,4 +289,3 @@ LoopOcEnd:
    vpop {q4-q7}
    pop {r4-r8, r10, r11, pc}
 #endif
-#endif
--- a/mindspore/lite/nnacl/assembly/arm32/MatVecMulFp32.S
+++ b/mindspore/lite/nnacl/assembly/arm32/MatVecMulFp32.S
@ -1,13 +1,8 @@
-#ifdef __arm__
-#ifndef __aarch64__
+#ifdef ENABLE_ARM32
 #include "nnacl/assembly_global.h"

 .text
 .align 5
-.global MatVecMulFp32
-#ifndef __APPLE__
-.type MatVecMulFp32, %function
-#endif

 // void MatVecMulFp32(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int col)
 // r0: a
@ -183,4 +178,3 @@ End:
  sub sp, sp, #52
  pop {r0-r8, r9, r10, r11, pc}
 #endif
-#endif
--- a/mindspore/lite/nnacl/assembly/arm32/MatmulFp32.S
+++ b/mindspore/lite/nnacl/assembly/arm32/MatmulFp32.S
@ -1,11 +1,8 @@
 #ifdef ENABLE_ARM32
 #include "nnacl/assembly_global.h"
-    .text
-    .align 5
-    .global MatmulFloatNeon32
-#ifndef __APPLE__
-    .type MatmulFloatNeon32, %function
-#endif
+
+.text
+.align 5

 // void MatmulFloatNeon32(const float *a, const float *b, float *c, const float *bias, int act_type, int depth
 //                        int row, int col, size_t stride, size_t writeNhwc, size_t WriteWino)
--- a/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt.S
+++ b/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt.S
@ -1,11 +1,8 @@
 #ifdef ENABLE_ARM32
 #include "nnacl/assembly_global.h"
-    .text
-    .align 5
-    .global MatmulFloatNeon32Opt
-#ifndef __APPLE__
-    .type MatmulFloatNeon32Opt, %function
-#endif
+
+.text
+.align 5

 // void MatmulFloatNeon32Opt(const float *a, const float *b, float *c, const float *bias, int act_type, int depth
 //                        int row, int col, size_t stride, size_t writeMode)
--- a/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt12x4.S
+++ b/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt12x4.S
@ -1,11 +1,8 @@
 #ifdef ENABLE_ARM32
 #include "nnacl/assembly_global.h"
-    .text
-    .align 5
-    .global MatmulFloatNeon32Opt12x4
-#ifndef __APPLE__
-    .type MatmulFloatNeon32Opt12x4, %function
-#endif
+
+.text
+.align 5

 // void MatmulFloatNeon32Opt12x4(const float *a, const float *b, float *c, const float *bias, int act_type, int depth
 //                               int row, int col, size_t stride, size_t writeMode)
--- a/mindspore/lite/nnacl/assembly/arm32/MatmulInt8.S
+++ b/mindspore/lite/nnacl/assembly/arm32/MatmulInt8.S
@ -1,13 +1,8 @@
-#ifdef __arm__
-#ifndef __aarch64__
+#ifdef ENABLE_ARM32
 #include "nnacl/assembly_global.h"

 .text
 .align 5
-.global MatmulInt8Neon32
-#ifndef __APPLE__
-.type MatmulInt8Neon32, %function
-#endif

 //void MatmulInt8Neon32(const int8_t *a, const int8_t *b, int8_t *dst, int row, int col, int deep16, 
 //                      const int *input_sums, const int *weight_bias, int act_min, int act_max, int out_zp,
@ -286,4 +281,3 @@ End1:
  vpop {q4-q7}
  pop {r0-r11, pc}
 #endif
-#endif
--- a/mindspore/lite/nnacl/assembly/arm32/MatmulInt8Opt.S
+++ b/mindspore/lite/nnacl/assembly/arm32/MatmulInt8Opt.S
@ -1,13 +1,8 @@
-#ifdef __arm__
-#ifndef __aarch64__
+#ifdef ENABLE_ARM32
 #include "nnacl/assembly_global.h"

 .text
 .align 5
-.global MatmulInt8Opt
-#ifndef __APPLE__
-.type MatmulInt8Opt, %function
-#endif

 //void MatmulInt8Neon32Opt(const int8_t *a, const int8_t *b, int8_t *dst, int row, int col, int deep16, 
 //                         const int *input_sums, const int *weight_bias, int act_min, int act_max, int out_zp,
@ -288,4 +283,3 @@ LoopRowEnd:
    vpop {q4-q7}
    pop {r0-r8, r10, r11, pc}
 #endif
-#endif
--- a/mindspore/lite/nnacl/assembly/arm32/MatmulWinogradFp32.S
+++ b/mindspore/lite/nnacl/assembly/arm32/MatmulWinogradFp32.S
@ -3,10 +3,6 @@

 .text
 .align 5
-.global MatrixMultiplyWinograd
-#ifndef __APPLE__
-.type MatrixMultiplyWinograd, %function
-#endif

 // MatrixMultiplyWinograd(float *matix_a, float *matrix_b, float *matrix_c, int m, int k, int n, int in_channel, int c4_channel)
    // r0: matrix_a, r1: matrix_b, r2: matrix_c, r3: m, r4: k, r5: n, r6: in_channel, r7: c4_channel * 4
--- a/mindspore/lite/nnacl/assembly/arm32/PostFuncBiasReluC4.S
+++ b/mindspore/lite/nnacl/assembly/arm32/PostFuncBiasReluC4.S
@ -1,12 +1,8 @@
+#ifdef ENABLE_ARM32
 #include "nnacl/assembly_global.h"

 .text
 .align 5
-//.p2align 5,,15
-.global PostFuncBiasReluC4
-#ifndef __APPLE__
-.type PostFuncBiasReluC4, %function
-#endif

 asm_function PostFuncBiasReluC4
  push {r4-r8, r10, r11, lr}
@ -234,3 +230,4 @@ Loop_C1_3_Write:
 End:
  sub sp, sp, #32
  pop {r4-r8, r10, r11, pc}
+#endif
--- a/mindspore/lite/nnacl/assembly/arm32/PostFuncBiasReluC8.S
+++ b/mindspore/lite/nnacl/assembly/arm32/PostFuncBiasReluC8.S
@ -3,11 +3,6 @@

 .text
 .align 5
-//.p2align 5,,15
-.global PostFuncBiasReluC8
-#ifndef __APPLE__
-.type PostFuncBiasReluC8, %function
-#endif

 //void PostFuncBiasReluC8(float *dst, const float *src, const float *bias, size_t oc8div,size_t oc8mod
 //                        size_t plane_size, size_t stride, int relu_type);
--- a/mindspore/lite/nnacl/assembly/arm32/PreSum4x16Int8Peroc.S
+++ b/mindspore/lite/nnacl/assembly/arm32/PreSum4x16Int8Peroc.S
@ -1,12 +1,8 @@
+#ifdef ENABLE_ARM32
 #include "nnacl/assembly_global.h"

 .text
 .align 5
-.global PreSum4x16Int8Peroc
-#ifndef __APPLE__
-.type PreSum4x16Int8Peroc, %function
-#endif
-

 //void PreSum4x16Int8Peroc(const int8_t *src, int32_t *sum, int32_t *zp, size_t hw4, size_t ic16, int32_t oc_div2,
 //                            size_t oc_res2, size_t stride);
@ -129,3 +125,4 @@ End:
  sub sp, sp, #100
  vpop {q4-q7}
  pop {r4-r11, pc}
+#endif
--- a/mindspore/lite/nnacl/assembly/arm32/PreSum4x16Int8Pert.S
+++ b/mindspore/lite/nnacl/assembly/arm32/PreSum4x16Int8Pert.S
@ -1,12 +1,8 @@
+#ifdef ENABLE_ARM32
 #include "nnacl/assembly_global.h"

 .text
 .align 5
-.global PreSum4x16Int8Pert
-#ifndef __APPLE__
-.type PreSum4x16Int8Pert, %function
-#endif
-

 // void PreSum4x16Int8Pert(const int8_t *src, int32_t *sum, size_t row4, size_t col16, int32_t filter_zp);

@ -80,3 +76,4 @@ End:
  sub sp, sp, #96
  vpop {q4-q7}
  pop {r4-r8, r10, r11, pc}
+#endif
--- a/mindspore/lite/nnacl/assembly/arm32/TiledC4MatmulFp32.S
+++ b/mindspore/lite/nnacl/assembly/arm32/TiledC4MatmulFp32.S
@ -1,11 +1,8 @@
 #ifdef ENABLE_ARM32
 #include "nnacl/assembly_global.h"
-    .text
-    .align 5
-    .global TiledC4MatmulFp32
-#ifndef __APPLE__
-    .type TiledC4MatmulFp32, %function
-#endif
+
+.text
+.align 5

 asm_function TiledC4MatmulFp32
 //void TiledC4MatmulFp32(float* dst, const float* src, const float* weight, size_t cal_num, size_t ic4, size_t oc4)
--- a/Show More
+++ b/Show More