|
|
|
@ -6,46 +6,6 @@
|
|
|
|
|
.type MatmulFloatNeon64Opt, %function
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// A: LM [row_8 * depth] col_8_major
|
|
|
|
|
// B: RM [depth * col_8] row_8_major
|
|
|
|
|
// C: A*B [row_8 * col_8] col_8x8_major
|
|
|
|
|
// A * B -> [8 * depth] * [depth * 8] -> [8 * 4] * [4 * 8] or [8 * 1] * [1 * 8]
|
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
//CommLoopMul RM 1x8 block
|
|
|
|
|
// /-----------------------------------------\
|
|
|
|
|
// |v2.s[0] ... v2.s[3] v3.s[0] ... v3.s[3]|
|
|
|
|
|
// \-----------------------------------------/
|
|
|
|
|
// LM 8x1 block
|
|
|
|
|
// /---------------------\ /-----------------------------------------\
|
|
|
|
|
// | v0.s[0] | |v16.s[0]...v16.s[3] v17.s[0]...v17.s[3]|
|
|
|
|
|
// | ... | | ... ... |
|
|
|
|
|
// | v0.s[3] | |v22.s[0]...v22.s[3] v23.s[0]...v23.s[3]|
|
|
|
|
|
// | v1.s[0] | |v24.s[0]...v24.s[3] v25.s[0]...v25.s[3]|
|
|
|
|
|
// | ... | | ... ... |
|
|
|
|
|
// | v1.s[3] | |v30.s[0]...v30.s[3] v31.s[0]...v31.s[3]|
|
|
|
|
|
// \---------------------/ \-----------------------------------------/
|
|
|
|
|
// accumulators 8x8 block
|
|
|
|
|
//
|
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
//OptLoopMul4 RM 4x8 block
|
|
|
|
|
// /--------------------------------------------\
|
|
|
|
|
// |v8.s[0] ... v8.s[3] v9.s[0] ... v9.s[3] |
|
|
|
|
|
// |v10.s[0] ... v10.s[3] v11.s[0] ... v11.s[3]|
|
|
|
|
|
// |v12.s[0] ... v12.s[3] v13.s[0] ... v13.s[3]|
|
|
|
|
|
// |v14.s[0] ... v14.s[3] v15.s[0] ... v15.s[3]|
|
|
|
|
|
// \--------------------------------------------/
|
|
|
|
|
// LM 8x4 block
|
|
|
|
|
// /---------------------------------\ /--------------------------------------------\
|
|
|
|
|
// | v0.s[0] v2.s[0] v4.s[0] v6.s[0] | |v16.s[0]...v16.s[3] v17.s[0]...v17.s[3] |
|
|
|
|
|
// | ... ... ... ... | | ... ... |
|
|
|
|
|
// | v0.s[3] v2.s[3] v4.s[3] v6.s[3] | |v22.s[0]...v22.s[3] v23.s[0]...v23.s[3] |
|
|
|
|
|
// | v1.s[0] v3.s[0] v5.s[0] v7.s[0] | |v24.s[0]...v24.s[3] v25.s[0]...v25.s[3] |
|
|
|
|
|
// | ... ... ... ... | | ... ... |
|
|
|
|
|
// | v1.s[3] v3.s[3] v5.s[3] v7.s[3] | |v30.s[0]...v30.s[3] v31.s[0]...v31.s[3] |
|
|
|
|
|
// \---------------------------------/ \--------------------------------------------/
|
|
|
|
|
// accumulators 8x8 block
|
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
//
|
|
|
|
|
// void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth
|
|
|
|
|
// int row, int col, size_t stride, size_t writeNhwc, size_t WriteWino)
|
|
|
|
|
// x0: a
|
|
|
|
|