|
|
|
@ -15,33 +15,7 @@
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include "nnacl/int8/matmul_int8.h"
|
|
|
|
|
#include <limits.h>
|
|
|
|
|
#include "nnacl/quantization/fixed_point.h"
|
|
|
|
|
void RowMajor2Row8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col) {
|
|
|
|
|
for (int r = 0; r < row; r++) {
|
|
|
|
|
int8_t *src = src_ptr + r * col;
|
|
|
|
|
for (int c = 0; c < col; c++) {
|
|
|
|
|
int cd8 = c / 8;
|
|
|
|
|
int cm8 = c % 8;
|
|
|
|
|
dst_ptr[cd8 * 8 * row + r * 8 + cm8] = src[c];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void RowMajor2Row4x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col) {
|
|
|
|
|
int col16 = UP_ROUND(col, C16NUM);
|
|
|
|
|
for (int r = 0; r < row; r++) {
|
|
|
|
|
int rd4 = r / C4NUM;
|
|
|
|
|
int rm4 = r % C4NUM;
|
|
|
|
|
for (int c = 0; c < col; c++) {
|
|
|
|
|
int cd16 = c / C16NUM;
|
|
|
|
|
int cm16 = c % C16NUM;
|
|
|
|
|
int dst_index = rd4 * col16 * C4NUM + cd16 * C4NUM * C16NUM + rm4 * C16NUM + cm16;
|
|
|
|
|
int src_index = r * col + c;
|
|
|
|
|
dst_ptr[dst_index] = src_ptr[src_index];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void RowMajor2Row2x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col) {
|
|
|
|
|
int col16 = UP_ROUND(col, C16NUM);
|
|
|
|
@ -90,22 +64,7 @@ void MatrixEmptyInt8(int8_t *dst, int row, int col) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void RowMajor2Row4x8MajorInt8(const int8_t *src_ptr, int8_t *dst_ptr, int row, int col) {
|
|
|
|
|
/* Row-major to row16x4-major (block row-major) */
|
|
|
|
|
int col4 = UP_ROUND(col, C4NUM);
|
|
|
|
|
for (int r = 0; r < row; r++) {
|
|
|
|
|
int rd8 = r / C8NUM, rm8 = r % C8NUM;
|
|
|
|
|
for (int c = 0; c < col; c++) {
|
|
|
|
|
int cd4 = c / C4NUM, cm4 = c % C4NUM;
|
|
|
|
|
int src_index = r * col + c;
|
|
|
|
|
int dst_index = rd8 * col4 * C8NUM + cd4 * C4NUM * C8NUM + rm8 * C4NUM + cm4;
|
|
|
|
|
dst_ptr[dst_index] = src_ptr[src_index];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void RowMajor2Row16x4MajorInt8(void *src_ptr, void *dst_ptr, int row, int col) {
|
|
|
|
|
void RowMajor2Row16x4MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col) {
|
|
|
|
|
/* Row-major to row16x4-major (block row-major) */
|
|
|
|
|
int col16 = UP_ROUND(col, C16NUM);
|
|
|
|
|
size_t row_4div = row / C4NUM * C4NUM;
|
|
|
|
@ -185,16 +144,6 @@ void RowMajor2Row16x4MajorInt8(void *src_ptr, void *dst_ptr, int row, int col) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void RowMajor2Col8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col) {
|
|
|
|
|
for (int r = 0; r < row; r++) {
|
|
|
|
|
int rd8 = r / 8;
|
|
|
|
|
int rm8 = r % 8;
|
|
|
|
|
for (int c = 0; c < col; c++) {
|
|
|
|
|
dst_ptr[rd8 * col * 8 + c * 8 + rm8] = src_ptr[r * col + c];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void MatMulInt8_16x4(const int8_t *a, const int8_t *b, int *dst, int row_4, int col_4, int deep_16,
|
|
|
|
|
const int *input_sum, const int *bias) {
|
|
|
|
|
/* row4x16-major * row16x4-major => row4x4-major */
|
|
|
|
@ -319,47 +268,6 @@ void MatMulInt8_8x8_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row,
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* row4x16-major * col16x4-major => row4x4-major */
|
|
|
|
|
void MatmulInt8(const int8_t *a, const int8_t *b, int8_t *dst, const int *a_sums, const int *bias, int act_min,
|
|
|
|
|
int act_max, int out_zp, int multiplier, int left_shift, int right_shift, int row, int col, int deep16,
|
|
|
|
|
int stride) {
|
|
|
|
|
int8_t *output = dst;
|
|
|
|
|
for (int r = 0; r < row; r++) {
|
|
|
|
|
for (int c = 0; c < col; c++) {
|
|
|
|
|
int r4div = r / C4NUM;
|
|
|
|
|
int r4mod = r % C4NUM;
|
|
|
|
|
int c4div = c / C4NUM;
|
|
|
|
|
int c4mod = c % C4NUM;
|
|
|
|
|
int value = 0;
|
|
|
|
|
for (int d = 0; d < deep16; d++) {
|
|
|
|
|
int d16div = d / C16NUM;
|
|
|
|
|
int d16mod = d % C16NUM;
|
|
|
|
|
size_t ai = r4div * deep16 * C4NUM + d16div * C4NUM * C16NUM + r4mod * C16NUM + d16mod;
|
|
|
|
|
size_t bi = c4div * deep16 * C4NUM + d16div * C4NUM * C16NUM + c4mod * C16NUM + d16mod;
|
|
|
|
|
value += a[ai] * b[bi];
|
|
|
|
|
}
|
|
|
|
|
value -= a_sums[r];
|
|
|
|
|
value += bias[c];
|
|
|
|
|
value = MultiplyByQuantizedMultiplier(value, multiplier, left_shift, right_shift) + out_zp;
|
|
|
|
|
value = MSMIN(INT8_MAX, value);
|
|
|
|
|
value = MSMAX(INT8_MIN, value);
|
|
|
|
|
output[c] = (int8_t)value;
|
|
|
|
|
}
|
|
|
|
|
output += stride;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void RowMajor2Row4x16Major(int8_t *src, int row, int col, int8_t *dst, int col_16) {
|
|
|
|
|
int stride = sizeof(int8_t) * 16 * 4;
|
|
|
|
|
for (int r = 0; r < row; ++r) {
|
|
|
|
|
for (int c = 0; c < col; ++c) {
|
|
|
|
|
int stride_n = r / 4 * (col_16 / 16) + c / 16;
|
|
|
|
|
int src_idx = r * col + c;
|
|
|
|
|
dst[stride * stride_n + r % 4 * 16 + c % 16] = src[src_idx];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void RowMajor2Col16x4Major(int8_t *src, int row, int col, int8_t *dst, int row_16) {
|
|
|
|
|
int stride = sizeof(int8_t) * 16 * 4;
|
|
|
|
|
for (int r = 0; r < row; ++r) {
|
|
|
|
@ -405,14 +313,3 @@ void CalcWeightBiasSums(int8_t *weight, int row, int col, int input_zp, int weig
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void Row4x4Major2RowMajor(int8_t *src, int row4, int8_t *dst, int row, int cow) {
|
|
|
|
|
int stride = sizeof(int8_t) * 4 * 4;
|
|
|
|
|
for (int r = 0; r < row; ++r) {
|
|
|
|
|
for (int c = 0; c < cow; ++c) {
|
|
|
|
|
int sride_n = c / 4 * (row4 / 4) + r / 4;
|
|
|
|
|
int dst_idx = r * cow + c;
|
|
|
|
|
dst[dst_idx] = src[stride * sride_n + r % 4 * 4 + c % 4];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|