!12117 [MSLITE] MAX_SHAPE_SIZE COMM_SHAPE_SIZE

From: @ling_qiao_min
Reviewed-by: 
Signed-off-by:
pull/12117/MERGE
mindspore-ci-bot 4 years ago committed by Gitee
commit 20c61ad83e

@ -17,7 +17,7 @@
#include "micro/coder/opcoders/base/conv2d_base_coder.h"
#include <string>
#include <vector>
#include "nnacl/winograd_utils.h"
#include "nnacl/fp32/winograd_utils.h"
#include "nnacl/int8/quantize.h"
#include "micro/coder/log.h"

@ -5,8 +5,10 @@ include_directories(NNACL_DIR)
if(PLATFORM_ARM32 OR PLATFORM_ARM64)
if("${CMAKE_BUILD_TYPE}" STREQUAL "Release")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fomit-frame-pointer -fstrict-aliasing -ffunction-sections -fdata-sections -ffast-math")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fomit-frame-pointer -fstrict-aliasing -ffunction-sections -fdata-sections -ffast-math")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fomit-frame-pointer -fstrict-aliasing \
-ffunction-sections -fdata-sections -ffast-math")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fomit-frame-pointer -fstrict-aliasing \
-ffunction-sections -fdata-sections -ffast-math")
endif()
endif()
if("${X86_64_SIMD}" STREQUAL "avx")
@ -37,13 +39,13 @@ if(PLATFORM_ARM32)
endif()
if("${X86_64_SIMD}" STREQUAL "sse")
file(GLOB ASSEMBLY_SRC ${NNACL_DIR}/x86_64_sse/*.c)
file(GLOB ASSEMBLY_SRC ${NNACL_DIR}/intrinsics/sse/*.c)
set_property(SOURCE ${ASSEMBLY_SRC} PROPERTY LANGUAGE C)
endif()
if("${X86_64_SIMD}" STREQUAL "avx")
file(GLOB ASSEMBLY_SRC ${NNACL_DIR}/x86_64_sse/*.c
${NNACL_DIR}/x86_64_avx/*.c
file(GLOB ASSEMBLY_SRC ${NNACL_DIR}/intrinsics/sse/*.c
${NNACL_DIR}/intrinsics/avx/*.c
${NNACL_DIR}/assembly/avx/*.S)
set_property(SOURCE ${ASSEMBLY_SRC} PROPERTY LANGUAGE C)
endif()

@ -13,10 +13,10 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "nnacl/minimal_filtering_generator.h"
#include "nnacl/base/minimal_filtering_generator.h"
#include <string.h>
#include <math.h>
#include "nnacl/winograd_utils.h"
#include "nnacl/fp32/winograd_utils.h"
#include "nnacl/errorcode.h"
void Polynomial(const float *interval, float *m, int degree) {

@ -72,6 +72,7 @@ typedef struct SlidingWindowParam {
int kernel_step_;
} SlidingWindowParam;
#define OUPUT_UNIT 2
#define DECONV_WINOGRAD_DEFAULT_UNIT 3
#define DECONV_WINOGRAD_DEFAULT_TILE 8
#define DECONV_WINOGRAD_BUFFER_COUNT 8

@ -15,7 +15,7 @@
*/
#include "nnacl/fp16/deconv_winograd_fp16.h"
#include "nnacl/minimal_filtering_generator.h"
#include "nnacl/base/minimal_filtering_generator.h"
void DeConvWgInputPackFp16(float16_t *src_ptr, float16_t *dst_ptr, int channel, int stride) {
int ic4div = channel / C4NUM;
@ -111,16 +111,16 @@ void DeConvWgMergeFp16(const float16_t *src, float16_t *dst, size_t src_stride,
}
void DeConvWgCalWgFp16(float16_t *tile_in, float16_t *tile_out, float16_t *weight_buf, float16_t *tmp_buf,
float16_t *at_buf, float16_t *a_mid_buf, float16_t *trans_a_buf, bool *transfered,
float16_t *at_buf, float16_t *a_mid_buf, float16_t *trans_a_buf, bool *transferred,
float16_t *bt_buf, float16_t *b_tmp_buf, int unit_size, int w_start, int h_start,
ConvParameter *conv_param, DeConvParam *deconv_param) {
int winograd_plane = unit_size * unit_size;
if (!transfered[unit_size]) {
if (!transferred[unit_size]) {
WinogradTransLeftFp16(tile_in, at_buf, a_mid_buf, DECONV_WINOGRAD_DEFAULT_UNIT, unit_size,
DECONV_WINOGRAD_DEFAULT_UNIT, deconv_param->ic_div4_ * DECONV_WINOGRAD_DEFAULT_TILE);
WinogradTransRightFp16(a_mid_buf, at_buf, trans_a_buf, unit_size, unit_size, DECONV_WINOGRAD_DEFAULT_UNIT,
deconv_param->ic_div4_ * DECONV_WINOGRAD_DEFAULT_TILE);
transfered[unit_size] = true;
transferred[unit_size] = true;
}
for (int index = 0; index < winograd_plane; index++) {
@ -311,7 +311,7 @@ void DeconvWgFp16(float16_t *nhwc_input_, float16_t *tile_in, float16_t *tile_ou
}
/* compute */
bool transfered[DECONV_WINOGRAD_BUFFER_COUNT] = {false};
bool transferred[DECONV_WINOGRAD_BUFFER_COUNT] = {false};
for (int i = 0; i < deconv_param->compute_size_; i++) {
DeConvComputeUnit *unit = &deconv_param->compute_units_[i];
if (unit->use_winograd_) {
@ -328,7 +328,7 @@ void DeconvWgFp16(float16_t *nhwc_input_, float16_t *tile_in, float16_t *tile_ou
DECONV_WINOGRAD_DEFAULT_TILE *
deconv_param->oc_up4_;
DeConvWgCalWgFp16(tile_in, tile_out, (float16_t *)unit->weight_, tmp_buf, unit->winograd_.AT_, mid_a, dst_a,
transfered, unit->winograd_.BT_, tmp_b, unit->winograd_.kh_, unit->w_start_, unit->h_start_,
transferred, unit->winograd_.BT_, tmp_b, unit->winograd_.kh_, unit->w_start_, unit->h_start_,
conv_param, deconv_param);
} else {
float16_t *tmp_buf = (float16_t *)unit->tmp_buffer_ + task_id * deconv_param->oc_div4_ * unit->w_size_ *

@ -17,7 +17,7 @@
#include "nnacl/fp32/conv_depthwise_fp32.h"
#include "nnacl/common_func.h"
#include "nnacl/fp32/common_func_fp32.h"
#include "nnacl/winograd_transform.h"
#include "nnacl/fp32/winograd_transform.h"
#ifdef ENABLE_ARM64
#include <arm_neon.h>
#endif

@ -17,7 +17,7 @@
#include "nnacl/fp32/conv_winograd_fp32.h"
#include <string.h>
#include "nnacl/fp32/common_func_fp32.h"
#include "nnacl/winograd_transform.h"
#include "nnacl/fp32/winograd_transform.h"
#include "nnacl/fp32/matmul_fp32.h"
// fp32 conv winograd

@ -24,7 +24,7 @@
#include "nnacl/op_base.h"
#include "nnacl/common_func.h"
#include "nnacl/conv_parameter.h"
#include "nnacl/winograd_utils.h"
#include "nnacl/fp32/winograd_utils.h"
#include "nnacl/fp32/conv_depthwise_fp32.h"
typedef float *TmpBufferAddress;

@ -22,7 +22,7 @@
#include "nnacl/conv_parameter.h"
#include "nnacl/errorcode.h"
#include "nnacl/fp32/common_func_fp32.h"
#include "nnacl/minimal_filtering_generator.h"
#include "nnacl/base/minimal_filtering_generator.h"
#ifdef __cplusplus
extern "C" {

@ -22,7 +22,7 @@
#include "nnacl/conv_parameter.h"
#include "nnacl/errorcode.h"
#include "nnacl/fp32/common_func_fp32.h"
#include "nnacl/minimal_filtering_generator.h"
#include "nnacl/base/minimal_filtering_generator.h"
#ifdef __cplusplus
extern "C" {

@ -14,7 +14,7 @@
* limitations under the License.
*/
#include "nnacl/winograd_transform.h"
#include "nnacl/fp32/winograd_transform.h"
#include "nnacl/op_base.h"
// fp32 conv winograd

@ -22,10 +22,7 @@
#endif
#include <string.h>
#include "nnacl/pack.h"
#include "nnacl/winograd_utils.h"
#include "mindspore/lite/nnacl/int8/fixed_point.h"
#define OUPUT_UNIT 2
#include "nnacl/fp32/winograd_utils.h"
#ifdef __cplusplus
extern "C" {

File diff suppressed because it is too large Load Diff

@ -24,11 +24,10 @@
#include "nnacl/op_base.h"
#include "nnacl/common_func.h"
#include "nnacl/conv_parameter.h"
#include "nnacl/winograd_utils.h"
#include "nnacl/int8/fixed_point.h"
#include "nnacl/int8/quantize.h"
#include "nnacl/matmul_parameter.h"
#include "nnacl/int8/matmul_int8.h"
#include "nnacl/winograd_transform.h"
#include "nnacl/int8/common_func_int8.h"
#ifdef __cplusplus

@ -24,7 +24,6 @@
#include "nnacl/op_base.h"
#include "nnacl/common_func.h"
#include "nnacl/conv_parameter.h"
#include "nnacl/winograd_utils.h"
#include "nnacl/int8/quantize.h"
#include "nnacl/matmul_parameter.h"
#include "nnacl/int8/matmul_int8.h"

@ -13,7 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "nnacl/x86_64_avx/common_utils.h"
#include "nnacl/intrinsics/avx/common_utils.h"
#ifdef WIN32
#ifdef ENABLE_AVX
#include <stdint.h>

@ -17,6 +17,7 @@
#ifdef ENABLE_SSE
#include <x86intrin.h>
#include "nnacl/fp32/conv_depthwise_fp32.h"
#include "nnacl/intrinsics/sse/sse_common.h"
void ConvDwFp32Border(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width,
size_t in_kh_step, size_t in_kw_step, size_t kernel_w_step, size_t relu, size_t relu6) {
@ -123,18 +124,16 @@ void ConvDwFp32Center(float *dst, const float *src, const float *weight, const f
int c2 = DOWN_DIV(width, C2NUM) * C2NUM;
int c1 = 0;
// c4 loop
for (; c1 < c4; c1 += C4NUM) {
const float *src_kh = src_w;
const float *weight_kh = weight;
for (; c1 < c4; c1 += C4NUM, dst_w += C4NUM * block_channel, src_w += C4NUM * in_sw_step) {
const float *src_kh = src_w, *weight_kh = weight;
__m128 dst_w_ma1 = _mm_setzero_ps();
__m128 dst_w_ma2 = _mm_setzero_ps();
__m128 dst_w_ma3 = _mm_setzero_ps();
__m128 dst_w_ma4 = _mm_setzero_ps();
for (int kh = 0; kh < kernel_h; kh++) {
const float *src_kw = src_kh;
const float *weight_kw = weight_kh;
for (int kw = 0; kw < kernel_w; kw++) {
for (int kh = 0; kh < kernel_h; kh++, src_kh += in_kh_step, weight_kh += kernel_w * C4NUM) {
const float *src_kw = src_kh, *weight_kw = weight_kh;
for (int kw = 0; kw < kernel_w; kw++, src_kw += in_kw_step, weight_kw += C4NUM) {
__m128 src_kw_ma1 = _mm_loadu_ps(src_kw);
__m128 weight_kw_ma1 = _mm_loadu_ps(weight_kw);
__m128 tmp_ma1 = _mm_mul_ps(src_kw_ma1, weight_kw_ma1);
@ -154,13 +153,9 @@ void ConvDwFp32Center(float *dst, const float *src, const float *weight, const f
__m128 weight_kw_ma4 = _mm_loadu_ps(weight_kw);
__m128 tmp_ma4 = _mm_mul_ps(src_kw_ma4, weight_kw_ma4);
dst_w_ma4 = _mm_add_ps(dst_w_ma4, tmp_ma4);
src_kw += in_kw_step;
weight_kw += C4NUM;
} // kernel_w loop
src_kh += in_kh_step;
weight_kh += kernel_w * C4NUM;
} // kernel_h loop
// add bias relu
__m128 bias_ma = _mm_loadu_ps(bias);
dst_w_ma1 = _mm_add_ps(dst_w_ma1, bias_ma);
@ -168,39 +163,23 @@ void ConvDwFp32Center(float *dst, const float *src, const float *weight, const f
dst_w_ma3 = _mm_add_ps(dst_w_ma3, bias_ma);
dst_w_ma4 = _mm_add_ps(dst_w_ma4, bias_ma);
__m128 zero_ma = _mm_setzero_ps();
if (relu || relu6) {
dst_w_ma1 = _mm_max_ps(zero_ma, dst_w_ma1);
dst_w_ma2 = _mm_max_ps(zero_ma, dst_w_ma2);
dst_w_ma3 = _mm_max_ps(zero_ma, dst_w_ma3);
dst_w_ma4 = _mm_max_ps(zero_ma, dst_w_ma4);
if (relu6) {
__m128 const_ma = _mm_set_ps(6.0f, 6.0f, 6.0f, 6.0f);
dst_w_ma1 = _mm_min_ps(const_ma, dst_w_ma1);
dst_w_ma2 = _mm_min_ps(const_ma, dst_w_ma2);
dst_w_ma3 = _mm_min_ps(const_ma, dst_w_ma3);
dst_w_ma4 = _mm_min_ps(const_ma, dst_w_ma4);
}
}
ActBlock4(&dst_w_ma1, &dst_w_ma2, &dst_w_ma3, &dst_w_ma4, relu, relu6);
_mm_storeu_ps(dst_w, dst_w_ma1);
_mm_storeu_ps(dst_w + block_channel, dst_w_ma2);
_mm_storeu_ps(dst_w + 2 * block_channel, dst_w_ma3);
_mm_storeu_ps(dst_w + 3 * block_channel, dst_w_ma4);
dst_w += C4NUM * block_channel;
src_w += C4NUM * in_sw_step;
} // dst_width loop
// c2 loop
for (; c1 < c2; c1 += C2NUM) {
const float *src_kh = src_w;
const float *weight_kh = weight;
for (; c1 < c2; c1 += C2NUM, dst_w += C2NUM * block_channel, src_w += C2NUM * in_sw_step) {
const float *src_kh = src_w, *weight_kh = weight;
__m128 dst_w_ma1 = _mm_setzero_ps();
__m128 dst_w_ma2 = _mm_setzero_ps();
for (int kh = 0; kh < kernel_h; kh++) {
const float *src_kw = src_kh;
const float *weight_kw = weight_kh;
for (int kw = 0; kw < kernel_w; kw++) {
for (int kh = 0; kh < kernel_h; kh++, src_kh += in_kh_step, weight_kh += kernel_w * C4NUM) {
const float *src_kw = src_kh, *weight_kw = weight_kh;
for (int kw = 0; kw < kernel_w; kw++, src_kw += in_kw_step, weight_kw += C4NUM) {
__m128 src_kw_ma1 = _mm_loadu_ps(src_kw);
__m128 weight_kw_ma1 = _mm_loadu_ps(weight_kw);
__m128 tmp_ma1 = _mm_mul_ps(src_kw_ma1, weight_kw_ma1);
@ -210,68 +189,38 @@ void ConvDwFp32Center(float *dst, const float *src, const float *weight, const f
__m128 weight_kw_ma2 = _mm_loadu_ps(weight_kw);
__m128 tmp_ma2 = _mm_mul_ps(src_kw_ma2, weight_kw_ma2);
dst_w_ma2 = _mm_add_ps(dst_w_ma2, tmp_ma2);
src_kw += in_kw_step;
weight_kw += C4NUM;
} // kernel_w loop
src_kh += in_kh_step;
weight_kh += kernel_w * C4NUM;
} // kernel_h loop
// add bias relu
__m128 bias_ma = _mm_loadu_ps(bias);
dst_w_ma1 = _mm_add_ps(dst_w_ma1, bias_ma);
dst_w_ma2 = _mm_add_ps(dst_w_ma2, bias_ma);
__m128 zero_ma = _mm_setzero_ps();
if (relu || relu6) {
dst_w_ma1 = _mm_max_ps(zero_ma, dst_w_ma1);
dst_w_ma2 = _mm_max_ps(zero_ma, dst_w_ma2);
if (relu6) {
__m128 const_ma = _mm_set_ps(6.0f, 6.0f, 6.0f, 6.0f);
dst_w_ma1 = _mm_min_ps(const_ma, dst_w_ma1);
dst_w_ma2 = _mm_min_ps(const_ma, dst_w_ma2);
}
}
ActBlock2(&dst_w_ma1, &dst_w_ma2, relu, relu6);
_mm_storeu_ps(dst_w, dst_w_ma1);
_mm_storeu_ps(dst_w + block_channel, dst_w_ma2);
dst_w += C2NUM * block_channel;
src_w += C2NUM * in_sw_step;
}
// remaining
for (; c1 < width; c1++) {
const float *src_kh = src_w;
const float *weight_kh = weight;
for (; c1 < width; c1++, dst_w += block_channel, src_w += in_sw_step) {
const float *src_kh = src_w, *weight_kh = weight;
__m128 dst_w_ma1 = _mm_setzero_ps();
for (int kh = 0; kh < kernel_h; kh++) {
const float *src_kw = src_kh;
const float *weight_kw = weight_kh;
for (int kw = 0; kw < kernel_w; kw++) {
for (int kh = 0; kh < kernel_h; kh++, src_kh += in_kh_step, weight_kh += kernel_w * C4NUM) {
const float *src_kw = src_kh, *weight_kw = weight_kh;
for (int kw = 0; kw < kernel_w; kw++, src_kw += in_kw_step, weight_kw += C4NUM) {
__m128 src_kw_ma1 = _mm_loadu_ps(src_kw);
__m128 weight_kw_ma1 = _mm_loadu_ps(weight_kw);
__m128 tmp_ma1 = _mm_mul_ps(src_kw_ma1, weight_kw_ma1);
dst_w_ma1 = _mm_add_ps(dst_w_ma1, tmp_ma1);
src_kw += in_kw_step;
weight_kw += C4NUM;
} // kernel_w loop
src_kh += in_kh_step;
weight_kh += kernel_w * C4NUM;
} // kernel_h loop
// add bias relu
__m128 bias_ma = _mm_loadu_ps(bias);
dst_w_ma1 = _mm_add_ps(dst_w_ma1, bias_ma);
__m128 zero_ma = _mm_setzero_ps();
if (relu || relu6) {
dst_w_ma1 = _mm_max_ps(zero_ma, dst_w_ma1);
if (relu6) {
__m128 const_ma = _mm_set_ps(6.0f, 6.0f, 6.0f, 6.0f);
dst_w_ma1 = _mm_min_ps(const_ma, dst_w_ma1);
}
}
ActBlock1(&dst_w_ma1, relu, relu6);
_mm_storeu_ps(dst_w, dst_w_ma1);
dst_w += block_channel;
src_w += in_sw_step;
}
dst_h += out_h_step;
src_h += in_sh_step;

File diff suppressed because it is too large Load Diff

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save