!12734 [ms][lite][cpu] transpose optimize

From: @lzkcode
Reviewed-by: 
Signed-off-by:
pull/12734/MERGE
mindspore-ci-bot 4 years ago committed by Gitee
commit 16a9aa37af

@ -39,7 +39,8 @@ int ConvolutionDepthwiseFP32Coder::InitWeightBias() {
MS_CHECK_PTR(packed_weight_); MS_CHECK_PTR(packed_weight_);
MS_CHECK_RET_CODE(memset_s(packed_weight_, packed_weight_data_size, 0, packed_weight_data_size), MS_CHECK_RET_CODE(memset_s(packed_weight_, packed_weight_data_size, 0, packed_weight_data_size),
"memset packed weight failed!"); "memset packed weight failed!");
PackNCHWToNHWCFp32(origin_weight, packed_weight_, 1, filter_tensor_->Height() * filter_tensor_->Width(), channel); PackNCHWToNHWCFp32(origin_weight, packed_weight_, 1, filter_tensor_->Height() * filter_tensor_->Width(), channel, 0,
0);
auto channel_size = static_cast<size_t>(channel); auto channel_size = static_cast<size_t>(channel);
auto bias_size = static_cast<size_t>(channel_size * sizeof(float)); auto bias_size = static_cast<size_t>(channel_size * sizeof(float));

@ -127,6 +127,52 @@ void Fp16TransposeDim5(const float16_t *in_data, float16_t *out_data, int *strid
} }
} }
void Fp16TransposeDim6(const float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm,
const int *output_shape) {
const int stride0 = strides[perm[0]];
const int stride1 = strides[perm[1]];
const int stride2 = strides[perm[2]];
const int stride3 = strides[perm[3]];
const int stride4 = strides[perm[4]];
const int stride5 = strides[perm[5]];
const int out_stride0 = out_strides[0];
const int out_stride1 = out_strides[1];
const int out_stride2 = out_strides[2];
const int out_stride3 = out_strides[3];
const int out_stride4 = out_strides[4];
const int output0 = output_shape[0];
const int output1 = output_shape[1];
const int output2 = output_shape[2];
const int output3 = output_shape[3];
const int output4 = output_shape[4];
const int output5 = output_shape[5];
for (int i = 0; i < output0; ++i) {
int out_stride0_i = i * out_stride0;
int stride0_i = i * stride0;
for (int j = 0; j < output1; ++j) {
int out_stride1_j = j * out_stride1;
int stride1_j = j * stride1;
for (int k = 0; k < output2; ++k) {
int out_stride2_k = k * out_stride2;
int stride2_k = k * stride2;
for (int m = 0; m < output3; ++m) {
int out_stride3_m = m * out_stride3;
int stride3_m = m * stride3;
for (int n = 0; n < output4; ++n) {
int out_stride4_n = n * out_stride4;
int stride4_n = n * stride4;
for (int g = 0; g < output5; ++g) {
out_data[out_stride0_i + out_stride1_j + out_stride2_k + out_stride3_m + out_stride4_n + g] =
in_data[stride0_i + stride1_j + stride2_k + stride3_m + stride4_n + g * stride5];
}
}
}
}
}
}
}
void TransposeDimsFp16(const float16_t *in_data, float16_t *out_data, const int *strides, const int *out_strides, void TransposeDimsFp16(const float16_t *in_data, float16_t *out_data, const int *strides, const int *out_strides,
const int *perm, const int *output_shape, int dims, int *size, int *position) { const int *perm, const int *output_shape, int dims, int *size, int *position) {
*(size + dims - 1) = 1; *(size + dims - 1) = 1;
@ -190,6 +236,8 @@ int Fp16DoTranspose(const float16_t *in_data, float16_t *out_data, const int *ou
Fp16TransposeDim4(in_data, out_data, strides, out_strides, perm, output_shape); Fp16TransposeDim4(in_data, out_data, strides, out_strides, perm, output_shape);
} else if (num_axes == 5) { } else if (num_axes == 5) {
Fp16TransposeDim5(in_data, out_data, strides, out_strides, perm, output_shape); Fp16TransposeDim5(in_data, out_data, strides, out_strides, perm, output_shape);
} else if (num_axes == 6) {
Fp16TransposeDim6(in_data, out_data, strides, out_strides, perm, output_shape);
} else { } else {
TransposeDimsFp16(in_data, out_data, strides, out_strides, perm, output_shape, num_axes, size, position); TransposeDimsFp16(in_data, out_data, strides, out_strides, perm, output_shape, num_axes, size, position);
} }

File diff suppressed because it is too large Load Diff

@ -1,5 +1,5 @@
/** /**
* Copyright 2020 Huawei Technologies Co., Ltd * Copyright 2020-2021 Huawei Technologies Co., Ltd
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
@ -30,8 +30,9 @@ void PackNHWCToNC4HW4Fp32(const void *src, void *dst, int batch, int plane, int
void PackNCHWToNC4HW4Fp32(const void *src, void *dst, int batch, int plane, int channel); void PackNCHWToNC4HW4Fp32(const void *src, void *dst, int batch, int plane, int channel);
void PackNHWCToNHWC4Fp32(const void *src, void *dst, int batch, int plane, int channel); void PackNHWCToNHWC4Fp32(const void *src, void *dst, int batch, int plane, int channel);
void PackNHWCToNHWC8Fp32(const void *src, void *dst, int batch, int plane, int channel); void PackNHWCToNHWC8Fp32(const void *src, void *dst, int batch, int plane, int channel);
void PackNHWCToNCHWFp32(const void *src, void *dst, int batch, int plane, int channel); // Note: If not multithreaded, please set task_id = 0 and thread_count = 0;
void PackNCHWToNHWCFp32(const void *src, void *dst, int batch, int plane, int channel); void PackNHWCToNCHWFp32(const void *src, void *dst, int batch, int plane, int channel, int task_id, int thread_count);
void PackNCHWToNHWCFp32(const void *src, void *dst, int batch, int plane, int channel, int task_id, int thread_count);
void PackNHWC4ToNHWCFp32(const void *src, void *dst, int batch, int plane, int channel); void PackNHWC4ToNHWCFp32(const void *src, void *dst, int batch, int plane, int channel);
void PackNC4HW4ToNHWC4Fp32(const void *src, void *dst, int batch, int plane, int channel); void PackNC4HW4ToNHWC4Fp32(const void *src, void *dst, int batch, int plane, int channel);
void PackNC4HW4ToNHWCFp32(const void *src, void *dst, int batch, int plane, int channel); void PackNC4HW4ToNHWCFp32(const void *src, void *dst, int batch, int plane, int channel);
@ -43,6 +44,21 @@ void PackDepthwiseIndirectWeightC8Fp32(const void *src, void *dst, int height, i
void Im2ColPackUnitFp32(const float *input_data, const ConvParameter *conv_param, float *packed_input, int real_cal_num, void Im2ColPackUnitFp32(const float *input_data, const ConvParameter *conv_param, float *packed_input, int real_cal_num,
int block_index); int block_index);
// Transpose 8X8 Fp32 block data
typedef void (*Transpose8X8Fp32Func)(const float *src_ptr, float *dst_ptr, int src_stride, int dst_stride);
#ifdef ENABLE_ARM64
void Transpose8X8Fp32Arm64(const float *src_ptr, float *dst_ptr, int src_stride, int dst_stride);
#endif
#ifdef ENABLE_ARM32
void Transpose8X8Fp32Arm32(const float *src_ptr, float *dst_ptr, int src_stride, int dst_stride);
#endif
#ifdef ENABLE_AVX
void Transpose8X8Fp32Avx(const float *src_ptr, float *dst_ptr, int src_stride, int dst_stride);
#endif
#if defined(ENABLE_SSE) && !defined(ENABLE_AVX)
void Transpose8X8Fp32Sse(const float *src_ptr, float *dst_ptr, int src_stride, int dst_stride);
#endif
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

@ -125,20 +125,73 @@ void TransposeDim5Fp32(const float *in_data, float *out_data, const int *strides
} }
} }
void TransposeDimsFp32(const float *in_data, float *out_data, const int *strides, const int *out_strides, void TransposeDim6Fp32(const float *in_data, float *out_data, const int *strides, const int *out_strides,
const int *perm, const int *output_shape, int dims, int *size, int *position) { const int *perm, const int *output_shape) {
*(size + dims - 1) = 1; const int stride0 = strides[perm[0]];
for (int i = dims - 1; i > 0; --i) { const int stride1 = strides[perm[1]];
*(size + i - 1) = *(size + i) * output_shape[i]; const int stride2 = strides[perm[2]];
const int stride3 = strides[perm[3]];
const int stride4 = strides[perm[4]];
const int stride5 = strides[perm[5]];
const int out_stride0 = out_strides[0];
const int out_stride1 = out_strides[1];
const int out_stride2 = out_strides[2];
const int out_stride3 = out_strides[3];
const int out_stride4 = out_strides[4];
const int output0 = output_shape[0];
const int output1 = output_shape[1];
const int output2 = output_shape[2];
const int output3 = output_shape[3];
const int output4 = output_shape[4];
const int output5 = output_shape[5];
for (int i = 0; i < output0; ++i) {
int out_stride0_i = i * out_stride0;
int stride0_i = i * stride0;
for (int j = 0; j < output1; ++j) {
int out_stride1_j = j * out_stride1;
int stride1_j = j * stride1;
for (int k = 0; k < output2; ++k) {
int out_stride2_k = k * out_stride2;
int stride2_k = k * stride2;
for (int m = 0; m < output3; ++m) {
int out_stride3_m = m * out_stride3;
int stride3_m = m * stride3;
for (int n = 0; n < output4; ++n) {
int out_stride4_m = n * out_stride4;
int stride4_m = n * stride4;
for (int g = 0; g < output5; ++g) {
out_data[out_stride0_i + out_stride1_j + out_stride2_k + out_stride3_m + out_stride4_m + g] =
in_data[stride0_i + stride1_j + stride2_k + stride3_m + stride4_m + g * stride5];
}
}
}
}
}
} }
}
for (size_t idx = 0; idx < (*size) * output_shape[0]; ++idx) { void TransposeDimsFp32(const float *in_data, float *out_data, const int *output_shape, int *size, int *position,
TransposeParameter *transpose_param, int task_id, int thread_num) {
int *perm = transpose_param->perm_;
int *strides = transpose_param->strides_;
int *out_strides = transpose_param->out_strides_;
int num_axes = transpose_param->num_axes_;
size_t data_size = (*size) * output_shape[0];
size_t offset_size = UP_DIV(data_size, thread_num);
size_t task_offset = offset_size * task_id;
int count = data_size - task_offset;
if (count <= 0) {
return;
}
count = MSMIN(offset_size, count);
for (size_t idx = task_offset; idx < task_offset + count; ++idx) {
int pos = idx; int pos = idx;
int output_idx = 0; int output_idx = 0;
int input_idx = 0; int input_idx = 0;
for (int i = 0; i < dims; ++i) { for (int i = 0; i < num_axes; ++i) {
*(position + i) = pos / *(size + i); *(position + i) = pos / *(size + i);
int out_stride = i < dims - 1 ? out_strides[i] : 1; int out_stride = i < num_axes - 1 ? out_strides[i] : 1;
output_idx += (*(position + i) * out_stride); output_idx += (*(position + i) * out_stride);
input_idx += (*(position + i) * strides[perm[i]]); input_idx += (*(position + i) * strides[perm[i]]);
pos -= *(position + i) * (*(size + i)); pos -= *(position + i) * (*(size + i));
@ -147,8 +200,8 @@ void TransposeDimsFp32(const float *in_data, float *out_data, const int *strides
} }
} }
int DoTransposeFp32(const float *in_data, float *out_data, const int *output_shape, TransposeParameter *transpose_param, int DoTransposeFp32(const float *in_data, float *out_data, const int *output_shape,
int *size, int *position) { TransposeParameter *transpose_param) {
if (in_data == NULL || out_data == NULL) { if (in_data == NULL || out_data == NULL) {
return NNACL_ERR; return NNACL_ERR;
} }
@ -188,8 +241,10 @@ int DoTransposeFp32(const float *in_data, float *out_data, const int *output_sha
TransposeDim4Fp32(in_data, out_data, strides, out_strides, perm, output_shape); TransposeDim4Fp32(in_data, out_data, strides, out_strides, perm, output_shape);
} else if (num_axes == 5) { } else if (num_axes == 5) {
TransposeDim5Fp32(in_data, out_data, strides, out_strides, perm, output_shape); TransposeDim5Fp32(in_data, out_data, strides, out_strides, perm, output_shape);
} else if (num_axes == 6) {
TransposeDim6Fp32(in_data, out_data, strides, out_strides, perm, output_shape);
} else { } else {
TransposeDimsFp32(in_data, out_data, strides, out_strides, perm, output_shape, num_axes, size, position); return NNACL_ERR;
} }
return NNACL_OK; return NNACL_OK;
} }

@ -25,9 +25,9 @@
extern "C" { extern "C" {
#endif #endif
int DoTransposeFp32(const float *in_data, float *out_data, const int *output_shape, TransposeParameter *transpose_param, int DoTransposeFp32(const float *in_data, float *out_data, const int *output_shape, TransposeParameter *param);
int *size, int *position); void TransposeDimsFp32(const float *in_data, float *out_data, const int *output_shape, int *size, int *position,
TransposeParameter *transpose_param, int task_id, int thread_num);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

@ -1,140 +0,0 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifdef ENABLE_SSE
#include <x86intrin.h>
#include "nnacl/pack.h"
#include "nnacl/int8/conv_int8.h"
void PackNHWCToNCHWFp32(const void *src, void *dst, int batches, int plane, int channel) {
int hw8 = plane / C8NUM * C8NUM;
int c8 = channel / C8NUM * C8NUM;
int batch = plane * channel;
for (int n = 0; n < batches; n++) {
const float *src_batch = (const float *)src + n * batch;
float *dst_batch = (float *)dst + n * batch;
int hw = 0;
for (; hw < hw8; hw += C8NUM) {
int c = 0;
for (; c < c8; c += C8NUM) {
const float *src_ptr = src_batch + hw * channel + c;
float *dst_ptr = dst_batch + c * plane + hw;
// 11-14
__m128 v0_ma = _mm_loadu_ps(src_ptr);
__m128 v1_ma = _mm_loadu_ps(src_ptr + channel);
__m128 v2_ma = _mm_loadu_ps(src_ptr + 2 * channel);
__m128 v3_ma = _mm_loadu_ps(src_ptr + 3 * channel);
__m128 v4_ma = _mm_unpacklo_ps(v0_ma, v1_ma);
__m128 v5_ma = _mm_unpackhi_ps(v0_ma, v1_ma);
__m128 v6_ma = _mm_unpacklo_ps(v2_ma, v3_ma);
__m128 v7_ma = _mm_unpackhi_ps(v2_ma, v3_ma);
__m128 v8_ma = _mm_movelh_ps(v4_ma, v6_ma);
__m128 v9_ma = _mm_movehl_ps(v6_ma, v4_ma);
__m128 v10_ma = _mm_movelh_ps(v5_ma, v7_ma);
__m128 v11_ma = _mm_movehl_ps(v7_ma, v5_ma);
_mm_storeu_ps(dst_ptr, v8_ma);
_mm_storeu_ps(dst_ptr + plane, v9_ma);
_mm_storeu_ps(dst_ptr + 2 * plane, v10_ma);
_mm_storeu_ps(dst_ptr + 3 * plane, v11_ma);
// 15-18
v0_ma = _mm_loadu_ps(src_ptr + C4NUM);
v1_ma = _mm_loadu_ps(src_ptr + channel + C4NUM);
v2_ma = _mm_loadu_ps(src_ptr + 2 * channel + C4NUM);
v3_ma = _mm_loadu_ps(src_ptr + 3 * channel + C4NUM);
v4_ma = _mm_unpacklo_ps(v0_ma, v1_ma);
v5_ma = _mm_unpackhi_ps(v0_ma, v1_ma);
v6_ma = _mm_unpacklo_ps(v2_ma, v3_ma);
v7_ma = _mm_unpackhi_ps(v2_ma, v3_ma);
v8_ma = _mm_movelh_ps(v4_ma, v6_ma);
v9_ma = _mm_movehl_ps(v6_ma, v4_ma);
v10_ma = _mm_movelh_ps(v5_ma, v7_ma);
v11_ma = _mm_movehl_ps(v7_ma, v5_ma);
_mm_storeu_ps(dst_ptr + C4NUM * plane, v8_ma);
_mm_storeu_ps(dst_ptr + (C4NUM + 1) * plane, v9_ma);
_mm_storeu_ps(dst_ptr + (C4NUM + 2) * plane, v10_ma);
_mm_storeu_ps(dst_ptr + (C4NUM + 3) * plane, v11_ma);
// 21-24
v0_ma = _mm_loadu_ps(src_ptr + C4NUM * channel);
v1_ma = _mm_loadu_ps(src_ptr + (C4NUM + 1) * channel);
v2_ma = _mm_loadu_ps(src_ptr + (C4NUM + 2) * channel);
v3_ma = _mm_loadu_ps(src_ptr + (C4NUM + 3) * channel);
v4_ma = _mm_unpacklo_ps(v0_ma, v1_ma);
v5_ma = _mm_unpackhi_ps(v0_ma, v1_ma);
v6_ma = _mm_unpacklo_ps(v2_ma, v3_ma);
v7_ma = _mm_unpackhi_ps(v2_ma, v3_ma);
v8_ma = _mm_movelh_ps(v4_ma, v6_ma);
v9_ma = _mm_movehl_ps(v6_ma, v4_ma);
v10_ma = _mm_movelh_ps(v5_ma, v7_ma);
v11_ma = _mm_movehl_ps(v7_ma, v5_ma);
_mm_storeu_ps(dst_ptr + C4NUM, v8_ma);
_mm_storeu_ps(dst_ptr + plane + C4NUM, v9_ma);
_mm_storeu_ps(dst_ptr + 2 * plane + C4NUM, v10_ma);
_mm_storeu_ps(dst_ptr + 3 * plane + C4NUM, v11_ma);
// 25-28
v0_ma = _mm_loadu_ps(src_ptr + C4NUM * channel + C4NUM);
v1_ma = _mm_loadu_ps(src_ptr + (C4NUM + 1) * channel + C4NUM);
v2_ma = _mm_loadu_ps(src_ptr + (C4NUM + 2) * channel + C4NUM);
v3_ma = _mm_loadu_ps(src_ptr + (C4NUM + 3) * channel + C4NUM);
v4_ma = _mm_unpacklo_ps(v0_ma, v1_ma);
v5_ma = _mm_unpackhi_ps(v0_ma, v1_ma);
v6_ma = _mm_unpacklo_ps(v2_ma, v3_ma);
v7_ma = _mm_unpackhi_ps(v2_ma, v3_ma);
v8_ma = _mm_movelh_ps(v4_ma, v6_ma);
v9_ma = _mm_movehl_ps(v6_ma, v4_ma);
v10_ma = _mm_movelh_ps(v5_ma, v7_ma);
v11_ma = _mm_movehl_ps(v7_ma, v5_ma);
_mm_storeu_ps(dst_ptr + C4NUM * plane + C4NUM, v8_ma);
_mm_storeu_ps(dst_ptr + (C4NUM + 1) * plane + C4NUM, v9_ma);
_mm_storeu_ps(dst_ptr + (C4NUM + 2) * plane + C4NUM, v10_ma);
_mm_storeu_ps(dst_ptr + (C4NUM + 3) * plane + C4NUM, v11_ma);
}
for (; c < channel; c++) {
const float *src_ptr = src_batch + hw * channel + c;
float *dst_ptr = dst_batch + c * plane + hw;
for (size_t i = 0; i < C8NUM; i++) {
dst_ptr[i] = src_ptr[i * channel];
}
}
}
for (; hw < plane; hw++) {
const float *src_ptr = src_batch + hw * channel;
float *dst_ptr = dst_batch + hw;
for (size_t i = 0; i < channel; i++) {
dst_ptr[i * plane] = src_ptr[i];
}
}
}
return;
}
#endif

@ -19,7 +19,7 @@
#include "nnacl/op_base.h" #include "nnacl/op_base.h"
#define MAX_TRANSPOSE_DIM_SIZE 5 #define MAX_TRANSPOSE_DIM_SIZE 6
typedef struct TransposeParameter { typedef struct TransposeParameter {
// primitive parameter // primitive parameter

@ -22,6 +22,7 @@
using mindspore::lite::KernelRegistrar; using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_ERROR; using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_NULL_PTR;
using mindspore::lite::RET_OK; using mindspore::lite::RET_OK;
using mindspore::lite::RET_OP_EXECUTE_FAILURE; using mindspore::lite::RET_OP_EXECUTE_FAILURE;
using mindspore::schema::PrimitiveType_Transpose; using mindspore::schema::PrimitiveType_Transpose;
@ -82,31 +83,46 @@ TransposeCPUKernel::~TransposeCPUKernel() {
} }
} }
int TransposeCPUKernel::NhNcTranspose(lite::Tensor *in_tensor, lite::Tensor *out_tensor, TransposeParameter *param) { void TransposeCPUKernel::GetNHNCTransposeFunc(lite::Tensor *in_tensor, lite::Tensor *out_tensor,
TransposeParameter *param) {
auto out_shape = out_tensor->shape(); auto out_shape = out_tensor->shape();
if (in_tensor->shape().size() == 4 && param->perm_[0] == 0 && param->perm_[1] == 2 && param->perm_[2] == 3 && if (in_tensor->shape().size() == 4 && param->perm_[0] == 0 && param->perm_[1] == 2 && param->perm_[2] == 3 &&
param->perm_[3] == 1) { param->perm_[3] == 1) {
nhnc_param_[0] = out_shape[0];
nhnc_param_[1] = out_shape[1] * out_shape[2];
nhnc_param_[2] = out_shape[3];
if (in_tensor->data_type() == kNumberTypeFloat32) { if (in_tensor->data_type() == kNumberTypeFloat32) {
PackNCHWToNHWCFp32(in_tensor->MutableData(), out_tensor->MutableData(), out_shape[0], out_shape[1] * out_shape[2], NHNCTransposeFunc_ = PackNCHWToNHWCFp32;
out_shape[3]);
} else if (in_tensor->data_type() == kNumberTypeInt8) {
PackNCHWToNHWCInt8(in_tensor->MutableData(), out_tensor->MutableData(), out_shape[0], out_shape[1] * out_shape[2],
out_shape[3]);
} }
return RET_OK;
} }
if (in_tensor->shape().size() == 4 && param->perm_[0] == 0 && param->perm_[1] == 3 && param->perm_[2] == 1 && if (in_tensor->shape().size() == 4 && param->perm_[0] == 0 && param->perm_[1] == 3 && param->perm_[2] == 1 &&
param->perm_[3] == 2) { param->perm_[3] == 2) {
nhnc_param_[0] = out_shape[0];
nhnc_param_[1] = out_shape[2] * out_shape[3];
nhnc_param_[2] = out_shape[1];
if (in_tensor->data_type() == kNumberTypeFloat32) { if (in_tensor->data_type() == kNumberTypeFloat32) {
PackNHWCToNCHWFp32(in_tensor->MutableData(), out_tensor->MutableData(), out_shape[0], out_shape[2] * out_shape[3], NHNCTransposeFunc_ = PackNHWCToNCHWFp32;
out_shape[1]);
} else if (in_tensor->data_type() == kNumberTypeInt8) {
PackNHWCToNCHWInt8(in_tensor->MutableData(), out_tensor->MutableData(), out_shape[0], out_shape[2] * out_shape[3],
out_shape[1]);
} }
return RET_OK;
} }
return RET_ERROR; }
int TransposeCPUKernel::RunImpl(int task_id) {
if (NHNCTransposeFunc_ != nullptr) {
NHNCTransposeFunc_(in_data_, out_data_, nhnc_param_[0], nhnc_param_[1], nhnc_param_[2], task_id, thread_count_);
} else {
TransposeDimsFp32(in_data_, out_data_, out_shape_, dim_size_, position_ + dims_ * task_id, param_, task_id,
thread_count_);
}
return RET_OK;
}
int TransposeImpl(void *kernel, int task_id) {
auto transpose = reinterpret_cast<TransposeCPUKernel *>(kernel);
auto ret = transpose->RunImpl(task_id);
if (ret != RET_OK) {
MS_LOG(ERROR) << "TransposeImpl Run error task_id[" << task_id << "] error_code[" << ret << "]";
}
return ret;
} }
int TransposeCPUKernel::Run() { int TransposeCPUKernel::Run() {
@ -123,8 +139,8 @@ int TransposeCPUKernel::Run() {
MS_ASSERT(in_data_); MS_ASSERT(in_data_);
MS_ASSERT(out_data_); MS_ASSERT(out_data_);
TransposeParameter *param = reinterpret_cast<TransposeParameter *>(this->op_parameter_); param_ = reinterpret_cast<TransposeParameter *>(this->op_parameter_);
if (in_tensor->shape().size() != static_cast<size_t>(param->num_axes_)) { if (in_tensor->shape().size() != static_cast<size_t>(param_->num_axes_)) {
memcpy(out_data_, in_data_, in_tensor->ElementsNum() * sizeof(float)); memcpy(out_data_, in_data_, in_tensor->ElementsNum() * sizeof(float));
return RET_OK; return RET_OK;
} }
@ -134,40 +150,48 @@ int TransposeCPUKernel::Run() {
MS_ASSERT(input_perm->data_c() != nullptr); MS_ASSERT(input_perm->data_c() != nullptr);
int *perm_data = reinterpret_cast<int *>(input_perm->data_c()); int *perm_data = reinterpret_cast<int *>(input_perm->data_c());
for (int i = 0; i < input_perm->ElementsNum(); ++i) { for (int i = 0; i < input_perm->ElementsNum(); ++i) {
param->perm_[i] = perm_data[i]; param_->perm_[i] = perm_data[i];
} }
for (int i = input_perm->ElementsNum(); i < MAX_SHAPE_SIZE; ++i) { for (int i = input_perm->ElementsNum(); i < MAX_SHAPE_SIZE; ++i) {
param->perm_[i] = 0; param_->perm_[i] = 0;
} }
} }
auto ret = NhNcTranspose(in_tensor, out_tensor, param); thread_count_ = op_parameter_->thread_num_;
if (ret == RET_OK) { GetNHNCTransposeFunc(in_tensor, out_tensor, param_);
if (NHNCTransposeFunc_ != nullptr) {
auto ret = ParallelLaunch(this->context_->thread_pool_, TransposeImpl, this, thread_count_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "NHNCTransposeFunc_ is error!";
}
return ret; return ret;
} }
if (in_tensor->data_type() == kNumberTypeInt8) {
MS_LOG(ERROR) << "not support now";
return RET_ERROR;
}
int dims = out_tensor->shape().size(); MS_ASSERT(out_shape_);
if (dims > MAX_TRANSPOSE_DIM_SIZE) { dims_ = out_tensor->shape().size();
dim_size_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims * sizeof(int))); if (dims_ > MAX_TRANSPOSE_DIM_SIZE) {
dim_size_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims_ * sizeof(int)));
if (dim_size_ == nullptr) { if (dim_size_ == nullptr) {
MS_LOG(ERROR) << "Malloc data failed"; MS_LOG(ERROR) << "Malloc data failed";
return RET_ERROR; return RET_NULL_PTR;
} }
position_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims * sizeof(int))); *(dim_size_ + dims_ - 1) = 1;
for (int i = dims_ - 1; i > 0; --i) {
*(dim_size_ + i - 1) = *(dim_size_ + i) * out_shape_[i];
}
position_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims_ * sizeof(int) * thread_count_));
if (position_ == nullptr) { if (position_ == nullptr) {
MS_LOG(ERROR) << "Malloc data failed";
context_->allocator->Free(dim_size_); context_->allocator->Free(dim_size_);
dim_size_ = nullptr; MS_LOG(ERROR) << "Malloc data failed";
return RET_ERROR; return RET_NULL_PTR;
} }
} }
int ret;
MS_ASSERT(out_shape_); if (dims_ > MAX_TRANSPOSE_DIM_SIZE) {
ret = DoTransposeFp32(in_data_, out_data_, out_shape_, param, dim_size_, position_); ret = ParallelLaunch(this->context_->thread_pool_, TransposeImpl, this, thread_count_);
if (dims > MAX_TRANSPOSE_DIM_SIZE) { } else {
ret = DoTransposeFp32(in_data_, out_data_, out_shape_, param_);
}
if (dims_ > MAX_TRANSPOSE_DIM_SIZE) {
context_->allocator->Free(dim_size_); context_->allocator->Free(dim_size_);
context_->allocator->Free(position_); context_->allocator->Free(position_);
dim_size_ = nullptr; dim_size_ = nullptr;
@ -175,13 +199,10 @@ int TransposeCPUKernel::Run() {
} }
if (ret != RET_OK) { if (ret != RET_OK) {
MS_LOG(ERROR) << "Transpose run failed"; MS_LOG(ERROR) << "Transpose run failed";
return RET_ERROR;
} }
return ret; return ret;
} }
REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Transpose, LiteKernelCreator<TransposeCPUKernel>) REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Transpose, LiteKernelCreator<TransposeCPUKernel>)
REG_KERNEL(kCPU, kNumberTypeInt32, PrimitiveType_Transpose, LiteKernelCreator<TransposeCPUKernel>) REG_KERNEL(kCPU, kNumberTypeInt32, PrimitiveType_Transpose, LiteKernelCreator<TransposeCPUKernel>)
REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_Transpose, LiteKernelCreator<TransposeCPUKernel>)
} // namespace mindspore::kernel } // namespace mindspore::kernel

@ -25,6 +25,10 @@
#include "src/kernel_registry.h" #include "src/kernel_registry.h"
namespace mindspore::kernel { namespace mindspore::kernel {
typedef void (*TransposeFunc)(const void *src, void *dst, int batch, int plane, int channel, int thread_num,
int task_id);
class TransposeCPUKernel : public LiteKernel { class TransposeCPUKernel : public LiteKernel {
public: public:
explicit TransposeCPUKernel(OpParameter *param, const std::vector<lite::Tensor *> &inputs, explicit TransposeCPUKernel(OpParameter *param, const std::vector<lite::Tensor *> &inputs,
@ -35,14 +39,20 @@ class TransposeCPUKernel : public LiteKernel {
int Init() override; int Init() override;
int ReSize() override; int ReSize() override;
int Run() override; int Run() override;
int NhNcTranspose(lite::Tensor *in_tensor, lite::Tensor *out_tensor, TransposeParameter *param); int RunImpl(int task_id);
protected: protected:
void GetNHNCTransposeFunc(lite::Tensor *in_tensor, lite::Tensor *out_tensor, TransposeParameter *param);
float *in_data_ = nullptr; float *in_data_ = nullptr;
float *out_data_ = nullptr; float *out_data_ = nullptr;
int *out_shape_ = nullptr; int *out_shape_ = nullptr;
int *dim_size_ = nullptr; int *dim_size_ = nullptr;
int *position_ = nullptr; int *position_ = nullptr;
TransposeParameter *param_ = nullptr;
TransposeFunc NHNCTransposeFunc_ = nullptr;
int thread_count_ = 0;
int nhnc_param_[3];
int dims_ = 0;
}; };
} // namespace mindspore::kernel } // namespace mindspore::kernel

@ -141,6 +141,25 @@ int TransposeInt8CPUKernel::DoTranspose(int task_id) {
return RET_OK; return RET_OK;
} }
void TransposeInt8CPUKernel::GetNHNCTransposeFunc(lite::Tensor *in_tensor, lite::Tensor *out_tensor,
TransposeParameter *param) {
auto out_shape = out_tensor->shape();
if (in_tensor->shape().size() == 4 && param->perm_[0] == 0 && param->perm_[1] == 2 && param->perm_[2] == 3 &&
param->perm_[3] == 1) {
nhnc_param_[0] = out_shape[0];
nhnc_param_[1] = out_shape[1] * out_shape[2];
nhnc_param_[2] = out_shape[3];
NHNCTransposeFunc_ = PackNCHWToNHWCInt8;
}
if (in_tensor->shape().size() == 4 && param->perm_[0] == 0 && param->perm_[1] == 3 && param->perm_[2] == 1 &&
param->perm_[3] == 2) {
nhnc_param_[0] = out_shape[0];
nhnc_param_[1] = out_shape[2] * out_shape[3];
nhnc_param_[2] = out_shape[1];
NHNCTransposeFunc_ = PackNHWCToNCHWInt8;
}
}
int TransposeInt8CPUKernel::Run() { int TransposeInt8CPUKernel::Run() {
auto in_tensor = in_tensors_.front(); auto in_tensor = in_tensors_.front();
auto out_tensor = out_tensors_.front(); auto out_tensor = out_tensors_.front();
@ -150,7 +169,11 @@ int TransposeInt8CPUKernel::Run() {
in_ptr_ = reinterpret_cast<int8_t *>(in_tensor->data_c()); in_ptr_ = reinterpret_cast<int8_t *>(in_tensor->data_c());
out_ptr_ = reinterpret_cast<int8_t *>(out_tensor->data_c()); out_ptr_ = reinterpret_cast<int8_t *>(out_tensor->data_c());
GetNHNCTransposeFunc(in_tensor, out_tensor, transpose_param_);
if (NHNCTransposeFunc_ != nullptr) {
NHNCTransposeFunc_(in_ptr_, out_ptr_, nhnc_param_[0], nhnc_param_[1], nhnc_param_[2]);
return RET_OK;
}
memcpy(in_shape_, in_dims.data(), in_dims.size() * sizeof(int)); memcpy(in_shape_, in_dims.data(), in_dims.size() * sizeof(int));
memcpy(out_shape_, out_dims.data(), out_dims.size() * sizeof(int)); memcpy(out_shape_, out_dims.data(), out_dims.size() * sizeof(int));

@ -17,12 +17,16 @@
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_TRANSPOSE_INT8_H_ #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_TRANSPOSE_INT8_H_
#include <vector> #include <vector>
#include "nnacl/int8/pack_int8.h"
#include "nnacl/int8/transpose_int8.h" #include "nnacl/int8/transpose_int8.h"
#include "src/kernel_registry.h" #include "src/kernel_registry.h"
#include "src/lite_kernel.h" #include "src/lite_kernel.h"
#include "include/errorcode.h" #include "include/errorcode.h"
namespace mindspore::kernel { namespace mindspore::kernel {
typedef void (*TransposeFunc)(const void *src, void *dst, int batch, int plane, int channel);
class TransposeInt8CPUKernel : public LiteKernel { class TransposeInt8CPUKernel : public LiteKernel {
public: public:
TransposeInt8CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, TransposeInt8CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
@ -44,7 +48,9 @@ class TransposeInt8CPUKernel : public LiteKernel {
void FreeTmpBuf(); void FreeTmpBuf();
private: private:
void GetNHNCTransposeFunc(lite::Tensor *in_tensor, lite::Tensor *out_tensor, TransposeParameter *param);
TransposeParameter *transpose_param_; TransposeParameter *transpose_param_;
TransposeFunc NHNCTransposeFunc_ = nullptr;
int8_t *in_ptr_ = nullptr; int8_t *in_ptr_ = nullptr;
int8_t *out_ptr_ = nullptr; int8_t *out_ptr_ = nullptr;
int *dim_size_ = nullptr; int *dim_size_ = nullptr;
@ -56,6 +62,7 @@ class TransposeInt8CPUKernel : public LiteKernel {
int num_unit_ = 0; int num_unit_ = 0;
int in_shape_[8] = {0}; int in_shape_[8] = {0};
int out_shape_[8] = {0}; int out_shape_[8] = {0};
int nhnc_param_[3];
}; };
} // namespace mindspore::kernel } // namespace mindspore::kernel

@ -47,7 +47,7 @@ int ConvolutionBaseNPUKernel::InitWeightConst(const std::vector<lite::Tensor *>
MS_LOG(ERROR) << "Malloc buffer failed."; MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR; return RET_ERROR;
} }
PackNHWCToNCHWFp32(nhwc_data, nchw_data, w_shape[0], w_shape[1] * w_shape[2], w_shape[3]); PackNHWCToNCHWFp32(nhwc_data, nchw_data, w_shape[0], w_shape[1] * w_shape[2], w_shape[3], 0, 0);
std::shared_ptr<ge::Tensor> weight_tensor = std::shared_ptr<ge::Tensor>(new (std::nothrow) ge::Tensor()); std::shared_ptr<ge::Tensor> weight_tensor = std::shared_ptr<ge::Tensor>(new (std::nothrow) ge::Tensor());
if (weight_tensor == nullptr) { if (weight_tensor == nullptr) {

@ -335,7 +335,8 @@ int DeConvTestInit1(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tens
0.2642501, 0.29840338, 0.38820496, 0.37829784, 0.105839334, 0.07713295, 0.45629853, 0.9290373, 0.56323594, 0.2642501, 0.29840338, 0.38820496, 0.37829784, 0.105839334, 0.07713295, 0.45629853, 0.9290373, 0.56323594,
0.59976774, 0.48325357, 0.102543674, 0.35449505, 0.3158472, 0.02927611, 0.44739273, 0.0516185, 0.12340133, 0.59976774, 0.48325357, 0.102543674, 0.35449505, 0.3158472, 0.02927611, 0.44739273, 0.0516185, 0.12340133,
0.13908496, 0.54970616, 0.74672216, 0.673308, 0.6400629, 0.26790652, 0.98673576}; /* nhwc */ 0.13908496, 0.54970616, 0.74672216, 0.673308, 0.6400629, 0.26790652, 0.98673576}; /* nhwc */
PackNCHWToNHWCFp32(in_nchw, in_t->MutableData(), in_t->Batch(), in_t->Width() * in_t->Height(), in_t->Channel()); PackNCHWToNHWCFp32(in_nchw, in_t->MutableData(), in_t->Batch(), in_t->Width() * in_t->Height(), in_t->Channel(), 0,
0);
inputs_->push_back(in_t); inputs_->push_back(in_t);
std::vector<int> weight_dims_nhwc = {2, 3, 3, 6}; std::vector<int> weight_dims_nhwc = {2, 3, 3, 6};
@ -358,7 +359,7 @@ int DeConvTestInit1(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tens
0.06060236, 0.10848369, -0.4512424, 0.023834296, 0.1643943, -0.25290534, 0.066953085, -0.11685201, 0.06060236, 0.10848369, -0.4512424, 0.023834296, 0.1643943, -0.25290534, 0.066953085, -0.11685201,
-0.4159784, 0.37839416, -0.11141268, -0.15986018}; /* nhwc */ -0.4159784, 0.37839416, -0.11141268, -0.15986018}; /* nhwc */
PackNCHWToNHWCFp32(weight_nchw, weight_t->MutableData(), weight_t->Batch(), weight_t->Width() * weight_t->Height(), PackNCHWToNHWCFp32(weight_nchw, weight_t->MutableData(), weight_t->Batch(), weight_t->Width() * weight_t->Height(),
weight_t->Channel()); weight_t->Channel(), 0, 0);
inputs_->push_back(weight_t); inputs_->push_back(weight_t);
auto *bias_t = new lite::Tensor(kNumberTypeFloat, {6}, schema::Format_NHWC, lite::Tensor::Category::CONST_TENSOR); auto *bias_t = new lite::Tensor(kNumberTypeFloat, {6}, schema::Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
@ -463,7 +464,7 @@ int DeConvTestInit1(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tens
0.8622399, 0.47823763, 0.8856161, 0.6762785, 0.73437214, 0.3766058, 0.764144, 0.60693324, 0.8622399, 0.47823763, 0.8856161, 0.6762785, 0.73437214, 0.3766058, 0.764144, 0.60693324,
0.89371794, 0.92908806, 0.7702812, 0.79492164, 0.58807003, 0.678272, 0.4573259, 0.7444603, 0.89371794, 0.92908806, 0.7702812, 0.79492164, 0.58807003, 0.678272, 0.4573259, 0.7444603,
0.49847388, 0.84439206, 0.51984715, 0.9452883, 0.7511028, 0.81281227}; 0.49847388, 0.84439206, 0.51984715, 0.9452883, 0.7511028, 0.81281227};
PackNCHWToNHWCFp32(nchw_co, *correct, out_t->Batch(), out_t->Width() * out_t->Height(), out_t->Channel()); PackNCHWToNHWCFp32(nchw_co, *correct, out_t->Batch(), out_t->Width() * out_t->Height(), out_t->Channel(), 0, 0);
conv_param->kernel_h_ = conv_param->kernel_w_ = 3; conv_param->kernel_h_ = conv_param->kernel_w_ = 3;
conv_param->stride_h_ = conv_param->stride_w_ = 2; conv_param->stride_h_ = conv_param->stride_w_ = 2;
@ -531,7 +532,7 @@ int DeConvTestInit2(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tens
-11.827093, -12.340071, -2.6368382, -14.432123, -8.483799, -12.28651, 0.80561405, -11.827093, -12.340071, -2.6368382, -14.432123, -8.483799, -12.28651, 0.80561405,
11.332421, -0.43688506, -3.476327, -4.587028, -1.9491882, -3.3619316, -15.831648, 11.332421, -0.43688506, -3.476327, -4.587028, -1.9491882, -3.3619316, -15.831648,
-10.517606, -9.204161, -0.15148449, 1.5822954, -10.122691, -4.7448387, 3.99177}; -10.517606, -9.204161, -0.15148449, 1.5822954, -10.122691, -4.7448387, 3.99177};
PackNCHWToNHWCFp32(nchw_co, *correct, out_t->Batch(), out_t->Width() * out_t->Height(), out_t->Channel()); PackNCHWToNHWCFp32(nchw_co, *correct, out_t->Batch(), out_t->Width() * out_t->Height(), out_t->Channel(), 0, 0);
conv_param->kernel_h_ = conv_param->kernel_w_ = 3; conv_param->kernel_h_ = conv_param->kernel_w_ = 3;
conv_param->stride_h_ = conv_param->stride_w_ = 2; conv_param->stride_h_ = conv_param->stride_w_ = 2;
@ -571,7 +572,7 @@ int DeConvTestInit3(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tens
0.26498786, 0.6701024, 0.9744634, 0.49075702, 0.03877404, 0.48646277, 0.26498786, 0.6701024, 0.9744634, 0.49075702, 0.03877404, 0.48646277,
0.5473929, 0.32438126, 0.87553847, 0.75820315, 0.86666644, 0.4852329}; 0.5473929, 0.32438126, 0.87553847, 0.75820315, 0.86666644, 0.4852329};
PackNCHWToNHWCFp32(in_nchw, reinterpret_cast<float *>(in_t->MutableData()), in_t->Batch(), PackNCHWToNHWCFp32(in_nchw, reinterpret_cast<float *>(in_t->MutableData()), in_t->Batch(),
in_t->Width() * in_t->Height(), in_t->Channel()); in_t->Width() * in_t->Height(), in_t->Channel(), 0, 0);
inputs_->push_back(in_t); inputs_->push_back(in_t);
std::vector<int> w_dims_nhwc = {2, 2, 2, 2}; std::vector<int> w_dims_nhwc = {2, 2, 2, 2};
@ -582,7 +583,7 @@ int DeConvTestInit3(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tens
-0.34362152, 0.7557833, 0.16503833, 0.2418737, -0.26612744, 0.5072577, -0.34362152, 0.7557833, 0.16503833, 0.2418737, -0.26612744, 0.5072577,
-0.4284475, 0.2215941, 0.9273913, 0.34634787}; -0.4284475, 0.2215941, 0.9273913, 0.34634787};
PackNCHWToNHWCFp32(w_nchw, weight_t->MutableData(), weight_t->Batch(), weight_t->Width() * weight_t->Height(), PackNCHWToNHWCFp32(w_nchw, weight_t->MutableData(), weight_t->Batch(), weight_t->Width() * weight_t->Height(),
weight_t->Channel()); weight_t->Channel(), 0, 0);
inputs_->push_back(weight_t); inputs_->push_back(weight_t);
std::vector<int> out_dims_nhwc = {1, 9, 9, 2}; std::vector<int> out_dims_nhwc = {1, 9, 9, 2};
@ -609,7 +610,7 @@ int DeConvTestInit3(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tens
-0.026721025, 0.0, 0.24602996, 0.38258934, 0.0, 0.38933694, 0.88844025, 0.0, 0.3944222, -0.026721025, 0.0, 0.24602996, 0.38258934, 0.0, 0.38933694, 0.88844025, 0.0, 0.3944222,
0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
0.6120955, 0.0, 0.46287543, 0.57347727, 0.0, 0.80662024, 0.11515418, 0.0, 0.90454257}; 0.6120955, 0.0, 0.46287543, 0.57347727, 0.0, 0.80662024, 0.11515418, 0.0, 0.90454257};
PackNCHWToNHWCFp32(nchw_co, *correct, out_t->Batch(), out_t->Width() * out_t->Height(), out_t->Channel()); PackNCHWToNHWCFp32(nchw_co, *correct, out_t->Batch(), out_t->Width() * out_t->Height(), out_t->Channel(), 0, 0);
conv_param->kernel_h_ = conv_param->kernel_w_ = 2; conv_param->kernel_h_ = conv_param->kernel_w_ = 2;
conv_param->stride_h_ = conv_param->stride_w_ = 3; conv_param->stride_h_ = conv_param->stride_w_ = 3;
@ -658,7 +659,7 @@ int DeConvTestInit4(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tens
std::string weight_path = "./deconv/deconv_fp32_nchw_weight1.bin"; std::string weight_path = "./deconv/deconv_fp32_nchw_weight1.bin";
auto weight_nchw = reinterpret_cast<float *>(mindspore::lite::ReadFile(weight_path.c_str(), &buffer_size)); auto weight_nchw = reinterpret_cast<float *>(mindspore::lite::ReadFile(weight_path.c_str(), &buffer_size));
PackNCHWToNHWCFp32(weight_nchw, weight_t->MutableData(), weight_t->Batch(), weight_t->Width() * weight_t->Height(), PackNCHWToNHWCFp32(weight_nchw, weight_t->MutableData(), weight_t->Batch(), weight_t->Width() * weight_t->Height(),
weight_t->Channel()); weight_t->Channel(), 0, 0);
inputs_->push_back(weight_t); inputs_->push_back(weight_t);
auto *bias_t = new lite::Tensor(kNumberTypeFloat, {40}, schema::Format_NHWC, lite::Tensor::Category::CONST_TENSOR); auto *bias_t = new lite::Tensor(kNumberTypeFloat, {40}, schema::Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
@ -676,7 +677,7 @@ int DeConvTestInit4(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tens
std::string out_path = "./deconv/deconv_fp32_nchw_output1.bin"; std::string out_path = "./deconv/deconv_fp32_nchw_output1.bin";
auto out_nchw = mindspore::lite::ReadFile(out_path.c_str(), &buffer_size); auto out_nchw = mindspore::lite::ReadFile(out_path.c_str(), &buffer_size);
*correct = reinterpret_cast<float *>(malloc(buffer_size)); *correct = reinterpret_cast<float *>(malloc(buffer_size));
PackNCHWToNHWCFp32(out_nchw, *correct, out_t->Batch(), out_t->Width() * out_t->Height(), out_t->Channel()); PackNCHWToNHWCFp32(out_nchw, *correct, out_t->Batch(), out_t->Width() * out_t->Height(), out_t->Channel(), 0, 0);
conv_param->kernel_h_ = conv_param->kernel_w_ = 3; conv_param->kernel_h_ = conv_param->kernel_w_ = 3;
conv_param->stride_h_ = conv_param->stride_w_ = 1; conv_param->stride_h_ = conv_param->stride_w_ = 1;

@ -63,7 +63,7 @@ TEST_F(TestTransposeFp32, TransposeFp32_axes4) {
param->out_strides_[i] = out_strides[i]; param->out_strides_[i] = out_strides[i];
} }
auto ret = DoTransposeFp32(in, out, output_shape, param, nullptr, nullptr); auto ret = DoTransposeFp32(in, out, output_shape, param);
ASSERT_EQ(ret, 0); ASSERT_EQ(ret, 0);
delete param; delete param;
ASSERT_EQ(0, CompareOutputData(out, correct, 24, 0.000001)); ASSERT_EQ(0, CompareOutputData(out, correct, 24, 0.000001));
@ -102,7 +102,7 @@ TEST_F(TestTransposeFp32, TransposeFp32_axes3) {
param->out_strides_[i] = out_strides[i]; param->out_strides_[i] = out_strides[i];
} }
auto ret = DoTransposeFp32(in, out, output_shape, param, nullptr, nullptr); auto ret = DoTransposeFp32(in, out, output_shape, param);
ASSERT_EQ(ret, 0); ASSERT_EQ(ret, 0);
delete param; delete param;
ASSERT_EQ(0, CompareOutputData(out, correct, 24, 0.000001)); ASSERT_EQ(0, CompareOutputData(out, correct, 24, 0.000001));
@ -142,7 +142,7 @@ TEST_F(TestTransposeFp32, TransposeFp32_axes2) {
param->out_strides_[i] = out_strides[i]; param->out_strides_[i] = out_strides[i];
} }
auto ret = DoTransposeFp32(in, out, output_shape, param, nullptr, nullptr); auto ret = DoTransposeFp32(in, out, output_shape, param);
ASSERT_EQ(ret, 0); ASSERT_EQ(ret, 0);
delete param; delete param;
ASSERT_EQ(0, CompareOutputData(out, correct, 24, 0.000001)); ASSERT_EQ(0, CompareOutputData(out, correct, 24, 0.000001));

Loading…
Cancel
Save