!12734 [ms][lite][cpu] transpose optimize

From: @lzkcode
Reviewed-by: 
Signed-off-by:
pull/12734/MERGE
mindspore-ci-bot 4 years ago committed by Gitee
commit 16a9aa37af

@ -39,7 +39,8 @@ int ConvolutionDepthwiseFP32Coder::InitWeightBias() {
MS_CHECK_PTR(packed_weight_);
MS_CHECK_RET_CODE(memset_s(packed_weight_, packed_weight_data_size, 0, packed_weight_data_size),
"memset packed weight failed!");
PackNCHWToNHWCFp32(origin_weight, packed_weight_, 1, filter_tensor_->Height() * filter_tensor_->Width(), channel);
PackNCHWToNHWCFp32(origin_weight, packed_weight_, 1, filter_tensor_->Height() * filter_tensor_->Width(), channel, 0,
0);
auto channel_size = static_cast<size_t>(channel);
auto bias_size = static_cast<size_t>(channel_size * sizeof(float));

@ -127,6 +127,52 @@ void Fp16TransposeDim5(const float16_t *in_data, float16_t *out_data, int *strid
}
}
void Fp16TransposeDim6(const float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm,
const int *output_shape) {
const int stride0 = strides[perm[0]];
const int stride1 = strides[perm[1]];
const int stride2 = strides[perm[2]];
const int stride3 = strides[perm[3]];
const int stride4 = strides[perm[4]];
const int stride5 = strides[perm[5]];
const int out_stride0 = out_strides[0];
const int out_stride1 = out_strides[1];
const int out_stride2 = out_strides[2];
const int out_stride3 = out_strides[3];
const int out_stride4 = out_strides[4];
const int output0 = output_shape[0];
const int output1 = output_shape[1];
const int output2 = output_shape[2];
const int output3 = output_shape[3];
const int output4 = output_shape[4];
const int output5 = output_shape[5];
for (int i = 0; i < output0; ++i) {
int out_stride0_i = i * out_stride0;
int stride0_i = i * stride0;
for (int j = 0; j < output1; ++j) {
int out_stride1_j = j * out_stride1;
int stride1_j = j * stride1;
for (int k = 0; k < output2; ++k) {
int out_stride2_k = k * out_stride2;
int stride2_k = k * stride2;
for (int m = 0; m < output3; ++m) {
int out_stride3_m = m * out_stride3;
int stride3_m = m * stride3;
for (int n = 0; n < output4; ++n) {
int out_stride4_n = n * out_stride4;
int stride4_n = n * stride4;
for (int g = 0; g < output5; ++g) {
out_data[out_stride0_i + out_stride1_j + out_stride2_k + out_stride3_m + out_stride4_n + g] =
in_data[stride0_i + stride1_j + stride2_k + stride3_m + stride4_n + g * stride5];
}
}
}
}
}
}
}
void TransposeDimsFp16(const float16_t *in_data, float16_t *out_data, const int *strides, const int *out_strides,
const int *perm, const int *output_shape, int dims, int *size, int *position) {
*(size + dims - 1) = 1;
@ -190,6 +236,8 @@ int Fp16DoTranspose(const float16_t *in_data, float16_t *out_data, const int *ou
Fp16TransposeDim4(in_data, out_data, strides, out_strides, perm, output_shape);
} else if (num_axes == 5) {
Fp16TransposeDim5(in_data, out_data, strides, out_strides, perm, output_shape);
} else if (num_axes == 6) {
Fp16TransposeDim6(in_data, out_data, strides, out_strides, perm, output_shape);
} else {
TransposeDimsFp16(in_data, out_data, strides, out_strides, perm, output_shape, num_axes, size, position);
}

File diff suppressed because it is too large Load Diff

@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -30,8 +30,9 @@ void PackNHWCToNC4HW4Fp32(const void *src, void *dst, int batch, int plane, int
void PackNCHWToNC4HW4Fp32(const void *src, void *dst, int batch, int plane, int channel);
void PackNHWCToNHWC4Fp32(const void *src, void *dst, int batch, int plane, int channel);
void PackNHWCToNHWC8Fp32(const void *src, void *dst, int batch, int plane, int channel);
void PackNHWCToNCHWFp32(const void *src, void *dst, int batch, int plane, int channel);
void PackNCHWToNHWCFp32(const void *src, void *dst, int batch, int plane, int channel);
// Note: If not multithreaded, please set task_id = 0 and thread_count = 0;
void PackNHWCToNCHWFp32(const void *src, void *dst, int batch, int plane, int channel, int task_id, int thread_count);
void PackNCHWToNHWCFp32(const void *src, void *dst, int batch, int plane, int channel, int task_id, int thread_count);
void PackNHWC4ToNHWCFp32(const void *src, void *dst, int batch, int plane, int channel);
void PackNC4HW4ToNHWC4Fp32(const void *src, void *dst, int batch, int plane, int channel);
void PackNC4HW4ToNHWCFp32(const void *src, void *dst, int batch, int plane, int channel);
@ -43,6 +44,21 @@ void PackDepthwiseIndirectWeightC8Fp32(const void *src, void *dst, int height, i
void Im2ColPackUnitFp32(const float *input_data, const ConvParameter *conv_param, float *packed_input, int real_cal_num,
int block_index);
// Transpose 8X8 Fp32 block data
typedef void (*Transpose8X8Fp32Func)(const float *src_ptr, float *dst_ptr, int src_stride, int dst_stride);
#ifdef ENABLE_ARM64
void Transpose8X8Fp32Arm64(const float *src_ptr, float *dst_ptr, int src_stride, int dst_stride);
#endif
#ifdef ENABLE_ARM32
void Transpose8X8Fp32Arm32(const float *src_ptr, float *dst_ptr, int src_stride, int dst_stride);
#endif
#ifdef ENABLE_AVX
void Transpose8X8Fp32Avx(const float *src_ptr, float *dst_ptr, int src_stride, int dst_stride);
#endif
#if defined(ENABLE_SSE) && !defined(ENABLE_AVX)
void Transpose8X8Fp32Sse(const float *src_ptr, float *dst_ptr, int src_stride, int dst_stride);
#endif
#ifdef __cplusplus
}
#endif

@ -125,20 +125,73 @@ void TransposeDim5Fp32(const float *in_data, float *out_data, const int *strides
}
}
void TransposeDimsFp32(const float *in_data, float *out_data, const int *strides, const int *out_strides,
const int *perm, const int *output_shape, int dims, int *size, int *position) {
*(size + dims - 1) = 1;
for (int i = dims - 1; i > 0; --i) {
*(size + i - 1) = *(size + i) * output_shape[i];
void TransposeDim6Fp32(const float *in_data, float *out_data, const int *strides, const int *out_strides,
const int *perm, const int *output_shape) {
const int stride0 = strides[perm[0]];
const int stride1 = strides[perm[1]];
const int stride2 = strides[perm[2]];
const int stride3 = strides[perm[3]];
const int stride4 = strides[perm[4]];
const int stride5 = strides[perm[5]];
const int out_stride0 = out_strides[0];
const int out_stride1 = out_strides[1];
const int out_stride2 = out_strides[2];
const int out_stride3 = out_strides[3];
const int out_stride4 = out_strides[4];
const int output0 = output_shape[0];
const int output1 = output_shape[1];
const int output2 = output_shape[2];
const int output3 = output_shape[3];
const int output4 = output_shape[4];
const int output5 = output_shape[5];
for (int i = 0; i < output0; ++i) {
int out_stride0_i = i * out_stride0;
int stride0_i = i * stride0;
for (int j = 0; j < output1; ++j) {
int out_stride1_j = j * out_stride1;
int stride1_j = j * stride1;
for (int k = 0; k < output2; ++k) {
int out_stride2_k = k * out_stride2;
int stride2_k = k * stride2;
for (int m = 0; m < output3; ++m) {
int out_stride3_m = m * out_stride3;
int stride3_m = m * stride3;
for (int n = 0; n < output4; ++n) {
int out_stride4_m = n * out_stride4;
int stride4_m = n * stride4;
for (int g = 0; g < output5; ++g) {
out_data[out_stride0_i + out_stride1_j + out_stride2_k + out_stride3_m + out_stride4_m + g] =
in_data[stride0_i + stride1_j + stride2_k + stride3_m + stride4_m + g * stride5];
}
}
}
}
}
}
}
for (size_t idx = 0; idx < (*size) * output_shape[0]; ++idx) {
void TransposeDimsFp32(const float *in_data, float *out_data, const int *output_shape, int *size, int *position,
TransposeParameter *transpose_param, int task_id, int thread_num) {
int *perm = transpose_param->perm_;
int *strides = transpose_param->strides_;
int *out_strides = transpose_param->out_strides_;
int num_axes = transpose_param->num_axes_;
size_t data_size = (*size) * output_shape[0];
size_t offset_size = UP_DIV(data_size, thread_num);
size_t task_offset = offset_size * task_id;
int count = data_size - task_offset;
if (count <= 0) {
return;
}
count = MSMIN(offset_size, count);
for (size_t idx = task_offset; idx < task_offset + count; ++idx) {
int pos = idx;
int output_idx = 0;
int input_idx = 0;
for (int i = 0; i < dims; ++i) {
for (int i = 0; i < num_axes; ++i) {
*(position + i) = pos / *(size + i);
int out_stride = i < dims - 1 ? out_strides[i] : 1;
int out_stride = i < num_axes - 1 ? out_strides[i] : 1;
output_idx += (*(position + i) * out_stride);
input_idx += (*(position + i) * strides[perm[i]]);
pos -= *(position + i) * (*(size + i));
@ -147,8 +200,8 @@ void TransposeDimsFp32(const float *in_data, float *out_data, const int *strides
}
}
int DoTransposeFp32(const float *in_data, float *out_data, const int *output_shape, TransposeParameter *transpose_param,
int *size, int *position) {
int DoTransposeFp32(const float *in_data, float *out_data, const int *output_shape,
TransposeParameter *transpose_param) {
if (in_data == NULL || out_data == NULL) {
return NNACL_ERR;
}
@ -188,8 +241,10 @@ int DoTransposeFp32(const float *in_data, float *out_data, const int *output_sha
TransposeDim4Fp32(in_data, out_data, strides, out_strides, perm, output_shape);
} else if (num_axes == 5) {
TransposeDim5Fp32(in_data, out_data, strides, out_strides, perm, output_shape);
} else if (num_axes == 6) {
TransposeDim6Fp32(in_data, out_data, strides, out_strides, perm, output_shape);
} else {
TransposeDimsFp32(in_data, out_data, strides, out_strides, perm, output_shape, num_axes, size, position);
return NNACL_ERR;
}
return NNACL_OK;
}

@ -25,9 +25,9 @@
extern "C" {
#endif
int DoTransposeFp32(const float *in_data, float *out_data, const int *output_shape, TransposeParameter *transpose_param,
int *size, int *position);
int DoTransposeFp32(const float *in_data, float *out_data, const int *output_shape, TransposeParameter *param);
void TransposeDimsFp32(const float *in_data, float *out_data, const int *output_shape, int *size, int *position,
TransposeParameter *transpose_param, int task_id, int thread_num);
#ifdef __cplusplus
}
#endif

@ -1,140 +0,0 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifdef ENABLE_SSE
#include <x86intrin.h>
#include "nnacl/pack.h"
#include "nnacl/int8/conv_int8.h"
void PackNHWCToNCHWFp32(const void *src, void *dst, int batches, int plane, int channel) {
int hw8 = plane / C8NUM * C8NUM;
int c8 = channel / C8NUM * C8NUM;
int batch = plane * channel;
for (int n = 0; n < batches; n++) {
const float *src_batch = (const float *)src + n * batch;
float *dst_batch = (float *)dst + n * batch;
int hw = 0;
for (; hw < hw8; hw += C8NUM) {
int c = 0;
for (; c < c8; c += C8NUM) {
const float *src_ptr = src_batch + hw * channel + c;
float *dst_ptr = dst_batch + c * plane + hw;
// 11-14
__m128 v0_ma = _mm_loadu_ps(src_ptr);
__m128 v1_ma = _mm_loadu_ps(src_ptr + channel);
__m128 v2_ma = _mm_loadu_ps(src_ptr + 2 * channel);
__m128 v3_ma = _mm_loadu_ps(src_ptr + 3 * channel);
__m128 v4_ma = _mm_unpacklo_ps(v0_ma, v1_ma);
__m128 v5_ma = _mm_unpackhi_ps(v0_ma, v1_ma);
__m128 v6_ma = _mm_unpacklo_ps(v2_ma, v3_ma);
__m128 v7_ma = _mm_unpackhi_ps(v2_ma, v3_ma);
__m128 v8_ma = _mm_movelh_ps(v4_ma, v6_ma);
__m128 v9_ma = _mm_movehl_ps(v6_ma, v4_ma);
__m128 v10_ma = _mm_movelh_ps(v5_ma, v7_ma);
__m128 v11_ma = _mm_movehl_ps(v7_ma, v5_ma);
_mm_storeu_ps(dst_ptr, v8_ma);
_mm_storeu_ps(dst_ptr + plane, v9_ma);
_mm_storeu_ps(dst_ptr + 2 * plane, v10_ma);
_mm_storeu_ps(dst_ptr + 3 * plane, v11_ma);
// 15-18
v0_ma = _mm_loadu_ps(src_ptr + C4NUM);
v1_ma = _mm_loadu_ps(src_ptr + channel + C4NUM);
v2_ma = _mm_loadu_ps(src_ptr + 2 * channel + C4NUM);
v3_ma = _mm_loadu_ps(src_ptr + 3 * channel + C4NUM);
v4_ma = _mm_unpacklo_ps(v0_ma, v1_ma);
v5_ma = _mm_unpackhi_ps(v0_ma, v1_ma);
v6_ma = _mm_unpacklo_ps(v2_ma, v3_ma);
v7_ma = _mm_unpackhi_ps(v2_ma, v3_ma);
v8_ma = _mm_movelh_ps(v4_ma, v6_ma);
v9_ma = _mm_movehl_ps(v6_ma, v4_ma);
v10_ma = _mm_movelh_ps(v5_ma, v7_ma);
v11_ma = _mm_movehl_ps(v7_ma, v5_ma);
_mm_storeu_ps(dst_ptr + C4NUM * plane, v8_ma);
_mm_storeu_ps(dst_ptr + (C4NUM + 1) * plane, v9_ma);
_mm_storeu_ps(dst_ptr + (C4NUM + 2) * plane, v10_ma);
_mm_storeu_ps(dst_ptr + (C4NUM + 3) * plane, v11_ma);
// 21-24
v0_ma = _mm_loadu_ps(src_ptr + C4NUM * channel);
v1_ma = _mm_loadu_ps(src_ptr + (C4NUM + 1) * channel);
v2_ma = _mm_loadu_ps(src_ptr + (C4NUM + 2) * channel);
v3_ma = _mm_loadu_ps(src_ptr + (C4NUM + 3) * channel);
v4_ma = _mm_unpacklo_ps(v0_ma, v1_ma);
v5_ma = _mm_unpackhi_ps(v0_ma, v1_ma);
v6_ma = _mm_unpacklo_ps(v2_ma, v3_ma);
v7_ma = _mm_unpackhi_ps(v2_ma, v3_ma);
v8_ma = _mm_movelh_ps(v4_ma, v6_ma);
v9_ma = _mm_movehl_ps(v6_ma, v4_ma);
v10_ma = _mm_movelh_ps(v5_ma, v7_ma);
v11_ma = _mm_movehl_ps(v7_ma, v5_ma);
_mm_storeu_ps(dst_ptr + C4NUM, v8_ma);
_mm_storeu_ps(dst_ptr + plane + C4NUM, v9_ma);
_mm_storeu_ps(dst_ptr + 2 * plane + C4NUM, v10_ma);
_mm_storeu_ps(dst_ptr + 3 * plane + C4NUM, v11_ma);
// 25-28
v0_ma = _mm_loadu_ps(src_ptr + C4NUM * channel + C4NUM);
v1_ma = _mm_loadu_ps(src_ptr + (C4NUM + 1) * channel + C4NUM);
v2_ma = _mm_loadu_ps(src_ptr + (C4NUM + 2) * channel + C4NUM);
v3_ma = _mm_loadu_ps(src_ptr + (C4NUM + 3) * channel + C4NUM);
v4_ma = _mm_unpacklo_ps(v0_ma, v1_ma);
v5_ma = _mm_unpackhi_ps(v0_ma, v1_ma);
v6_ma = _mm_unpacklo_ps(v2_ma, v3_ma);
v7_ma = _mm_unpackhi_ps(v2_ma, v3_ma);
v8_ma = _mm_movelh_ps(v4_ma, v6_ma);
v9_ma = _mm_movehl_ps(v6_ma, v4_ma);
v10_ma = _mm_movelh_ps(v5_ma, v7_ma);
v11_ma = _mm_movehl_ps(v7_ma, v5_ma);
_mm_storeu_ps(dst_ptr + C4NUM * plane + C4NUM, v8_ma);
_mm_storeu_ps(dst_ptr + (C4NUM + 1) * plane + C4NUM, v9_ma);
_mm_storeu_ps(dst_ptr + (C4NUM + 2) * plane + C4NUM, v10_ma);
_mm_storeu_ps(dst_ptr + (C4NUM + 3) * plane + C4NUM, v11_ma);
}
for (; c < channel; c++) {
const float *src_ptr = src_batch + hw * channel + c;
float *dst_ptr = dst_batch + c * plane + hw;
for (size_t i = 0; i < C8NUM; i++) {
dst_ptr[i] = src_ptr[i * channel];
}
}
}
for (; hw < plane; hw++) {
const float *src_ptr = src_batch + hw * channel;
float *dst_ptr = dst_batch + hw;
for (size_t i = 0; i < channel; i++) {
dst_ptr[i * plane] = src_ptr[i];
}
}
}
return;
}
#endif

@ -19,7 +19,7 @@
#include "nnacl/op_base.h"
#define MAX_TRANSPOSE_DIM_SIZE 5
#define MAX_TRANSPOSE_DIM_SIZE 6
typedef struct TransposeParameter {
// primitive parameter

@ -22,6 +22,7 @@
using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_NULL_PTR;
using mindspore::lite::RET_OK;
using mindspore::lite::RET_OP_EXECUTE_FAILURE;
using mindspore::schema::PrimitiveType_Transpose;
@ -82,31 +83,46 @@ TransposeCPUKernel::~TransposeCPUKernel() {
}
}
int TransposeCPUKernel::NhNcTranspose(lite::Tensor *in_tensor, lite::Tensor *out_tensor, TransposeParameter *param) {
void TransposeCPUKernel::GetNHNCTransposeFunc(lite::Tensor *in_tensor, lite::Tensor *out_tensor,
TransposeParameter *param) {
auto out_shape = out_tensor->shape();
if (in_tensor->shape().size() == 4 && param->perm_[0] == 0 && param->perm_[1] == 2 && param->perm_[2] == 3 &&
param->perm_[3] == 1) {
nhnc_param_[0] = out_shape[0];
nhnc_param_[1] = out_shape[1] * out_shape[2];
nhnc_param_[2] = out_shape[3];
if (in_tensor->data_type() == kNumberTypeFloat32) {
PackNCHWToNHWCFp32(in_tensor->MutableData(), out_tensor->MutableData(), out_shape[0], out_shape[1] * out_shape[2],
out_shape[3]);
} else if (in_tensor->data_type() == kNumberTypeInt8) {
PackNCHWToNHWCInt8(in_tensor->MutableData(), out_tensor->MutableData(), out_shape[0], out_shape[1] * out_shape[2],
out_shape[3]);
NHNCTransposeFunc_ = PackNCHWToNHWCFp32;
}
return RET_OK;
}
if (in_tensor->shape().size() == 4 && param->perm_[0] == 0 && param->perm_[1] == 3 && param->perm_[2] == 1 &&
param->perm_[3] == 2) {
nhnc_param_[0] = out_shape[0];
nhnc_param_[1] = out_shape[2] * out_shape[3];
nhnc_param_[2] = out_shape[1];
if (in_tensor->data_type() == kNumberTypeFloat32) {
PackNHWCToNCHWFp32(in_tensor->MutableData(), out_tensor->MutableData(), out_shape[0], out_shape[2] * out_shape[3],
out_shape[1]);
} else if (in_tensor->data_type() == kNumberTypeInt8) {
PackNHWCToNCHWInt8(in_tensor->MutableData(), out_tensor->MutableData(), out_shape[0], out_shape[2] * out_shape[3],
out_shape[1]);
NHNCTransposeFunc_ = PackNHWCToNCHWFp32;
}
return RET_OK;
}
return RET_ERROR;
}
int TransposeCPUKernel::RunImpl(int task_id) {
if (NHNCTransposeFunc_ != nullptr) {
NHNCTransposeFunc_(in_data_, out_data_, nhnc_param_[0], nhnc_param_[1], nhnc_param_[2], task_id, thread_count_);
} else {
TransposeDimsFp32(in_data_, out_data_, out_shape_, dim_size_, position_ + dims_ * task_id, param_, task_id,
thread_count_);
}
return RET_OK;
}
int TransposeImpl(void *kernel, int task_id) {
auto transpose = reinterpret_cast<TransposeCPUKernel *>(kernel);
auto ret = transpose->RunImpl(task_id);
if (ret != RET_OK) {
MS_LOG(ERROR) << "TransposeImpl Run error task_id[" << task_id << "] error_code[" << ret << "]";
}
return ret;
}
int TransposeCPUKernel::Run() {
@ -123,8 +139,8 @@ int TransposeCPUKernel::Run() {
MS_ASSERT(in_data_);
MS_ASSERT(out_data_);
TransposeParameter *param = reinterpret_cast<TransposeParameter *>(this->op_parameter_);
if (in_tensor->shape().size() != static_cast<size_t>(param->num_axes_)) {
param_ = reinterpret_cast<TransposeParameter *>(this->op_parameter_);
if (in_tensor->shape().size() != static_cast<size_t>(param_->num_axes_)) {
memcpy(out_data_, in_data_, in_tensor->ElementsNum() * sizeof(float));
return RET_OK;
}
@ -134,40 +150,48 @@ int TransposeCPUKernel::Run() {
MS_ASSERT(input_perm->data_c() != nullptr);
int *perm_data = reinterpret_cast<int *>(input_perm->data_c());
for (int i = 0; i < input_perm->ElementsNum(); ++i) {
param->perm_[i] = perm_data[i];
param_->perm_[i] = perm_data[i];
}
for (int i = input_perm->ElementsNum(); i < MAX_SHAPE_SIZE; ++i) {
param->perm_[i] = 0;
param_->perm_[i] = 0;
}
}
auto ret = NhNcTranspose(in_tensor, out_tensor, param);
if (ret == RET_OK) {
thread_count_ = op_parameter_->thread_num_;
GetNHNCTransposeFunc(in_tensor, out_tensor, param_);
if (NHNCTransposeFunc_ != nullptr) {
auto ret = ParallelLaunch(this->context_->thread_pool_, TransposeImpl, this, thread_count_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "NHNCTransposeFunc_ is error!";
}
return ret;
}
if (in_tensor->data_type() == kNumberTypeInt8) {
MS_LOG(ERROR) << "not support now";
return RET_ERROR;
}
int dims = out_tensor->shape().size();
if (dims > MAX_TRANSPOSE_DIM_SIZE) {
dim_size_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims * sizeof(int)));
MS_ASSERT(out_shape_);
dims_ = out_tensor->shape().size();
if (dims_ > MAX_TRANSPOSE_DIM_SIZE) {
dim_size_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims_ * sizeof(int)));
if (dim_size_ == nullptr) {
MS_LOG(ERROR) << "Malloc data failed";
return RET_ERROR;
return RET_NULL_PTR;
}
position_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims * sizeof(int)));
*(dim_size_ + dims_ - 1) = 1;
for (int i = dims_ - 1; i > 0; --i) {
*(dim_size_ + i - 1) = *(dim_size_ + i) * out_shape_[i];
}
position_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims_ * sizeof(int) * thread_count_));
if (position_ == nullptr) {
MS_LOG(ERROR) << "Malloc data failed";
context_->allocator->Free(dim_size_);
dim_size_ = nullptr;
return RET_ERROR;
MS_LOG(ERROR) << "Malloc data failed";
return RET_NULL_PTR;
}
}
MS_ASSERT(out_shape_);
ret = DoTransposeFp32(in_data_, out_data_, out_shape_, param, dim_size_, position_);
if (dims > MAX_TRANSPOSE_DIM_SIZE) {
int ret;
if (dims_ > MAX_TRANSPOSE_DIM_SIZE) {
ret = ParallelLaunch(this->context_->thread_pool_, TransposeImpl, this, thread_count_);
} else {
ret = DoTransposeFp32(in_data_, out_data_, out_shape_, param_);
}
if (dims_ > MAX_TRANSPOSE_DIM_SIZE) {
context_->allocator->Free(dim_size_);
context_->allocator->Free(position_);
dim_size_ = nullptr;
@ -175,13 +199,10 @@ int TransposeCPUKernel::Run() {
}
if (ret != RET_OK) {
MS_LOG(ERROR) << "Transpose run failed";
return RET_ERROR;
}
return ret;
}
REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Transpose, LiteKernelCreator<TransposeCPUKernel>)
REG_KERNEL(kCPU, kNumberTypeInt32, PrimitiveType_Transpose, LiteKernelCreator<TransposeCPUKernel>)
REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_Transpose, LiteKernelCreator<TransposeCPUKernel>)
} // namespace mindspore::kernel

@ -25,6 +25,10 @@
#include "src/kernel_registry.h"
namespace mindspore::kernel {
typedef void (*TransposeFunc)(const void *src, void *dst, int batch, int plane, int channel, int thread_num,
int task_id);
class TransposeCPUKernel : public LiteKernel {
public:
explicit TransposeCPUKernel(OpParameter *param, const std::vector<lite::Tensor *> &inputs,
@ -35,14 +39,20 @@ class TransposeCPUKernel : public LiteKernel {
int Init() override;
int ReSize() override;
int Run() override;
int NhNcTranspose(lite::Tensor *in_tensor, lite::Tensor *out_tensor, TransposeParameter *param);
int RunImpl(int task_id);
protected:
void GetNHNCTransposeFunc(lite::Tensor *in_tensor, lite::Tensor *out_tensor, TransposeParameter *param);
float *in_data_ = nullptr;
float *out_data_ = nullptr;
int *out_shape_ = nullptr;
int *dim_size_ = nullptr;
int *position_ = nullptr;
TransposeParameter *param_ = nullptr;
TransposeFunc NHNCTransposeFunc_ = nullptr;
int thread_count_ = 0;
int nhnc_param_[3];
int dims_ = 0;
};
} // namespace mindspore::kernel

@ -141,6 +141,25 @@ int TransposeInt8CPUKernel::DoTranspose(int task_id) {
return RET_OK;
}
void TransposeInt8CPUKernel::GetNHNCTransposeFunc(lite::Tensor *in_tensor, lite::Tensor *out_tensor,
TransposeParameter *param) {
auto out_shape = out_tensor->shape();
if (in_tensor->shape().size() == 4 && param->perm_[0] == 0 && param->perm_[1] == 2 && param->perm_[2] == 3 &&
param->perm_[3] == 1) {
nhnc_param_[0] = out_shape[0];
nhnc_param_[1] = out_shape[1] * out_shape[2];
nhnc_param_[2] = out_shape[3];
NHNCTransposeFunc_ = PackNCHWToNHWCInt8;
}
if (in_tensor->shape().size() == 4 && param->perm_[0] == 0 && param->perm_[1] == 3 && param->perm_[2] == 1 &&
param->perm_[3] == 2) {
nhnc_param_[0] = out_shape[0];
nhnc_param_[1] = out_shape[2] * out_shape[3];
nhnc_param_[2] = out_shape[1];
NHNCTransposeFunc_ = PackNHWCToNCHWInt8;
}
}
int TransposeInt8CPUKernel::Run() {
auto in_tensor = in_tensors_.front();
auto out_tensor = out_tensors_.front();
@ -150,7 +169,11 @@ int TransposeInt8CPUKernel::Run() {
in_ptr_ = reinterpret_cast<int8_t *>(in_tensor->data_c());
out_ptr_ = reinterpret_cast<int8_t *>(out_tensor->data_c());
GetNHNCTransposeFunc(in_tensor, out_tensor, transpose_param_);
if (NHNCTransposeFunc_ != nullptr) {
NHNCTransposeFunc_(in_ptr_, out_ptr_, nhnc_param_[0], nhnc_param_[1], nhnc_param_[2]);
return RET_OK;
}
memcpy(in_shape_, in_dims.data(), in_dims.size() * sizeof(int));
memcpy(out_shape_, out_dims.data(), out_dims.size() * sizeof(int));

@ -17,12 +17,16 @@
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_TRANSPOSE_INT8_H_
#include <vector>
#include "nnacl/int8/pack_int8.h"
#include "nnacl/int8/transpose_int8.h"
#include "src/kernel_registry.h"
#include "src/lite_kernel.h"
#include "include/errorcode.h"
namespace mindspore::kernel {
typedef void (*TransposeFunc)(const void *src, void *dst, int batch, int plane, int channel);
class TransposeInt8CPUKernel : public LiteKernel {
public:
TransposeInt8CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
@ -44,7 +48,9 @@ class TransposeInt8CPUKernel : public LiteKernel {
void FreeTmpBuf();
private:
void GetNHNCTransposeFunc(lite::Tensor *in_tensor, lite::Tensor *out_tensor, TransposeParameter *param);
TransposeParameter *transpose_param_;
TransposeFunc NHNCTransposeFunc_ = nullptr;
int8_t *in_ptr_ = nullptr;
int8_t *out_ptr_ = nullptr;
int *dim_size_ = nullptr;
@ -56,6 +62,7 @@ class TransposeInt8CPUKernel : public LiteKernel {
int num_unit_ = 0;
int in_shape_[8] = {0};
int out_shape_[8] = {0};
int nhnc_param_[3];
};
} // namespace mindspore::kernel

@ -47,7 +47,7 @@ int ConvolutionBaseNPUKernel::InitWeightConst(const std::vector<lite::Tensor *>
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
PackNHWCToNCHWFp32(nhwc_data, nchw_data, w_shape[0], w_shape[1] * w_shape[2], w_shape[3]);
PackNHWCToNCHWFp32(nhwc_data, nchw_data, w_shape[0], w_shape[1] * w_shape[2], w_shape[3], 0, 0);
std::shared_ptr<ge::Tensor> weight_tensor = std::shared_ptr<ge::Tensor>(new (std::nothrow) ge::Tensor());
if (weight_tensor == nullptr) {

@ -335,7 +335,8 @@ int DeConvTestInit1(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tens
0.2642501, 0.29840338, 0.38820496, 0.37829784, 0.105839334, 0.07713295, 0.45629853, 0.9290373, 0.56323594,
0.59976774, 0.48325357, 0.102543674, 0.35449505, 0.3158472, 0.02927611, 0.44739273, 0.0516185, 0.12340133,
0.13908496, 0.54970616, 0.74672216, 0.673308, 0.6400629, 0.26790652, 0.98673576}; /* nhwc */
PackNCHWToNHWCFp32(in_nchw, in_t->MutableData(), in_t->Batch(), in_t->Width() * in_t->Height(), in_t->Channel());
PackNCHWToNHWCFp32(in_nchw, in_t->MutableData(), in_t->Batch(), in_t->Width() * in_t->Height(), in_t->Channel(), 0,
0);
inputs_->push_back(in_t);
std::vector<int> weight_dims_nhwc = {2, 3, 3, 6};
@ -358,7 +359,7 @@ int DeConvTestInit1(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tens
0.06060236, 0.10848369, -0.4512424, 0.023834296, 0.1643943, -0.25290534, 0.066953085, -0.11685201,
-0.4159784, 0.37839416, -0.11141268, -0.15986018}; /* nhwc */
PackNCHWToNHWCFp32(weight_nchw, weight_t->MutableData(), weight_t->Batch(), weight_t->Width() * weight_t->Height(),
weight_t->Channel());
weight_t->Channel(), 0, 0);
inputs_->push_back(weight_t);
auto *bias_t = new lite::Tensor(kNumberTypeFloat, {6}, schema::Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
@ -463,7 +464,7 @@ int DeConvTestInit1(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tens
0.8622399, 0.47823763, 0.8856161, 0.6762785, 0.73437214, 0.3766058, 0.764144, 0.60693324,
0.89371794, 0.92908806, 0.7702812, 0.79492164, 0.58807003, 0.678272, 0.4573259, 0.7444603,
0.49847388, 0.84439206, 0.51984715, 0.9452883, 0.7511028, 0.81281227};
PackNCHWToNHWCFp32(nchw_co, *correct, out_t->Batch(), out_t->Width() * out_t->Height(), out_t->Channel());
PackNCHWToNHWCFp32(nchw_co, *correct, out_t->Batch(), out_t->Width() * out_t->Height(), out_t->Channel(), 0, 0);
conv_param->kernel_h_ = conv_param->kernel_w_ = 3;
conv_param->stride_h_ = conv_param->stride_w_ = 2;
@ -531,7 +532,7 @@ int DeConvTestInit2(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tens
-11.827093, -12.340071, -2.6368382, -14.432123, -8.483799, -12.28651, 0.80561405,
11.332421, -0.43688506, -3.476327, -4.587028, -1.9491882, -3.3619316, -15.831648,
-10.517606, -9.204161, -0.15148449, 1.5822954, -10.122691, -4.7448387, 3.99177};
PackNCHWToNHWCFp32(nchw_co, *correct, out_t->Batch(), out_t->Width() * out_t->Height(), out_t->Channel());
PackNCHWToNHWCFp32(nchw_co, *correct, out_t->Batch(), out_t->Width() * out_t->Height(), out_t->Channel(), 0, 0);
conv_param->kernel_h_ = conv_param->kernel_w_ = 3;
conv_param->stride_h_ = conv_param->stride_w_ = 2;
@ -571,7 +572,7 @@ int DeConvTestInit3(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tens
0.26498786, 0.6701024, 0.9744634, 0.49075702, 0.03877404, 0.48646277,
0.5473929, 0.32438126, 0.87553847, 0.75820315, 0.86666644, 0.4852329};
PackNCHWToNHWCFp32(in_nchw, reinterpret_cast<float *>(in_t->MutableData()), in_t->Batch(),
in_t->Width() * in_t->Height(), in_t->Channel());
in_t->Width() * in_t->Height(), in_t->Channel(), 0, 0);
inputs_->push_back(in_t);
std::vector<int> w_dims_nhwc = {2, 2, 2, 2};
@ -582,7 +583,7 @@ int DeConvTestInit3(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tens
-0.34362152, 0.7557833, 0.16503833, 0.2418737, -0.26612744, 0.5072577,
-0.4284475, 0.2215941, 0.9273913, 0.34634787};
PackNCHWToNHWCFp32(w_nchw, weight_t->MutableData(), weight_t->Batch(), weight_t->Width() * weight_t->Height(),
weight_t->Channel());
weight_t->Channel(), 0, 0);
inputs_->push_back(weight_t);
std::vector<int> out_dims_nhwc = {1, 9, 9, 2};
@ -609,7 +610,7 @@ int DeConvTestInit3(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tens
-0.026721025, 0.0, 0.24602996, 0.38258934, 0.0, 0.38933694, 0.88844025, 0.0, 0.3944222,
0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
0.6120955, 0.0, 0.46287543, 0.57347727, 0.0, 0.80662024, 0.11515418, 0.0, 0.90454257};
PackNCHWToNHWCFp32(nchw_co, *correct, out_t->Batch(), out_t->Width() * out_t->Height(), out_t->Channel());
PackNCHWToNHWCFp32(nchw_co, *correct, out_t->Batch(), out_t->Width() * out_t->Height(), out_t->Channel(), 0, 0);
conv_param->kernel_h_ = conv_param->kernel_w_ = 2;
conv_param->stride_h_ = conv_param->stride_w_ = 3;
@ -658,7 +659,7 @@ int DeConvTestInit4(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tens
std::string weight_path = "./deconv/deconv_fp32_nchw_weight1.bin";
auto weight_nchw = reinterpret_cast<float *>(mindspore::lite::ReadFile(weight_path.c_str(), &buffer_size));
PackNCHWToNHWCFp32(weight_nchw, weight_t->MutableData(), weight_t->Batch(), weight_t->Width() * weight_t->Height(),
weight_t->Channel());
weight_t->Channel(), 0, 0);
inputs_->push_back(weight_t);
auto *bias_t = new lite::Tensor(kNumberTypeFloat, {40}, schema::Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
@ -676,7 +677,7 @@ int DeConvTestInit4(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tens
std::string out_path = "./deconv/deconv_fp32_nchw_output1.bin";
auto out_nchw = mindspore::lite::ReadFile(out_path.c_str(), &buffer_size);
*correct = reinterpret_cast<float *>(malloc(buffer_size));
PackNCHWToNHWCFp32(out_nchw, *correct, out_t->Batch(), out_t->Width() * out_t->Height(), out_t->Channel());
PackNCHWToNHWCFp32(out_nchw, *correct, out_t->Batch(), out_t->Width() * out_t->Height(), out_t->Channel(), 0, 0);
conv_param->kernel_h_ = conv_param->kernel_w_ = 3;
conv_param->stride_h_ = conv_param->stride_w_ = 1;

@ -63,7 +63,7 @@ TEST_F(TestTransposeFp32, TransposeFp32_axes4) {
param->out_strides_[i] = out_strides[i];
}
auto ret = DoTransposeFp32(in, out, output_shape, param, nullptr, nullptr);
auto ret = DoTransposeFp32(in, out, output_shape, param);
ASSERT_EQ(ret, 0);
delete param;
ASSERT_EQ(0, CompareOutputData(out, correct, 24, 0.000001));
@ -102,7 +102,7 @@ TEST_F(TestTransposeFp32, TransposeFp32_axes3) {
param->out_strides_[i] = out_strides[i];
}
auto ret = DoTransposeFp32(in, out, output_shape, param, nullptr, nullptr);
auto ret = DoTransposeFp32(in, out, output_shape, param);
ASSERT_EQ(ret, 0);
delete param;
ASSERT_EQ(0, CompareOutputData(out, correct, 24, 0.000001));
@ -142,7 +142,7 @@ TEST_F(TestTransposeFp32, TransposeFp32_axes2) {
param->out_strides_[i] = out_strides[i];
}
auto ret = DoTransposeFp32(in, out, output_shape, param, nullptr, nullptr);
auto ret = DoTransposeFp32(in, out, output_shape, param);
ASSERT_EQ(ret, 0);
delete param;
ASSERT_EQ(0, CompareOutputData(out, correct, 24, 0.000001));

Loading…
Cancel
Save