diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/lite_mat.cc b/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/lite_mat.cc index f5904e7151..636dfc9e71 100644 --- a/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/lite_mat.cc +++ b/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/lite_mat.cc @@ -18,12 +18,9 @@ #include #include #include -#ifdef ENABLE_ANDROID -#if defined(__arm__) || defined(__aarch64__) || defined(_M_ARM) || defined(_M_ARM64) -#define USE_NEON +#ifdef ENABLE_NEON #include #endif -#endif namespace mindspore { namespace dataset { @@ -333,7 +330,7 @@ inline void SubtractImpl(const T *src0, const T *src1, T *dst, int64_t total_siz template <> inline void SubtractImpl(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, int64_t total_size) { int64_t x = 0; -#ifdef USE_NEON +#ifdef ENABLE_NEON const int64_t step = 32; for (; x <= total_size - step; x += step) { uint8x16_t v_src00 = vld1q_u8(src0 + x); @@ -432,7 +429,7 @@ bool Subtract(const LiteMat &src_a, const LiteMat &src_b, LiteMat *dst) { return true; } -#ifdef USE_NEON +#ifdef ENABLE_NEON inline float32x4_t reciprocal_simd(float32x4_t val) { // get an initial estimate of 1/val float32x4_t reciprocal = vrecpeq_f32(val); @@ -461,7 +458,7 @@ inline void DivideImpl(const T *src0, const T *src1, T *dst, int64_t total_size) template <> inline void DivideImpl(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, int64_t total_size) { int64_t x = 0; -#ifdef USE_NEON +#ifdef ENABLE_NEON const int64_t step = 16; for (; x <= total_size - step; x += step) { __builtin_prefetch(reinterpret_cast(src0 + x) + 32 * 10); @@ -558,7 +555,7 @@ inline bool CheckDivide(const LiteMat &src_a, const LiteMat &src_b, LiteMat *dst bool Divide(const LiteMat &src_a, const LiteMat &src_b, LiteMat *dst) { if (!CheckDivide(src_a, src_b, dst)) { - return true; + return false; } if (dst->IsEmpty()) { @@ -606,7 +603,7 @@ inline void MultiplyImpl(const T *src0, const T *src1, T *dst, int64_t total_siz template <> inline void MultiplyImpl(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, int64_t total_size) { int64_t x = 0; -#ifdef USE_NEON +#ifdef ENABLE_NEON const int64_t step = 32; for (; x <= total_size - step; x += step) { uint8x16_t v_src00 = vld1q_u8(src0 + x); diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/warp_affine.cc b/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/warp_affine.cc index 9467327971..c362ee24d7 100644 --- a/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/warp_affine.cc +++ b/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/warp_affine.cc @@ -20,13 +20,6 @@ #include "lite_cv/lite_mat.h" #include "lite_cv/image_process.h" -#ifdef ENABLE_ANDROID -#if defined(__arm__) || defined(__aarch64__) || defined(_M_ARM) || defined(_M_ARM64) -#define USE_NEON -#include -#endif -#endif - #define BITS 5 #define BITS1 15 #define TAB_SZ (1 << BITS) @@ -125,6 +118,134 @@ static int BorderPolate(int value, int length, PaddBorderType borderType) { return value; } +static void RemapBilinearNotCur1C(int dx, const int16_t *HW, const uint16_t *FHW, const int16_t *wblock, + size_t src_step, const uint8_t *src_ptr, uint8_t *dst_ptr) { + int shx = HW[dx * 2]; + int shy = HW[dx * 2 + 1]; + const int16_t *w_ptr = wblock + FHW[dx] * 4; + const uint8_t *t_src_ptr = src_ptr + shy * src_step + shx; + *dst_ptr = CastToFixed(reinterpret_cast(t_src_ptr[0] * w_ptr[0] + t_src_ptr[1] * w_ptr[1] + + t_src_ptr[src_step] * w_ptr[2] + t_src_ptr[src_step + 1] * w_ptr[3])); +} + +static void RemapBilinearNotCur2C(int dx, const int16_t *HW, const uint16_t *FHW, const int16_t *wblock, + size_t src_step, const uint8_t *src_ptr, uint8_t *dst_ptr) { + int shx = HW[dx * 2]; + int shy = HW[dx * 2 + 1]; + const int16_t *w_ptr = wblock + FHW[dx] * 4; + const uint8_t *t_src_ptr = src_ptr + shy * src_step + shx * 2; + int v0 = t_src_ptr[0] * w_ptr[0] + t_src_ptr[2] * w_ptr[1] + t_src_ptr[src_step] * w_ptr[2] + + t_src_ptr[src_step + 2] * w_ptr[3]; + int v1 = t_src_ptr[1] * w_ptr[0] + t_src_ptr[3] * w_ptr[1] + t_src_ptr[src_step + 1] * w_ptr[2] + + t_src_ptr[src_step + 3] * w_ptr[3]; + dst_ptr[0] = CastToFixed(v0); + dst_ptr[1] = CastToFixed(v1); +} + +static void RemapBilinearNotCur3C(int dx, const int16_t *HW, const uint16_t *FHW, const int16_t *wblock, + size_t src_step, const uint8_t *src_ptr, uint8_t *dst_ptr) { + int shx = HW[dx * 2]; + int shy = HW[dx * 2 + 1]; + const int16_t *w_ptr = wblock + FHW[dx] * 4; + const uint8_t *t_src_ptr = src_ptr + shy * src_step + shx * 3; + int v0 = t_src_ptr[0] * w_ptr[0] + t_src_ptr[3] * w_ptr[1] + t_src_ptr[src_step] * w_ptr[2] + + t_src_ptr[src_step + 3] * w_ptr[3]; + int v1 = t_src_ptr[1] * w_ptr[0] + t_src_ptr[4] * w_ptr[1] + t_src_ptr[src_step + 1] * w_ptr[2] + + t_src_ptr[src_step + 4] * w_ptr[3]; + int v2 = t_src_ptr[2] * w_ptr[0] + t_src_ptr[5] * w_ptr[1] + t_src_ptr[src_step + 2] * w_ptr[2] + + t_src_ptr[src_step + 5] * w_ptr[3]; + dst_ptr[0] = CastToFixed(v0); + dst_ptr[1] = CastToFixed(v1); + dst_ptr[2] = CastToFixed(v2); +} + +static void RemapBilinearNotCur4C(int dx, const int16_t *HW, const uint16_t *FHW, const int16_t *wblock, + size_t src_step, const uint8_t *src_ptr, uint8_t *dst_ptr) { + int shx = HW[dx * 2]; + int shy = HW[dx * 2 + 1]; + const int16_t *w_ptr = wblock + FHW[dx] * 4; + const uint8_t *t_src_ptr = src_ptr + shy * src_step + shx * 4; + int v0 = t_src_ptr[0] * w_ptr[0] + t_src_ptr[4] * w_ptr[1] + t_src_ptr[src_step] * w_ptr[2] + + t_src_ptr[src_step + 4] * w_ptr[3]; + int v1 = t_src_ptr[1] * w_ptr[0] + t_src_ptr[5] * w_ptr[1] + t_src_ptr[src_step + 1] * w_ptr[2] + + t_src_ptr[src_step + 5] * w_ptr[3]; + dst_ptr[0] = CastToFixed(v0); + dst_ptr[1] = CastToFixed(v1); + v0 = t_src_ptr[2] * w_ptr[0] + t_src_ptr[6] * w_ptr[1] + t_src_ptr[src_step + 2] * w_ptr[2] + + t_src_ptr[src_step + 6] * w_ptr[3]; + v1 = t_src_ptr[3] * w_ptr[0] + t_src_ptr[7] * w_ptr[1] + t_src_ptr[src_step + 3] * w_ptr[2] + + t_src_ptr[src_step + 7] * w_ptr[3]; + dst_ptr[2] = CastToFixed(v0); + dst_ptr[3] = CastToFixed(v1); +} + +static void RemapBilinearNotCurMoreC(int dx, const int16_t *HW, const uint16_t *FHW, const int16_t *wblock, + size_t src_step, int cn, const uint8_t *src_ptr, uint8_t *dst_ptr) { + int shx = HW[dx * 2]; + int shy = HW[dx * 2 + 1]; + const int16_t *w_ptr = wblock + FHW[dx] * 4; + const uint8_t *t_src_ptr = src_ptr + shy * src_step + shx * cn; + for (int k = 0; k < cn; k++) { + int v0 = t_src_ptr[k] * w_ptr[0] + t_src_ptr[k + cn] * w_ptr[1] + t_src_ptr[src_step + k] * w_ptr[2] + + t_src_ptr[src_step + k + cn] * w_ptr[3]; + dst_ptr[k] = CastToFixed(v0); + } +} + +static void RemapBilinearCur1C(LiteMat _src, int dx, const int16_t *HW, const uint16_t *FHW, const int16_t *wblock, + size_t src_step, const uint8_t *src_ptr, uint8_t *dst_ptr, PaddBorderType borderType, + const std::vector &borderValue) { + int shx = HW[dx * 2]; + int shy = HW[dx * 2 + 1]; + if (borderType == PADD_BORDER_CONSTANT && (shx >= _src.width_ || shx + 1 < 0 || shy >= _src.height_ || shy + 1 < 0)) { + dst_ptr[0] = borderValue[0]; + } else { + int sv0; + int sv1; + int su0; + int su1; + const int16_t *w_ptr = wblock + FHW[dx] * 4; + + sv0 = BorderPolate(shx, _src.width_, borderType); + sv1 = BorderPolate(shx + 1, _src.width_, borderType); + su0 = BorderPolate(shy, _src.height_, borderType); + su1 = BorderPolate(shy + 1, _src.height_, borderType); + uint8_t v0 = sv0 >= 0 && su0 >= 0 ? src_ptr[su0 * src_step + sv0] : borderValue[0]; + uint8_t v1 = sv1 >= 0 && su0 >= 0 ? src_ptr[su0 * src_step + sv1] : borderValue[0]; + uint8_t v2 = sv0 >= 0 && su1 >= 0 ? src_ptr[su1 * src_step + sv0] : borderValue[0]; + uint8_t v3 = sv1 >= 0 && su1 >= 0 ? src_ptr[su1 * src_step + sv1] : borderValue[0]; + dst_ptr[0] = CastToFixed(reinterpret_cast(v0 * w_ptr[0] + v1 * w_ptr[1] + v2 * w_ptr[2] + v3 * w_ptr[3])); + } +} + +static void RemapBilinearCurMoreC(LiteMat _src, int dx, const int16_t *HW, const uint16_t *FHW, const int16_t *wblock, + size_t src_step, int cn, const uint8_t *src_ptr, uint8_t *dst_ptr, + PaddBorderType borderType, const std::vector &borderValue) { + int shx = HW[dx * 2]; + int shy = HW[dx * 2 + 1]; + if (borderType == PADD_BORDER_CONSTANT && (shx >= _src.width_ || shx + 1 < 0 || shy >= _src.height_ || shy + 1 < 0)) { + for (int k = 0; k < cn; k++) dst_ptr[k] = borderValue[k]; + } else { + int sv0; + int sv1; + int su0; + int su1; + const int16_t *w_ptr = wblock + FHW[dx] * 4; + sv0 = BorderPolate(shx, _src.width_, borderType); + sv1 = BorderPolate(shx + 1, _src.width_, borderType); + su0 = BorderPolate(shy, _src.height_, borderType); + su1 = BorderPolate(shy + 1, _src.height_, borderType); + const uint8_t *v0 = sv0 >= 0 && su0 >= 0 ? src_ptr + su0 * src_step + sv0 * cn : &borderValue[0]; + const uint8_t *v1 = sv1 >= 0 && su0 >= 0 ? src_ptr + su0 * src_step + sv1 * cn : &borderValue[0]; + const uint8_t *v2 = sv0 >= 0 && su1 >= 0 ? src_ptr + su1 * src_step + sv0 * cn : &borderValue[0]; + const uint8_t *v3 = sv1 >= 0 && su1 >= 0 ? src_ptr + su1 * src_step + sv1 * cn : &borderValue[0]; + + for (int k = 0; k < cn; k++) + dst_ptr[k] = + CastToFixed(reinterpret_cast(v0[k] * w_ptr[0] + v1[k] * w_ptr[1] + v2[k] * w_ptr[2] + v3[k] * w_ptr[3])); + } +} + static void RemapBilinear(const LiteMat &_src, LiteMat &_dst, const LiteMat &_hw, const LiteMat &_fhw, // NOLINT const void *_wblock, const PaddBorderType borderType, const std::vector &borderValue) { @@ -159,128 +280,33 @@ static void RemapBilinear(const LiteMat &_src, LiteMat &_dst, const LiteMat &_hw if (cn == 1) { for (; dx < H1; dx++, dst_ptr++) { - int shx = HW[dx * 2]; - int shy = HW[dx * 2 + 1]; - const int16_t *w_ptr = wblock + FHW[dx] * 4; - const uint8_t *t_src_ptr = src_ptr + shy * src_step + shx; - *dst_ptr = - CastToFixed(reinterpret_cast(t_src_ptr[0] * w_ptr[0] + t_src_ptr[1] * w_ptr[1] + - t_src_ptr[src_step] * w_ptr[2] + t_src_ptr[src_step + 1] * w_ptr[3])); + RemapBilinearNotCur1C(dx, HW, FHW, wblock, src_step, src_ptr, dst_ptr); } } else if (cn == 2) { for (; dx < H1; dx++, dst_ptr += 2) { - int shx = HW[dx * 2]; - int shy = HW[dx * 2 + 1]; - const int16_t *w_ptr = wblock + FHW[dx] * 4; - const uint8_t *t_src_ptr = src_ptr + shy * src_step + shx * 2; - int v0 = t_src_ptr[0] * w_ptr[0] + t_src_ptr[2] * w_ptr[1] + t_src_ptr[src_step] * w_ptr[2] + - t_src_ptr[src_step + 2] * w_ptr[3]; - int v1 = t_src_ptr[1] * w_ptr[0] + t_src_ptr[3] * w_ptr[1] + t_src_ptr[src_step + 1] * w_ptr[2] + - t_src_ptr[src_step + 3] * w_ptr[3]; - dst_ptr[0] = CastToFixed(v0); - dst_ptr[1] = CastToFixed(v1); + RemapBilinearNotCur2C(dx, HW, FHW, wblock, src_step, src_ptr, dst_ptr); } } else if (cn == 3) { for (; dx < H1; dx++, dst_ptr += 3) { - int shx = HW[dx * 2]; - int shy = HW[dx * 2 + 1]; - const int16_t *w_ptr = wblock + FHW[dx] * 4; - const uint8_t *t_src_ptr = src_ptr + shy * src_step + shx * 3; - int v0 = t_src_ptr[0] * w_ptr[0] + t_src_ptr[3] * w_ptr[1] + t_src_ptr[src_step] * w_ptr[2] + - t_src_ptr[src_step + 3] * w_ptr[3]; - int v1 = t_src_ptr[1] * w_ptr[0] + t_src_ptr[4] * w_ptr[1] + t_src_ptr[src_step + 1] * w_ptr[2] + - t_src_ptr[src_step + 4] * w_ptr[3]; - int v2 = t_src_ptr[2] * w_ptr[0] + t_src_ptr[5] * w_ptr[1] + t_src_ptr[src_step + 2] * w_ptr[2] + - t_src_ptr[src_step + 5] * w_ptr[3]; - dst_ptr[0] = CastToFixed(v0); - dst_ptr[1] = CastToFixed(v1); - dst_ptr[2] = CastToFixed(v2); + RemapBilinearNotCur3C(dx, HW, FHW, wblock, src_step, src_ptr, dst_ptr); } } else if (cn == 4) { for (; dx < H1; dx++, dst_ptr += 4) { - int shx = HW[dx * 2]; - int shy = HW[dx * 2 + 1]; - const int16_t *w_ptr = wblock + FHW[dx] * 4; - const uint8_t *t_src_ptr = src_ptr + shy * src_step + shx * 4; - int v0 = t_src_ptr[0] * w_ptr[0] + t_src_ptr[4] * w_ptr[1] + t_src_ptr[src_step] * w_ptr[2] + - t_src_ptr[src_step + 4] * w_ptr[3]; - int v1 = t_src_ptr[1] * w_ptr[0] + t_src_ptr[5] * w_ptr[1] + t_src_ptr[src_step + 1] * w_ptr[2] + - t_src_ptr[src_step + 5] * w_ptr[3]; - dst_ptr[0] = CastToFixed(v0); - dst_ptr[1] = CastToFixed(v1); - v0 = t_src_ptr[2] * w_ptr[0] + t_src_ptr[6] * w_ptr[1] + t_src_ptr[src_step + 2] * w_ptr[2] + - t_src_ptr[src_step + 6] * w_ptr[3]; - v1 = t_src_ptr[3] * w_ptr[0] + t_src_ptr[7] * w_ptr[1] + t_src_ptr[src_step + 3] * w_ptr[2] + - t_src_ptr[src_step + 7] * w_ptr[3]; - dst_ptr[2] = CastToFixed(v0); - dst_ptr[3] = CastToFixed(v1); + RemapBilinearNotCur4C(dx, HW, FHW, wblock, src_step, src_ptr, dst_ptr); } } else { for (; dx < H1; dx++, dst_ptr += cn) { - int shx = HW[dx * 2]; - int shy = HW[dx * 2 + 1]; - const int16_t *w_ptr = wblock + FHW[dx] * 4; - const uint8_t *t_src_ptr = src_ptr + shy * src_step + shx * cn; - for (int k = 0; k < cn; k++) { - int v0 = t_src_ptr[k] * w_ptr[0] + t_src_ptr[k + cn] * w_ptr[1] + t_src_ptr[src_step + k] * w_ptr[2] + - t_src_ptr[src_step + k + cn] * w_ptr[3]; - dst_ptr[k] = CastToFixed(v0); - } + RemapBilinearNotCurMoreC(dx, HW, FHW, wblock, src_step, cn, src_ptr, dst_ptr); } } } else { if (cn == 1) { for (; dx < H1; dx++, dst_ptr++) { - int shx = HW[dx * 2]; - int shy = HW[dx * 2 + 1]; - if (borderType == PADD_BORDER_CONSTANT && - (shx >= _src.width_ || shx + 1 < 0 || shy >= _src.height_ || shy + 1 < 0)) { - dst_ptr[0] = borderValue[0]; - } else { - int sv0; - int sv1; - int su0; - int su1; - const int16_t *w_ptr = wblock + FHW[dx] * 4; - - sv0 = BorderPolate(shx, _src.width_, borderType); - sv1 = BorderPolate(shx + 1, _src.width_, borderType); - su0 = BorderPolate(shy, _src.height_, borderType); - su1 = BorderPolate(shy + 1, _src.height_, borderType); - uint8_t v0 = sv0 >= 0 && su0 >= 0 ? src_ptr[su0 * src_step + sv0] : borderValue[0]; - uint8_t v1 = sv1 >= 0 && su0 >= 0 ? src_ptr[su0 * src_step + sv1] : borderValue[0]; - uint8_t v2 = sv0 >= 0 && su1 >= 0 ? src_ptr[su1 * src_step + sv0] : borderValue[0]; - uint8_t v3 = sv1 >= 0 && su1 >= 0 ? src_ptr[su1 * src_step + sv1] : borderValue[0]; - dst_ptr[0] = - CastToFixed(reinterpret_cast(v0 * w_ptr[0] + v1 * w_ptr[1] + v2 * w_ptr[2] + v3 * w_ptr[3])); - } + RemapBilinearCur1C(_src, dx, HW, FHW, wblock, src_step, src_ptr, dst_ptr, borderType, borderValue); } } else { for (; dx < H1; dx++, dst_ptr += cn) { - int shx = HW[dx * 2]; - int shy = HW[dx * 2 + 1]; - if (borderType == PADD_BORDER_CONSTANT && - (shx >= _src.width_ || shx + 1 < 0 || shy >= _src.height_ || shy + 1 < 0)) { - for (int k = 0; k < cn; k++) dst_ptr[k] = borderValue[k]; - } else { - int sv0; - int sv1; - int su0; - int su1; - const int16_t *w_ptr = wblock + FHW[dx] * 4; - sv0 = BorderPolate(shx, _src.width_, borderType); - sv1 = BorderPolate(shx + 1, _src.width_, borderType); - su0 = BorderPolate(shy, _src.height_, borderType); - su1 = BorderPolate(shy + 1, _src.height_, borderType); - const uint8_t *v0 = sv0 >= 0 && su0 >= 0 ? src_ptr + su0 * src_step + sv0 * cn : &borderValue[0]; - const uint8_t *v1 = sv1 >= 0 && su0 >= 0 ? src_ptr + su0 * src_step + sv1 * cn : &borderValue[0]; - const uint8_t *v2 = sv0 >= 0 && su1 >= 0 ? src_ptr + su1 * src_step + sv0 * cn : &borderValue[0]; - const uint8_t *v3 = sv1 >= 0 && su1 >= 0 ? src_ptr + su1 * src_step + sv1 * cn : &borderValue[0]; - - for (int k = 0; k < cn; k++) - dst_ptr[k] = CastToFixed( - reinterpret_cast(v0[k] * w_ptr[0] + v1[k] * w_ptr[1] + v2[k] * w_ptr[2] + v3[k] * w_ptr[3])); - } + RemapBilinearCurMoreC(_src, dx, HW, FHW, wblock, src_step, cn, src_ptr, dst_ptr, borderType, borderValue); } } }