|
|
|
@ -14,6 +14,7 @@ limitations under the License. */
|
|
|
|
|
|
|
|
|
|
#pragma once
|
|
|
|
|
|
|
|
|
|
#include <string.h>
|
|
|
|
|
#include "neon_util.h"
|
|
|
|
|
|
|
|
|
|
namespace paddle {
|
|
|
|
@ -474,6 +475,97 @@ struct DepthwiseConvKernel<4, 2> {
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
template <class T>
|
|
|
|
|
struct Padding {
|
|
|
|
|
static void run(const T* src,
|
|
|
|
|
T* dest,
|
|
|
|
|
int channels,
|
|
|
|
|
int inputHeight,
|
|
|
|
|
int inputWidth,
|
|
|
|
|
int paddingHeight,
|
|
|
|
|
int paddingWidth) {
|
|
|
|
|
const int destWidth = inputWidth + 2 * paddingWidth;
|
|
|
|
|
for (int c = 0; c < channels; c++) {
|
|
|
|
|
if (paddingHeight > 0) {
|
|
|
|
|
memset(dest, 0, destWidth * paddingHeight * sizeof(T));
|
|
|
|
|
dest += destWidth * paddingHeight;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < inputHeight; i++) {
|
|
|
|
|
// padding head
|
|
|
|
|
for (int j = 0; j < paddingWidth; j++) {
|
|
|
|
|
*dest++ = T(0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
memcpy(dest, src, inputWidth * sizeof(T));
|
|
|
|
|
dest += inputWidth;
|
|
|
|
|
src += inputWidth;
|
|
|
|
|
|
|
|
|
|
// padding tail
|
|
|
|
|
for (int j = 0; j < paddingWidth; j++) {
|
|
|
|
|
*dest++ = T(0);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (paddingHeight > 0) {
|
|
|
|
|
memset(dest, 0, destWidth * paddingHeight * sizeof(T));
|
|
|
|
|
dest += destWidth * paddingHeight;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
|
|
|
|
|
template <>
|
|
|
|
|
struct Padding<float> {
|
|
|
|
|
static void run(const float* src,
|
|
|
|
|
float* dest,
|
|
|
|
|
int channels,
|
|
|
|
|
int inputHeight,
|
|
|
|
|
int inputWidth,
|
|
|
|
|
int paddingHeight,
|
|
|
|
|
int paddingWidth) {
|
|
|
|
|
const int destWidth = inputWidth + 2 * paddingWidth;
|
|
|
|
|
for (int c = 0; c < channels; c++) {
|
|
|
|
|
if (paddingHeight > 0) {
|
|
|
|
|
memset(dest, 0, destWidth * paddingHeight * sizeof(float));
|
|
|
|
|
dest += destWidth * paddingHeight;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < inputHeight; i++) {
|
|
|
|
|
// padding head
|
|
|
|
|
for (int j = 0; j < paddingWidth; j++) {
|
|
|
|
|
*dest++ = float(0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int step = inputWidth >> 2;
|
|
|
|
|
int remain = inputWidth & 3;
|
|
|
|
|
for (int s = 0; s < step; s++) {
|
|
|
|
|
float32x4_t s0 = vld1q_f32(src);
|
|
|
|
|
vst1q_f32(dest, s0);
|
|
|
|
|
src += 4;
|
|
|
|
|
dest += 4;
|
|
|
|
|
}
|
|
|
|
|
for (int r = 0; r < remain; r++) {
|
|
|
|
|
*dest++ = *src++;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// padding tail
|
|
|
|
|
for (int j = 0; j < paddingWidth; j++) {
|
|
|
|
|
*dest++ = float(0);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (paddingHeight > 0) {
|
|
|
|
|
memset(dest, 0, destWidth * paddingHeight * sizeof(float));
|
|
|
|
|
dest += destWidth * paddingHeight;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
} // namespace neon
|
|
|
|
|