|
|
|
@ -18,134 +18,6 @@ limitations under the License. */
|
|
|
|
|
#include "hl_cnn.h"
|
|
|
|
|
#include "hl_device_functions.cuh"
|
|
|
|
|
|
|
|
|
|
__global__ void KeFeature2col(size_t n, size_t height, const real* data_im,
|
|
|
|
|
size_t blockH, size_t blockW, size_t width,
|
|
|
|
|
size_t strideH, size_t strideW,
|
|
|
|
|
size_t paddingH, size_t paddingW,
|
|
|
|
|
size_t height_col, size_t width_col,
|
|
|
|
|
real* data_col) {
|
|
|
|
|
size_t index =
|
|
|
|
|
(blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
|
|
|
|
if (index < n) {
|
|
|
|
|
size_t w_out = index % width_col;
|
|
|
|
|
index /= width_col;
|
|
|
|
|
size_t h_out = index % height_col;
|
|
|
|
|
size_t channel_in = index / height_col;
|
|
|
|
|
size_t channel_out = channel_in * blockH * blockW;
|
|
|
|
|
size_t h_in = h_out * strideH;
|
|
|
|
|
size_t w_in = w_out * strideW;
|
|
|
|
|
|
|
|
|
|
data_col += (channel_out * height_col + h_out) * width_col + w_out;
|
|
|
|
|
for (size_t i = 0; i < blockH; ++i) {
|
|
|
|
|
for (size_t j = 0; j < blockW; ++j) {
|
|
|
|
|
int rIdx = int(h_in+i);
|
|
|
|
|
int cIdx = int(w_in+j);
|
|
|
|
|
if ((rIdx-(int)paddingH) >= (int)height ||
|
|
|
|
|
(rIdx-(int)paddingH) < 0 ||
|
|
|
|
|
(cIdx-(int)paddingW) >= (int)width ||
|
|
|
|
|
(cIdx-(int)paddingW) < 0) {
|
|
|
|
|
*data_col = 0;
|
|
|
|
|
} else {
|
|
|
|
|
rIdx = rIdx + channel_in*height - paddingH;
|
|
|
|
|
cIdx = cIdx - paddingW;
|
|
|
|
|
*data_col = data_im[rIdx* width + cIdx];
|
|
|
|
|
}
|
|
|
|
|
data_col += height_col * width_col;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void hl_expand_feature2col(const real* dataIm, size_t channels,
|
|
|
|
|
size_t height, size_t width,
|
|
|
|
|
size_t blockH, size_t blockW,
|
|
|
|
|
size_t strideH, size_t strideW,
|
|
|
|
|
size_t paddingH, size_t paddingW,
|
|
|
|
|
size_t outputH, size_t outputW,
|
|
|
|
|
real* dataCol) {
|
|
|
|
|
size_t numKernels = channels * outputH * outputW;
|
|
|
|
|
|
|
|
|
|
size_t blocks = (numKernels + 1024 -1) / 1024;
|
|
|
|
|
size_t blockX = 512;
|
|
|
|
|
size_t blockY = (blocks+512-1)/512;
|
|
|
|
|
dim3 threads(1024, 1);
|
|
|
|
|
dim3 grid(blockX, blockY);
|
|
|
|
|
KeFeature2col<<< grid, threads, 0, STREAM_DEFAULT >>>
|
|
|
|
|
(numKernels, height, dataIm, blockH, blockW, width,
|
|
|
|
|
strideH, strideW, paddingH, paddingW,
|
|
|
|
|
outputH, outputW, dataCol);
|
|
|
|
|
CHECK_SYNC("hl_expand_feature2col failed");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__global__ void KeCol2Feature(size_t n, const real* data_col, size_t height,
|
|
|
|
|
size_t width, size_t channels,
|
|
|
|
|
size_t blockH, size_t blockW,
|
|
|
|
|
size_t strideH, size_t strideW,
|
|
|
|
|
size_t paddingH, size_t paddingW,
|
|
|
|
|
size_t height_col, size_t width_col,
|
|
|
|
|
real* data_im, real alpha, real beta) {
|
|
|
|
|
size_t index =
|
|
|
|
|
(blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
|
|
|
|
if (index < n) {
|
|
|
|
|
real val = 0;
|
|
|
|
|
int w = int(index % width);
|
|
|
|
|
int h = int((index / width) % height);
|
|
|
|
|
int c = int(index / (width * height));
|
|
|
|
|
if ((w - (int)paddingW) >= 0 &&
|
|
|
|
|
(w - (int)paddingW) < (width-2 * paddingW) &&
|
|
|
|
|
(h - (int)paddingH) >= 0 &&
|
|
|
|
|
(h - paddingH) < (height - 2 * paddingH)) {
|
|
|
|
|
// compute the start and end of the output
|
|
|
|
|
int w_col_start =
|
|
|
|
|
(w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1;
|
|
|
|
|
int w_col_end =
|
|
|
|
|
min((int)(w / (int)strideW + 1), (int)(width_col));
|
|
|
|
|
int h_col_start =
|
|
|
|
|
(h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1;
|
|
|
|
|
int h_col_end = min(int(h / strideH + 1), int(height_col));
|
|
|
|
|
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
|
|
|
|
|
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
|
|
|
|
|
// the col location: [c * width * height + h_out, w_out]
|
|
|
|
|
int c_col = int(c * blockH* blockW) + \
|
|
|
|
|
(h - h_col * (int)strideH) * (int)blockW +
|
|
|
|
|
(w - w_col * (int)strideW);
|
|
|
|
|
val += data_col[(c_col * height_col + h_col) * width_col + w_col];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
h -= paddingH;
|
|
|
|
|
w -= paddingW;
|
|
|
|
|
real tD = data_im[c*((width-2*paddingW) * (height-2*paddingH)) +
|
|
|
|
|
h*(width-2*paddingW) + w];
|
|
|
|
|
data_im[c*((width-2*paddingW) * (height-2*paddingH)) +
|
|
|
|
|
h*(width-2*paddingW) + w] = alpha * val + beta*tD;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void hl_shrink_col2feature(const real * dataCol, size_t channels,
|
|
|
|
|
size_t height, size_t width,
|
|
|
|
|
size_t blockH, size_t blockW,
|
|
|
|
|
size_t strideH, size_t strideW,
|
|
|
|
|
size_t paddingH, size_t paddingW,
|
|
|
|
|
size_t outputH, size_t outputW,
|
|
|
|
|
real* dataIm, real alpha, real beta) {
|
|
|
|
|
size_t numKernels = channels * (height + 2*paddingH) * (width + 2*paddingW);
|
|
|
|
|
|
|
|
|
|
size_t blocks = (numKernels + 1024 -1) / 1024;
|
|
|
|
|
size_t blockX = 512;
|
|
|
|
|
size_t blockY = (blocks+512-1)/512;
|
|
|
|
|
dim3 threads(1024, 1);
|
|
|
|
|
dim3 grid(blockX, blockY);
|
|
|
|
|
|
|
|
|
|
// To avoid involving atomic operations, we will launch one kernel per
|
|
|
|
|
// bottom dimension, and then in the kernel add up the top dimensions.
|
|
|
|
|
KeCol2Feature<<< grid, threads, 0, STREAM_DEFAULT >>>
|
|
|
|
|
(numKernels, dataCol, height + 2*paddingH, width + 2*paddingW,
|
|
|
|
|
channels, blockH, blockW, strideH, strideW, paddingH, paddingW,
|
|
|
|
|
outputH, outputW, dataIm, alpha, beta);
|
|
|
|
|
CHECK_SYNC("hl_shrink_col2feature failed");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__global__ void KeMaxPoolForward(const int nthreads, const real* inputData,
|
|
|
|
|
const int channels, const int height,
|
|
|
|
|
const int width,
|
|
|
|
|