You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
465 lines
17 KiB
465 lines
17 KiB
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License. */
|
|
|
|
#include "Im2Col.h"
|
|
#include "hl_device_functions.cuh"
|
|
|
|
namespace paddle {
|
|
|
|
template <class T>
|
|
__global__ void im2col(const T* data_im,
|
|
int numOuts,
|
|
int height,
|
|
int width,
|
|
int blockH,
|
|
int blockW,
|
|
int strideH,
|
|
int strideW,
|
|
int paddingH,
|
|
int paddingW,
|
|
int dilationH,
|
|
int dilationW,
|
|
int height_col,
|
|
int width_col,
|
|
T* data_col) {
|
|
int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
|
if (index < numOuts) {
|
|
int w_out = index % width_col;
|
|
index /= width_col;
|
|
int h_out = index % height_col;
|
|
int channel_in = index / height_col;
|
|
int channel_out = channel_in * blockH * blockW;
|
|
int h_in = h_out * strideH;
|
|
int w_in = w_out * strideW;
|
|
|
|
data_col += (channel_out * height_col + h_out) * width_col + w_out;
|
|
for (int i = 0; i < blockH; ++i) {
|
|
for (int j = 0; j < blockW; ++j) {
|
|
int rIdx = int(h_in + i * dilationH);
|
|
int cIdx = int(w_in + j * dilationW);
|
|
if ((rIdx - (int)paddingH) >= (int)height ||
|
|
(rIdx - (int)paddingH) < 0 ||
|
|
(cIdx - (int)paddingW) >= (int)width ||
|
|
(cIdx - (int)paddingW) < 0) {
|
|
*data_col = 0;
|
|
} else {
|
|
rIdx = rIdx + channel_in * height - paddingH;
|
|
cIdx = cIdx - paddingW;
|
|
*data_col = data_im[rIdx * width + cIdx];
|
|
}
|
|
data_col += height_col * width_col;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* imShape = [inputChannels, inputHeight, inputWidth]
|
|
* colShape =
|
|
* [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
|
|
*/
|
|
template <class T>
|
|
class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, T> {
|
|
public:
|
|
void operator()(const T* imData,
|
|
const TensorShape& imShape,
|
|
T* colData,
|
|
const TensorShape& colShape,
|
|
int strideHeight,
|
|
int strideWidth,
|
|
int paddingHeight,
|
|
int paddingWidth,
|
|
int dilationHeight,
|
|
int dilationWidth) {
|
|
int inputChannels = imShape[0];
|
|
int inputHeight = imShape[1];
|
|
int inputWidth = imShape[2];
|
|
int filterHeight = colShape[1];
|
|
int filterWidth = colShape[2];
|
|
int outputHeight = colShape[3];
|
|
int outputWidth = colShape[4];
|
|
|
|
int numKernels = inputChannels * outputHeight * outputWidth;
|
|
int blocks = (numKernels + 1024 - 1) / 1024;
|
|
int blockX = 512;
|
|
int blockY = (blocks + 512 - 1) / 512;
|
|
dim3 threads(1024, 1);
|
|
dim3 grid(blockX, blockY);
|
|
im2col<T><<<grid, threads, 0, STREAM_DEFAULT>>>(imData,
|
|
numKernels,
|
|
inputHeight,
|
|
inputWidth,
|
|
filterHeight,
|
|
filterWidth,
|
|
strideHeight,
|
|
strideWidth,
|
|
paddingHeight,
|
|
paddingWidth,
|
|
dilationHeight,
|
|
dilationWidth,
|
|
outputHeight,
|
|
outputWidth,
|
|
colData);
|
|
CHECK_SYNC("Im2ColFunctor GPU failed");
|
|
}
|
|
};
|
|
|
|
template <class T>
|
|
__global__ void col2im(size_t n,
|
|
const T* data_col,
|
|
size_t height,
|
|
size_t width,
|
|
size_t channels,
|
|
size_t blockH,
|
|
size_t blockW,
|
|
size_t strideH,
|
|
size_t strideW,
|
|
size_t paddingH,
|
|
size_t paddingW,
|
|
size_t dilationH,
|
|
size_t dilationW,
|
|
size_t height_col,
|
|
size_t width_col,
|
|
T* data_im) {
|
|
size_t index =
|
|
(blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
|
if (index < n) {
|
|
T val = 0;
|
|
int w = int(index % width);
|
|
int h = int((index / width) % height);
|
|
int c = int(index / (width * height));
|
|
int filterH = (blockH - 1) * dilationH + 1;
|
|
int filterW = (blockW - 1) * dilationW + 1;
|
|
|
|
if ((w - (int)paddingW) >= 0 &&
|
|
(w - (int)paddingW) < (width - 2 * paddingW) &&
|
|
(h - (int)paddingH) >= 0 && (h - paddingH) < (height - 2 * paddingH)) {
|
|
// compute the start and end of the output
|
|
int w_col_start =
|
|
(w < (int)filterW) ? 0 : (w - int(filterW)) / (int)strideW + 1;
|
|
int w_col_end = min((int)(w / (int)strideW + 1), (int)(width_col));
|
|
int h_col_start =
|
|
(h < (int)filterH) ? 0 : (h - (int)filterH) / (int)strideH + 1;
|
|
int h_col_end = min(int(h / strideH + 1), int(height_col));
|
|
|
|
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
|
|
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
|
|
// the col location: [c * width * height + h_out, w_out]
|
|
int h_k = (h - h_col * strideH);
|
|
int w_k = (w - w_col * strideW);
|
|
if (h_k % dilationH == 0 && w_k % dilationW == 0) {
|
|
h_k /= dilationH;
|
|
w_k /= dilationW;
|
|
int c_col =
|
|
(((c * blockH + h_k) * blockW + w_k) * height_col + h_col) *
|
|
width_col +
|
|
w_col;
|
|
val += data_col[c_col];
|
|
}
|
|
}
|
|
}
|
|
h -= paddingH;
|
|
w -= paddingW;
|
|
data_im[c * ((width - 2 * paddingW) * (height - 2 * paddingH)) +
|
|
h * (width - 2 * paddingW) + w] += val;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* imShape = [inputChannels, inputHeight, inputWidth]
|
|
* colShape =
|
|
* [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
|
|
*/
|
|
template <class T>
|
|
class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, T> {
|
|
public:
|
|
void operator()(T* imData,
|
|
const TensorShape& imShape,
|
|
const T* colData,
|
|
const TensorShape& colShape,
|
|
int strideHeight,
|
|
int strideWidth,
|
|
int paddingHeight,
|
|
int paddingWidth,
|
|
int dilationHeight,
|
|
int dilationWidth) {
|
|
int inputChannels = imShape[0];
|
|
int inputHeight = imShape[1];
|
|
int inputWidth = imShape[2];
|
|
int filterHeight = colShape[1];
|
|
int filterWidth = colShape[2];
|
|
int outputHeight = colShape[3];
|
|
int outputWidth = colShape[4];
|
|
|
|
size_t numKernels = inputChannels * (inputHeight + 2 * paddingHeight) *
|
|
(inputWidth + 2 * paddingWidth);
|
|
|
|
size_t blocks = (numKernels + 1024 - 1) / 1024;
|
|
size_t blockX = 512;
|
|
size_t blockY = (blocks + 512 - 1) / 512;
|
|
dim3 threads(1024, 1);
|
|
dim3 grid(blockX, blockY);
|
|
|
|
// To avoid involving atomic operations, we will launch one kernel per
|
|
// bottom dimension, and then in the kernel add up the top dimensions.
|
|
col2im<T><<<grid, threads, 0, STREAM_DEFAULT>>>(
|
|
numKernels,
|
|
colData,
|
|
inputHeight + 2 * paddingHeight,
|
|
inputWidth + 2 * paddingWidth,
|
|
inputChannels,
|
|
filterHeight,
|
|
filterWidth,
|
|
strideHeight,
|
|
strideWidth,
|
|
paddingHeight,
|
|
paddingWidth,
|
|
dilationHeight,
|
|
dilationWidth,
|
|
outputHeight,
|
|
outputWidth,
|
|
imData);
|
|
CHECK_SYNC("Col2ImFunctor GPU failed");
|
|
}
|
|
};
|
|
|
|
template class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, float>;
|
|
template class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, double>;
|
|
template class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, float>;
|
|
template class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, double>;
|
|
|
|
template <class T>
|
|
__global__ void im2colOCF(const T* imData,
|
|
T* colData,
|
|
int inputChannels,
|
|
int inputHeight,
|
|
int inputWidth,
|
|
int filterHeight,
|
|
int filterWidth,
|
|
int strideHeight,
|
|
int strideWidth,
|
|
int paddingHeight,
|
|
int paddingWidth,
|
|
int dilationHeight,
|
|
int dilationWidth,
|
|
int outputHeight,
|
|
int outputWidth) {
|
|
int swId = blockIdx.x;
|
|
int shId = blockIdx.y;
|
|
for (int channelId = threadIdx.z; channelId < inputChannels;
|
|
channelId += blockDim.z) {
|
|
for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) {
|
|
for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) {
|
|
int widthOffset =
|
|
idx * dilationHeight + swId * strideWidth - paddingWidth;
|
|
int heightOffset =
|
|
idy * dilationWidth + shId * strideHeight - paddingHeight;
|
|
int imOffset = widthOffset + heightOffset * inputWidth +
|
|
channelId * inputHeight * inputWidth;
|
|
|
|
int colOffset = idx + idy * filterWidth +
|
|
channelId * filterHeight * filterWidth +
|
|
(shId * outputWidth + swId) *
|
|
(inputChannels * filterHeight * filterWidth);
|
|
|
|
if (heightOffset >= inputHeight || heightOffset < 0 ||
|
|
widthOffset >= inputWidth || widthOffset < 0) {
|
|
colData[colOffset] = T(0);
|
|
} else {
|
|
colData[colOffset] = imData[imOffset];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* imShape = [inputChannels, inputHeight, inputWidth]
|
|
* colShape =
|
|
* [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
|
|
*/
|
|
template <class T>
|
|
class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, T> {
|
|
public:
|
|
void operator()(const T* imData,
|
|
const TensorShape& imShape,
|
|
T* colData,
|
|
const TensorShape& colShape,
|
|
int strideHeight,
|
|
int strideWidth,
|
|
int paddingHeight,
|
|
int paddingWidth,
|
|
int dilationHeight,
|
|
int dilationWidth) {
|
|
int inputChannels = imShape[0];
|
|
int inputHeight = imShape[1];
|
|
int inputWidth = imShape[2];
|
|
int filterHeight = colShape[3];
|
|
int filterWidth = colShape[4];
|
|
int outputHeight = colShape[0];
|
|
int outputWidth = colShape[1];
|
|
|
|
int blockDimX = 0;
|
|
int blockDimY = 0;
|
|
if (filterHeight <= 4 && filterWidth <= 4) {
|
|
blockDimX = 4;
|
|
blockDimY = 4;
|
|
} else if (filterHeight <= 8 && filterWidth <= 8) {
|
|
blockDimX = 8;
|
|
blockDimY = 8;
|
|
} else if (filterHeight <= 16 && filterWidth <= 16) {
|
|
blockDimX = 16;
|
|
blockDimY = 16;
|
|
} else {
|
|
blockDimX = 32;
|
|
blockDimY = 32;
|
|
}
|
|
|
|
int blockDimZ = 1024 / blockDimX / blockDimY;
|
|
dim3 threads(blockDimX, blockDimY, std::min(blockDimZ, inputChannels));
|
|
dim3 grid(outputWidth, outputHeight);
|
|
im2colOCF<T><<<grid, threads, 0, STREAM_DEFAULT>>>(imData,
|
|
colData,
|
|
inputChannels,
|
|
inputHeight,
|
|
inputWidth,
|
|
filterHeight,
|
|
filterWidth,
|
|
strideHeight,
|
|
strideWidth,
|
|
paddingHeight,
|
|
paddingWidth,
|
|
dilationHeight,
|
|
dilationWidth,
|
|
outputHeight,
|
|
outputWidth);
|
|
CHECK_SYNC("Im2ColFunctor GPU failed");
|
|
}
|
|
};
|
|
|
|
template <class T>
|
|
__global__ void col2imOCF(T* imData,
|
|
const T* colData,
|
|
int inputChannels,
|
|
int inputHeight,
|
|
int inputWidth,
|
|
int filterHeight,
|
|
int filterWidth,
|
|
int strideHeight,
|
|
int strideWidth,
|
|
int paddingHeight,
|
|
int paddingWidth,
|
|
int dilationHeight,
|
|
int dilationWidth,
|
|
int outputHeight,
|
|
int outputWidth) {
|
|
int swId = blockIdx.x;
|
|
int shId = blockIdx.y;
|
|
for (int channelId = threadIdx.z; channelId < inputChannels;
|
|
channelId += blockDim.z) {
|
|
for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) {
|
|
for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) {
|
|
int widthOffset =
|
|
idx * dilationWidth + swId * strideWidth - paddingWidth;
|
|
int heightOffset =
|
|
idy * dilationHeight + shId * strideHeight - paddingHeight;
|
|
int imOffset = widthOffset + heightOffset * inputWidth +
|
|
channelId * inputHeight * inputWidth;
|
|
|
|
int colOffset = idx + idy * filterWidth +
|
|
channelId * filterHeight * filterWidth +
|
|
(shId * outputWidth + swId) *
|
|
(inputChannels * filterHeight * filterWidth);
|
|
|
|
if (heightOffset >= 0 && heightOffset < inputHeight &&
|
|
widthOffset >= 0 && widthOffset < inputWidth) {
|
|
paddle::paddleAtomicAdd(imData + imOffset, colData[colOffset]);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* imShape = [inputChannels, inputHeight, inputWidth]
|
|
* colShape =
|
|
* [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
|
|
*/
|
|
template <class T>
|
|
class Col2ImFunctor<kOCF, DEVICE_TYPE_GPU, T> {
|
|
public:
|
|
void operator()(T* imData,
|
|
const TensorShape& imShape,
|
|
const T* colData,
|
|
const TensorShape& colShape,
|
|
int strideHeight,
|
|
int strideWidth,
|
|
int paddingHeight,
|
|
int paddingWidth,
|
|
int dilationHeight,
|
|
int dilationWidth) {
|
|
int inputChannels = imShape[0];
|
|
int inputHeight = imShape[1];
|
|
int inputWidth = imShape[2];
|
|
int filterHeight = colShape[3];
|
|
int filterWidth = colShape[4];
|
|
int outputHeight = colShape[0];
|
|
int outputWidth = colShape[1];
|
|
|
|
int blockDimX = 0;
|
|
int blockDimY = 0;
|
|
if (filterHeight <= 4 && filterWidth <= 4) {
|
|
blockDimX = 4;
|
|
blockDimY = 4;
|
|
} else if (filterHeight <= 8 && filterWidth <= 8) {
|
|
blockDimX = 8;
|
|
blockDimY = 8;
|
|
} else if (filterHeight <= 16 && filterWidth <= 16) {
|
|
blockDimX = 16;
|
|
blockDimY = 16;
|
|
} else {
|
|
blockDimX = 32;
|
|
blockDimY = 32;
|
|
}
|
|
|
|
int blockDimZ = 1024 / blockDimX / blockDimY;
|
|
dim3 threads(blockDimX, blockDimY, std::min(blockDimZ, inputChannels));
|
|
dim3 grid(outputWidth, outputHeight);
|
|
col2imOCF<T><<<grid, threads, 0, STREAM_DEFAULT>>>(imData,
|
|
colData,
|
|
inputChannels,
|
|
inputHeight,
|
|
inputWidth,
|
|
filterHeight,
|
|
filterWidth,
|
|
strideHeight,
|
|
strideWidth,
|
|
paddingHeight,
|
|
paddingWidth,
|
|
dilationHeight,
|
|
dilationWidth,
|
|
outputHeight,
|
|
outputWidth);
|
|
CHECK_SYNC("Col2ImFunctor GPU failed");
|
|
}
|
|
};
|
|
|
|
template class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, float>;
|
|
template class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, double>;
|
|
template class Col2ImFunctor<kOCF, DEVICE_TYPE_GPU, float>;
|
|
template class Col2ImFunctor<kOCF, DEVICE_TYPE_GPU, double>;
|
|
|
|
} // namespace paddle
|