|
|
|
@ -25,7 +25,7 @@ void ConvolutionDepthwiseForward(const int nthreads,
|
|
|
|
|
const T* const inputData, const T* const filterData,
|
|
|
|
|
const int batchSize, const int outputChannels, const int outputHeight,
|
|
|
|
|
const int outputWidth,const int inputChannels, const int inputHeight, const int inputWidth,
|
|
|
|
|
const int filterHeight, const int filterWidth, const int strideH,
|
|
|
|
|
const int filterMultiplier, const int filterHeight, const int filterWidth, const int strideH,
|
|
|
|
|
const int strideW, const int paddingH, const int paddingW,
|
|
|
|
|
T* const outputData) {
|
|
|
|
|
|
|
|
|
@ -33,23 +33,25 @@ void ConvolutionDepthwiseForward(const int nthreads,
|
|
|
|
|
(blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
|
|
|
|
|
|
|
|
|
if(index < nthreads) {
|
|
|
|
|
const int n = index / outputChannels / outputHeight / outputWidth;
|
|
|
|
|
const int c = (index / outputHeight / outputWidth) % outputChannels;
|
|
|
|
|
const int h = (index / outputWidth) % outputHeight;
|
|
|
|
|
const int w = index % outputWidth;
|
|
|
|
|
const T* weight = filterData + c * filterHeight * filterWidth;
|
|
|
|
|
const int batch = index / outputChannels / outputHeight / outputWidth;
|
|
|
|
|
const int c_out = (index / outputHeight / outputWidth) % outputChannels;
|
|
|
|
|
const int h_out = (index / outputWidth) % outputHeight;
|
|
|
|
|
const int w_out = index % outputWidth;
|
|
|
|
|
|
|
|
|
|
const int c_in = c_out / filterMultiplier;
|
|
|
|
|
const T* weight = filterData + c_out * filterHeight * filterWidth;
|
|
|
|
|
T value = 0;
|
|
|
|
|
const int h_in_start = -paddingH + h * strideH;
|
|
|
|
|
const int w_in_start = -paddingW + w * strideW;
|
|
|
|
|
const int h_in_end = -paddingH + h * strideH + filterHeight - 1;
|
|
|
|
|
const int w_in_end = -paddingW + w * strideW + filterWidth - 1;
|
|
|
|
|
const int h_in_start = -paddingH + h_out * strideH;
|
|
|
|
|
const int w_in_start = -paddingW + w_out * strideW;
|
|
|
|
|
const int h_in_end = -paddingH + h_out * strideH + filterHeight - 1;
|
|
|
|
|
const int w_in_end = -paddingW + w_out * strideW + filterWidth - 1;
|
|
|
|
|
if ((h_in_start >= 0) && (h_in_end < inputHeight)
|
|
|
|
|
&&(w_in_start >= 0) && (w_in_end < inputWidth)) {
|
|
|
|
|
for (int kh = 0; kh < filterHeight; ++kh) {
|
|
|
|
|
for (int kw = 0; kw < filterWidth; ++kw) {
|
|
|
|
|
const int h_in = -paddingH + h * strideH + kh;
|
|
|
|
|
const int w_in = -paddingW + w * strideW + kw;
|
|
|
|
|
const int offset = ((n * inputChannels + c) * inputHeight + h_in)
|
|
|
|
|
const int h_in = -paddingH + h_out * strideH + kh;
|
|
|
|
|
const int w_in = -paddingW + w_out * strideW + kw;
|
|
|
|
|
const int offset = ((batch * inputChannels + c_in) * inputHeight + h_in)
|
|
|
|
|
* inputWidth + w_in;
|
|
|
|
|
value += (*weight) * inputData[offset];
|
|
|
|
|
++weight;
|
|
|
|
@ -58,11 +60,11 @@ void ConvolutionDepthwiseForward(const int nthreads,
|
|
|
|
|
}else{
|
|
|
|
|
for (int kh = 0; kh < filterHeight; ++kh) {
|
|
|
|
|
for (int kw = 0; kw < filterWidth; ++kw) {
|
|
|
|
|
const int h_in = -paddingH + h * strideH + kh;
|
|
|
|
|
const int w_in = -paddingW + w * strideW + kw;
|
|
|
|
|
const int h_in = -paddingH + h_out * strideH + kh;
|
|
|
|
|
const int w_in = -paddingW + w_out * strideW + kw;
|
|
|
|
|
if ((h_in >= 0) && (h_in < inputHeight)
|
|
|
|
|
&& (w_in >= 0) && (w_in < inputWidth)) {
|
|
|
|
|
const int offset = ((n * outputChannels + c) * inputHeight + h_in)
|
|
|
|
|
const int offset = ((batch * inputChannels + c_in) * inputHeight + h_in)
|
|
|
|
|
* inputWidth + w_in;
|
|
|
|
|
value += (*weight) * inputData[offset];
|
|
|
|
|
}
|
|
|
|
@ -81,38 +83,42 @@ void ConvolutionDepthwiseInputBackward(const int nthreads,
|
|
|
|
|
const T* const top_diff, const T* const weight_data,
|
|
|
|
|
const int num, const int outputChannels, const int outputHeight,
|
|
|
|
|
const int outputWidth,const int inputChannels, const int inputHeight, const int inputWidth,
|
|
|
|
|
const int filterHeight, const int filterWidth, const int strideH,
|
|
|
|
|
const int filterMultiplier, const int filterHeight, const int filterWidth, const int strideH,
|
|
|
|
|
const int strideW, const int paddingH, const int paddingW,
|
|
|
|
|
T* const bottom_diff) {
|
|
|
|
|
int index =
|
|
|
|
|
(blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
|
|
|
|
if(index < nthreads) {
|
|
|
|
|
const int n = index / inputChannels / inputHeight / inputWidth;
|
|
|
|
|
const int c = (index / inputHeight / inputWidth) % inputChannels;
|
|
|
|
|
const int h = (index / inputWidth) % inputHeight;
|
|
|
|
|
const int w = index % inputWidth;
|
|
|
|
|
const T* weight = weight_data + c * filterHeight * filterWidth;
|
|
|
|
|
const int batch = index / inputChannels / inputHeight / inputWidth;
|
|
|
|
|
const int c_in = (index / inputHeight / inputWidth) % inputChannels;
|
|
|
|
|
const int h_in = (index / inputWidth) % inputHeight;
|
|
|
|
|
const int w_in = index % inputWidth;
|
|
|
|
|
const int c_out_start = c_in * filterMultiplier;
|
|
|
|
|
T value = 0;
|
|
|
|
|
for (int kh = 0; kh < filterHeight; ++kh) {
|
|
|
|
|
for (int kw = 0; kw < filterWidth; ++kw) {
|
|
|
|
|
const int h_out_s = h + paddingH - kh;
|
|
|
|
|
const int w_out_s = w + paddingW - kw;
|
|
|
|
|
if (((h_out_s % strideH) == 0) && ((w_out_s % strideW) == 0)) {
|
|
|
|
|
const int h_out = h_out_s / strideH;
|
|
|
|
|
const int w_out = w_out_s / strideW;
|
|
|
|
|
// TODO(zhaolong) : the 'if' affect the effectiveness, it needs to optimize
|
|
|
|
|
if ((h_out >= 0) && (h_out < outputHeight)
|
|
|
|
|
&& (w_out >= 0) && (w_out < outputWidth)) {
|
|
|
|
|
const int offset = ((n * outputChannels + c) * outputHeight + h_out)
|
|
|
|
|
* outputWidth + w_out;
|
|
|
|
|
value += (*weight) * top_diff[offset];
|
|
|
|
|
}
|
|
|
|
|
for(int c_out = c_out_start; c_out < c_out_start + filterMultiplier; c_out ++){
|
|
|
|
|
//weight bixu c_out
|
|
|
|
|
const T* weight = weight_data + c_out * filterHeight * filterWidth;
|
|
|
|
|
for (int kh = 0; kh < filterHeight; ++kh) {
|
|
|
|
|
for (int kw = 0; kw < filterWidth; ++kw) {
|
|
|
|
|
const int h_out_s = h_in + paddingH - kh;
|
|
|
|
|
const int w_out_s = w_in + paddingW - kw;
|
|
|
|
|
if (((h_out_s % strideH) == 0) && ((w_out_s % strideW) == 0)) {
|
|
|
|
|
const int h_out = h_out_s / strideH;
|
|
|
|
|
const int w_out = w_out_s / strideW;
|
|
|
|
|
// TODO(zhaolong) : the 'if' affect the effectiveness, it needs to optimize
|
|
|
|
|
if ((h_out >= 0) && (h_out < outputHeight)
|
|
|
|
|
&& (w_out >= 0) && (w_out < outputWidth)) {
|
|
|
|
|
const int offset = ((batch * outputChannels + c_out) * outputHeight + h_out)
|
|
|
|
|
* outputWidth + w_out;
|
|
|
|
|
value += (*weight) * top_diff[offset];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
++weight;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
++weight;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
bottom_diff[index] += value;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// CUDA kernel to compute the depthwise convolution backprop w.r.t filter.
|
|
|
|
@ -122,26 +128,27 @@ void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads,
|
|
|
|
|
const T* const top_diff, const T* const inputData,
|
|
|
|
|
const int num, const int outputChannels, const int outputHeight,
|
|
|
|
|
const int outputWidth, const int inputChannels, const int inputHeight, const int inputWidth,
|
|
|
|
|
const int filterHeight, const int filterWidth, const int strideH,
|
|
|
|
|
const int filterMultiplier, const int filterHeight, const int filterWidth, const int strideH,
|
|
|
|
|
const int strideW, const int paddingH, const int paddingW,
|
|
|
|
|
T* const buffer_data) {
|
|
|
|
|
int index =
|
|
|
|
|
(blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
|
|
|
|
if (index < nthreads) {
|
|
|
|
|
const int h = (index / outputWidth) % outputHeight;
|
|
|
|
|
const int w = index % outputWidth;
|
|
|
|
|
const int h_out = (index / outputWidth) % outputHeight;
|
|
|
|
|
const int w_out = index % outputWidth;
|
|
|
|
|
const int kh = (index / filterWidth / outputHeight / outputWidth)
|
|
|
|
|
% filterHeight;
|
|
|
|
|
const int kw = (index / outputHeight / outputWidth) % filterWidth;
|
|
|
|
|
const int h_in = -paddingH + h * strideH + kh;
|
|
|
|
|
const int w_in = -paddingW + w * strideW + kw;
|
|
|
|
|
const int h_in = -paddingH + h_out * strideH + kh;
|
|
|
|
|
const int w_in = -paddingW + w_out * strideW + kw;
|
|
|
|
|
if ((h_in >= 0) && (h_in < inputHeight)
|
|
|
|
|
&& (w_in >= 0) && (w_in < inputWidth)) {
|
|
|
|
|
const int c = index / filterHeight / filterWidth / outputHeight / outputWidth;
|
|
|
|
|
const int n = num_i;
|
|
|
|
|
const int top_offset = ((n * outputChannels + c) * outputHeight + h)
|
|
|
|
|
* outputWidth + w;
|
|
|
|
|
const int bottom_offset = ((n * inputChannels + c) * inputHeight + h_in)
|
|
|
|
|
const int c_out = index / filterHeight / filterWidth / outputHeight / outputWidth;
|
|
|
|
|
const int c_in = c_out / filterMultiplier;
|
|
|
|
|
const int batch = num_i;
|
|
|
|
|
const int top_offset = ((batch * outputChannels + c_out) * outputHeight + h_out)
|
|
|
|
|
* outputWidth + w_out;
|
|
|
|
|
const int bottom_offset = ((batch * inputChannels + c_in) * inputHeight + h_in)
|
|
|
|
|
* inputWidth + w_in;
|
|
|
|
|
buffer_data[index] = top_diff[top_offset] * inputData[bottom_offset];
|
|
|
|
|
} else {
|
|
|
|
@ -162,6 +169,7 @@ public:
|
|
|
|
|
int inputChannels,
|
|
|
|
|
int inputHeight,
|
|
|
|
|
int inputWidth,
|
|
|
|
|
int filterMultiplier,
|
|
|
|
|
int filterHeight,
|
|
|
|
|
int filterWidth,
|
|
|
|
|
int strideH,
|
|
|
|
@ -190,6 +198,7 @@ public:
|
|
|
|
|
inputChannels,
|
|
|
|
|
inputHeight,
|
|
|
|
|
inputWidth,
|
|
|
|
|
filterMultiplier,
|
|
|
|
|
filterHeight,
|
|
|
|
|
filterWidth,
|
|
|
|
|
strideH,
|
|
|
|
@ -212,6 +221,7 @@ public:
|
|
|
|
|
int inputChannels,
|
|
|
|
|
int inputHeight,
|
|
|
|
|
int inputWidth,
|
|
|
|
|
int filterMultiplier,
|
|
|
|
|
int filterHeight,
|
|
|
|
|
int filterWidth,
|
|
|
|
|
int strideH,
|
|
|
|
@ -242,6 +252,7 @@ public:
|
|
|
|
|
inputChannels,
|
|
|
|
|
inputHeight,
|
|
|
|
|
inputWidth,
|
|
|
|
|
filterMultiplier,
|
|
|
|
|
filterHeight,
|
|
|
|
|
filterWidth,
|
|
|
|
|
strideH,
|
|
|
|
@ -264,6 +275,7 @@ public:
|
|
|
|
|
int inputChannels,
|
|
|
|
|
int inputHeight,
|
|
|
|
|
int inputWidth,
|
|
|
|
|
int filterMultiplier,
|
|
|
|
|
int filterHeight,
|
|
|
|
|
int filterWidth,
|
|
|
|
|
int strideH,
|
|
|
|
@ -273,14 +285,14 @@ public:
|
|
|
|
|
T* colData,
|
|
|
|
|
T* filterGrad){
|
|
|
|
|
|
|
|
|
|
int colDataSize = inputChannels * filterHeight * filterWidth * outputHeight * outputWidth;
|
|
|
|
|
int colDataSize = outputChannels * filterHeight * filterWidth * outputHeight * outputWidth;
|
|
|
|
|
|
|
|
|
|
size_t blocks = (colDataSize + 1024 -1) / 1024;
|
|
|
|
|
size_t blockX = 512;
|
|
|
|
|
size_t blockY = (blocks+512-1)/512;
|
|
|
|
|
dim3 threads(1024, 1);
|
|
|
|
|
dim3 grid(blockX, blockY);
|
|
|
|
|
BaseMatrix filterGradMatrix(inputChannels * filterHeight * filterWidth, 1, filterGrad, false, true);
|
|
|
|
|
BaseMatrix filterGradMatrix(outputChannels * filterHeight * filterWidth, 1, filterGrad, false, true);
|
|
|
|
|
|
|
|
|
|
for(int i = 0; i < batchSize; i++) {
|
|
|
|
|
ConvolutionDepthwiseFilterBackward<T>
|
|
|
|
@ -296,6 +308,7 @@ public:
|
|
|
|
|
inputChannels,
|
|
|
|
|
inputHeight,
|
|
|
|
|
inputWidth,
|
|
|
|
|
filterMultiplier,
|
|
|
|
|
filterHeight,
|
|
|
|
|
filterWidth,
|
|
|
|
|
strideH,
|
|
|
|
@ -304,8 +317,8 @@ public:
|
|
|
|
|
paddingW,
|
|
|
|
|
colData
|
|
|
|
|
);
|
|
|
|
|
int M = colDataSize / outputHeight / outputWidth;
|
|
|
|
|
int K = outputHeight * outputWidth;
|
|
|
|
|
int M = colDataSize / K;
|
|
|
|
|
|
|
|
|
|
BaseMatrix colMatrix(M, K, colData, false, true);
|
|
|
|
|
filterGradMatrix.sumRows(colMatrix, (T)1.0, (T)1.0);
|
|
|
|
|