modity the format

cblas_new
xzl 8 years ago
parent 44927bf70a
commit dbb658805e

@ -99,8 +99,7 @@ public:
ConvFunctionBase::init(config);
}
virtual void check(const BufferArgs& inputs,
const BufferArgs& outputs) override {
void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
const TensorShape& input = inputs[0].shape();
const TensorShape& filter = inputs[1].shape();
const TensorShape& output = outputs[0].shape();
@ -162,8 +161,7 @@ public:
ConvFunctionBase::init(config);
}
virtual void check(const BufferArgs& inputs,
const BufferArgs& outputs) override {
void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
const TensorShape& output = inputs[0].shape();
const TensorShape& filter = inputs[1].shape();
const TensorShape& input = outputs[0].shape();
@ -225,8 +223,7 @@ public:
ConvFunctionBase::init(config);
}
virtual void check(const BufferArgs& inputs,
const BufferArgs& outputs) override {
void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
const TensorShape& output = inputs[0].shape();
const TensorShape& input = inputs[1].shape();
const TensorShape& filter = outputs[0].shape();

@ -24,10 +24,10 @@ __global__
void ConvolutionDepthwiseForward(const int nthreads,
const T* const inputData, const T* const filterData,
const int batchSize, const int outputChannels, const int outputHeight,
const int outputWidth,const int inputChannels, const int inputHeight, const int inputWidth,
const int filterMultiplier, const int filterHeight, const int filterWidth, const int strideH,
const int strideW, const int paddingH, const int paddingW,
T* const outputData) {
const int outputWidth, const int inputChannels, const int inputHeight,
const int inputWidth, const int filterMultiplier, const int filterHeight,
const int filterWidth, const int strideH, const int strideW,
const int paddingH, const int paddingW, T* const outputData) {
int index =
(blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
@ -51,8 +51,8 @@ void ConvolutionDepthwiseForward(const int nthreads,
for (int kw = 0; kw < filterWidth; ++kw) {
const int h_in = -paddingH + h_out * strideH + kh;
const int w_in = -paddingW + w_out * strideW + kw;
const int offset = ((batch * inputChannels + c_in) * inputHeight + h_in)
* inputWidth + w_in;
const int offset = ((batch * inputChannels + c_in)
* inputHeight + h_in) * inputWidth + w_in;
value += (*weight) * inputData[offset];
++weight;
}
@ -64,8 +64,8 @@ void ConvolutionDepthwiseForward(const int nthreads,
const int w_in = -paddingW + w_out * strideW + kw;
if ((h_in >= 0) && (h_in < inputHeight)
&& (w_in >= 0) && (w_in < inputWidth)) {
const int offset = ((batch * inputChannels + c_in) * inputHeight + h_in)
* inputWidth + w_in;
const int offset = ((batch * inputChannels + c_in)
* inputHeight + h_in) * inputWidth + w_in;
value += (*weight) * inputData[offset];
}
++weight;
@ -82,10 +82,10 @@ __global__
void ConvolutionDepthwiseInputBackward(const int nthreads,
const T* const top_diff, const T* const weight_data,
const int num, const int outputChannels, const int outputHeight,
const int outputWidth,const int inputChannels, const int inputHeight, const int inputWidth,
const int filterMultiplier, const int filterHeight, const int filterWidth, const int strideH,
const int strideW, const int paddingH, const int paddingW,
T* const bottom_diff) {
const int outputWidth, const int inputChannels, const int inputHeight,
const int inputWidth, const int filterMultiplier, const int filterHeight,
const int filterWidth, const int strideH, const int strideW,
const int paddingH, const int paddingW, T* const bottom_diff) {
int index =
(blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if (index < nthreads) {
@ -95,8 +95,8 @@ void ConvolutionDepthwiseInputBackward(const int nthreads,
const int w_in = index % inputWidth;
const int c_out_start = c_in * filterMultiplier;
T value = 0;
for(int c_out = c_out_start; c_out < c_out_start + filterMultiplier; c_out ++){
//weight bixu c_out
for (int c_out = c_out_start;
c_out < c_out_start + filterMultiplier; c_out ++) {
const T* weight = weight_data + c_out * filterHeight * filterWidth;
for (int kh = 0; kh < filterHeight; ++kh) {
for (int kw = 0; kw < filterWidth; ++kw) {
@ -105,11 +105,12 @@ void ConvolutionDepthwiseInputBackward(const int nthreads,
if (((h_out_s % strideH) == 0) && ((w_out_s % strideW) == 0)) {
const int h_out = h_out_s / strideH;
const int w_out = w_out_s / strideW;
// TODO(zhaolong) : the 'if' affect the effectiveness, it needs to optimize
// TODO(zhaolong) : the 'if' affect the effectiveness,
// it needs to optimize
if ((h_out >= 0) && (h_out < outputHeight)
&& (w_out >= 0) && (w_out < outputWidth)) {
const int offset = ((batch * outputChannels + c_out) * outputHeight + h_out)
* outputWidth + w_out;
const int offset = ((batch * outputChannels + c_out)
* outputHeight + h_out) * outputWidth + w_out;
value += (*weight) * top_diff[offset];
}
}
@ -127,10 +128,10 @@ __global__
void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads,
const T* const top_diff, const T* const inputData,
const int num, const int outputChannels, const int outputHeight,
const int outputWidth, const int inputChannels, const int inputHeight, const int inputWidth,
const int filterMultiplier, const int filterHeight, const int filterWidth, const int strideH,
const int strideW, const int paddingH, const int paddingW,
T* const buffer_data) {
const int outputWidth, const int inputChannels, const int inputHeight,
const int inputWidth, const int filterMultiplier, const int filterHeight,
const int filterWidth, const int strideH, const int strideW,
const int paddingH, const int paddingW, T* const buffer_data) {
int index =
(blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if (index < nthreads) {
@ -143,13 +144,14 @@ void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads,
const int w_in = -paddingW + w_out * strideW + kw;
if ((h_in >= 0) && (h_in < inputHeight)
&& (w_in >= 0) && (w_in < inputWidth)) {
const int c_out = index / filterHeight / filterWidth / outputHeight / outputWidth;
const int c_out = index /
(filterHeight * filterWidth * outputHeight * outputWidth);
const int c_in = c_out / filterMultiplier;
const int batch = num_i;
const int top_offset = ((batch * outputChannels + c_out) * outputHeight + h_out)
* outputWidth + w_out;
const int bottom_offset = ((batch * inputChannels + c_in) * inputHeight + h_in)
* inputWidth + w_in;
const int top_offset = ((batch * outputChannels + c_out) *
outputHeight + h_out) * outputWidth + w_out;
const int bottom_offset = ((batch * inputChannels + c_in)
* inputHeight + h_in) * inputWidth + w_in;
buffer_data[index] = top_diff[top_offset] * inputData[bottom_offset];
} else {
buffer_data[index] = 0;
@ -177,7 +179,6 @@ public:
int paddingH,
int paddingW,
T* outputData){
int outputSize = batchSize * outputChannels * outputHeight * outputWidth;
size_t blocks = (outputSize + 1024 -1) / 1024;
@ -229,7 +230,6 @@ public:
int paddingH,
int paddingW,
T* inputGrad){
int inputSize = batchSize * inputChannels * inputHeight * inputWidth;
size_t blocks = (inputSize + 1024 -1) / 1024;
@ -284,15 +284,16 @@ public:
int paddingW,
T* colData,
T* filterGrad){
int colDataSize = outputChannels * filterHeight * filterWidth * outputHeight * outputWidth;
int colDataSize = outputChannels * filterHeight * filterWidth
* outputHeight * outputWidth;
size_t blocks = (colDataSize + 1024 -1) / 1024;
size_t blockX = 512;
size_t blockY = (blocks+512-1)/512;
dim3 threads(1024, 1);
dim3 grid(blockX, blockY);
BaseMatrix filterGradMatrix(outputChannels * filterHeight * filterWidth, 1, filterGrad, false, true);
BaseMatrix filterGradMatrix(outputChannels * filterHeight * filterWidth,
1, filterGrad, false, true);
for (int i = 0; i < batchSize; i++) {
ConvolutionDepthwiseFilterBackward<T>
@ -315,8 +316,7 @@ public:
strideW,
paddingH,
paddingW,
colData
);
colData);
int K = outputHeight * outputWidth;
int M = colDataSize / K;

Loading…
Cancel
Save