Merge pull request #3228 from gangliao/clang-format

ClangFormat for proto and cuda
cblas_new
gangliao 8 years ago committed by GitHub
commit 75185d821a

@ -24,7 +24,7 @@
description: Format files with ClangFormat. description: Format files with ClangFormat.
entry: clang-format -i entry: clang-format -i
language: system language: system
files: \.(c|cc|cxx|cpp|h|hpp|hxx)$ files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
- repo: https://github.com/PaddlePaddle/pre-commit-golang - repo: https://github.com/PaddlePaddle/pre-commit-golang
sha: 8337620115c25ff8333f1b1a493bd031049bd7c0 sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
hooks: hooks:

@ -12,17 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_batch_transpose.h"
#include "hl_base.h" #include "hl_base.h"
#include "hl_batch_transpose.h"
const int TILE_DIM = 64; const int TILE_DIM = 64;
const int BLOCK_ROWS = 16; const int BLOCK_ROWS = 16;
// No bank-conflict transpose for a batch of data. // No bank-conflict transpose for a batch of data.
__global__ void batchTransposeNoBankConflicts(real* odata, __global__ void batchTransposeNoBankConflicts(
const real* idata, real* odata, const real* idata, int numSamples, int width, int height) {
int numSamples, int width,
int height) {
__shared__ float tile[TILE_DIM][TILE_DIM + 1]; __shared__ float tile[TILE_DIM][TILE_DIM + 1];
const int x = blockIdx.x * TILE_DIM + threadIdx.x; const int x = blockIdx.x * TILE_DIM + threadIdx.x;
@ -50,12 +48,12 @@ __global__ void batchTransposeNoBankConflicts(real* odata,
newX] = tile[threadIdx.x][j]; newX] = tile[threadIdx.x][j];
} }
void batchTranspose(const real* input, real* output, int width, int height, void batchTranspose(
int batchSize) { const real* input, real* output, int width, int height, int batchSize) {
dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1); dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1);
dim3 dimGrid(DIVUP(width, TILE_DIM), DIVUP(height, TILE_DIM), batchSize); dim3 dimGrid(DIVUP(width, TILE_DIM), DIVUP(height, TILE_DIM), batchSize);
batchTransposeNoBankConflicts<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>> batchTransposeNoBankConflicts<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
(output, input, batchSize, width, height); output, input, batchSize, width, height);
CHECK_SYNC("batchTranspose failed!"); CHECK_SYNC("batchTranspose failed!");
} }

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <cmath>
#include <stdlib.h> #include <stdlib.h>
#include "hl_cuda.h" #include <cmath>
#include "hl_time.h"
#include "hl_base.h" #include "hl_base.h"
#include "hl_cuda.h"
#include "hl_perturbation_util.cuh" #include "hl_perturbation_util.cuh"
#include "hl_time.h"
#define _USE_MATH_DEFINES #define _USE_MATH_DEFINES
@ -30,10 +29,16 @@ limitations under the License. */
* centerX, centerY: translation. * centerX, centerY: translation.
* sourceX, sourceY: output coordinates in the original image. * sourceX, sourceY: output coordinates in the original image.
*/ */
__device__ void getTranformCoord(int x, int y, real theta, real scale, __device__ void getTranformCoord(int x,
real tgtCenter, real imgCenter, int y,
real centerR, real centerC, real theta,
int* sourceX, int* sourceY) { real scale,
real tgtCenter,
real imgCenter,
real centerR,
real centerC,
int* sourceX,
int* sourceY) {
real H[4] = {cosf(-theta), -sinf(-theta), sinf(-theta), cosf(-theta)}; real H[4] = {cosf(-theta), -sinf(-theta), sinf(-theta), cosf(-theta)};
// compute coornidates in the rotated and scaled image // compute coornidates in the rotated and scaled image
@ -57,11 +62,17 @@ __device__ void getTranformCoord(int x, int y, real theta, real scale,
* created by Wei Xu (genome), converted by Jiang Wang * created by Wei Xu (genome), converted by Jiang Wang
*/ */
__global__ void kSamplingPatches(const real* imgs, real* targets, __global__ void kSamplingPatches(const real* imgs,
int imgSize, int tgtSize, const int channels, real* targets,
int samplingRate, const real* thetas, int imgSize,
const real* scales, const int* centerRs, int tgtSize,
const int* centerCs, const real padValue, const int channels,
int samplingRate,
const real* thetas,
const real* scales,
const int* centerRs,
const int* centerCs,
const real padValue,
const int numImages) { const int numImages) {
const int caseIdx = blockIdx.x * 4 + threadIdx.x; const int caseIdx = blockIdx.x * 4 + threadIdx.x;
const int pxIdx = blockIdx.y * 128 + threadIdx.y; const int pxIdx = blockIdx.y * 128 + threadIdx.y;
@ -80,8 +91,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
const int pxY = pxIdx / tgtSize; const int pxY = pxIdx / tgtSize;
int srcPxX, srcPxY; int srcPxX, srcPxY;
getTranformCoord(pxX, pxY, thetas[imgIdx], scales[imgIdx], tgtCenter, getTranformCoord(pxX,
imgCenter, centerCs[caseIdx], centerRs[caseIdx], &srcPxX, pxY,
thetas[imgIdx],
scales[imgIdx],
tgtCenter,
imgCenter,
centerCs[caseIdx],
centerRs[caseIdx],
&srcPxX,
&srcPxY); &srcPxY);
imgs += (imgIdx * imgPixels + srcPxY * imgSize + srcPxX) * channels; imgs += (imgIdx * imgPixels + srcPxY * imgSize + srcPxX) * channels;
@ -100,10 +118,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
* *
* created by Wei Xu * created by Wei Xu
*/ */
void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio, void hl_generate_disturb_params(real*& gpuAngle,
int*& gpuCenterR, int*& gpuCenterC, real*& gpuScaleRatio,
int numImages, int imgSize, real rotateAngle, int*& gpuCenterR,
real scaleRatio, int samplingRate, int*& gpuCenterC,
int numImages,
int imgSize,
real rotateAngle,
real scaleRatio,
int samplingRate,
bool isTrain) { bool isTrain) {
// The number of output samples. // The number of output samples.
int numPatches = numImages * samplingRate; int numPatches = numImages * samplingRate;
@ -123,7 +146,8 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
for (int i = 0; i < numImages; i++) { for (int i = 0; i < numImages; i++) {
r_angle[i] = r_angle[i] =
(rotateAngle * M_PI / 180.0) * (rand() / (RAND_MAX + 1.0) // NOLINT (rotateAngle * M_PI / 180.0) * (rand() / (RAND_MAX + 1.0) // NOLINT
- 0.5); -
0.5);
s_ratio[i] = s_ratio[i] =
1 + (rand() / (RAND_MAX + 1.0) - 0.5) * scaleRatio; // NOLINT 1 + (rand() / (RAND_MAX + 1.0) - 0.5) * scaleRatio; // NOLINT
} }
@ -140,8 +164,10 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
int pxY = int pxY =
(int)(real(imgSize - 1) * rand() / (RAND_MAX + 1.0)); // NOLINT (int)(real(imgSize - 1) * rand() / (RAND_MAX + 1.0)); // NOLINT
const real H[4] = {cos(-r_angle[i]), -sin(-r_angle[i]), const real H[4] = {cos(-r_angle[i]),
sin(-r_angle[i]), cos(-r_angle[i])}; -sin(-r_angle[i]),
sin(-r_angle[i]),
cos(-r_angle[i])};
real x = pxX - imgCenter; real x = pxX - imgCenter;
real y = pxY - imgCenter; real y = pxY - imgCenter;
real xx = H[0] * x + H[1] * y; real xx = H[0] * x + H[1] * y;
@ -185,9 +211,12 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
delete[] center_c; delete[] center_c;
} }
void hl_conv_random_disturb_with_params(const real* images, int imgSize, void hl_conv_random_disturb_with_params(const real* images,
int tgtSize, int channels, int imgSize,
int numImages, int samplingRate, int tgtSize,
int channels,
int numImages,
int samplingRate,
const real* gpuRotationAngle, const real* gpuRotationAngle,
const real* gpuScaleRatio, const real* gpuScaleRatio,
const int* gpuCenterR, const int* gpuCenterR,
@ -202,29 +231,59 @@ void hl_conv_random_disturb_with_params(const real* images, int imgSize,
dim3 threadsPerBlock(4, 128); dim3 threadsPerBlock(4, 128);
dim3 numBlocks(DIVUP(numPatches, 4), DIVUP(targetSize, 128)); dim3 numBlocks(DIVUP(numPatches, 4), DIVUP(targetSize, 128));
kSamplingPatches <<<numBlocks, threadsPerBlock>>> kSamplingPatches<<<numBlocks, threadsPerBlock>>>(images,
(images, target, imgSize, tgtSize, channels, samplingRate, target,
gpuRotationAngle, gpuScaleRatio, gpuCenterR, gpuCenterC, imgSize,
paddingValue, numImages); tgtSize,
channels,
samplingRate,
gpuRotationAngle,
gpuScaleRatio,
gpuCenterR,
gpuCenterC,
paddingValue,
numImages);
hl_device_synchronize(); hl_device_synchronize();
} }
void hl_conv_random_disturb(const real* images, int imgSize, void hl_conv_random_disturb(const real* images,
int tgtSize, int channels, int numImages, int imgSize,
real scaleRatio, real rotateAngle, int tgtSize,
int samplingRate, real* gpu_r_angle, int channels,
real* gpu_s_ratio, int* gpu_center_r, int numImages,
int* gpu_center_c, int paddingValue, real scaleRatio,
bool isTrain, real* targets) { real rotateAngle,
int samplingRate,
real* gpu_r_angle,
real* gpu_s_ratio,
int* gpu_center_r,
int* gpu_center_c,
int paddingValue,
bool isTrain,
real* targets) {
// generate the random disturbance sequence and the sampling locations // generate the random disturbance sequence and the sampling locations
hl_generate_disturb_params(gpu_r_angle, gpu_s_ratio, gpu_center_r, hl_generate_disturb_params(gpu_r_angle,
gpu_center_c, numImages, imgSize, rotateAngle, gpu_s_ratio,
scaleRatio, samplingRate, isTrain); gpu_center_r,
gpu_center_c,
hl_conv_random_disturb_with_params( numImages,
images, imgSize, tgtSize, channels, numImages, imgSize,
samplingRate, gpu_r_angle, gpu_s_ratio, rotateAngle,
gpu_center_r, gpu_center_r, paddingValue, scaleRatio,
targets); samplingRate,
isTrain);
hl_conv_random_disturb_with_params(images,
imgSize,
tgtSize,
channels,
numImages,
samplingRate,
gpu_r_angle,
gpu_s_ratio,
gpu_center_r,
gpu_center_r,
paddingValue,
targets);
} }

@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_base.h" #include "hl_base.h"
#include "hl_device_functions.cuh"
#include "hl_cuda.h" #include "hl_cuda.h"
#include "hl_device_functions.cuh"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
template<int blockDimX, int blockDimY, int gridDimX, bool AddRow> template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
__global__ void KeMatrixAddRows(real* output, int ldo, __global__ void KeMatrixAddRows(real* output,
real* table, int ldt, int ldo,
real* table,
int ldt,
int* ids, int* ids,
int numSamples, int numSamples,
int tableSize, int tableSize,
@ -31,8 +32,8 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
while (idy < numSamples) { while (idy < numSamples) {
int tableId = ids[idy]; int tableId = ids[idy];
if ((0 <= tableId) && (tableId < tableSize)) { if ((0 <= tableId) && (tableId < tableSize)) {
real *out = output + idy * ldo; real* out = output + idy * ldo;
real *tab = table + tableId * ldt; real* tab = table + tableId * ldt;
for (int i = idx; i < dim; i += blockDimX) { for (int i = idx; i < dim; i += blockDimX) {
if (AddRow) { if (AddRow) {
paddle::paddleAtomicAdd(&tab[i], out[i]); paddle::paddleAtomicAdd(&tab[i], out[i]);
@ -45,8 +46,10 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
} }
} }
void hl_matrix_select_rows(real* output, int ldo, void hl_matrix_select_rows(real* output,
real* table, int ldt, int ldo,
real* table,
int ldt,
int* ids, int* ids,
int numSamples, int numSamples,
int tableSize, int tableSize,
@ -57,14 +60,16 @@ void hl_matrix_select_rows(real* output, int ldo,
dim3 threads(128, 8); dim3 threads(128, 8);
dim3 grid(8, 1); dim3 grid(8, 1);
KeMatrixAddRows<128, 8, 8, 0><<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixAddRows<128, 8, 8, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
(output, ldo, table, ldt, ids, numSamples, tableSize, dim); output, ldo, table, ldt, ids, numSamples, tableSize, dim);
CHECK_SYNC("hl_matrix_select_rows failed"); CHECK_SYNC("hl_matrix_select_rows failed");
} }
void hl_matrix_add_to_rows(real* table, int ldt, void hl_matrix_add_to_rows(real* table,
real* input, int ldi, int ldt,
real* input,
int ldi,
int* ids, int* ids,
int numSamples, int numSamples,
int tableSize, int tableSize,
@ -75,16 +80,15 @@ void hl_matrix_add_to_rows(real* table, int ldt,
dim3 threads(128, 8); dim3 threads(128, 8);
dim3 grid(8, 1); dim3 grid(8, 1);
KeMatrixAddRows<128, 8, 8, 1><<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixAddRows<128, 8, 8, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
(input, ldi, table, ldt, ids, numSamples, tableSize, dim); input, ldi, table, ldt, ids, numSamples, tableSize, dim);
CHECK_SYNC("hl_matrix_add_to_rows failed"); CHECK_SYNC("hl_matrix_add_to_rows failed");
} }
template<class T, int blockDimX, int gridDimX> template <class T, int blockDimX, int gridDimX>
__global__ void KeVectorSelect(T* dst, int sized, __global__ void KeVectorSelect(
const T* src, int sizes, T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
const int* ids, int sizei) {
int idx = threadIdx.x + blockDimX * blockIdx.x; int idx = threadIdx.x + blockDimX * blockIdx.x;
while (idx < sizei) { while (idx < sizei) {
int index = ids[idx]; int index = ids[idx];
@ -95,9 +99,8 @@ __global__ void KeVectorSelect(T* dst, int sized,
} }
template <class T> template <class T>
void hl_vector_select_from(T* dst, int sized, void hl_vector_select_from(
const T* src, int sizes, T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
const int* ids, int sizei) {
CHECK_NOTNULL(dst); CHECK_NOTNULL(dst);
CHECK_NOTNULL(src); CHECK_NOTNULL(src);
CHECK_NOTNULL(ids); CHECK_NOTNULL(ids);
@ -105,18 +108,17 @@ void hl_vector_select_from(T* dst, int sized,
dim3 threads(512, 1); dim3 threads(512, 1);
dim3 grid(8, 1); dim3 grid(8, 1);
KeVectorSelect<T, 512, 8><<< grid, threads, 0, STREAM_DEFAULT >>> KeVectorSelect<T, 512, 8><<<grid, threads, 0, STREAM_DEFAULT>>>(
(dst, sized, src, sizes, ids, sizei); dst, sized, src, sizes, ids, sizei);
CHECK_SYNC("hl_vector_select_from failed"); CHECK_SYNC("hl_vector_select_from failed");
} }
template template void hl_vector_select_from(real* dst,
void hl_vector_select_from(real* dst, int sized, int sized,
const real* src, int sizes, const real* src,
const int* ids, int sizei); int sizes,
template const int* ids,
void hl_vector_select_from(int* dst, int sized, int sizei);
const int* src, int sizes, template void hl_vector_select_from(
const int* ids, int sizei); int* dst, int sized, const int* src, int sizes, const int* ids, int sizei);

File diff suppressed because it is too large Load Diff

@ -12,17 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
syntax="proto2"; syntax = "proto2";
package paddle.framework; package paddle.framework;
// Attribute Type for paddle's Op. // Attribute Type for paddle's Op.
// Op contains many attributes. Each type of attributes could be different. // Op contains many attributes. Each type of attributes could be different.
// The AttrType will be shared between AttrDesc and AttrProto. // The AttrType will be shared between AttrDesc and AttrProto.
enum AttrType { enum AttrType {
INT = 0; INT = 0;
FLOAT = 1; FLOAT = 1;
STRING = 2; STRING = 2;
INTS = 3; INTS = 3;
FLOATS = 4; FLOATS = 4;
STRINGS = 5; STRINGS = 5;
} }

@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
syntax="proto2"; syntax = "proto2";
package paddle.framework; package paddle.framework;
import "attribute.proto"; import "attribute.proto";
@ -22,14 +22,14 @@ import "attribute.proto";
// //
// e.g, for scale=3.0: name=scala, type=AttrType.FLOAT, value=3.0 // e.g, for scale=3.0: name=scala, type=AttrType.FLOAT, value=3.0
message AttrDesc { message AttrDesc {
required string name = 1; required string name = 1;
required AttrType type = 2; required AttrType type = 2;
optional int32 i = 3; optional int32 i = 3;
optional float f = 4; optional float f = 4;
optional string s = 5; optional string s = 5;
repeated int32 ints = 6; repeated int32 ints = 6;
repeated float floats = 7; repeated float floats = 7;
repeated string strings = 8; repeated string strings = 8;
}; };
// Protocol Message to describe an Operator. // Protocol Message to describe an Operator.
@ -42,15 +42,15 @@ message AttrDesc {
// 3rd-party language can build this proto message and call // 3rd-party language can build this proto message and call
// AddOp(const OpDesc& op_desc) of Paddle core to create an Operator. // AddOp(const OpDesc& op_desc) of Paddle core to create an Operator.
message OpDesc { message OpDesc {
// input names of this Operator. // input names of this Operator.
repeated string inputs = 1; repeated string inputs = 1;
// output names of this Operator. // output names of this Operator.
repeated string outputs = 2; repeated string outputs = 2;
// type of this Operator, such as "add", "sub", "fc". // type of this Operator, such as "add", "sub", "fc".
required string type = 3; required string type = 3;
// Attributes of this Operator. e.g., scale=3.0 in cosine op. // Attributes of this Operator. e.g., scale=3.0 in cosine op.
repeated AttrDesc attrs = 4; repeated AttrDesc attrs = 4;
}; };

@ -15,10 +15,11 @@ limitations under the License. */
// Protocol Message for 3rd-party language binding. // Protocol Message for 3rd-party language binding.
// //
// Paddle Python package will use `OpProto` to generate op creation methods. // Paddle Python package will use `OpProto` to generate op creation methods.
// The op creation methods take user's input and generate `OpDesc` proto message, // The op creation methods take user's input and generate `OpDesc` proto
// message,
// then pass `OpDesc` to C++ side and create Op pointer. // then pass `OpDesc` to C++ side and create Op pointer.
// //
syntax="proto2"; syntax = "proto2";
package paddle.framework; package paddle.framework;
import "attribute.proto"; import "attribute.proto";
@ -26,89 +27,90 @@ import "attribute.proto";
// Attribute protocol message for 3rd-party language binding. // Attribute protocol message for 3rd-party language binding.
// It will store the Op support what attribute and what type. // It will store the Op support what attribute and what type.
message AttrProto { message AttrProto {
// Supported attribute name. e.g. `scale` for cosine op. // Supported attribute name. e.g. `scale` for cosine op.
required string name = 1; required string name = 1;
// Supported attribute type. // Supported attribute type.
required AttrType type = 2; required AttrType type = 2;
// Supported attribute comments. It helps 3rd-party language generate doc-string. // Supported attribute comments. It helps 3rd-party language generate
required string comment = 3; // doc-string.
required string comment = 3;
// If that attribute is generated, it means the Paddle third language // If that attribute is generated, it means the Paddle third language
// binding has responsibility to fill that attribute. End-User should // binding has responsibility to fill that attribute. End-User should
// not set that attribute. // not set that attribute.
optional bool generated = 4 [default=false]; optional bool generated = 4 [ default = false ];
} }
// Input or output message for 3rd-party language binding. // Input or output message for 3rd-party language binding.
// It contains parameter name and its comments. // It contains parameter name and its comments.
message VarProto { message VarProto {
// Input or output name in that op creation function. // Input or output name in that op creation function.
// e.g. `cos(a, b, output, ...)`, "a", "b", "output" are names. // e.g. `cos(a, b, output, ...)`, "a", "b", "output" are names.
required string name = 1; required string name = 1;
// The comment for that input. It helps 3rd-party language generate doc-string. // The comment for that input. It helps 3rd-party language generate
required string comment = 2; // doc-string.
required string comment = 2;
// Is that input/output could be a list or not.
// If so, that Op should write a attributed named `input_format` or // Is that input/output could be a list or not.
// `output_format`. // If so, that Op should write a attributed named `input_format` or
// // `output_format`.
// e.g. //
// If the op is a fc op, the inputs are `X`, `W`, `b`. The `X` and `W` // e.g.
// could be multiple, so the multiple of `X` and `W` is True, and OpDesc // If the op is a fc op, the inputs are `X`, `W`, `b`. The `X` and `W`
// will hold a attribute of them. // could be multiple, so the multiple of `X` and `W` is True, and OpDesc
// // will hold a attribute of them.
// The Op desc of same fc could be //
// { // The Op desc of same fc could be
// "type": "fc", // {
// "input": ["X1", "X2", "W1", "W2", "b"], // "type": "fc",
// "output": "fc.out", // "input": ["X1", "X2", "W1", "W2", "b"],
// "attrs" : { // "output": "fc.out",
// "input_format": [0, 2, 4, 5] // "attrs" : {
// } // "input_format": [0, 2, 4, 5]
// } // }
// // }
optional bool multiple = 3 [default=false]; //
optional bool multiple = 3 [ default = false ];
// It marks that output is a temporary output. That output is not used by
// user, but used by other op internally as input. If other op is not use // It marks that output is a temporary output. That output is not used by
// that output, it could be optimized early. // user, but used by other op internally as input. If other op is not use
// // that output, it could be optimized early.
// Attribute temporary_index will be set in OpDesc if there is some //
// outputs are temporary. // Attribute temporary_index will be set in OpDesc if there is some
// // outputs are temporary.
// output = [ "xxx.out1", "xxx.tmp", "xxx.out2"], //
// attrs = { // output = [ "xxx.out1", "xxx.tmp", "xxx.out2"],
// "temporary_index": [1] // attrs = {
// } // "temporary_index": [1]
optional bool temporary = 4 [default=false]; // }
optional bool temporary = 4 [ default = false ];
// The gradient of operator can be ignored immediately
// e.g. operator AddOp, y = x1 + x2, the gradient of dy/dx1, dy/dx2 // The gradient of operator can be ignored immediately
// can be ignored for the future optimized on graph. // e.g. operator AddOp, y = x1 + x2, the gradient of dy/dx1, dy/dx2
optional bool ignore_gradient = 6; // can be ignored for the future optimized on graph.
optional bool ignore_gradient = 6;
} }
// Op protocol message for 3rd-party language binding. // Op protocol message for 3rd-party language binding.
// It contains all information for generating op creation method. // It contains all information for generating op creation method.
message OpProto { message OpProto {
// The input information to generate op creation method. // The input information to generate op creation method.
repeated VarProto inputs = 1; repeated VarProto inputs = 1;
// The output information to generate op creation method. // The output information to generate op creation method.
repeated VarProto outputs = 2; repeated VarProto outputs = 2;
// The attribute information to generate op creation method. // The attribute information to generate op creation method.
repeated AttrProto attrs = 3; repeated AttrProto attrs = 3;
// The comments for that Op. It helps 3rd-party language generate // The comments for that Op. It helps 3rd-party language generate
// doc-string. The whole documentation of that Op is generated by comment, // doc-string. The whole documentation of that Op is generated by comment,
// inputs, outputs, attrs together. // inputs, outputs, attrs together.
required string comment = 4; required string comment = 4;
// The type of that Op.
required string type = 5;
// The type of that Op.
required string type = 5;
} }

@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_base.h"
#include "ContextProjectionOp.h" #include "ContextProjectionOp.h"
#include "hl_base.h"
namespace paddle { namespace paddle {
@ -30,7 +30,7 @@ __global__ void KeContextProjectionForward(const real* input,
int block_size = blockDim.x; int block_size = blockDim.x;
int sequenceId = blockIdx.x; int sequenceId = blockIdx.x;
int seq_start = sequence[sequenceId]; int seq_start = sequence[sequenceId];
int seq_end = sequence[sequenceId+1]; int seq_end = sequence[sequenceId + 1];
real value = 0; real value = 0;
int instances = seq_end - seq_start + context_length - 1; int instances = seq_end - seq_start + context_length - 1;
@ -49,8 +49,9 @@ __global__ void KeContextProjectionForward(const real* input,
} else if ((i + context_start) >= (seq_end - seq_start)) { } else if ((i + context_start) >= (seq_end - seq_start)) {
if (padding) { if (padding) {
value = value =
weight[(begin_pad + i + context_start - (seq_end - seq_start)) * weight[(begin_pad + i + context_start - (seq_end - seq_start)) *
input_dim + idx]; input_dim +
idx];
} else { } else {
continue; continue;
} }
@ -61,7 +62,7 @@ __global__ void KeContextProjectionForward(const real* input,
int outx = (i - context_length) < 0 ? i : (context_length - 1); int outx = (i - context_length) < 0 ? i : (context_length - 1);
int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1)); int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
real* output_r = real* output_r =
output + outy * input_dim * context_length + outx * input_dim; output + outy * input_dim * context_length + outx * input_dim;
for (int j = outy; j < seq_end - seq_start; j++) { for (int j = outy; j < seq_end - seq_start; j++) {
output_r[idx] += value; output_r[idx] += value;
if (j - outy == outx) break; if (j - outy == outx) break;
@ -108,13 +109,25 @@ void hl_context_projection_forward(const real* input,
dim3 grid(blocks_x, blocks_y); dim3 grid(blocks_x, blocks_y);
if (weight) { if (weight) {
KeContextProjectionForward<true><<< grid, threads, 0, STREAM_DEFAULT >>> KeContextProjectionForward<true><<<grid, threads, 0, STREAM_DEFAULT>>>(
(input, sequence, weight, output, input_dim, input,
context_length, context_start, begin_pad); sequence,
} else { weight,
KeContextProjectionForward<false><<< grid, threads, 0, STREAM_DEFAULT >>> output,
(input, sequence, weight, output, input_dim, input_dim,
context_length, context_start, begin_pad); context_length,
context_start,
begin_pad);
} else {
KeContextProjectionForward<false><<<grid, threads, 0, STREAM_DEFAULT>>>(
input,
sequence,
weight,
output,
input_dim,
context_length,
context_start,
begin_pad);
} }
CHECK_SYNC("hl_context_projection_forward failed"); CHECK_SYNC("hl_context_projection_forward failed");
} }
@ -148,7 +161,7 @@ __global__ void KeContextProjectionBackwardData(const real* out_grad,
int block_size = blockDim.x; int block_size = blockDim.x;
int sequenceId = blockIdx.x; int sequenceId = blockIdx.x;
int seq_start = sequence[sequenceId]; int seq_start = sequence[sequenceId];
int seq_end = sequence[sequenceId+1]; int seq_end = sequence[sequenceId + 1];
real value = 0; real value = 0;
int instances = seq_end - seq_start + context_length - 1; int instances = seq_end - seq_start + context_length - 1;
@ -170,7 +183,7 @@ __global__ void KeContextProjectionBackwardData(const real* out_grad,
int outx = (i - context_length) < 0 ? i : (context_length - 1); int outx = (i - context_length) < 0 ? i : (context_length - 1);
int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1)); int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
real* output_r = real* output_r =
out + outy * input_dim * context_length + outx * input_dim; out + outy * input_dim * context_length + outx * input_dim;
for (int j = outy; j < seq_end - seq_start; j++) { for (int j = outy; j < seq_end - seq_start; j++) {
value += output_r[idx]; value += output_r[idx];
if (j - outy == outx) break; if (j - outy == outx) break;
@ -211,8 +224,8 @@ void hl_context_projection_backward_data(const real* out_grad,
int blocks_y = 1; int blocks_y = 1;
dim3 threads(block_size, 1); dim3 threads(block_size, 1);
dim3 grid(blocks_x, blocks_y); dim3 grid(blocks_x, blocks_y);
KeContextProjectionBackwardData<<< grid, threads, 0, STREAM_DEFAULT >>> KeContextProjectionBackwardData<<<grid, threads, 0, STREAM_DEFAULT>>>(
(out_grad, sequence, input_grad, input_dim, context_length, context_start); out_grad, sequence, input_grad, input_dim, context_length, context_start);
CHECK_SYNC("hl_context_projection_backward_data failed"); CHECK_SYNC("hl_context_projection_backward_data failed");
} }
@ -231,7 +244,7 @@ void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
context_start); context_start);
} }
template<int THREADS_X, int THREADS_Y> template <int THREADS_X, int THREADS_Y>
__global__ void KeContextProjectionBackwardWeight(const real* out_grad, __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
const int* sequence, const int* sequence,
real* w_grad, real* w_grad,
@ -254,17 +267,17 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
if (weight_idx < w_dim) { if (weight_idx < w_dim) {
for (int seqId = idy; seqId < num_sequences; seqId += THREADS_Y) { for (int seqId = idy; seqId < num_sequences; seqId += THREADS_Y) {
int seq_start = sequence[seqId]; int seq_start = sequence[seqId];
int seq_end = sequence[seqId+1]; int seq_end = sequence[seqId + 1];
output_r = const_cast<real*>(out_grad) output_r =
+ seq_start * w_dim * context_length; const_cast<real*>(out_grad) + seq_start * w_dim * context_length;
if (context_start < 0) { if (context_start < 0) {
if (padId + context_start < 0) { if (padId + context_start < 0) {
instanceId = padId; instanceId = padId;
} else { } else {
// begin_pad > 0; // begin_pad > 0;
instanceId = (padId - begin_pad) + instanceId =
(seq_end - seq_start) - context_start; (padId - begin_pad) + (seq_end - seq_start) - context_start;
} }
} else { } else {
if (padId + (seq_end - seq_start) < context_start) { if (padId + (seq_end - seq_start) < context_start) {
@ -275,10 +288,11 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
} }
} }
int outx = (instanceId - context_length) < 0 ? int outx =
instanceId : (context_length - 1); (instanceId - context_length) < 0 ? instanceId : (context_length - 1);
int outy = (instanceId - context_length) < 0 ? int outy = (instanceId - context_length) < 0
0 : (instanceId - (context_length - 1)); ? 0
: (instanceId - (context_length - 1));
output_r += outy * w_dim * context_length + outx * w_dim; output_r += outy * w_dim * context_length + outx * w_dim;
for (int j = outy; j < seq_end - seq_start; j++) { for (int j = outy; j < seq_end - seq_start; j++) {
value += output_r[weight_idx]; value += output_r[weight_idx];
@ -290,7 +304,7 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
} }
__syncthreads(); __syncthreads();
for (int stride = THREADS_Y/2; stride > 0; stride = stride/2) { for (int stride = THREADS_Y / 2; stride > 0; stride = stride / 2) {
if (idy < stride) { if (idy < stride) {
sum_s[idy][idx] += sum_s[idy + stride][idx]; sum_s[idy][idx] += sum_s[idy + stride][idx];
} }
@ -339,22 +353,27 @@ void hl_context_projection_backward_weight(const real* out_grad,
dim3 threads(threads_x, threads_y); dim3 threads(threads_x, threads_y);
dim3 grid(blocks_x, 1); dim3 grid(blocks_x, 1);
KeContextProjectionBackwardWeight<32, 32> KeContextProjectionBackwardWeight<32,
<<< grid, threads, 0, STREAM_DEFAULT >>> 32><<<grid, threads, 0, STREAM_DEFAULT>>>(
(out_grad, sequence, w_grad, num_sequences, w_dim, out_grad,
context_length, context_start, begin_pad); sequence,
w_grad,
num_sequences,
w_dim,
context_length,
context_start,
begin_pad);
CHECK_SYNC("hl_context_projection_backward_weight failed"); CHECK_SYNC("hl_context_projection_backward_weight failed");
} }
template <> template <>
void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>( void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
const GpuMatrix& out_grad, GpuMatrix& w_grad,
GpuMatrix& w_grad, const GpuIVector& seq_vec,
const GpuIVector& seq_vec, size_t context_length,
size_t context_length, int context_start,
int context_start, size_t total_pad,
size_t total_pad, size_t begin_pad) {
size_t begin_pad) {
hl_context_projection_backward_weight(out_grad.getData(), hl_context_projection_backward_weight(out_grad.getData(),
seq_vec.getData(), seq_vec.getData(),
w_grad.getData(), w_grad.getData(),
@ -376,23 +395,18 @@ void ContextProjectionBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
size_t begin_pad, size_t begin_pad,
bool is_padding, bool is_padding,
size_t total_pad) { size_t total_pad) {
if (in_grad) { if (in_grad) {
ContextProjectionBackwardData<DEVICE_TYPE_GPU>( ContextProjectionBackwardData<DEVICE_TYPE_GPU>(
out_grad, out_grad, in_grad, sequence, context_length, context_start);
in_grad, }
sequence, if (is_padding && w_grad) {
context_length, ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(out_grad,
context_start); w_grad,
} sequence,
if (is_padding && w_grad) { context_length,
ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>( context_start,
out_grad, total_pad,
w_grad, begin_pad);
sequence,
context_length,
context_start,
total_pad,
begin_pad);
} }
} }

@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "CosSimOp.h"
#include "hl_base.h" #include "hl_base.h"
#include "hl_device_functions.cuh" #include "hl_device_functions.cuh"
#include "CosSimOp.h"
namespace paddle { namespace paddle {
template<int block_size> template <int block_size>
__global__ void KeCosSim(real* output, __global__ void KeCosSim(real* output,
const real* input1, const real* input1,
const real* input2, const real* input2,
@ -78,8 +78,8 @@ void hlCossim(real* output,
dim3 threads(block_size, 1); dim3 threads(block_size, 1);
dim3 grid(1, input1_height); dim3 grid(1, input1_height);
KeCosSim<block_size><<<grid, threads, 0, STREAM_DEFAULT>>> KeCosSim<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>(
(output, input1, input2, width, input1_height, input2_height, scale); output, input1, input2, width, input1_height, input2_height, scale);
CHECK_SYNC("hlCossim failed"); CHECK_SYNC("hlCossim failed");
} }
@ -99,7 +99,7 @@ void CosSimForward<DEVICE_TYPE_GPU>(GpuMatrix& out_mat,
hlCossim(out, x, y, dim, in1_mat.getHeight(), in2_mat.getHeight(), scale); hlCossim(out, x, y, dim, in1_mat.getHeight(), in2_mat.getHeight(), scale);
} }
template<int block_size> template <int block_size>
__global__ void KeCosSimDerivative(const real* grad, __global__ void KeCosSimDerivative(const real* grad,
const real* output, const real* output,
const real* prev_out_x, const real* prev_out_x,
@ -148,14 +148,13 @@ __global__ void KeCosSimDerivative(const real* grad,
if (xy[0] == 0) { if (xy[0] == 0) {
real reciprocal = 1.0 / (sqrt(xx[0]) * sqrt(yy[0])); real reciprocal = 1.0 / (sqrt(xx[0]) * sqrt(yy[0]));
for (int index = tid; index < width; index += block_size) { for (int index = tid; index < width; index += block_size) {
prev_grad_x[index] += prev_grad_x[index] += scale * grad[ty] * prev_out_y[index] * reciprocal;
scale * grad[ty] * prev_out_y[index] * reciprocal;
if (input2_height > 1) { if (input2_height > 1) {
prev_grad_y[index] += prev_grad_y[index] += scale * grad[ty] * prev_out_x[index] * reciprocal;
scale * grad[ty] * prev_out_x[index] * reciprocal;
} else { } else {
paddle::paddleAtomicAdd(prev_grad_y + index, paddle::paddleAtomicAdd(
scale * grad[ty] * prev_out_x[index] * reciprocal); prev_grad_y + index,
scale * grad[ty] * prev_out_x[index] * reciprocal);
} }
} }
} else { } else {
@ -163,17 +162,18 @@ __global__ void KeCosSimDerivative(const real* grad,
real reciprocalSquareSumX = 1.0 / xx[0]; real reciprocalSquareSumX = 1.0 / xx[0];
real reciprocalSquareSumY = 1.0 / yy[0]; real reciprocalSquareSumY = 1.0 / yy[0];
for (int index = tid; index < width; index += block_size) { for (int index = tid; index < width; index += block_size) {
prev_grad_x[index] += output[ty] * grad[ty] * prev_grad_x[index] +=
(prev_out_y[index] * reciprocalXY - output[ty] * grad[ty] * (prev_out_y[index] * reciprocalXY -
prev_out_x[index] * reciprocalSquareSumX); prev_out_x[index] * reciprocalSquareSumX);
if (input2_height > 1) { if (input2_height > 1) {
prev_grad_y[index] += output[ty] * grad[ty] * prev_grad_y[index] +=
(prev_out_x[index] * reciprocalXY - output[ty] * grad[ty] * (prev_out_x[index] * reciprocalXY -
prev_out_y[index] * reciprocalSquareSumY); prev_out_y[index] * reciprocalSquareSumY);
} else { } else {
paddle::paddleAtomicAdd(prev_grad_y + index, output[ty] * grad[ty] * paddle::paddleAtomicAdd(
(prev_out_x[index] * reciprocalXY - prev_grad_y + index,
prev_out_y[index] * reciprocalSquareSumY)); output[ty] * grad[ty] * (prev_out_x[index] * reciprocalXY -
prev_out_y[index] * reciprocalSquareSumY));
} }
} }
} }
@ -198,9 +198,17 @@ void hlCossimDerivative(const real* grad,
const int block_size = 256; const int block_size = 256;
dim3 threads(block_size, 1); dim3 threads(block_size, 1);
dim3 grid(1, input1_height); dim3 grid(1, input1_height);
KeCosSimDerivative<block_size><<<grid, threads, 0, STREAM_DEFAULT>>> KeCosSimDerivative<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>(
(grad, output, prev_out_x, prev_out_y, prev_grad_x, prev_grad_y, width, grad,
input1_height, input2_height, scale); output,
prev_out_x,
prev_out_y,
prev_grad_x,
prev_grad_y,
width,
input1_height,
input2_height,
scale);
CHECK_SYNC("hlCossimDerivate failed"); CHECK_SYNC("hlCossimDerivate failed");
} }
@ -214,9 +222,9 @@ void CosSimBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
real scale) { real scale) {
CHECK(out_grad.getData() && out_val.getData() && in1_val.getData() && CHECK(out_grad.getData() && out_val.getData() && in1_val.getData() &&
in2_val.getData() && in1_grad.getData() && in2_grad.getData()); in2_val.getData() && in1_grad.getData() && in2_grad.getData());
CHECK(out_grad.useGpu_ && out_val.useGpu_ && in1_val.useGpu_ CHECK(out_grad.useGpu_ && out_val.useGpu_ && in1_val.useGpu_ &&
&& in2_val.useGpu_ && in1_grad.useGpu_ && in2_grad.useGpu_) in2_val.useGpu_ && in1_grad.useGpu_ && in2_grad.useGpu_)
<< "Matrix types are not equally GPU"; << "Matrix types are not equally GPU";
size_t dim = in1_val.getWidth(); size_t dim = in1_val.getWidth();
const real* grad = out_grad.getData(); const real* grad = out_grad.getData();

@ -12,15 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_base.h"
#include "CropOp.h" #include "CropOp.h"
#include "hl_base.h"
namespace paddle { namespace paddle {
__global__ void KeCrop(real* outputs, const real* inputs, __global__ void KeCrop(real* outputs,
int inC, int inH, int inW, const real* inputs,
int cropC, int cropH, int cropW, int inC,
int outC, int outH, int outW, int nthreads) { int inH,
int inW,
int cropC,
int cropH,
int cropW,
int outC,
int outH,
int outW,
int nthreads) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x; const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < nthreads) { if (idx < nthreads) {
const int w = idx % outW; const int w = idx % outW;
@ -35,12 +43,12 @@ __global__ void KeCrop(real* outputs, const real* inputs,
template <> template <>
void Crop<DEVICE_TYPE_GPU>(real* outputs, void Crop<DEVICE_TYPE_GPU>(real* outputs,
const real* inputs, const real* inputs,
const TensorShape inShape, const TensorShape inShape,
const TensorShape outShape, const TensorShape outShape,
const FuncConfig& conf) { const FuncConfig& conf) {
std::vector<uint32_t> crop_corner = std::vector<uint32_t> crop_corner =
conf.get<std::vector<uint32_t>>("crop_corner"); conf.get<std::vector<uint32_t>>("crop_corner");
int cropC = crop_corner[1]; int cropC = crop_corner[1];
int cropH = crop_corner[2]; int cropH = crop_corner[2];
int cropW = crop_corner[3]; int cropW = crop_corner[3];
@ -58,16 +66,33 @@ void Crop<DEVICE_TYPE_GPU>(real* outputs,
int blockSize = 1024; int blockSize = 1024;
int gridSize = (nth + blockSize - 1) / blockSize; int gridSize = (nth + blockSize - 1) / blockSize;
KeCrop<<<gridSize, blockSize, 0, STREAM_DEFAULT>>> KeCrop<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(outputs,
(outputs, inputs, inC, inH, inW, cropC, cropH, cropW, inputs,
outC, outH, outW, nth); inC,
inH,
inW,
cropC,
cropH,
cropW,
outC,
outH,
outW,
nth);
CHECK_SYNC("Crop"); CHECK_SYNC("Crop");
} }
__global__ void KeCropDiff(const real* inGrad, real* outGrad, __global__ void KeCropDiff(const real* inGrad,
int inC, int inH, int inW, real* outGrad,
int cropC, int cropH, int cropW, int inC,
int outC, int outH, int outW, int nthreads) { int inH,
int inW,
int cropC,
int cropH,
int cropW,
int outC,
int outH,
int outW,
int nthreads) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x; const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < nthreads) { if (idx < nthreads) {
const int w = idx % inW; const int w = idx % inW;
@ -84,12 +109,12 @@ __global__ void KeCropDiff(const real* inGrad, real* outGrad,
template <> template <>
void CropGrad<DEVICE_TYPE_GPU>(const real* inGrad, void CropGrad<DEVICE_TYPE_GPU>(const real* inGrad,
real* outGrad, real* outGrad,
const TensorShape inShape, const TensorShape inShape,
const TensorShape outShape, const TensorShape outShape,
const FuncConfig& conf) { const FuncConfig& conf) {
std::vector<uint32_t> crop_corner = std::vector<uint32_t> crop_corner =
conf.get<std::vector<uint32_t>>("crop_corner"); conf.get<std::vector<uint32_t>>("crop_corner");
int cropC = crop_corner[1]; int cropC = crop_corner[1];
int cropH = crop_corner[2]; int cropH = crop_corner[2];
int cropW = crop_corner[3]; int cropW = crop_corner[3];
@ -107,9 +132,18 @@ void CropGrad<DEVICE_TYPE_GPU>(const real* inGrad,
int blockSize = 1024; int blockSize = 1024;
int gridSize = (nth + blockSize - 1) / blockSize; int gridSize = (nth + blockSize - 1) / blockSize;
KeCropDiff <<<gridSize, blockSize, 0, STREAM_DEFAULT>>> KeCropDiff<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(inGrad,
(inGrad, outGrad, inC, inH, inW, cropC, cropH, cropW, outGrad,
outC, outH, outW, nth); inC,
inH,
inW,
cropC,
cropH,
cropW,
outC,
outH,
outW,
nth);
CHECK_SYNC("CropGrad"); CHECK_SYNC("CropGrad");
} }

@ -12,14 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_base.h"
#include "CrossMapNormalOp.h" #include "CrossMapNormalOp.h"
#include "hl_base.h"
namespace paddle { namespace paddle {
__global__ void KeCMRNormFillScale(size_t imageSize, const real* in, __global__ void KeCMRNormFillScale(size_t imageSize,
real* scale, size_t channels, const real* in,
size_t height, size_t width, size_t size, real* scale,
size_t channels,
size_t height,
size_t width,
size_t size,
real alpha) { real alpha) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x; const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < imageSize) { if (idx < imageSize) {
@ -51,8 +55,10 @@ __global__ void KeCMRNormFillScale(size_t imageSize, const real* in,
} }
} }
__global__ void KeCMRNormOutput(size_t inputSize, const real* in, __global__ void KeCMRNormOutput(size_t inputSize,
const real* scale, real negative_beta, const real* in,
const real* scale,
real negative_beta,
real* out) { real* out) {
const int index = threadIdx.x + blockIdx.x * blockDim.x; const int index = threadIdx.x + blockIdx.x * blockDim.x;
if (index < inputSize) { if (index < inputSize) {
@ -74,24 +80,30 @@ void CrossMapNormal<DEVICE_TYPE_GPU>(real* outputs,
size_t imageSize = numSamples * height * width; size_t imageSize = numSamples * height * width;
int blockSize = 1024; int blockSize = 1024;
int gridSize = (imageSize + 1024 - 1) / 1024; int gridSize = (imageSize + 1024 - 1) / 1024;
KeCMRNormFillScale<<<gridSize, blockSize, 0, STREAM_DEFAULT>>> KeCMRNormFillScale<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
(imageSize, inputs, denoms, channels, height, width, size, scale); imageSize, inputs, denoms, channels, height, width, size, scale);
size_t inputSize = numSamples * height * width *channels; size_t inputSize = numSamples * height * width * channels;
blockSize = 1024; blockSize = 1024;
gridSize = (inputSize + 1024 - 1) / 1024; gridSize = (inputSize + 1024 - 1) / 1024;
KeCMRNormOutput<<<gridSize, blockSize, 0, STREAM_DEFAULT>>> KeCMRNormOutput<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
(inputSize, inputs, denoms, -pow, outputs); inputSize, inputs, denoms, -pow, outputs);
CHECK_SYNC("CrossMapNormal"); CHECK_SYNC("CrossMapNormal");
} }
__global__ void KeCMRNormDiff(size_t imageSize, const real* bottom_data, __global__ void KeCMRNormDiff(size_t imageSize,
const real* top_data, const real* scale, const real* bottom_data,
const real* top_diff, size_t channels, const real* top_data,
size_t height, size_t width, size_t size, const real* scale,
real negative_beta, real cache_ratio, const real* top_diff,
real* bottom_diff ) { size_t channels,
size_t height,
size_t width,
size_t size,
real negative_beta,
real cache_ratio,
real* bottom_diff) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x; const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < imageSize) { if (idx < imageSize) {
const int w = idx % width; const int w = idx % width;
@ -113,17 +125,17 @@ __global__ void KeCMRNormDiff(size_t imageSize, const real* bottom_data,
while (index < channels + post_pad) { while (index < channels + post_pad) {
if (index < channels) { if (index < channels) {
accum += top_diff[index * step] * top_data[index * step] / accum += top_diff[index * step] * top_data[index * step] /
scale[index * step]; scale[index * step];
} }
if (index >= size) { if (index >= size) {
accum -= top_diff[(index - size) * step] * accum -= top_diff[(index - size) * step] *
top_data[(index - size) * step] / scale[(index - size) * step]; top_data[(index - size) * step] / scale[(index - size) * step];
} }
if (index >= post_pad) { if (index >= post_pad) {
bottom_diff[(index - post_pad) * step] += bottom_diff[(index - post_pad) * step] +=
top_diff[(index - post_pad) * step] * top_diff[(index - post_pad) * step] *
pow(scale[(index - post_pad) * step], negative_beta) - cache_ratio * pow(scale[(index - post_pad) * step], negative_beta) -
bottom_data[(index - post_pad) * step] * accum; cache_ratio * bottom_data[(index - post_pad) * step] * accum;
} }
++index; ++index;
} }
@ -147,9 +159,18 @@ void CrossMapNormalGrad<DEVICE_TYPE_GPU>(real* inputsGrad,
int blockSize = 1024; int blockSize = 1024;
int gridSize = (imageSize + 1024 - 1) / 1024; int gridSize = (imageSize + 1024 - 1) / 1024;
KeCMRNormDiff <<<gridSize, blockSize, 0, STREAM_DEFAULT>>> KeCMRNormDiff<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(imageSize,
(imageSize, inputsValue, outputsValue, denoms, outputsGrad, channels, inputsValue,
height, width, size, -pow, 2.0f * pow * scale, inputsGrad); outputsValue,
denoms,
outputsGrad,
channels,
height,
width,
size,
-pow,
2.0f * pow * scale,
inputsGrad);
CHECK_SYNC("CrossMapNormalGrad"); CHECK_SYNC("CrossMapNormalGrad");
} }

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_base.h"
#include "MulOp.h" #include "MulOp.h"
#include "hl_base.h"
#include "paddle/math/Matrix.h" #include "paddle/math/Matrix.h"
#include "paddle/math/SparseMatrix.h" #include "paddle/math/SparseMatrix.h"

@ -12,15 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_base.h"
#include "PadOp.h" #include "PadOp.h"
#include "hl_base.h"
namespace paddle { namespace paddle {
__global__ void KePad(real* outputs, const real* inputs, __global__ void KePad(real* outputs,
int inC, int inH, int inW, const real* inputs,
int padc, int padh, int padw, int inC,
int outC, int outH, int outW, int nthreads) { int inH,
int inW,
int padc,
int padh,
int padw,
int outC,
int outH,
int outW,
int nthreads) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x; const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < nthreads) { if (idx < nthreads) {
const int w = idx % inW; const int w = idx % inW;
@ -50,16 +58,33 @@ void Pad<DEVICE_TYPE_GPU>(real* outputs,
int outC = inC + cstart + cend; int outC = inC + cstart + cend;
int outH = inH + hstart + hend; int outH = inH + hstart + hend;
int outW = inW + wstart + wend; int outW = inW + wstart + wend;
KePad<<<gridSize, blockSize, 0, STREAM_DEFAULT>>> KePad<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(outputs,
(outputs, inputs, inC, inH, inW, cstart, hstart, wstart, inputs,
outC, outH, outW, nth); inC,
inH,
inW,
cstart,
hstart,
wstart,
outC,
outH,
outW,
nth);
CHECK_SYNC("Pad"); CHECK_SYNC("Pad");
} }
__global__ void KePadDiff(real* inGrad, const real* outGrad, __global__ void KePadDiff(real* inGrad,
int inC, int inH, int inW, const real* outGrad,
int padc, int padh, int padw, int inC,
int outC, int outH, int outW, int nthreads) { int inH,
int inW,
int padc,
int padh,
int padw,
int outC,
int outH,
int outW,
int nthreads) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x; const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < nthreads) { if (idx < nthreads) {
const int w = idx % inW; const int w = idx % inW;
@ -89,9 +114,18 @@ void PadGrad<DEVICE_TYPE_GPU>(real* inGrad,
int outC = inC + cstart + cend; int outC = inC + cstart + cend;
int outH = inH + hstart + hend; int outH = inH + hstart + hend;
int outW = inW + wstart + wend; int outW = inW + wstart + wend;
KePadDiff <<<gridSize, blockSize, 0, STREAM_DEFAULT>>> KePadDiff<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(inGrad,
(inGrad, outGrad, inC, inH, inW, cstart, hstart, wstart, outGrad,
outC, outH, outW, nth); inC,
inH,
inW,
cstart,
hstart,
wstart,
outC,
outH,
outW,
nth);
CHECK_SYNC("PadGrad"); CHECK_SYNC("PadGrad");
} }

File diff suppressed because it is too large Load Diff

@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "GruCompute.h" #include "GruCompute.h"
#include "hl_recurrent_apply.cuh" #include "hl_recurrent_apply.cuh"
@ -31,8 +30,10 @@ void GruCompute::forward<1>(hl_gru_value value, int frameSize, int batchSize) {
} }
template <> template <>
void GruCompute::backward<1>(hl_gru_value value, hl_gru_grad grad, void GruCompute::backward<1>(hl_gru_value value,
int frameSize, int batchSize) { hl_gru_grad grad,
int frameSize,
int batchSize) {
hl_gpu_gru_backward(hppl::backward::gru_stateGrad(), hl_gpu_gru_backward(hppl::backward::gru_stateGrad(),
hppl::backward::gru_resetGrad(), hppl::backward::gru_resetGrad(),
value, value,

@ -12,41 +12,62 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "LstmCompute.h" #include "LstmCompute.h"
#include "hl_recurrent_apply.cuh" #include "hl_recurrent_apply.cuh"
namespace paddle { namespace paddle {
template <> template <>
void LstmCompute::forwardBatch<1>(hl_lstm_value value, int frameSize, void LstmCompute::forwardBatch<1>(hl_lstm_value value,
int batchSize) { int frameSize,
hl_gpu_lstm_forward(hppl::forward::lstm(), value, frameSize, int batchSize) {
batchSize, activeNode_, activeGate_, hl_gpu_lstm_forward(hppl::forward::lstm(),
value,
frameSize,
batchSize,
activeNode_,
activeGate_,
activeState_); activeState_);
} }
template <> template <>
void LstmCompute::backwardBatch<1>(hl_lstm_value value, hl_lstm_grad grad, void LstmCompute::backwardBatch<1>(hl_lstm_value value,
int frameSize, int batchSize) { hl_lstm_grad grad,
hl_gpu_lstm_backward(hppl::backward::lstm(), value, grad, int frameSize,
frameSize, batchSize, activeNode_, int batchSize) {
activeGate_, activeState_); hl_gpu_lstm_backward(hppl::backward::lstm(),
value,
grad,
frameSize,
batchSize,
activeNode_,
activeGate_,
activeState_);
} }
template <> template <>
void LstmCompute::forwardOneSequence<1>(hl_lstm_value value, int frameSize) { void LstmCompute::forwardOneSequence<1>(hl_lstm_value value, int frameSize) {
hl_gpu_lstm_forward(hppl::forward::lstm(), value, hl_gpu_lstm_forward(hppl::forward::lstm(),
frameSize, /* batchSize */ 1, value,
activeNode_, activeGate_, activeState_); frameSize,
/* batchSize */ 1,
activeNode_,
activeGate_,
activeState_);
} }
template <> template <>
void LstmCompute::backwardOneSequence<1>(hl_lstm_value value, hl_lstm_grad grad, void LstmCompute::backwardOneSequence<1>(hl_lstm_value value,
hl_lstm_grad grad,
int frameSize) { int frameSize) {
hl_gpu_lstm_backward(hppl::backward::lstm(), value, grad, hl_gpu_lstm_backward(hppl::backward::lstm(),
frameSize, /* batchSize */ 1, value,
activeNode_, activeGate_, activeState_); grad,
frameSize,
/* batchSize */ 1,
activeNode_,
activeGate_,
activeState_);
} }
} // namespace paddle } // namespace paddle

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save