From a3a8a0900df325809920c2ffcbe3e5afdaa27dbe Mon Sep 17 00:00:00 2001 From: caoying03 Date: Wed, 20 Sep 2017 23:19:19 +0800 Subject: [PATCH 1/5] optimize cross entropy kernel by using reduce. --- paddle/operators/cross_entropy_op.cu | 36 ++++++++++++++----- .../framework/tests/test_cross_entropy_op.py | 27 +++++++------- 2 files changed, 41 insertions(+), 22 deletions(-) diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu index ab6ad0e062..4b38af7dc3 100644 --- a/paddle/operators/cross_entropy_op.cu +++ b/paddle/operators/cross_entropy_op.cu @@ -32,16 +32,33 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int* label, } } -template +template __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label, const int N, const int D) { - for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; - i += blockDim.x * gridDim.x) { - T sum = static_cast(0); - for (int j = 0; j < D; j++) { - sum += label[i * D + j] * tolerable_value(log(X[i * D + j])); + int tid = threadIdx.x; + __shared__ T d_sum[blockSize]; + int next_idx = blockIdx.x * D + tid; + + d_sum[tid] = 0; + int cur_idx = tid; + while (cur_idx < D) { + d_sum[tid] += tolerable_value(std::log(X[next_idx])) * label[next_idx]; + next_idx += blockSize; + cur_idx += blockSize; + } + __syncthreads(); + + for (int stride = blockSize >> 1; stride > 0; stride >>= 1) { + __syncthreads(); + if (tid < stride) { + next_idx = tid + stride; + d_sum[tid] += d_sum[next_idx]; } - Y[i] = -sum; + } + __syncthreads(); + + if (tid == 0) { + Y[blockIdx.x] = -d_sum[0]; } } @@ -104,8 +121,9 @@ class CrossEntropyOpCUDAKernel : public framework::OpKernel { // base on ExecutionContext. if (ctx.Attr("soft_label") == 1) { auto* label_data = ctx.Input("Label")->data(); - SoftCrossEntropyKernel<<>>(y_data, x_data, label_data, n, - d); + grid = d; + SoftCrossEntropyKernel<<>>(y_data, x_data, + label_data, n, d); } else { auto* label_data = ctx.Input("Label")->data(); CrossEntropyKernel<<>>(y_data, x_data, label_data, n, d); diff --git a/python/paddle/v2/framework/tests/test_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_cross_entropy_op.py index 0206ca064b..5cccfc6ade 100644 --- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py +++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py @@ -19,7 +19,7 @@ class TestCrossEntropyOp1(OpTest): dtype="float32") self.inputs = {"X": X, "Label": label} self.outputs = {"Y": cross_entropy} - self.attrs = {'soft_label': 0} + self.attrs = {"soft_label": 0} def test_check_output(self): self.check_output() @@ -34,8 +34,8 @@ class TestCrossEntropyOp2(OpTest): def setUp(self): self.op_type = "cross_entropy" - batch_size = 10 - class_num = 5 + batch_size = 13 + class_num = 37 X = np.random.uniform(0.1, 1.0, [batch_size, class_num]).astype("float32") label = np.random.uniform(0.1, 1.0, @@ -43,15 +43,16 @@ class TestCrossEntropyOp2(OpTest): label /= label.sum(axis=1, keepdims=True) cross_entropy = (-label * np.log(X)).sum( axis=1, keepdims=True).astype("float32") - self.inputs = {'X': X, 'Label': label} - self.outputs = {'Y': cross_entropy} - self.attrs = {'soft_label': 1} + + self.inputs = {"X": X, "Label": label} + self.outputs = {"Y": cross_entropy} + self.attrs = {"soft_label": 1} def test_check_output(self): self.check_output() def test_check_grad(self): - self.check_grad(['X'], 'Y') + self.check_grad(["X"], "Y", max_relative_error=0.05) class TestCrossEntropyOp3(OpTest): @@ -61,8 +62,8 @@ class TestCrossEntropyOp3(OpTest): def setUp(self): self.op_type = "cross_entropy" - batch_size = 30 - class_num = 10 + batch_size = 13 + class_num = 37 X = np.random.uniform(0.1, 1.0, [batch_size, class_num]).astype("float32") label_index = np.random.randint( @@ -74,15 +75,15 @@ class TestCrossEntropyOp3(OpTest): dtype="float32") cross_entropy2 = (-label * np.log(X)).sum( axis=1, keepdims=True).astype("float32") - self.inputs = {'X': X, 'Label': label} - self.outputs = {'Y': cross_entropy} - self.attrs = {'soft_label': 1} + self.inputs = {"X": X, "Label": label} + self.outputs = {"Y": cross_entropy} + self.attrs = {"soft_label": 1} def test_check_output(self): self.check_output() def test_check_grad(self): - self.check_grad(['X'], 'Y') + self.check_grad(["X"], "Y", max_relative_error=0.05) if __name__ == "__main__": From 141b8dbce63618a967dbe163c7393efde26614ae Mon Sep 17 00:00:00 2001 From: caoying03 Date: Thu, 21 Sep 2017 08:20:59 +0800 Subject: [PATCH 2/5] update the backward kernel. --- paddle/operators/cross_entropy_op.cu | 36 +++++++------- paddle/operators/cross_entropy_op.h | 70 ++++++++++++++++------------ 2 files changed, 59 insertions(+), 47 deletions(-) diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu index 4b38af7dc3..a21bc73351 100644 --- a/paddle/operators/cross_entropy_op.cu +++ b/paddle/operators/cross_entropy_op.cu @@ -28,27 +28,27 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int* label, for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) { PADDLE_ASSERT(label[i] >= 0 && label[i] < D); - Y[i] = -tolerable_value(log(X[i * D + label[i]])); + Y[i] = -TolerableValue()(log(X[i * D + label[i]])); } } -template +template __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label, const int N, const int D) { int tid = threadIdx.x; - __shared__ T d_sum[blockSize]; + __shared__ T d_sum[BlockSize]; int next_idx = blockIdx.x * D + tid; d_sum[tid] = 0; int cur_idx = tid; while (cur_idx < D) { - d_sum[tid] += tolerable_value(std::log(X[next_idx])) * label[next_idx]; - next_idx += blockSize; - cur_idx += blockSize; + d_sum[tid] += TolerableValue()(std::log(X[next_idx])) * label[next_idx]; + next_idx += BlockSize; + cur_idx += BlockSize; } __syncthreads(); - for (int stride = blockSize >> 1; stride > 0; stride >>= 1) { + for (int stride = BlockSize >> 1; stride > 0; stride >>= 1) { __syncthreads(); if (tid < stride) { next_idx = tid + stride; @@ -88,13 +88,12 @@ template __global__ void SoftCrossEntropyGradientKernel(T* dX, const T* dY, const T* X, const T* label, const int N, const int D) { - // TOOD(qingqing): optimize for this kernel - for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; - i += blockDim.x * gridDim.x) { - for (int j = 0; j < D; ++j) { - int idx = i * D + j; - dX[idx] = -label[idx] * dY[i] / X[idx]; - } + int row_ids = blockIdx.x * blockDim.x + threadIdx.x; + int col_ids = blockIdx.y * blockDim.y + threadIdx.y; + int ids = row_ids * D + col_ids; + + if (ids < N * D) { + dX[ids] = -label[ids] * dY[row_ids] / X[ids]; } } @@ -103,7 +102,7 @@ class CrossEntropyOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), - "It must use GPUPlace."); + "This kernel only runs on GPU device."); auto x = ctx.Input("X"); auto y = ctx.Output("Y"); @@ -136,7 +135,7 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), - "It must use GPUPlace."); + "This kernel only runs on GPU device."); auto x = ctx.Input("X"); auto dx = ctx.Output(framework::GradVarName("X")); @@ -156,6 +155,11 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel { // TODO(qingqing): launch kernel on specified stream // base on ExecutionContext. if (ctx.Attr("soft_label") == 1) { + int block_x = 32; + int block_y = 32; + dim3 block(block_x, block_y); + dim3 grid((n + block_x - 1) / block_x, (d + block_y - 1) / block_y); + auto* label_data = label->data(); SoftCrossEntropyGradientKernel<<>>( dx_data, dy_data, x_data, label_data, n, d); diff --git a/paddle/operators/cross_entropy_op.h b/paddle/operators/cross_entropy_op.h index 1b4b23ac20..4bbd05a1bb 100644 --- a/paddle/operators/cross_entropy_op.h +++ b/paddle/operators/cross_entropy_op.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include "paddle/framework/eigen.h" #include "paddle/framework/op_registry.h" #include "paddle/platform/hostdevice.h" @@ -20,19 +21,25 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; +template +using EigenMatrix = framework::EigenMatrix; template -HOSTDEVICE T tolerable_value(const T x) { - PADDLE_ASSERT(std::is_floating_point::value); - const T kApproInf = 1e20; - if (x == INFINITY) { - return kApproInf; - } - if (x == -INFINITY) { - return -kApproInf; +struct TolerableValue { + HOSTDEVICE T operator()(const T& x) const { + PADDLE_ASSERT(std::is_floating_point::value); + const T kApproInf = 1e20; + + if (x == INFINITY) { + return kApproInf; + } + if (x == -INFINITY) { + return -kApproInf; + } + return x; } - return x; -} +}; template class CrossEntropyOpKernel : public framework::OpKernel { @@ -40,33 +47,34 @@ class CrossEntropyOpKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace."); - - auto x = ctx.Input("X"); - auto y = ctx.Output("Y"); - - auto* x_data = x->data(); + const Tensor* x = ctx.Input("X"); + const Tensor* labels = ctx.Input("Label"); + Tensor* y = ctx.Output("Y"); y->mutable_data(ctx.GetPlace()); - auto* y_data = y->data(); - - int batch_size = x->dims()[0]; - int class_num = x->dims()[1]; + const int batch_size = x->dims()[0]; if (ctx.Attr("soft_label") == 1) { - auto* label_data = ctx.Input("Label")->data(); - int index = 0; - for (int i = 0; i < batch_size; ++i) { - T sum = static_cast(0); - for (int j = 0; j < class_num; ++j) { - sum += label_data[index] * tolerable_value(std::log(x_data[index])); - y_data[i] = -sum; - index++; - } - } + auto prob = EigenMatrix::From(*x); + auto lbl_mat = EigenMatrix::From(*labels); + auto loss = EigenMatrix::From(*y); + + // loss.device(ctx.GetEigenDevice()) = + // prob.log().unaryExpr(TolerableValue()); + + loss.device(ctx.GetEigenDevice()) = + -((lbl_mat * prob.log()) + .sum(Eigen::DSizes(1)) + .reshape(Eigen::DSizes(batch_size, 1))); } else { - auto* label_data = ctx.Input("Label")->data(); + const int class_num = x->dims()[1]; + + const T* x_data = x->data(); + T* y_data = y->data(); + + const int* label_data = labels->data(); for (int i = 0; i < batch_size; ++i) { int index = i * class_num + label_data[i]; - y_data[i] = -tolerable_value(std::log(x_data[index])); + y_data[i] = -TolerableValue()(std::log(x_data[index])); } } } From 6735585b0f919f12ec91180b0b70757998b107da Mon Sep 17 00:00:00 2001 From: caoying03 Date: Fri, 22 Sep 2017 20:42:02 +0800 Subject: [PATCH 3/5] fix cpu kernel with soft labels. --- paddle/operators/accuracy_op.cu | 8 +- paddle/operators/cross_entropy_op.cc | 72 ++++++---- paddle/operators/cross_entropy_op.cu | 128 ++++++++++++------ paddle/operators/cross_entropy_op.h | 13 +- paddle/operators/lookup_table_op.cu | 11 +- paddle/operators/top_k_op.cu | 10 +- .../framework/tests/test_cross_entropy_op.py | 49 +++++-- 7 files changed, 194 insertions(+), 97 deletions(-) diff --git a/paddle/operators/accuracy_op.cu b/paddle/operators/accuracy_op.cu index 0a6a0fd15c..75e8a98903 100644 --- a/paddle/operators/accuracy_op.cu +++ b/paddle/operators/accuracy_op.cu @@ -69,8 +69,12 @@ class AccuracyOpCUDAKernel : public framework::OpKernel { return; } - AccuracyCudaKernel<<<1, PADDLE_CUDA_NUM_THREADS>>>( - num_samples, infer_width, inference_data, label_data, accuracy_data); + AccuracyCudaKernel<<< + 1, PADDLE_CUDA_NUM_THREADS, 0, + reinterpret_cast( + ctx.device_context()) + .stream()>>>(num_samples, infer_width, inference_data, label_data, + accuracy_data); } }; diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc index b11dc1472d..80f7b69c14 100644 --- a/paddle/operators/cross_entropy_op.cc +++ b/paddle/operators/cross_entropy_op.cc @@ -23,27 +23,28 @@ class CrossEntropyOp : public framework::OperatorWithKernel { protected: void InferShape(const framework::InferShapeContext &ctx) const override { - PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null."); + PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should be not null."); PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"), - "Input(Label) must not be null."); - PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Y"), "Output(Y) must not be null."); + "Input(Label) should be not null."); + PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Y"), + "Output(Y) should be not null."); auto x = ctx.Input("X"); auto label = ctx.Input("Label"); - PADDLE_ENFORCE_EQ(x->dims().size(), 2, "Input(X)'s rank must be 2."); + PADDLE_ENFORCE_EQ(x->dims().size(), 2, "Input(X)'s rank should be 2."); PADDLE_ENFORCE_EQ(label->dims().size(), 2, - "Input(Label)'s rank must be 2."); + "Input(Label)'s rank should be 2."); PADDLE_ENFORCE_EQ(x->dims()[0], label->dims()[0], - "The 1st dimension of Input(X) and Input(Label) must " + "The 1st dimension of Input(X) and Input(Label) should " "be equal."); if (ctx.Attr("soft_label")) { PADDLE_ENFORCE_EQ(x->dims()[1], label->dims()[1], - "If Attr(soft_label) == true, The 2nd dimension of " - "Input(X) and Input(Label) must be equal."); + "If Attr(soft_label) == true, the 2nd dimension of " + "Input(X) and Input(Label) should be equal."); } else { PADDLE_ENFORCE_EQ(label->dims()[1], 1, - "If Attr(soft_label) == false, The 2nd dimension of " - "Input(Label) must be 1."); + "If Attr(soft_label) == false, the 2nd dimension of " + "Input(Label) should be 1."); } ctx.Output("Y")->Resize({x->dims()[0], 1}); @@ -57,35 +58,36 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel { protected: void InferShape(const framework::InferShapeContext &ctx) const override { - PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null."); + PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should be not null."); PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"), - "Input(Label) must not be null."); + "Input(Label) should be not null."); PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Y")), - "Input(Y@GRAD) must not be null."); + "Input(Y@GRAD) shoudl be not null."); auto x = ctx.Input("X"); auto label = ctx.Input("Label"); auto dy = ctx.Input(framework::GradVarName("Y")); - PADDLE_ENFORCE_EQ(x->dims().size(), 2, "Input(X)'s rank must be 2."); - PADDLE_ENFORCE_EQ(dy->dims().size(), 2, "Input(Y@Grad)'s rank must be 2."); + PADDLE_ENFORCE_EQ(x->dims().size(), 2, "Input(X)'s rank should be 2."); + PADDLE_ENFORCE_EQ(dy->dims().size(), 2, + "Input(Y@Grad)'s rank should be 2."); PADDLE_ENFORCE_EQ(label->dims().size(), 2, - "Input(Label)'s rank must be 2."); + "Input(Label)'s rank should be 2."); PADDLE_ENFORCE_EQ(x->dims()[0], label->dims()[0], - "The 1st dimension of Input(X) and Input(Label) must " + "The 1st dimension of Input(X) and Input(Label) should " "be equal."); PADDLE_ENFORCE_EQ(x->dims()[0], dy->dims()[0], - "The 1st dimension of Input(X) and Input(Y@Grad) must " + "The 1st dimension of Input(X) and Input(Y@Grad) should " "be equal."); PADDLE_ENFORCE_EQ(dy->dims()[1], 1, - "The 2nd dimension of Input(Y@Grad) must be 1."); + "The 2nd dimension of Input(Y@Grad) should be 1."); if (ctx.Attr("soft_label")) { PADDLE_ENFORCE_EQ(x->dims()[1], label->dims()[1], - "If Attr(soft_label) == true, The 2nd dimension of " - "Input(X) and Input(Label) must be equal."); + "When Attr(soft_label) == true, the 2nd dimension of " + "Input(X) and Input(Label) should be equal."); } else { PADDLE_ENFORCE_EQ(label->dims()[1], 1, - "If Attr(soft_label) == false, The 2nd dimension of " - "Input(Label) must be 1."); + "When Attr(soft_label) == false, the 2nd dimension of " + "Input(Label) should be 1."); } auto dx = ctx.Output(framework::GradVarName("X")); @@ -98,12 +100,26 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker { CrossEntropyOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput("X", "The first input of CrossEntropyOp"); - AddInput("Label", "The second input of CrossEntropyOp"); - AddOutput("Y", "The output of CrossEntropyOp"); - AddAttr("soft_label", "Is soft label. Default zero.") + AddInput("X", + "(Tensor, default Tensor), a 2-D tensor with shape N x D, " + "where N is the batch size and D is the number of classes. " + "This input is a probability computed by the previous operator, " + "which is almost always the result of a softmax operator."); + AddInput("Label", + "(Tensor, default Tensor), the ground truth which is " + "a 1-D or 2-D tensor. " + "When soft_label is set to 0, `Label` is a Tensor with shape " + "[N x 1]. " + "When soft_label is set to 1, `Label` is a Tensor " + "with shape [N x K]."); + AddOutput("Y", + "(Tensor, default Tensor), a 1-D tensor " + "with shape [N x 1]. The cross entropy loss."); + AddAttr( + "soft_label", + "(bool, default false), a flag to indicate whether to interpretate " + "the given labels as soft labels.") .SetDefault(false); - AddComment(R"DOC( CrossEntropy Operator. diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu index d837f49126..3f34a2d52d 100644 --- a/paddle/operators/cross_entropy_op.cu +++ b/paddle/operators/cross_entropy_op.cu @@ -32,37 +32,71 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int* label, } } +template +__device__ __forceinline__ T sum_single_warp(T val) { + val += __shfl_down(val, 16); + val += __shfl_down(val, 8); + val += __shfl_down(val, 4); + val += __shfl_down(val, 2); + val += __shfl_down(val, 1); + return val; +} + +// This kernel is called when the class number is less than or equal to 512. +template +__global__ void SoftCrossEntropyKernel1(T* Y, const T* X, const T* label, + const int class_num) { + int tid = threadIdx.x; + extern __shared__ T d_sum[]; + d_sum[tid] = 0; + + int cur_idx = tid; + int next_idx = blockIdx.x * class_num + tid; + while (cur_idx < class_num) { + d_sum[tid] += TolerableValue()(std::log(X[next_idx])) * label[next_idx]; + next_idx += blockDim.x; + cur_idx += blockDim.x; + } + __syncthreads(); + + for (unsigned int stride = blockDim.x >> 1; stride >= 32; stride >>= 1) { + if (tid < stride) d_sum[tid] += d_sum[tid + stride]; + __syncthreads(); + } + + T val = d_sum[tid]; + val = sum_single_warp(val); + if (tid == 0) Y[blockIdx.x] = -val; +} + +// This kernel is called when the class number is larger than 512. template -__global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label, - const int N, const int D) { +__global__ void SoftCrossEntropyKernel2(T* Y, const T* X, const T* label, + const int class_num) { int tid = threadIdx.x; __shared__ T d_sum[BlockSize]; - int next_idx = blockIdx.x * D + tid; + int next_idx = blockIdx.x * class_num + tid; d_sum[tid] = 0; int cur_idx = tid; - while (cur_idx < D) { + while (cur_idx < class_num) { d_sum[tid] += TolerableValue()(std::log(X[next_idx])) * label[next_idx]; next_idx += BlockSize; cur_idx += BlockSize; } __syncthreads(); - for (int stride = BlockSize >> 1; stride > 0; stride >>= 1) { + for (unsigned int stride = BlockSize >> 1; stride >= 32; stride >>= 1) { + if (tid < stride) d_sum[tid] += d_sum[tid + stride]; __syncthreads(); - if (tid < stride) { - next_idx = tid + stride; - d_sum[tid] += d_sum[next_idx]; - } } - __syncthreads(); - if (tid == 0) { - Y[blockIdx.x] = -d_sum[0]; - } + T val = d_sum[tid]; + val = sum_single_warp(val); + if (tid == 0) Y[blockIdx.x] = -val; } -// TODO(qingqing): make zero setting an common function. +// TODO(qingqing): make zero setting a common function. template __global__ void zero(T* X, const int N) { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; @@ -88,11 +122,9 @@ template __global__ void SoftCrossEntropyGradientKernel(T* dX, const T* dY, const T* X, const T* label, const int N, const int D) { - int row_ids = blockIdx.x * blockDim.x + threadIdx.x; - int col_ids = blockIdx.y * blockDim.y + threadIdx.y; - int ids = row_ids * D + col_ids; - + int ids = blockIdx.x * blockDim.x + threadIdx.x; if (ids < N * D) { + int row_ids = ids / D; dX[ids] = -label[ids] * dY[row_ids] / X[ids]; } } @@ -112,20 +144,34 @@ class CrossEntropyOpCUDAKernel : public framework::OpKernel { y->mutable_data(ctx.GetPlace()); auto* y_data = y->data(); - int n = x->dims()[0]; - int d = x->dims()[1]; + int batch_size = x->dims()[0]; + int class_num = x->dims()[1]; int block = 512; - int grid = (n + block - 1) / block; - // TODO(qingqing) launch kernel on specified stream - // base on ExecutionContext. + if (ctx.Attr("soft_label")) { auto* label_data = ctx.Input("Label")->data(); - grid = d; - SoftCrossEntropyKernel<<>>(y_data, x_data, - label_data, n, d); + if (class_num > 512) { + SoftCrossEntropyKernel2< + T, 512><<( + ctx.device_context()) + .stream()>>>(y_data, x_data, label_data, class_num); + } else { + int block_size = pow(2, int(std::log2(class_num))); + SoftCrossEntropyKernel1< + T><<( + ctx.device_context()) + .stream()>>>(y_data, x_data, label_data, class_num); + } } else { auto* label_data = ctx.Input("Label")->data(); - CrossEntropyKernel<<>>(y_data, x_data, label_data, n, d); + int grid = (batch_size + block - 1) / block; + CrossEntropyKernel<<< + grid, block, 0, reinterpret_cast( + ctx.device_context()) + .stream()>>>(y_data, x_data, label_data, + batch_size, class_num); } } }; @@ -148,25 +194,27 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel { int n = x->dims()[0]; int d = x->dims()[1]; + int block = 512; int grid = (n * d + block - 1) / block; - zero<<>>(dx_data, n * d); - grid = (n + block - 1) / block; - // TODO(qingqing): launch kernel on specified stream - // base on ExecutionContext. + zero<<( + ctx.device_context()) + .stream()>>>(dx_data, n * d); if (ctx.Attr("soft_label")) { - int block_x = 32; - int block_y = 32; - dim3 block(block_x, block_y); - dim3 grid((n + block_x - 1) / block_x, (d + block_y - 1) / block_y); - auto* label_data = label->data(); - SoftCrossEntropyGradientKernel<<>>( - dx_data, dy_data, x_data, label_data, n, d); + SoftCrossEntropyGradientKernel<<< + grid, block, 0, reinterpret_cast( + ctx.device_context()) + .stream()>>>(dx_data, dy_data, x_data, label_data, + n, d); } else { auto* label_data = label->data(); - CrossEntropyGradientKernel<<>>(dx_data, dy_data, x_data, - label_data, n, d); + CrossEntropyGradientKernel<<< + grid, block, 0, reinterpret_cast( + ctx.device_context()) + .stream()>>>(dx_data, dy_data, x_data, label_data, + n, d); } } }; diff --git a/paddle/operators/cross_entropy_op.h b/paddle/operators/cross_entropy_op.h index cfb00fe647..6b3f8c95be 100644 --- a/paddle/operators/cross_entropy_op.h +++ b/paddle/operators/cross_entropy_op.h @@ -31,12 +31,8 @@ struct TolerableValue { PADDLE_ASSERT(std::is_floating_point::value); const T kApproInf = 1e20; - if (x == INFINITY) { - return kApproInf; - } - if (x == -INFINITY) { - return -kApproInf; - } + if (x == INFINITY) return kApproInf; + if (x == -INFINITY) return -kApproInf; return x; } }; @@ -58,11 +54,8 @@ class CrossEntropyOpKernel : public framework::OpKernel { auto lbl_mat = EigenMatrix::From(*labels); auto loss = EigenMatrix::From(*y); - // loss.device(ctx.GetEigenDevice()) = - // prob.log().unaryExpr(TolerableValue()); - loss.device(ctx.GetEigenDevice()) = - -((lbl_mat * prob.log()) + -((lbl_mat * prob.log().unaryExpr(TolerableValue())) .sum(Eigen::DSizes(1)) .reshape(Eigen::DSizes(batch_size, 1))); } else { diff --git a/paddle/operators/lookup_table_op.cu b/paddle/operators/lookup_table_op.cu index 7083440467..62f63b4f3c 100644 --- a/paddle/operators/lookup_table_op.cu +++ b/paddle/operators/lookup_table_op.cu @@ -77,7 +77,10 @@ class LookupTableCUDAKernel : public framework::OpKernel { dim3 threads(128, 8); dim3 grids(8, 1); - LookupTable<<>>(output, table, ids, N, K, D); + LookupTable<<< + grids, threads, 0, reinterpret_cast( + context.device_context()) + .stream()>>>(output, table, ids, N, K, D); } }; @@ -102,8 +105,10 @@ class LookupTableGradCUDAKernel : public framework::OpKernel { dim3 threads(128, 8); dim3 grids(8, 1); - LookupTableGrad<<>>(d_table, d_output, ids, N, - K, D); + LookupTableGrad<<< + grids, threads, 0, reinterpret_cast( + context.device_context()) + .stream()>>>(d_table, d_output, ids, N, K, D); } }; diff --git a/paddle/operators/top_k_op.cu b/paddle/operators/top_k_op.cu index afe4d149c5..53fe505b77 100644 --- a/paddle/operators/top_k_op.cu +++ b/paddle/operators/top_k_op.cu @@ -301,14 +301,16 @@ class TopkOpCUDAKernel : public framework::OpKernel { // NOTE: pass lds and dim same to input width. // NOTE: old matrix implementation of stride is different to eigen. - // TODO(typhoonzero): launch kernel on specified stream. // TODO(typhoonzero): refine this kernel. dim3 threads(256, 1); dim3 grid(input_height, 1); - KeMatrixTopK<<>>( - output_data, output->dims()[1], indices_data, input_data, input_width, - input_width, int(k)); + KeMatrixTopK<<< + grid, threads, 0, reinterpret_cast( + ctx.device_context()) + .stream()>>>(output_data, output->dims()[1], + indices_data, input_data, + input_width, input_width, int(k)); } }; diff --git a/python/paddle/v2/framework/tests/test_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_cross_entropy_op.py index 8fdf622970..1715a4db34 100644 --- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py +++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py @@ -19,7 +19,7 @@ class TestCrossEntropyOp1(OpTest): dtype="float32") self.inputs = {"X": X, "Label": label} self.outputs = {"Y": cross_entropy} - self.attrs = {'soft_label': False} + self.attrs = {"soft_label": False} def test_check_output(self): self.check_output() @@ -34,7 +34,8 @@ class TestCrossEntropyOp2(OpTest): def setUp(self): self.op_type = "cross_entropy" - batch_size = 13 + batch_size = 5 + # this setting tests threads in more than one wrap. class_num = 37 X = np.random.uniform(0.1, 1.0, [batch_size, class_num]).astype("float32") @@ -43,9 +44,9 @@ class TestCrossEntropyOp2(OpTest): label /= label.sum(axis=1, keepdims=True) cross_entropy = (-label * np.log(X)).sum( axis=1, keepdims=True).astype("float32") - self.inputs = {'X': X, 'Label': label} - self.outputs = {'Y': cross_entropy} - self.attrs = {'soft_label': True} + self.inputs = {"X": X, "Label": label} + self.outputs = {"Y": cross_entropy} + self.attrs = {"soft_label": True} def test_check_output(self): self.check_output() @@ -61,8 +62,9 @@ class TestCrossEntropyOp3(OpTest): def setUp(self): self.op_type = "cross_entropy" - batch_size = 13 - class_num = 37 + batch_size = 5 + # this setting tests all threads in one wrap. + class_num = 17 X = np.random.uniform(0.1, 1.0, [batch_size, class_num]).astype("float32") label_index = np.random.randint( @@ -74,9 +76,36 @@ class TestCrossEntropyOp3(OpTest): dtype="float32") cross_entropy2 = (-label * np.log(X)).sum( axis=1, keepdims=True).astype("float32") - self.inputs = {'X': X, 'Label': label} - self.outputs = {'Y': cross_entropy} - self.attrs = {'soft_label': True} + self.inputs = {"X": X, "Label": label} + self.outputs = {"Y": cross_entropy} + self.attrs = {"soft_label": True} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(["X"], "Y", max_relative_error=0.05) + + +class TestCrossEntropyOp4(OpTest): + """Test soft-label cross-entropy. + This unittest tests the gpu kernel for layer size excesses 512. + """ + + def setUp(self): + self.op_type = "cross_entropy" + batch_size = 2 + class_num = 517 + X = np.random.uniform(0.1, 1.0, + [batch_size, class_num]).astype("float32") + label = np.random.uniform(0.1, 1.0, + [batch_size, class_num]).astype("float32") + label /= label.sum(axis=1, keepdims=True) + cross_entropy = (-label * np.log(X)).sum( + axis=1, keepdims=True).astype("float32") + self.inputs = {"X": X, "Label": label} + self.outputs = {"Y": cross_entropy} + self.attrs = {"soft_label": True} def test_check_output(self): self.check_output() From 201c2bcf202fb3af49f1d65bdfe3bea8b4cda331 Mon Sep 17 00:00:00 2001 From: caoying03 Date: Sat, 23 Sep 2017 16:55:40 +0800 Subject: [PATCH 4/5] delete redundant codes. --- paddle/operators/cross_entropy_op.cu | 55 ++++--------------- .../framework/tests/test_cross_entropy_op.py | 40 +++----------- 2 files changed, 18 insertions(+), 77 deletions(-) diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu index 3f34a2d52d..283021eae5 100644 --- a/paddle/operators/cross_entropy_op.cu +++ b/paddle/operators/cross_entropy_op.cu @@ -42,10 +42,9 @@ __device__ __forceinline__ T sum_single_warp(T val) { return val; } -// This kernel is called when the class number is less than or equal to 512. template -__global__ void SoftCrossEntropyKernel1(T* Y, const T* X, const T* label, - const int class_num) { +__global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label, + const int class_num) { int tid = threadIdx.x; extern __shared__ T d_sum[]; d_sum[tid] = 0; @@ -69,33 +68,6 @@ __global__ void SoftCrossEntropyKernel1(T* Y, const T* X, const T* label, if (tid == 0) Y[blockIdx.x] = -val; } -// This kernel is called when the class number is larger than 512. -template -__global__ void SoftCrossEntropyKernel2(T* Y, const T* X, const T* label, - const int class_num) { - int tid = threadIdx.x; - __shared__ T d_sum[BlockSize]; - int next_idx = blockIdx.x * class_num + tid; - - d_sum[tid] = 0; - int cur_idx = tid; - while (cur_idx < class_num) { - d_sum[tid] += TolerableValue()(std::log(X[next_idx])) * label[next_idx]; - next_idx += BlockSize; - cur_idx += BlockSize; - } - __syncthreads(); - - for (unsigned int stride = BlockSize >> 1; stride >= 32; stride >>= 1) { - if (tid < stride) d_sum[tid] += d_sum[tid + stride]; - __syncthreads(); - } - - T val = d_sum[tid]; - val = sum_single_warp(val); - if (tid == 0) Y[blockIdx.x] = -val; -} - // TODO(qingqing): make zero setting a common function. template __global__ void zero(T* X, const int N) { @@ -146,26 +118,19 @@ class CrossEntropyOpCUDAKernel : public framework::OpKernel { int batch_size = x->dims()[0]; int class_num = x->dims()[1]; - int block = 512; if (ctx.Attr("soft_label")) { auto* label_data = ctx.Input("Label")->data(); - if (class_num > 512) { - SoftCrossEntropyKernel2< - T, 512><<( - ctx.device_context()) - .stream()>>>(y_data, x_data, label_data, class_num); - } else { - int block_size = pow(2, int(std::log2(class_num))); - SoftCrossEntropyKernel1< - T><<( - ctx.device_context()) - .stream()>>>(y_data, x_data, label_data, class_num); - } + int block = class_num > 512 ? 512 : pow(2, int(std::log2(class_num))); + + SoftCrossEntropyKernel< + T><<( + ctx.device_context()) + .stream()>>>(y_data, x_data, label_data, class_num); } else { auto* label_data = ctx.Input("Label")->data(); + int block = 512; int grid = (batch_size + block - 1) / block; CrossEntropyKernel<<< grid, block, 0, reinterpret_cast( diff --git a/python/paddle/v2/framework/tests/test_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_cross_entropy_op.py index 1715a4db34..52032f3a69 100644 --- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py +++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py @@ -4,19 +4,21 @@ from op_test import OpTest class TestCrossEntropyOp1(OpTest): - """Test standard cross-entropy, with index representation of labels. + """Test cross-entropy with discrete one-hot labels. """ def setUp(self): self.op_type = "cross_entropy" batch_size = 30 class_num = 10 + X = np.random.uniform(0.1, 1.0, [batch_size, class_num]).astype("float32") label = np.random.randint(0, class_num, (batch_size, 1), dtype="int32") cross_entropy = np.asmatrix( [[-np.log(X[i][label[i][0]])] for i in range(X.shape[0])], dtype="float32") + self.inputs = {"X": X, "Label": label} self.outputs = {"Y": cross_entropy} self.attrs = {"soft_label": False} @@ -29,14 +31,14 @@ class TestCrossEntropyOp1(OpTest): class TestCrossEntropyOp2(OpTest): - """Test soft-label cross-entropy, with vecterized soft labels. + """Test cross-entropy with vectorized soft labels. """ def setUp(self): self.op_type = "cross_entropy" batch_size = 5 - # this setting tests threads in more than one wrap. class_num = 37 + X = np.random.uniform(0.1, 1.0, [batch_size, class_num]).astype("float32") label = np.random.uniform(0.1, 1.0, @@ -44,6 +46,7 @@ class TestCrossEntropyOp2(OpTest): label /= label.sum(axis=1, keepdims=True) cross_entropy = (-label * np.log(X)).sum( axis=1, keepdims=True).astype("float32") + self.inputs = {"X": X, "Label": label} self.outputs = {"Y": cross_entropy} self.attrs = {"soft_label": True} @@ -56,15 +59,14 @@ class TestCrossEntropyOp2(OpTest): class TestCrossEntropyOp3(OpTest): - """Test one-hot cross-entropy, with vecterized one-hot representation of - labels. + """Test cross-entropy with vectorized one-hot representation of labels. """ def setUp(self): self.op_type = "cross_entropy" batch_size = 5 - # this setting tests all threads in one wrap. class_num = 17 + X = np.random.uniform(0.1, 1.0, [batch_size, class_num]).astype("float32") label_index = np.random.randint( @@ -76,33 +78,7 @@ class TestCrossEntropyOp3(OpTest): dtype="float32") cross_entropy2 = (-label * np.log(X)).sum( axis=1, keepdims=True).astype("float32") - self.inputs = {"X": X, "Label": label} - self.outputs = {"Y": cross_entropy} - self.attrs = {"soft_label": True} - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(["X"], "Y", max_relative_error=0.05) - - -class TestCrossEntropyOp4(OpTest): - """Test soft-label cross-entropy. - This unittest tests the gpu kernel for layer size excesses 512. - """ - def setUp(self): - self.op_type = "cross_entropy" - batch_size = 2 - class_num = 517 - X = np.random.uniform(0.1, 1.0, - [batch_size, class_num]).astype("float32") - label = np.random.uniform(0.1, 1.0, - [batch_size, class_num]).astype("float32") - label /= label.sum(axis=1, keepdims=True) - cross_entropy = (-label * np.log(X)).sum( - axis=1, keepdims=True).astype("float32") self.inputs = {"X": X, "Label": label} self.outputs = {"Y": cross_entropy} self.attrs = {"soft_label": True} From 000d75116f43a11e5b745820575a8c07c709150f Mon Sep 17 00:00:00 2001 From: caoying03 Date: Mon, 25 Sep 2017 10:37:25 +0800 Subject: [PATCH 5/5] fix backward op. --- paddle/operators/cross_entropy_op.cc | 37 +++++++------ paddle/operators/cross_entropy_op.cu | 52 +++++++++--------- paddle/operators/cross_entropy_op.h | 53 +++++++++---------- .../framework/tests/test_cross_entropy_op.py | 7 +-- 4 files changed, 76 insertions(+), 73 deletions(-) diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc index 80f7b69c14..2e16201e74 100644 --- a/paddle/operators/cross_entropy_op.cc +++ b/paddle/operators/cross_entropy_op.cc @@ -37,13 +37,13 @@ class CrossEntropyOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(x->dims()[0], label->dims()[0], "The 1st dimension of Input(X) and Input(Label) should " "be equal."); - if (ctx.Attr("soft_label")) { + if (ctx.Attr("softLabel")) { PADDLE_ENFORCE_EQ(x->dims()[1], label->dims()[1], - "If Attr(soft_label) == true, the 2nd dimension of " + "If Attr(softLabel) == true, the 2nd dimension of " "Input(X) and Input(Label) should be equal."); } else { PADDLE_ENFORCE_EQ(label->dims()[1], 1, - "If Attr(soft_label) == false, the 2nd dimension of " + "If Attr(softLabel) == false, the 2nd dimension of " "Input(Label) should be 1."); } @@ -63,6 +63,8 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel { "Input(Label) should be not null."); PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Y")), "Input(Y@GRAD) shoudl be not null."); + PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar(framework::GradVarName("X")), + "Output(X@GRAD) should be not null."); auto x = ctx.Input("X"); auto label = ctx.Input("Label"); @@ -80,13 +82,13 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel { "be equal."); PADDLE_ENFORCE_EQ(dy->dims()[1], 1, "The 2nd dimension of Input(Y@Grad) should be 1."); - if (ctx.Attr("soft_label")) { + if (ctx.Attr("softLabel")) { PADDLE_ENFORCE_EQ(x->dims()[1], label->dims()[1], - "When Attr(soft_label) == true, the 2nd dimension of " + "When Attr(softLabel) == true, the 2nd dimension of " "Input(X) and Input(Label) should be equal."); } else { PADDLE_ENFORCE_EQ(label->dims()[1], 1, - "When Attr(soft_label) == false, the 2nd dimension of " + "When Attr(softLabel) == false, the 2nd dimension of " "Input(Label) should be 1."); } @@ -105,18 +107,19 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker { "where N is the batch size and D is the number of classes. " "This input is a probability computed by the previous operator, " "which is almost always the result of a softmax operator."); - AddInput("Label", - "(Tensor, default Tensor), the ground truth which is " - "a 1-D or 2-D tensor. " - "When soft_label is set to 0, `Label` is a Tensor with shape " - "[N x 1]. " - "When soft_label is set to 1, `Label` is a Tensor " - "with shape [N x K]."); + AddInput( + "Label", + "(Tensor, default Tensor), the ground truth which is " + "a 2-D tensor. " + "When softLabel is set to false, `Label` is a Tensor with shape " + "[N x 1]. " + "When softLabel is set to true, `Label` is a Tensor " + "with shape [N x K]."); AddOutput("Y", - "(Tensor, default Tensor), a 1-D tensor " + "(Tensor, default Tensor), a 2-D tensor " "with shape [N x 1]. The cross entropy loss."); AddAttr( - "soft_label", + "softLabel", "(bool, default false), a flag to indicate whether to interpretate " "the given labels as soft labels.") .SetDefault(false); @@ -126,12 +129,12 @@ CrossEntropy Operator. It supports both standard cross-entropy and soft-label cross-entropy loss computation. 1) One-hot cross-entropy: - soft_label = False, Label[i, 0] indicates the class index for sample i: + softLabel = false, Label[i, 0] indicates the class index for sample i: Y[i] = -log(X[i, Label[i]]) 2) Soft-label cross-entropy: - soft_label = True, Label[i, j] indicates the soft label of class j + softLabel = true, Label[i, j] indicates the soft label of class j for sample i: Y[i] = \sum_j{-Label[i, j] * log(X[i, j])} diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu index 283021eae5..18e44d77c9 100644 --- a/paddle/operators/cross_entropy_op.cu +++ b/paddle/operators/cross_entropy_op.cu @@ -70,7 +70,7 @@ __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label, // TODO(qingqing): make zero setting a common function. template -__global__ void zero(T* X, const int N) { +__global__ void Zero(T* X, const int N) { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) { X[i] = 0.0; @@ -108,18 +108,17 @@ class CrossEntropyOpCUDAKernel : public framework::OpKernel { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), "This kernel only runs on GPU device."); - auto x = ctx.Input("X"); - auto y = ctx.Output("Y"); - auto label = ctx.Input("Label"); + const Tensor* x = ctx.Input("X"); + const Tensor* label = ctx.Input("Label"); + Tensor* y = ctx.Output("Y"); - auto* x_data = x->data(); - y->mutable_data(ctx.GetPlace()); - auto* y_data = y->data(); + const T* x_data = x->data(); + T* y_data = y->mutable_data(ctx.GetPlace()); int batch_size = x->dims()[0]; int class_num = x->dims()[1]; - if (ctx.Attr("soft_label")) { + if (ctx.Attr("softLabel")) { auto* label_data = ctx.Input("Label")->data(); int block = class_num > 512 ? 512 : pow(2, int(std::log2(class_num))); @@ -148,38 +147,41 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), "This kernel only runs on GPU device."); - auto x = ctx.Input("X"); - auto dx = ctx.Output(framework::GradVarName("X")); - auto dy = ctx.Input(framework::GradVarName("Y")); - auto label = ctx.Input("Label"); + const Tensor* x = ctx.Input("X"); + const Tensor* label = ctx.Input("Label"); + Tensor* dx = ctx.Output(framework::GradVarName("X")); - auto* dx_data = dx->mutable_data(ctx.GetPlace()); - auto* dy_data = dy->data(); - auto* x_data = x->data(); + const T* dy_data = + ctx.Input(framework::GradVarName("Y"))->data(); + T* dx_data = dx->mutable_data(ctx.GetPlace()); + const T* x_data = x->data(); - int n = x->dims()[0]; - int d = x->dims()[1]; + int batch_size = x->dims()[0]; + int class_num = x->dims()[1]; int block = 512; - int grid = (n * d + block - 1) / block; - zero<<( - ctx.device_context()) - .stream()>>>(dx_data, n * d); - if (ctx.Attr("soft_label")) { + int grid = (batch_size * class_num + block - 1) / block; + + if (ctx.Attr("softLabel")) { auto* label_data = label->data(); SoftCrossEntropyGradientKernel<<< grid, block, 0, reinterpret_cast( ctx.device_context()) .stream()>>>(dx_data, dy_data, x_data, label_data, - n, d); + batch_size, class_num); } else { + Zero<<( + ctx.device_context()) + .stream()>>>(dx_data, batch_size * class_num); + auto* label_data = label->data(); + grid = (batch_size + block - 1) / block; CrossEntropyGradientKernel<<< grid, block, 0, reinterpret_cast( ctx.device_context()) .stream()>>>(dx_data, dy_data, x_data, label_data, - n, d); + batch_size, class_num); } } }; diff --git a/paddle/operators/cross_entropy_op.h b/paddle/operators/cross_entropy_op.h index 6b3f8c95be..255b2e9f5e 100644 --- a/paddle/operators/cross_entropy_op.h +++ b/paddle/operators/cross_entropy_op.h @@ -42,14 +42,14 @@ class CrossEntropyOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), - "It must use CPUPlace."); + "This kernel only runs on CPU."); const Tensor* x = ctx.Input("X"); const Tensor* labels = ctx.Input("Label"); Tensor* y = ctx.Output("Y"); - y->mutable_data(ctx.GetPlace()); + T* y_data = y->mutable_data(ctx.GetPlace()); const int batch_size = x->dims()[0]; - if (ctx.Attr("soft_label")) { + if (ctx.Attr("softLabel")) { auto prob = EigenMatrix::From(*x); auto lbl_mat = EigenMatrix::From(*labels); auto loss = EigenMatrix::From(*y); @@ -60,9 +60,7 @@ class CrossEntropyOpKernel : public framework::OpKernel { .reshape(Eigen::DSizes(batch_size, 1))); } else { const int class_num = x->dims()[1]; - const T* x_data = x->data(); - T* y_data = y->data(); const int* label_data = labels->data(); for (int i = 0; i < batch_size; ++i) { @@ -78,33 +76,32 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), - "It must use CPUPlace."); - - auto x = ctx.Input("X"); - auto dx = ctx.Output(framework::GradVarName("X")); - auto dy = ctx.Input(framework::GradVarName("Y")); - auto label = ctx.Input("Label"); - - auto* dx_data = dx->mutable_data(ctx.GetPlace()); - auto* dy_data = dy->data(); - auto* x_data = x->data(); + "This kernel only runs on CPU."); + const Tensor* x = ctx.Input("X"); + const Tensor* dy = ctx.Input(framework::GradVarName("Y")); + const Tensor* label = ctx.Input("Label"); + Tensor* dx = ctx.Output(framework::GradVarName("X")); + T* dx_data = dx->mutable_data(ctx.GetPlace()); - int batch_size = x->dims()[0]; int class_num = x->dims()[1]; - - // TODO(qingqing): make zero setting an common function. - if (ctx.Attr("soft_label")) { - auto* label_data = ctx.Input("Label")->data(); - int index = 0; - for (int i = 0; i < batch_size; ++i) { - for (int j = 0; j < class_num; ++j) { - dx_data[index] = -label_data[index] * dy_data[i] / x_data[index]; - index++; - } - } + if (ctx.Attr("softLabel")) { + auto x_mat = EigenMatrix::From(*x); + auto dy_mat = EigenMatrix::From(*dy); + auto lbl_mat = EigenMatrix::From(*label); + auto dx_mat = EigenMatrix::From(*dx); + + dx_mat.device(ctx.GetEigenDevice()) = + -(lbl_mat * dy_mat.broadcast(Eigen::DSizes(1, class_num)) / + x_mat); } else { - auto* label_data = label->data(); + int batch_size = x->dims()[0]; + const T* dy_data = dy->data(); + const T* x_data = x->data(); + const int* label_data = label->data(); + + // TODO(qingqing): make zero setting a common function. memset(dx_data, 0, sizeof(T) * batch_size * class_num); + for (int i = 0; i < batch_size; ++i) { PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num); int index = i * class_num + label_data[i]; diff --git a/python/paddle/v2/framework/tests/test_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_cross_entropy_op.py index 52032f3a69..1de514dff4 100644 --- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py +++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py @@ -21,7 +21,7 @@ class TestCrossEntropyOp1(OpTest): self.inputs = {"X": X, "Label": label} self.outputs = {"Y": cross_entropy} - self.attrs = {"soft_label": False} + self.attrs = {"softLabel": False} def test_check_output(self): self.check_output() @@ -49,7 +49,7 @@ class TestCrossEntropyOp2(OpTest): self.inputs = {"X": X, "Label": label} self.outputs = {"Y": cross_entropy} - self.attrs = {"soft_label": True} + self.attrs = {"softLabel": True} def test_check_output(self): self.check_output() @@ -73,6 +73,7 @@ class TestCrossEntropyOp3(OpTest): 0, class_num, (batch_size), dtype="int32") label = np.zeros(X.shape) label[np.arange(batch_size), label_index] = 1 + cross_entropy = np.asmatrix( [[-np.log(X[i][label_index[i]])] for i in range(X.shape[0])], dtype="float32") @@ -81,7 +82,7 @@ class TestCrossEntropyOp3(OpTest): self.inputs = {"X": X, "Label": label} self.outputs = {"Y": cross_entropy} - self.attrs = {"soft_label": True} + self.attrs = {"softLabel": True} def test_check_output(self): self.check_output()