From a3a8a0900df325809920c2ffcbe3e5afdaa27dbe Mon Sep 17 00:00:00 2001
From: caoying03 <caoying03@baidu.com>
Date: Wed, 20 Sep 2017 23:19:19 +0800
Subject: [PATCH 1/5] optimize cross entropy kernel by using reduce.

---
 paddle/operators/cross_entropy_op.cu          | 36 ++++++++++++++-----
 .../framework/tests/test_cross_entropy_op.py  | 27 +++++++-------
 2 files changed, 41 insertions(+), 22 deletions(-)
diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu
index ab6ad0e062..4b38af7dc3 100644
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@@ -32,16 +32,33 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int* label,
   }
 }
 
-template <typename T>
+template <typename T, int blockSize>
 __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
                                        const int N, const int D) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
-       i += blockDim.x * gridDim.x) {
-    T sum = static_cast<T>(0);
-    for (int j = 0; j < D; j++) {
-      sum += label[i * D + j] * tolerable_value(log(X[i * D + j]));
+  int tid = threadIdx.x;
+  __shared__ T d_sum[blockSize];
+  int next_idx = blockIdx.x * D + tid;
+
+  d_sum[tid] = 0;
+  int cur_idx = tid;
+  while (cur_idx < D) {
+    d_sum[tid] += tolerable_value(std::log(X[next_idx])) * label[next_idx];
+    next_idx += blockSize;
+    cur_idx += blockSize;
+  }
+  __syncthreads();
+
+  for (int stride = blockSize >> 1; stride > 0; stride >>= 1) {
+    __syncthreads();
+    if (tid < stride) {
+      next_idx = tid + stride;
+      d_sum[tid] += d_sum[next_idx];
     }
-    Y[i] = -sum;
+  }
+  __syncthreads();
+
+  if (tid == 0) {
+    Y[blockIdx.x] = -d_sum[0];
   }
 }
 
@@ -104,8 +121,9 @@ class CrossEntropyOpCUDAKernel : public framework::OpKernel {
     // base on ExecutionContext.
     if (ctx.Attr<int>("soft_label") == 1) {
       auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
-      SoftCrossEntropyKernel<T><<<grid, block>>>(y_data, x_data, label_data, n,
-                                                 d);
+      grid = d;
+      SoftCrossEntropyKernel<T, 512><<<grid, block>>>(y_data, x_data,
+                                                      label_data, n, d);
     } else {
       auto* label_data = ctx.Input<Tensor>("Label")->data<int>();
       CrossEntropyKernel<T><<<grid, block>>>(y_data, x_data, label_data, n, d);
diff --git a/python/paddle/v2/framework/tests/test_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
index 0206ca064b..5cccfc6ade 100644
--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
@@ -19,7 +19,7 @@ class TestCrossEntropyOp1(OpTest):
             dtype="float32")
         self.inputs = {"X": X, "Label": label}
         self.outputs = {"Y": cross_entropy}
-        self.attrs = {'soft_label': 0}
+        self.attrs = {"soft_label": 0}
 
     def test_check_output(self):
         self.check_output()
@@ -34,8 +34,8 @@ class TestCrossEntropyOp2(OpTest):
 
     def setUp(self):
         self.op_type = "cross_entropy"
-        batch_size = 10
-        class_num = 5
+        batch_size = 13
+        class_num = 37
         X = np.random.uniform(0.1, 1.0,
                               [batch_size, class_num]).astype("float32")
         label = np.random.uniform(0.1, 1.0,
@@ -43,15 +43,16 @@ class TestCrossEntropyOp2(OpTest):
         label /= label.sum(axis=1, keepdims=True)
         cross_entropy = (-label * np.log(X)).sum(
             axis=1, keepdims=True).astype("float32")
-        self.inputs = {'X': X, 'Label': label}
-        self.outputs = {'Y': cross_entropy}
-        self.attrs = {'soft_label': 1}
+
+        self.inputs = {"X": X, "Label": label}
+        self.outputs = {"Y": cross_entropy}
+        self.attrs = {"soft_label": 1}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y')
+        self.check_grad(["X"], "Y", max_relative_error=0.05)
 
 
 class TestCrossEntropyOp3(OpTest):
@@ -61,8 +62,8 @@ class TestCrossEntropyOp3(OpTest):
 
     def setUp(self):
         self.op_type = "cross_entropy"
-        batch_size = 30
-        class_num = 10
+        batch_size = 13
+        class_num = 37
         X = np.random.uniform(0.1, 1.0,
                               [batch_size, class_num]).astype("float32")
         label_index = np.random.randint(
@@ -74,15 +75,15 @@ class TestCrossEntropyOp3(OpTest):
             dtype="float32")
         cross_entropy2 = (-label * np.log(X)).sum(
             axis=1, keepdims=True).astype("float32")
-        self.inputs = {'X': X, 'Label': label}
-        self.outputs = {'Y': cross_entropy}
-        self.attrs = {'soft_label': 1}
+        self.inputs = {"X": X, "Label": label}
+        self.outputs = {"Y": cross_entropy}
+        self.attrs = {"soft_label": 1}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y')
+        self.check_grad(["X"], "Y", max_relative_error=0.05)
 
 
 if __name__ == "__main__":

From 141b8dbce63618a967dbe163c7393efde26614ae Mon Sep 17 00:00:00 2001
From: caoying03 <caoying03@baidu.com>
Date: Thu, 21 Sep 2017 08:20:59 +0800
Subject: [PATCH 2/5] update the backward kernel.

---
 paddle/operators/cross_entropy_op.cu | 36 +++++++-------
 paddle/operators/cross_entropy_op.h  | 70 ++++++++++++++++------------
 2 files changed, 59 insertions(+), 47 deletions(-)

diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu
index 4b38af7dc3..a21bc73351 100644
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@@ -28,27 +28,27 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int* label,
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
        i += blockDim.x * gridDim.x) {
     PADDLE_ASSERT(label[i] >= 0 && label[i] < D);
-    Y[i] = -tolerable_value(log(X[i * D + label[i]]));
+    Y[i] = -TolerableValue<T>()(log(X[i * D + label[i]]));
   }
 }
 
-template <typename T, int blockSize>
+template <typename T, int BlockSize>
 __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
                                        const int N, const int D) {
   int tid = threadIdx.x;
-  __shared__ T d_sum[blockSize];
+  __shared__ T d_sum[BlockSize];
   int next_idx = blockIdx.x * D + tid;
 
   d_sum[tid] = 0;
   int cur_idx = tid;
   while (cur_idx < D) {
-    d_sum[tid] += tolerable_value(std::log(X[next_idx])) * label[next_idx];
-    next_idx += blockSize;
-    cur_idx += blockSize;
+    d_sum[tid] += TolerableValue<T>()(std::log(X[next_idx])) * label[next_idx];
+    next_idx += BlockSize;
+    cur_idx += BlockSize;
   }
   __syncthreads();
 
-  for (int stride = blockSize >> 1; stride > 0; stride >>= 1) {
+  for (int stride = BlockSize >> 1; stride > 0; stride >>= 1) {
     __syncthreads();
     if (tid < stride) {
       next_idx = tid + stride;
@@ -88,13 +88,12 @@ template <typename T>
 __global__ void SoftCrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
                                                const T* label, const int N,
                                                const int D) {
-  // TOOD(qingqing): optimize for this kernel
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
-       i += blockDim.x * gridDim.x) {
-    for (int j = 0; j < D; ++j) {
-      int idx = i * D + j;
-      dX[idx] = -label[idx] * dY[i] / X[idx];
-    }
+  int row_ids = blockIdx.x * blockDim.x + threadIdx.x;
+  int col_ids = blockIdx.y * blockDim.y + threadIdx.y;
+  int ids = row_ids * D + col_ids;
+
+  if (ids < N * D) {
+    dX[ids] = -label[ids] * dY[row_ids] / X[ids];
   }
 }
 
@@ -103,7 +102,7 @@ class CrossEntropyOpCUDAKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "This kernel only runs on GPU device.");
 
     auto x = ctx.Input<Tensor>("X");
     auto y = ctx.Output<Tensor>("Y");
@@ -136,7 +135,7 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "This kernel only runs on GPU device.");
 
     auto x = ctx.Input<Tensor>("X");
     auto dx = ctx.Output<Tensor>(framework::GradVarName("X"));
@@ -156,6 +155,11 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel {
     // TODO(qingqing): launch kernel on specified stream
     // base on ExecutionContext.
     if (ctx.Attr<int>("soft_label") == 1) {
+      int block_x = 32;
+      int block_y = 32;
+      dim3 block(block_x, block_y);
+      dim3 grid((n + block_x - 1) / block_x, (d + block_y - 1) / block_y);
+
       auto* label_data = label->data<T>();
       SoftCrossEntropyGradientKernel<T><<<grid, block>>>(
           dx_data, dy_data, x_data, label_data, n, d);
diff --git a/paddle/operators/cross_entropy_op.h b/paddle/operators/cross_entropy_op.h
index 1b4b23ac20..4bbd05a1bb 100644
--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/platform/hostdevice.h"
 
@@ -20,19 +21,25 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename T>
-HOSTDEVICE T tolerable_value(const T x) {
-  PADDLE_ASSERT(std::is_floating_point<T>::value);
-  const T kApproInf = 1e20;
-  if (x == INFINITY) {
-    return kApproInf;
-  }
-  if (x == -INFINITY) {
-    return -kApproInf;
+struct TolerableValue {
+  HOSTDEVICE T operator()(const T& x) const {
+    PADDLE_ASSERT(std::is_floating_point<T>::value);
+    const T kApproInf = 1e20;
+
+    if (x == INFINITY) {
+      return kApproInf;
+    }
+    if (x == -INFINITY) {
+      return -kApproInf;
+    }
+    return x;
   }
-  return x;
-}
+};
 
 template <typename T>
 class CrossEntropyOpKernel : public framework::OpKernel {
@@ -40,33 +47,34 @@ class CrossEntropyOpKernel : public framework::OpKernel {
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
                    "It must use CPUPlace.");
-
-    auto x = ctx.Input<Tensor>("X");
-    auto y = ctx.Output<Tensor>("Y");
-
-    auto* x_data = x->data<T>();
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* labels = ctx.Input<Tensor>("Label");
+    Tensor* y = ctx.Output<Tensor>("Y");
     y->mutable_data<T>(ctx.GetPlace());
-    auto* y_data = y->data<T>();
-
-    int batch_size = x->dims()[0];
-    int class_num = x->dims()[1];
 
+    const int batch_size = x->dims()[0];
     if (ctx.Attr<int>("soft_label") == 1) {
-      auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
-      int index = 0;
-      for (int i = 0; i < batch_size; ++i) {
-        T sum = static_cast<T>(0);
-        for (int j = 0; j < class_num; ++j) {
-          sum += label_data[index] * tolerable_value(std::log(x_data[index]));
-          y_data[i] = -sum;
-          index++;
-        }
-      }
+      auto prob = EigenMatrix<T>::From(*x);
+      auto lbl_mat = EigenMatrix<T>::From(*labels);
+      auto loss = EigenMatrix<T>::From(*y);
+
+      // loss.device(ctx.GetEigenDevice<platform::CPUPlace>()) =
+      //     prob.log().unaryExpr(TolerableValue<T>());
+
+      loss.device(ctx.GetEigenDevice<platform::CPUPlace>()) =
+          -((lbl_mat * prob.log())
+                .sum(Eigen::DSizes<int, 1>(1))
+                .reshape(Eigen::DSizes<int, 2>(batch_size, 1)));
     } else {
-      auto* label_data = ctx.Input<Tensor>("Label")->data<int>();
+      const int class_num = x->dims()[1];
+
+      const T* x_data = x->data<T>();
+      T* y_data = y->data<T>();
+
+      const int* label_data = labels->data<int>();
       for (int i = 0; i < batch_size; ++i) {
         int index = i * class_num + label_data[i];
-        y_data[i] = -tolerable_value(std::log(x_data[index]));
+        y_data[i] = -TolerableValue<T>()(std::log(x_data[index]));
       }
     }
   }

From 6735585b0f919f12ec91180b0b70757998b107da Mon Sep 17 00:00:00 2001
From: caoying03 <caoying03@baidu.com>
Date: Fri, 22 Sep 2017 20:42:02 +0800
Subject: [PATCH 3/5] fix cpu kernel with soft labels.

---
 paddle/operators/accuracy_op.cu               |   8 +-
 paddle/operators/cross_entropy_op.cc          |  72 ++++++----
 paddle/operators/cross_entropy_op.cu          | 128 ++++++++++++------
 paddle/operators/cross_entropy_op.h           |  13 +-
 paddle/operators/lookup_table_op.cu           |  11 +-
 paddle/operators/top_k_op.cu                  |  10 +-
 .../framework/tests/test_cross_entropy_op.py  |  49 +++++--
 7 files changed, 194 insertions(+), 97 deletions(-)

diff --git a/paddle/operators/accuracy_op.cu b/paddle/operators/accuracy_op.cu
index 0a6a0fd15c..75e8a98903 100644
--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/operators/accuracy_op.cu
@@ -69,8 +69,12 @@ class AccuracyOpCUDAKernel : public framework::OpKernel {
       return;
     }
 
-    AccuracyCudaKernel<PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS>>>(
-        num_samples, infer_width, inference_data, label_data, accuracy_data);
+    AccuracyCudaKernel<PADDLE_CUDA_NUM_THREADS><<<
+        1, PADDLE_CUDA_NUM_THREADS, 0,
+        reinterpret_cast<const platform::CUDADeviceContext&>(
+            ctx.device_context())
+            .stream()>>>(num_samples, infer_width, inference_data, label_data,
+                         accuracy_data);
   }
 };
 
diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc
index b11dc1472d..80f7b69c14 100644
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -23,27 +23,28 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should be not null.");
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"),
-                            "Input(Label) must not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Y"), "Output(Y) must not be null.");
+                            "Input(Label) should be not null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Y"),
+                            "Output(Y) should be not null.");
 
     auto x = ctx.Input<Tensor>("X");
     auto label = ctx.Input<Tensor>("Label");
-    PADDLE_ENFORCE_EQ(x->dims().size(), 2, "Input(X)'s rank must be 2.");
+    PADDLE_ENFORCE_EQ(x->dims().size(), 2, "Input(X)'s rank should be 2.");
     PADDLE_ENFORCE_EQ(label->dims().size(), 2,
-                      "Input(Label)'s rank must be 2.");
+                      "Input(Label)'s rank should be 2.");
     PADDLE_ENFORCE_EQ(x->dims()[0], label->dims()[0],
-                      "The 1st dimension of Input(X) and Input(Label) must "
+                      "The 1st dimension of Input(X) and Input(Label) should "
                       "be equal.");
     if (ctx.Attr<bool>("soft_label")) {
       PADDLE_ENFORCE_EQ(x->dims()[1], label->dims()[1],
-                        "If Attr(soft_label) == true, The 2nd dimension of "
-                        "Input(X) and Input(Label) must be equal.");
+                        "If Attr(soft_label) == true, the 2nd dimension of "
+                        "Input(X) and Input(Label) should be equal.");
     } else {
       PADDLE_ENFORCE_EQ(label->dims()[1], 1,
-                        "If Attr(soft_label) == false, The 2nd dimension of "
-                        "Input(Label) must be 1.");
+                        "If Attr(soft_label) == false, the 2nd dimension of "
+                        "Input(Label) should be 1.");
     }
 
     ctx.Output<Tensor>("Y")->Resize({x->dims()[0], 1});
@@ -57,35 +58,36 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should be not null.");
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"),
-                            "Input(Label) must not be null.");
+                            "Input(Label) should be not null.");
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Y")),
-                            "Input(Y@GRAD) must not be null.");
+                            "Input(Y@GRAD) shoudl be not null.");
 
     auto x = ctx.Input<Tensor>("X");
     auto label = ctx.Input<Tensor>("Label");
     auto dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    PADDLE_ENFORCE_EQ(x->dims().size(), 2, "Input(X)'s rank must be 2.");
-    PADDLE_ENFORCE_EQ(dy->dims().size(), 2, "Input(Y@Grad)'s rank must be 2.");
+    PADDLE_ENFORCE_EQ(x->dims().size(), 2, "Input(X)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(dy->dims().size(), 2,
+                      "Input(Y@Grad)'s rank should be 2.");
     PADDLE_ENFORCE_EQ(label->dims().size(), 2,
-                      "Input(Label)'s rank must be 2.");
+                      "Input(Label)'s rank should be 2.");
     PADDLE_ENFORCE_EQ(x->dims()[0], label->dims()[0],
-                      "The 1st dimension of Input(X) and Input(Label) must "
+                      "The 1st dimension of Input(X) and Input(Label) should "
                       "be equal.");
     PADDLE_ENFORCE_EQ(x->dims()[0], dy->dims()[0],
-                      "The 1st dimension of Input(X) and Input(Y@Grad) must "
+                      "The 1st dimension of Input(X) and Input(Y@Grad) should "
                       "be equal.");
     PADDLE_ENFORCE_EQ(dy->dims()[1], 1,
-                      "The 2nd dimension of Input(Y@Grad) must be 1.");
+                      "The 2nd dimension of Input(Y@Grad) should be 1.");
     if (ctx.Attr<bool>("soft_label")) {
       PADDLE_ENFORCE_EQ(x->dims()[1], label->dims()[1],
-                        "If Attr(soft_label) == true, The 2nd dimension of "
-                        "Input(X) and Input(Label) must be equal.");
+                        "When Attr(soft_label) == true, the 2nd dimension of "
+                        "Input(X) and Input(Label) should be equal.");
     } else {
       PADDLE_ENFORCE_EQ(label->dims()[1], 1,
-                        "If Attr(soft_label) == false, The 2nd dimension of "
-                        "Input(Label) must be 1.");
+                        "When Attr(soft_label) == false, the 2nd dimension of "
+                        "Input(Label) should be 1.");
     }
 
     auto dx = ctx.Output<Tensor>(framework::GradVarName("X"));
@@ -98,12 +100,26 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
   CrossEntropyOpMaker(framework::OpProto *proto,
                       framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The first input of CrossEntropyOp");
-    AddInput("Label", "The second input of CrossEntropyOp");
-    AddOutput("Y", "The output of CrossEntropyOp");
-    AddAttr<bool>("soft_label", "Is soft label. Default zero.")
+    AddInput("X",
+             "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D, "
+             "where N is the batch size and D is the number of classes. "
+             "This input is a probability computed by the previous operator, "
+             "which is almost always the result of a softmax operator.");
+    AddInput("Label",
+             "(Tensor, default Tensor<int>), the ground truth which is "
+             "a 1-D or 2-D tensor. "
+             "When soft_label is set to 0, `Label` is a Tensor<int> with shape "
+             "[N x 1]. "
+             "When soft_label is set to 1, `Label` is a Tensor<float/double> "
+             "with shape [N x K].");
+    AddOutput("Y",
+              "(Tensor, default Tensor<float>), a 1-D tensor "
+              "with shape [N x 1]. The cross entropy loss.");
+    AddAttr<bool>(
+        "soft_label",
+        "(bool, default false), a flag to indicate whether to interpretate "
+        "the given labels as soft labels.")
         .SetDefault(false);
-
     AddComment(R"DOC(
 CrossEntropy Operator.
 
diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu
index d837f49126..3f34a2d52d 100644
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@@ -32,37 +32,71 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int* label,
   }
 }
 
+template <typename T>
+__device__ __forceinline__ T sum_single_warp(T val) {
+  val += __shfl_down(val, 16);
+  val += __shfl_down(val, 8);
+  val += __shfl_down(val, 4);
+  val += __shfl_down(val, 2);
+  val += __shfl_down(val, 1);
+  return val;
+}
+
+// This kernel is called when the class number is less than or equal to 512.
+template <typename T>
+__global__ void SoftCrossEntropyKernel1(T* Y, const T* X, const T* label,
+                                        const int class_num) {
+  int tid = threadIdx.x;
+  extern __shared__ T d_sum[];
+  d_sum[tid] = 0;
+
+  int cur_idx = tid;
+  int next_idx = blockIdx.x * class_num + tid;
+  while (cur_idx < class_num) {
+    d_sum[tid] += TolerableValue<T>()(std::log(X[next_idx])) * label[next_idx];
+    next_idx += blockDim.x;
+    cur_idx += blockDim.x;
+  }
+  __syncthreads();
+
+  for (unsigned int stride = blockDim.x >> 1; stride >= 32; stride >>= 1) {
+    if (tid < stride) d_sum[tid] += d_sum[tid + stride];
+    __syncthreads();
+  }
+
+  T val = d_sum[tid];
+  val = sum_single_warp<T>(val);
+  if (tid == 0) Y[blockIdx.x] = -val;
+}
+
+// This kernel is called when the class number is larger than 512.
 template <typename T, int BlockSize>
-__global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
-                                       const int N, const int D) {
+__global__ void SoftCrossEntropyKernel2(T* Y, const T* X, const T* label,
+                                        const int class_num) {
   int tid = threadIdx.x;
   __shared__ T d_sum[BlockSize];
-  int next_idx = blockIdx.x * D + tid;
+  int next_idx = blockIdx.x * class_num + tid;
 
   d_sum[tid] = 0;
   int cur_idx = tid;
-  while (cur_idx < D) {
+  while (cur_idx < class_num) {
     d_sum[tid] += TolerableValue<T>()(std::log(X[next_idx])) * label[next_idx];
     next_idx += BlockSize;
     cur_idx += BlockSize;
   }
   __syncthreads();
 
-  for (int stride = BlockSize >> 1; stride > 0; stride >>= 1) {
+  for (unsigned int stride = BlockSize >> 1; stride >= 32; stride >>= 1) {
+    if (tid < stride) d_sum[tid] += d_sum[tid + stride];
     __syncthreads();
-    if (tid < stride) {
-      next_idx = tid + stride;
-      d_sum[tid] += d_sum[next_idx];
-    }
   }
-  __syncthreads();
 
-  if (tid == 0) {
-    Y[blockIdx.x] = -d_sum[0];
-  }
+  T val = d_sum[tid];
+  val = sum_single_warp<T>(val);
+  if (tid == 0) Y[blockIdx.x] = -val;
 }
 
-// TODO(qingqing): make zero setting an common function.
+// TODO(qingqing): make zero setting a common function.
 template <typename T>
 __global__ void zero(T* X, const int N) {
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
@@ -88,11 +122,9 @@ template <typename T>
 __global__ void SoftCrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
                                                const T* label, const int N,
                                                const int D) {
-  int row_ids = blockIdx.x * blockDim.x + threadIdx.x;
-  int col_ids = blockIdx.y * blockDim.y + threadIdx.y;
-  int ids = row_ids * D + col_ids;
-
+  int ids = blockIdx.x * blockDim.x + threadIdx.x;
   if (ids < N * D) {
+    int row_ids = ids / D;
     dX[ids] = -label[ids] * dY[row_ids] / X[ids];
   }
 }
@@ -112,20 +144,34 @@ class CrossEntropyOpCUDAKernel : public framework::OpKernel {
     y->mutable_data<T>(ctx.GetPlace());
     auto* y_data = y->data<T>();
 
-    int n = x->dims()[0];
-    int d = x->dims()[1];
+    int batch_size = x->dims()[0];
+    int class_num = x->dims()[1];
     int block = 512;
-    int grid = (n + block - 1) / block;
-    // TODO(qingqing) launch kernel on specified stream
-    // base on ExecutionContext.
+
     if (ctx.Attr<bool>("soft_label")) {
       auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
-      grid = d;
-      SoftCrossEntropyKernel<T, 512><<<grid, block>>>(y_data, x_data,
-                                                      label_data, n, d);
+      if (class_num > 512) {
+        SoftCrossEntropyKernel2<
+            T, 512><<<batch_size, block, 0,
+                      reinterpret_cast<const platform::CUDADeviceContext&>(
+                          ctx.device_context())
+                          .stream()>>>(y_data, x_data, label_data, class_num);
+      } else {
+        int block_size = pow(2, int(std::log2(class_num)));
+        SoftCrossEntropyKernel1<
+            T><<<batch_size, block_size, block_size * sizeof(T),
+                 reinterpret_cast<const platform::CUDADeviceContext&>(
+                     ctx.device_context())
+                     .stream()>>>(y_data, x_data, label_data, class_num);
+      }
     } else {
       auto* label_data = ctx.Input<Tensor>("Label")->data<int>();
-      CrossEntropyKernel<T><<<grid, block>>>(y_data, x_data, label_data, n, d);
+      int grid = (batch_size + block - 1) / block;
+      CrossEntropyKernel<T><<<
+          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                              ctx.device_context())
+                              .stream()>>>(y_data, x_data, label_data,
+                                           batch_size, class_num);
     }
   }
 };
@@ -148,25 +194,27 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel {
 
     int n = x->dims()[0];
     int d = x->dims()[1];
+
     int block = 512;
     int grid = (n * d + block - 1) / block;
-    zero<T><<<grid, block>>>(dx_data, n * d);
-    grid = (n + block - 1) / block;
-    // TODO(qingqing): launch kernel on specified stream
-    // base on ExecutionContext.
+    zero<T><<<grid, block, 0,
+              reinterpret_cast<const platform::CUDADeviceContext&>(
+                  ctx.device_context())
+                  .stream()>>>(dx_data, n * d);
     if (ctx.Attr<bool>("soft_label")) {
-      int block_x = 32;
-      int block_y = 32;
-      dim3 block(block_x, block_y);
-      dim3 grid((n + block_x - 1) / block_x, (d + block_y - 1) / block_y);
-
       auto* label_data = label->data<T>();
-      SoftCrossEntropyGradientKernel<T><<<grid, block>>>(
-          dx_data, dy_data, x_data, label_data, n, d);
+      SoftCrossEntropyGradientKernel<T><<<
+          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                              ctx.device_context())
+                              .stream()>>>(dx_data, dy_data, x_data, label_data,
+                                           n, d);
     } else {
       auto* label_data = label->data<int>();
-      CrossEntropyGradientKernel<T><<<grid, block>>>(dx_data, dy_data, x_data,
-                                                     label_data, n, d);
+      CrossEntropyGradientKernel<T><<<
+          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                              ctx.device_context())
+                              .stream()>>>(dx_data, dy_data, x_data, label_data,
+                                           n, d);
     }
   }
 };
diff --git a/paddle/operators/cross_entropy_op.h b/paddle/operators/cross_entropy_op.h
index cfb00fe647..6b3f8c95be 100644
--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
@@ -31,12 +31,8 @@ struct TolerableValue {
     PADDLE_ASSERT(std::is_floating_point<T>::value);
     const T kApproInf = 1e20;
 
-    if (x == INFINITY) {
-      return kApproInf;
-    }
-    if (x == -INFINITY) {
-      return -kApproInf;
-    }
+    if (x == INFINITY) return kApproInf;
+    if (x == -INFINITY) return -kApproInf;
     return x;
   }
 };
@@ -58,11 +54,8 @@ class CrossEntropyOpKernel : public framework::OpKernel {
       auto lbl_mat = EigenMatrix<T>::From(*labels);
       auto loss = EigenMatrix<T>::From(*y);
 
-      // loss.device(ctx.GetEigenDevice<platform::CPUPlace>()) =
-      //     prob.log().unaryExpr(TolerableValue<T>());
-
       loss.device(ctx.GetEigenDevice<platform::CPUPlace>()) =
-          -((lbl_mat * prob.log())
+          -((lbl_mat * prob.log().unaryExpr(TolerableValue<T>()))
                 .sum(Eigen::DSizes<int, 1>(1))
                 .reshape(Eigen::DSizes<int, 2>(batch_size, 1)));
     } else {
diff --git a/paddle/operators/lookup_table_op.cu b/paddle/operators/lookup_table_op.cu
index 7083440467..62f63b4f3c 100644
--- a/paddle/operators/lookup_table_op.cu
+++ b/paddle/operators/lookup_table_op.cu
@@ -77,7 +77,10 @@ class LookupTableCUDAKernel : public framework::OpKernel {
 
     dim3 threads(128, 8);
     dim3 grids(8, 1);
-    LookupTable<T, 128, 8, 8><<<grids, threads>>>(output, table, ids, N, K, D);
+    LookupTable<T, 128, 8, 8><<<
+        grids, threads, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                               context.device_context())
+                               .stream()>>>(output, table, ids, N, K, D);
   }
 };
 
@@ -102,8 +105,10 @@ class LookupTableGradCUDAKernel : public framework::OpKernel {
 
     dim3 threads(128, 8);
     dim3 grids(8, 1);
-    LookupTableGrad<T, 128, 8, 8><<<grids, threads>>>(d_table, d_output, ids, N,
-                                                      K, D);
+    LookupTableGrad<T, 128, 8, 8><<<
+        grids, threads, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                               context.device_context())
+                               .stream()>>>(d_table, d_output, ids, N, K, D);
   }
 };
 
diff --git a/paddle/operators/top_k_op.cu b/paddle/operators/top_k_op.cu
index afe4d149c5..53fe505b77 100644
--- a/paddle/operators/top_k_op.cu
+++ b/paddle/operators/top_k_op.cu
@@ -301,14 +301,16 @@ class TopkOpCUDAKernel : public framework::OpKernel {
 
     // NOTE: pass lds and dim same to input width.
     // NOTE: old matrix implementation of stride is different to eigen.
-    // TODO(typhoonzero): launch kernel on specified stream.
     // TODO(typhoonzero): refine this kernel.
     dim3 threads(256, 1);
     dim3 grid(input_height, 1);
 
-    KeMatrixTopK<T, 5, 256><<<grid, threads>>>(
-        output_data, output->dims()[1], indices_data, input_data, input_width,
-        input_width, int(k));
+    KeMatrixTopK<T, 5, 256><<<
+        grid, threads, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                              ctx.device_context())
+                              .stream()>>>(output_data, output->dims()[1],
+                                           indices_data, input_data,
+                                           input_width, input_width, int(k));
   }
 };
 
diff --git a/python/paddle/v2/framework/tests/test_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
index 8fdf622970..1715a4db34 100644
--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
@@ -19,7 +19,7 @@ class TestCrossEntropyOp1(OpTest):
             dtype="float32")
         self.inputs = {"X": X, "Label": label}
         self.outputs = {"Y": cross_entropy}
-        self.attrs = {'soft_label': False}
+        self.attrs = {"soft_label": False}
 
     def test_check_output(self):
         self.check_output()
@@ -34,7 +34,8 @@ class TestCrossEntropyOp2(OpTest):
 
     def setUp(self):
         self.op_type = "cross_entropy"
-        batch_size = 13
+        batch_size = 5
+        # this setting tests threads in more than one wrap.
         class_num = 37
         X = np.random.uniform(0.1, 1.0,
                               [batch_size, class_num]).astype("float32")
@@ -43,9 +44,9 @@ class TestCrossEntropyOp2(OpTest):
         label /= label.sum(axis=1, keepdims=True)
         cross_entropy = (-label * np.log(X)).sum(
             axis=1, keepdims=True).astype("float32")
-        self.inputs = {'X': X, 'Label': label}
-        self.outputs = {'Y': cross_entropy}
-        self.attrs = {'soft_label': True}
+        self.inputs = {"X": X, "Label": label}
+        self.outputs = {"Y": cross_entropy}
+        self.attrs = {"soft_label": True}
 
     def test_check_output(self):
         self.check_output()
@@ -61,8 +62,9 @@ class TestCrossEntropyOp3(OpTest):
 
     def setUp(self):
         self.op_type = "cross_entropy"
-        batch_size = 13
-        class_num = 37
+        batch_size = 5
+        # this setting tests all threads in one wrap.
+        class_num = 17
         X = np.random.uniform(0.1, 1.0,
                               [batch_size, class_num]).astype("float32")
         label_index = np.random.randint(
@@ -74,9 +76,36 @@ class TestCrossEntropyOp3(OpTest):
             dtype="float32")
         cross_entropy2 = (-label * np.log(X)).sum(
             axis=1, keepdims=True).astype("float32")
-        self.inputs = {'X': X, 'Label': label}
-        self.outputs = {'Y': cross_entropy}
-        self.attrs = {'soft_label': True}
+        self.inputs = {"X": X, "Label": label}
+        self.outputs = {"Y": cross_entropy}
+        self.attrs = {"soft_label": True}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Y", max_relative_error=0.05)
+
+
+class TestCrossEntropyOp4(OpTest):
+    """Test soft-label cross-entropy.
+    This unittest tests the gpu kernel for layer size excesses 512.
+    """
+
+    def setUp(self):
+        self.op_type = "cross_entropy"
+        batch_size = 2
+        class_num = 517
+        X = np.random.uniform(0.1, 1.0,
+                              [batch_size, class_num]).astype("float32")
+        label = np.random.uniform(0.1, 1.0,
+                                  [batch_size, class_num]).astype("float32")
+        label /= label.sum(axis=1, keepdims=True)
+        cross_entropy = (-label * np.log(X)).sum(
+            axis=1, keepdims=True).astype("float32")
+        self.inputs = {"X": X, "Label": label}
+        self.outputs = {"Y": cross_entropy}
+        self.attrs = {"soft_label": True}
 
     def test_check_output(self):
         self.check_output()

From 201c2bcf202fb3af49f1d65bdfe3bea8b4cda331 Mon Sep 17 00:00:00 2001
From: caoying03 <caoying03@baidu.com>
Date: Sat, 23 Sep 2017 16:55:40 +0800
Subject: [PATCH 4/5] delete redundant codes.

---
 paddle/operators/cross_entropy_op.cu          | 55 ++++---------------
 .../framework/tests/test_cross_entropy_op.py  | 40 +++-----------
 2 files changed, 18 insertions(+), 77 deletions(-)

diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu
index 3f34a2d52d..283021eae5 100644
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@@ -42,10 +42,9 @@ __device__ __forceinline__ T sum_single_warp(T val) {
   return val;
 }
 
-// This kernel is called when the class number is less than or equal to 512.
 template <typename T>
-__global__ void SoftCrossEntropyKernel1(T* Y, const T* X, const T* label,
-                                        const int class_num) {
+__global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
+                                       const int class_num) {
   int tid = threadIdx.x;
   extern __shared__ T d_sum[];
   d_sum[tid] = 0;
@@ -69,33 +68,6 @@ __global__ void SoftCrossEntropyKernel1(T* Y, const T* X, const T* label,
   if (tid == 0) Y[blockIdx.x] = -val;
 }
 
-// This kernel is called when the class number is larger than 512.
-template <typename T, int BlockSize>
-__global__ void SoftCrossEntropyKernel2(T* Y, const T* X, const T* label,
-                                        const int class_num) {
-  int tid = threadIdx.x;
-  __shared__ T d_sum[BlockSize];
-  int next_idx = blockIdx.x * class_num + tid;
-
-  d_sum[tid] = 0;
-  int cur_idx = tid;
-  while (cur_idx < class_num) {
-    d_sum[tid] += TolerableValue<T>()(std::log(X[next_idx])) * label[next_idx];
-    next_idx += BlockSize;
-    cur_idx += BlockSize;
-  }
-  __syncthreads();
-
-  for (unsigned int stride = BlockSize >> 1; stride >= 32; stride >>= 1) {
-    if (tid < stride) d_sum[tid] += d_sum[tid + stride];
-    __syncthreads();
-  }
-
-  T val = d_sum[tid];
-  val = sum_single_warp<T>(val);
-  if (tid == 0) Y[blockIdx.x] = -val;
-}
-
 // TODO(qingqing): make zero setting a common function.
 template <typename T>
 __global__ void zero(T* X, const int N) {
@@ -146,26 +118,19 @@ class CrossEntropyOpCUDAKernel : public framework::OpKernel {
 
     int batch_size = x->dims()[0];
     int class_num = x->dims()[1];
-    int block = 512;
 
     if (ctx.Attr<bool>("soft_label")) {
       auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
-      if (class_num > 512) {
-        SoftCrossEntropyKernel2<
-            T, 512><<<batch_size, block, 0,
-                      reinterpret_cast<const platform::CUDADeviceContext&>(
-                          ctx.device_context())
-                          .stream()>>>(y_data, x_data, label_data, class_num);
-      } else {
-        int block_size = pow(2, int(std::log2(class_num)));
-        SoftCrossEntropyKernel1<
-            T><<<batch_size, block_size, block_size * sizeof(T),
-                 reinterpret_cast<const platform::CUDADeviceContext&>(
-                     ctx.device_context())
-                     .stream()>>>(y_data, x_data, label_data, class_num);
-      }
+      int block = class_num > 512 ? 512 : pow(2, int(std::log2(class_num)));
+
+      SoftCrossEntropyKernel<
+          T><<<batch_size, block, block * sizeof(T),
+               reinterpret_cast<const platform::CUDADeviceContext&>(
+                   ctx.device_context())
+                   .stream()>>>(y_data, x_data, label_data, class_num);
     } else {
       auto* label_data = ctx.Input<Tensor>("Label")->data<int>();
+      int block = 512;
       int grid = (batch_size + block - 1) / block;
       CrossEntropyKernel<T><<<
           grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
diff --git a/python/paddle/v2/framework/tests/test_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
index 1715a4db34..52032f3a69 100644
--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
@@ -4,19 +4,21 @@ from op_test import OpTest
 
 
 class TestCrossEntropyOp1(OpTest):
-    """Test standard cross-entropy, with index representation of labels.
+    """Test cross-entropy with discrete one-hot labels.
     """
 
     def setUp(self):
         self.op_type = "cross_entropy"
         batch_size = 30
         class_num = 10
+
         X = np.random.uniform(0.1, 1.0,
                               [batch_size, class_num]).astype("float32")
         label = np.random.randint(0, class_num, (batch_size, 1), dtype="int32")
         cross_entropy = np.asmatrix(
             [[-np.log(X[i][label[i][0]])] for i in range(X.shape[0])],
             dtype="float32")
+
         self.inputs = {"X": X, "Label": label}
         self.outputs = {"Y": cross_entropy}
         self.attrs = {"soft_label": False}
@@ -29,14 +31,14 @@ class TestCrossEntropyOp1(OpTest):
 
 
 class TestCrossEntropyOp2(OpTest):
-    """Test soft-label cross-entropy, with vecterized soft labels.
+    """Test cross-entropy with vectorized soft labels.
     """
 
     def setUp(self):
         self.op_type = "cross_entropy"
         batch_size = 5
-        # this setting tests threads in more than one wrap.
         class_num = 37
+
         X = np.random.uniform(0.1, 1.0,
                               [batch_size, class_num]).astype("float32")
         label = np.random.uniform(0.1, 1.0,
@@ -44,6 +46,7 @@ class TestCrossEntropyOp2(OpTest):
         label /= label.sum(axis=1, keepdims=True)
         cross_entropy = (-label * np.log(X)).sum(
             axis=1, keepdims=True).astype("float32")
+
         self.inputs = {"X": X, "Label": label}
         self.outputs = {"Y": cross_entropy}
         self.attrs = {"soft_label": True}
@@ -56,15 +59,14 @@ class TestCrossEntropyOp2(OpTest):
 
 
 class TestCrossEntropyOp3(OpTest):
-    """Test one-hot cross-entropy, with vecterized one-hot representation of
-    labels.
+    """Test cross-entropy with vectorized one-hot representation of labels.
     """
 
     def setUp(self):
         self.op_type = "cross_entropy"
         batch_size = 5
-        # this setting tests all threads in one wrap.
         class_num = 17
+
         X = np.random.uniform(0.1, 1.0,
                               [batch_size, class_num]).astype("float32")
         label_index = np.random.randint(
@@ -76,33 +78,7 @@ class TestCrossEntropyOp3(OpTest):
             dtype="float32")
         cross_entropy2 = (-label * np.log(X)).sum(
             axis=1, keepdims=True).astype("float32")
-        self.inputs = {"X": X, "Label": label}
-        self.outputs = {"Y": cross_entropy}
-        self.attrs = {"soft_label": True}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Y", max_relative_error=0.05)
-
-
-class TestCrossEntropyOp4(OpTest):
-    """Test soft-label cross-entropy.
-    This unittest tests the gpu kernel for layer size excesses 512.
-    """
 
-    def setUp(self):
-        self.op_type = "cross_entropy"
-        batch_size = 2
-        class_num = 517
-        X = np.random.uniform(0.1, 1.0,
-                              [batch_size, class_num]).astype("float32")
-        label = np.random.uniform(0.1, 1.0,
-                                  [batch_size, class_num]).astype("float32")
-        label /= label.sum(axis=1, keepdims=True)
-        cross_entropy = (-label * np.log(X)).sum(
-            axis=1, keepdims=True).astype("float32")
         self.inputs = {"X": X, "Label": label}
         self.outputs = {"Y": cross_entropy}
         self.attrs = {"soft_label": True}

From 000d75116f43a11e5b745820575a8c07c709150f Mon Sep 17 00:00:00 2001
From: caoying03 <caoying03@baidu.com>
Date: Mon, 25 Sep 2017 10:37:25 +0800
Subject: [PATCH 5/5] fix backward op.

---
 paddle/operators/cross_entropy_op.cc          | 37 +++++++------
 paddle/operators/cross_entropy_op.cu          | 52 +++++++++---------
 paddle/operators/cross_entropy_op.h           | 53 +++++++++----------
 .../framework/tests/test_cross_entropy_op.py  |  7 +--
 4 files changed, 76 insertions(+), 73 deletions(-)

diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc
index 80f7b69c14..2e16201e74 100644
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -37,13 +37,13 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(x->dims()[0], label->dims()[0],
                       "The 1st dimension of Input(X) and Input(Label) should "
                       "be equal.");
-    if (ctx.Attr<bool>("soft_label")) {
+    if (ctx.Attr<bool>("softLabel")) {
       PADDLE_ENFORCE_EQ(x->dims()[1], label->dims()[1],
-                        "If Attr(soft_label) == true, the 2nd dimension of "
+                        "If Attr(softLabel) == true, the 2nd dimension of "
                         "Input(X) and Input(Label) should be equal.");
     } else {
       PADDLE_ENFORCE_EQ(label->dims()[1], 1,
-                        "If Attr(soft_label) == false, the 2nd dimension of "
+                        "If Attr(softLabel) == false, the 2nd dimension of "
                         "Input(Label) should be 1.");
     }
 
@@ -63,6 +63,8 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
                             "Input(Label) should be not null.");
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Y")),
                             "Input(Y@GRAD) shoudl be not null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar(framework::GradVarName("X")),
+                            "Output(X@GRAD) should be not null.");
 
     auto x = ctx.Input<Tensor>("X");
     auto label = ctx.Input<Tensor>("Label");
@@ -80,13 +82,13 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
                       "be equal.");
     PADDLE_ENFORCE_EQ(dy->dims()[1], 1,
                       "The 2nd dimension of Input(Y@Grad) should be 1.");
-    if (ctx.Attr<bool>("soft_label")) {
+    if (ctx.Attr<bool>("softLabel")) {
       PADDLE_ENFORCE_EQ(x->dims()[1], label->dims()[1],
-                        "When Attr(soft_label) == true, the 2nd dimension of "
+                        "When Attr(softLabel) == true, the 2nd dimension of "
                         "Input(X) and Input(Label) should be equal.");
     } else {
       PADDLE_ENFORCE_EQ(label->dims()[1], 1,
-                        "When Attr(soft_label) == false, the 2nd dimension of "
+                        "When Attr(softLabel) == false, the 2nd dimension of "
                         "Input(Label) should be 1.");
     }
 
@@ -105,18 +107,19 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
              "where N is the batch size and D is the number of classes. "
              "This input is a probability computed by the previous operator, "
              "which is almost always the result of a softmax operator.");
-    AddInput("Label",
-             "(Tensor, default Tensor<int>), the ground truth which is "
-             "a 1-D or 2-D tensor. "
-             "When soft_label is set to 0, `Label` is a Tensor<int> with shape "
-             "[N x 1]. "
-             "When soft_label is set to 1, `Label` is a Tensor<float/double> "
-             "with shape [N x K].");
+    AddInput(
+        "Label",
+        "(Tensor, default Tensor<int>), the ground truth which is "
+        "a 2-D tensor. "
+        "When softLabel is set to false, `Label` is a Tensor<int> with shape "
+        "[N x 1]. "
+        "When softLabel is set to true, `Label` is a Tensor<float/double> "
+        "with shape [N x K].");
     AddOutput("Y",
-              "(Tensor, default Tensor<float>), a 1-D tensor "
+              "(Tensor, default Tensor<float>), a 2-D tensor "
               "with shape [N x 1]. The cross entropy loss.");
     AddAttr<bool>(
-        "soft_label",
+        "softLabel",
         "(bool, default false), a flag to indicate whether to interpretate "
         "the given labels as soft labels.")
         .SetDefault(false);
@@ -126,12 +129,12 @@ CrossEntropy Operator.
 It supports both standard cross-entropy and soft-label cross-entropy loss
 computation.
 1) One-hot cross-entropy:
-    soft_label = False, Label[i, 0] indicates the class index for sample i:
+    softLabel = false, Label[i, 0] indicates the class index for sample i:
 
                 Y[i] = -log(X[i, Label[i]])
 
 2) Soft-label cross-entropy:
-    soft_label = True, Label[i, j] indicates the soft label of class j
+    softLabel = true, Label[i, j] indicates the soft label of class j
     for sample i:
 
                 Y[i] = \sum_j{-Label[i, j] * log(X[i, j])}
diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu
index 283021eae5..18e44d77c9 100644
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@@ -70,7 +70,7 @@ __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
 
 // TODO(qingqing): make zero setting a common function.
 template <typename T>
-__global__ void zero(T* X, const int N) {
+__global__ void Zero(T* X, const int N) {
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
        i += blockDim.x * gridDim.x) {
     X[i] = 0.0;
@@ -108,18 +108,17 @@ class CrossEntropyOpCUDAKernel : public framework::OpKernel {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                    "This kernel only runs on GPU device.");
 
-    auto x = ctx.Input<Tensor>("X");
-    auto y = ctx.Output<Tensor>("Y");
-    auto label = ctx.Input<Tensor>("Label");
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* label = ctx.Input<Tensor>("Label");
+    Tensor* y = ctx.Output<Tensor>("Y");
 
-    auto* x_data = x->data<T>();
-    y->mutable_data<T>(ctx.GetPlace());
-    auto* y_data = y->data<T>();
+    const T* x_data = x->data<T>();
+    T* y_data = y->mutable_data<T>(ctx.GetPlace());
 
     int batch_size = x->dims()[0];
     int class_num = x->dims()[1];
 
-    if (ctx.Attr<bool>("soft_label")) {
+    if (ctx.Attr<bool>("softLabel")) {
       auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
       int block = class_num > 512 ? 512 : pow(2, int(std::log2(class_num)));
 
@@ -148,38 +147,41 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                    "This kernel only runs on GPU device.");
 
-    auto x = ctx.Input<Tensor>("X");
-    auto dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto label = ctx.Input<Tensor>("Label");
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* label = ctx.Input<Tensor>("Label");
+    Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
 
-    auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    auto* dy_data = dy->data<T>();
-    auto* x_data = x->data<T>();
+    const T* dy_data =
+        ctx.Input<Tensor>(framework::GradVarName("Y"))->data<T>();
+    T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
+    const T* x_data = x->data<T>();
 
-    int n = x->dims()[0];
-    int d = x->dims()[1];
+    int batch_size = x->dims()[0];
+    int class_num = x->dims()[1];
 
     int block = 512;
-    int grid = (n * d + block - 1) / block;
-    zero<T><<<grid, block, 0,
-              reinterpret_cast<const platform::CUDADeviceContext&>(
-                  ctx.device_context())
-                  .stream()>>>(dx_data, n * d);
-    if (ctx.Attr<bool>("soft_label")) {
+    int grid = (batch_size * class_num + block - 1) / block;
+
+    if (ctx.Attr<bool>("softLabel")) {
       auto* label_data = label->data<T>();
       SoftCrossEntropyGradientKernel<T><<<
           grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
                               ctx.device_context())
                               .stream()>>>(dx_data, dy_data, x_data, label_data,
-                                           n, d);
+                                           batch_size, class_num);
     } else {
+      Zero<T><<<grid, block, 0,
+                reinterpret_cast<const platform::CUDADeviceContext&>(
+                    ctx.device_context())
+                    .stream()>>>(dx_data, batch_size * class_num);
+
       auto* label_data = label->data<int>();
+      grid = (batch_size + block - 1) / block;
       CrossEntropyGradientKernel<T><<<
           grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
                               ctx.device_context())
                               .stream()>>>(dx_data, dy_data, x_data, label_data,
-                                           n, d);
+                                           batch_size, class_num);
     }
   }
 };
diff --git a/paddle/operators/cross_entropy_op.h b/paddle/operators/cross_entropy_op.h
index 6b3f8c95be..255b2e9f5e 100644
--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
@@ -42,14 +42,14 @@ class CrossEntropyOpKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "It must use CPUPlace.");
+                   "This kernel only runs on CPU.");
     const Tensor* x = ctx.Input<Tensor>("X");
     const Tensor* labels = ctx.Input<Tensor>("Label");
     Tensor* y = ctx.Output<Tensor>("Y");
-    y->mutable_data<T>(ctx.GetPlace());
+    T* y_data = y->mutable_data<T>(ctx.GetPlace());
 
     const int batch_size = x->dims()[0];
-    if (ctx.Attr<bool>("soft_label")) {
+    if (ctx.Attr<bool>("softLabel")) {
       auto prob = EigenMatrix<T>::From(*x);
       auto lbl_mat = EigenMatrix<T>::From(*labels);
       auto loss = EigenMatrix<T>::From(*y);
@@ -60,9 +60,7 @@ class CrossEntropyOpKernel : public framework::OpKernel {
                 .reshape(Eigen::DSizes<int, 2>(batch_size, 1)));
     } else {
       const int class_num = x->dims()[1];
-
       const T* x_data = x->data<T>();
-      T* y_data = y->data<T>();
 
       const int* label_data = labels->data<int>();
       for (int i = 0; i < batch_size; ++i) {
@@ -78,33 +76,32 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "It must use CPUPlace.");
-
-    auto x = ctx.Input<Tensor>("X");
-    auto dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto label = ctx.Input<Tensor>("Label");
-
-    auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    auto* dy_data = dy->data<T>();
-    auto* x_data = x->data<T>();
+                   "This kernel only runs on CPU.");
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const Tensor* label = ctx.Input<Tensor>("Label");
+    Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
 
-    int batch_size = x->dims()[0];
     int class_num = x->dims()[1];
-
-    // TODO(qingqing): make zero setting an common function.
-    if (ctx.Attr<bool>("soft_label")) {
-      auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
-      int index = 0;
-      for (int i = 0; i < batch_size; ++i) {
-        for (int j = 0; j < class_num; ++j) {
-          dx_data[index] = -label_data[index] * dy_data[i] / x_data[index];
-          index++;
-        }
-      }
+    if (ctx.Attr<bool>("softLabel")) {
+      auto x_mat = EigenMatrix<T>::From(*x);
+      auto dy_mat = EigenMatrix<T>::From(*dy);
+      auto lbl_mat = EigenMatrix<T>::From(*label);
+      auto dx_mat = EigenMatrix<T>::From(*dx);
+
+      dx_mat.device(ctx.GetEigenDevice<platform::CPUPlace>()) =
+          -(lbl_mat * dy_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num)) /
+            x_mat);
     } else {
-      auto* label_data = label->data<int>();
+      int batch_size = x->dims()[0];
+      const T* dy_data = dy->data<T>();
+      const T* x_data = x->data<T>();
+      const int* label_data = label->data<int>();
+
+      // TODO(qingqing): make zero setting a common function.
       memset(dx_data, 0, sizeof(T) * batch_size * class_num);
+
       for (int i = 0; i < batch_size; ++i) {
         PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num);
         int index = i * class_num + label_data[i];
diff --git a/python/paddle/v2/framework/tests/test_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
index 52032f3a69..1de514dff4 100644
--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
@@ -21,7 +21,7 @@ class TestCrossEntropyOp1(OpTest):
 
         self.inputs = {"X": X, "Label": label}
         self.outputs = {"Y": cross_entropy}
-        self.attrs = {"soft_label": False}
+        self.attrs = {"softLabel": False}
 
     def test_check_output(self):
         self.check_output()
@@ -49,7 +49,7 @@ class TestCrossEntropyOp2(OpTest):
 
         self.inputs = {"X": X, "Label": label}
         self.outputs = {"Y": cross_entropy}
-        self.attrs = {"soft_label": True}
+        self.attrs = {"softLabel": True}
 
     def test_check_output(self):
         self.check_output()
@@ -73,6 +73,7 @@ class TestCrossEntropyOp3(OpTest):
             0, class_num, (batch_size), dtype="int32")
         label = np.zeros(X.shape)
         label[np.arange(batch_size), label_index] = 1
+
         cross_entropy = np.asmatrix(
             [[-np.log(X[i][label_index[i]])] for i in range(X.shape[0])],
             dtype="float32")
@@ -81,7 +82,7 @@ class TestCrossEntropyOp3(OpTest):
 
         self.inputs = {"X": X, "Label": label}
         self.outputs = {"Y": cross_entropy}
-        self.attrs = {"soft_label": True}
+        self.attrs = {"softLabel": True}
 
     def test_check_output(self):
         self.check_output()