Paddle/paddle/operators/cross_entropy_op.cu

/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/operators/cross_entropy_op.h"

namespace paddle {
namespace operators {

namespace {

template <typename T>
__global__ void CrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
                                           const int64_t* label, const int N,
                                           const int D) {
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
       i += blockDim.x * gridDim.x) {
    int idx = i * D + label[i];
    dX[idx] = -dY[i] / X[idx];
  }
}

template <typename T>
__global__ void SoftCrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
                                               const T* label, const int N,
                                               const int D) {
  int ids = blockIdx.x * blockDim.x + threadIdx.x;
  if (ids < N * D) {
    int row_ids = ids / D;
    dX[ids] = -label[ids] * dY[row_ids] / X[ids];
  }
}
}  // namespace

template <typename T>
class CrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                   "This kernel only runs on GPU device.");
    const Tensor* x = ctx.Input<Tensor>("X");
    const Tensor* label = ctx.Input<Tensor>("Label");
    Tensor* y = ctx.Output<Tensor>("Y");
    y->mutable_data<T>(ctx.GetPlace());

    math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
        ctx.template device_context<platform::CUDADeviceContext>(), y, x, label,
        ctx.Attr<bool>("soft_label"));
  }
};

template <typename T>
class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                   "This kernel only runs on GPU device.");

    const Tensor* x = ctx.Input<Tensor>("X");
    const Tensor* label = ctx.Input<Tensor>("Label");
    Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
    dx->mutable_data<T>(ctx.GetPlace());

    const T* dy_data =
        ctx.Input<Tensor>(framework::GradVarName("Y"))->data<T>();
    T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
    const T* x_data = x->data<T>();

    int64_t batch_size = x->dims()[0];
    int64_t class_num = x->dims()[1];

    int block = 512;
    int grid = (batch_size * class_num + block - 1) / block;

    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
    auto stream = dev_ctx.stream();

    if (ctx.Attr<bool>("soft_label")) {
      auto* label_data = label->data<T>();
      SoftCrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
          dx_data, dy_data, x_data, label_data, batch_size, class_num);
    } else {
      math::SetConstant<platform::CUDADeviceContext, T> functor;
      functor(dev_ctx, dx, 0);
      auto* label_data = label->data<int64_t>();
      grid = (batch_size + block - 1) / block;
      CrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
          dx_data, dy_data, x_data, label_data, batch_size, class_num);
    }
  }
};

}  // namespace operators
}  // namespace paddle

namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(cross_entropy, ops::CrossEntropyOpCUDAKernel<float>,
                        ops::CrossEntropyOpCUDAKernel<double>);
REGISTER_OP_CUDA_KERNEL(cross_entropy_grad,
                        ops::CrossEntropyGradientOpCUDAKernel<float>,
                        ops::CrossEntropyGradientOpCUDAKernel<double>);
Add cpplint for .h and cuda .cu 8 years ago			`/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.`

unify the indentation of license 7 years ago			`Licensed under the Apache License, Version 2.0 (the "License");`
			`you may not use this file except in compliance with the License.`
			`You may obtain a copy of the License at`
Add cpplint for .h and cuda .cu 8 years ago
unify the indentation of license 7 years ago			`http://www.apache.org/licenses/LICENSE-2.0`
Add cpplint for .h and cuda .cu 8 years ago
unify the indentation of license 7 years ago			`Unless required by applicable law or agreed to in writing, software`
			`distributed under the License is distributed on an "AS IS" BASIS,`
			`WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`See the License for the specific language governing permissions and`
			`limitations under the License. */`
Add cpplint for .h and cuda .cu 8 years ago
Use soft_label attribute for cross-entropy. 8 years ago			`#include "paddle/operators/cross_entropy_op.h"`
Implement GPU kernel for cross entropy operator. 8 years ago
			`namespace paddle {`
			`namespace operators {`

cross entropy as a functor to avoid duplicated codes. 7 years ago			`namespace {`
Implement GPU kernel for cross entropy operator. 8 years ago
			`template <typename T>`
			`__global__ void CrossEntropyGradientKernel(T* dX, const T* dY, const T* X,`
support sparse output for lookup table grad op (#5145) * add sparse support for sum op * typo fix * fix gpu build error * fix unittest error * typo fix * infer var type and shape in op_test * follow comments * fix build error * bypass some unittests depend on NetOp * support sparse output for lookup table grad op * refine codes * fix gpu build error * fix lookup table grad gpu kernel * fix ci * fix ci * fix ci * fix bug in lookup_table_grad op * fix bug in test_word2vec * register double kernel for some operators * set is_sparse=True in test_word2vec * fix lookup table grad op CUDA kernel bug * disable test_modified_huber_loss_op temporarily * disable test_lstm_unit_op temporarily 7 years ago			`const int64_t* label, const int N,`
Implement GPU kernel for cross entropy operator. 8 years ago			`const int D) {`
			`for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;`
			`i += blockDim.x * gridDim.x) {`
			`int idx = i * D + label[i];`
			`dX[idx] = -dY[i] / X[idx];`
			`}`
			`}`

			`template <typename T>`
Add soft-label support for cross-entropy operator. 8 years ago			`__global__ void SoftCrossEntropyGradientKernel(T* dX, const T* dY, const T* X,`
			`const T* label, const int N,`
			`const int D) {`
fix cpu kernel with soft labels. 7 years ago			`int ids = blockIdx.x * blockDim.x + threadIdx.x;`
update the backward kernel. 8 years ago			`if (ids < N * D) {`
fix cpu kernel with soft labels. 7 years ago			`int row_ids = ids / D;`
update the backward kernel. 8 years ago			`dX[ids] = -label[ids] * dY[row_ids] / X[ids];`
Add soft-label support for cross-entropy operator. 8 years ago			`}`
			`}`
cross entropy as a functor to avoid duplicated codes. 7 years ago			`} // namespace`
Add soft-label support for cross-entropy operator. 8 years ago
			`template <typename T>`
Add Skeleton of Double support 7 years ago			`class CrossEntropyOpCUDAKernel : public framework::OpKernel<T> {`
Implement GPU kernel for cross entropy operator. 8 years ago			`public:`
			`void Compute(const framework::ExecutionContext& ctx) const override {`
			`PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),`
update the backward kernel. 8 years ago			`"This kernel only runs on GPU device.");`
fix backward op. 7 years ago			`const Tensor* x = ctx.Input<Tensor>("X");`
			`const Tensor* label = ctx.Input<Tensor>("Label");`
			`Tensor* y = ctx.Output<Tensor>("Y");`
cross entropy as a functor to avoid duplicated codes. 7 years ago			`y->mutable_data<T>(ctx.GetPlace());`
Add soft-label support for cross-entropy operator. 8 years ago
Refine device context (#6433) There are mainly following fixes: - take `DeviceContext` as the template parameter of math functors and OpKernel instead of `Place` - remove `eigen_device` interface in base class `DeviceContext` - remove `GetEigenDevice` interface in `ExecutionContext` and base class `DeviceContext` - remove unused `platform::EigenDeviceConverter` - rename `REGISTER_OP_GPU_KERNEL` to `REGISTER_OP_CUDA_KERNEL` - rename `USE_GPU_ONLY_OP` to `USE_CUDA_ONLY_OP` 7 years ago			`math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(`
			`ctx.template device_context<platform::CUDADeviceContext>(), y, x, label,`
			`ctx.Attr<bool>("soft_label"));`
Implement GPU kernel for cross entropy operator. 8 years ago			`}`
			`};`

			`template <typename T>`
Add Skeleton of Double support 7 years ago			`class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel<T> {`
Implement GPU kernel for cross entropy operator. 8 years ago			`public:`
			`void Compute(const framework::ExecutionContext& ctx) const override {`
			`PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),`
update the backward kernel. 8 years ago			`"This kernel only runs on GPU device.");`
Implement GPU kernel for cross entropy operator. 8 years ago
fix backward op. 7 years ago			`const Tensor* x = ctx.Input<Tensor>("X");`
			`const Tensor* label = ctx.Input<Tensor>("Label");`
			`Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));`
cross entropy as a functor to avoid duplicated codes. 7 years ago			`dx->mutable_data<T>(ctx.GetPlace());`
Implement GPU kernel for cross entropy operator. 8 years ago
fix backward op. 7 years ago			`const T* dy_data =`
			`ctx.Input<Tensor>(framework::GradVarName("Y"))->data<T>();`
			`T* dx_data = dx->mutable_data<T>(ctx.GetPlace());`
			`const T* x_data = x->data<T>();`
Implement GPU kernel for cross entropy operator. 8 years ago
support sparse output for lookup table grad op (#5145) * add sparse support for sum op * typo fix * fix gpu build error * fix unittest error * typo fix * infer var type and shape in op_test * follow comments * fix build error * bypass some unittests depend on NetOp * support sparse output for lookup table grad op * refine codes * fix gpu build error * fix lookup table grad gpu kernel * fix ci * fix ci * fix ci * fix bug in lookup_table_grad op * fix bug in test_word2vec * register double kernel for some operators * set is_sparse=True in test_word2vec * fix lookup table grad op CUDA kernel bug * disable test_modified_huber_loss_op temporarily * disable test_lstm_unit_op temporarily 7 years ago			`int64_t batch_size = x->dims()[0];`
			`int64_t class_num = x->dims()[1];`
fix cpu kernel with soft labels. 7 years ago
Implement GPU kernel for cross entropy operator. 8 years ago			`int block = 512;`
fix backward op. 7 years ago			`int grid = (batch_size * class_num + block - 1) / block;`
Refine device context (#6433) There are mainly following fixes: - take `DeviceContext` as the template parameter of math functors and OpKernel instead of `Place` - remove `eigen_device` interface in base class `DeviceContext` - remove `GetEigenDevice` interface in `ExecutionContext` and base class `DeviceContext` - remove unused `platform::EigenDeviceConverter` - rename `REGISTER_OP_GPU_KERNEL` to `REGISTER_OP_CUDA_KERNEL` - rename `USE_GPU_ONLY_OP` to `USE_CUDA_ONLY_OP` 7 years ago
			`auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();`
			`auto stream = dev_ctx.stream();`
fix backward op. 7 years ago
Change Name convention of operator attributes (#4807) * Change dataType to data_type Follow PEP8 * Change name_convention to fit PEP8 7 years ago			`if (ctx.Attr<bool>("soft_label")) {`
Add soft-label support for cross-entropy operator. 8 years ago			`auto* label_data = label->data<T>();`
refine get cuda context 7 years ago			`SoftCrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(`
			`dx_data, dy_data, x_data, label_data, batch_size, class_num);`
Add soft-label support for cross-entropy operator. 8 years ago			`} else {`
Refine device context (#6433) There are mainly following fixes: - take `DeviceContext` as the template parameter of math functors and OpKernel instead of `Place` - remove `eigen_device` interface in base class `DeviceContext` - remove `GetEigenDevice` interface in `ExecutionContext` and base class `DeviceContext` - remove unused `platform::EigenDeviceConverter` - rename `REGISTER_OP_GPU_KERNEL` to `REGISTER_OP_CUDA_KERNEL` - rename `USE_GPU_ONLY_OP` to `USE_CUDA_ONLY_OP` 7 years ago			`math::SetConstant<platform::CUDADeviceContext, T> functor;`
			`functor(dev_ctx, dx, 0);`
support sparse output for lookup table grad op (#5145) * add sparse support for sum op * typo fix * fix gpu build error * fix unittest error * typo fix * infer var type and shape in op_test * follow comments * fix build error * bypass some unittests depend on NetOp * support sparse output for lookup table grad op * refine codes * fix gpu build error * fix lookup table grad gpu kernel * fix ci * fix ci * fix ci * fix bug in lookup_table_grad op * fix bug in test_word2vec * register double kernel for some operators * set is_sparse=True in test_word2vec * fix lookup table grad op CUDA kernel bug * disable test_modified_huber_loss_op temporarily * disable test_lstm_unit_op temporarily 7 years ago			`auto* label_data = label->data<int64_t>();`
fix backward op. 7 years ago			`grid = (batch_size + block - 1) / block;`
refine get cuda context 7 years ago			`CrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(`
			`dx_data, dy_data, x_data, label_data, batch_size, class_num);`
Add soft-label support for cross-entropy operator. 8 years ago			`}`
Implement GPU kernel for cross entropy operator. 8 years ago			`}`
			`};`

			`} // namespace operators`
			`} // namespace paddle`
add cross-entropy-op (#2965) * add cross-entropy-op * add infershape and compute * implement Infershape and compute of onehotcrossentropy op 8 years ago
"net op alias" 8 years ago			`namespace ops = paddle::operators;`
Refine device context (#6433) There are mainly following fixes: - take `DeviceContext` as the template parameter of math functors and OpKernel instead of `Place` - remove `eigen_device` interface in base class `DeviceContext` - remove `GetEigenDevice` interface in `ExecutionContext` and base class `DeviceContext` - remove unused `platform::EigenDeviceConverter` - rename `REGISTER_OP_GPU_KERNEL` to `REGISTER_OP_CUDA_KERNEL` - rename `USE_GPU_ONLY_OP` to `USE_CUDA_ONLY_OP` 7 years ago			`REGISTER_OP_CUDA_KERNEL(cross_entropy, ops::CrossEntropyOpCUDAKernel<float>,`
			`ops::CrossEntropyOpCUDAKernel<double>);`
			`REGISTER_OP_CUDA_KERNEL(cross_entropy_grad,`
			`ops::CrossEntropyGradientOpCUDAKernel<float>,`
			`ops::CrossEntropyGradientOpCUDAKernel<double>);`