Paddle/paddle/fluid/operators/clip_op.h

/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once

#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/transform.h"

namespace paddle {
namespace operators {

using framework::Tensor;
using platform::Transform;

#ifdef __NVCC__
template <typename T, typename UnaryOperation>
__global__ void ClipCudaKernel(const T* input, T* out, int num,
                               UnaryOperation op) {
  int idx = threadIdx.x + blockDim.x * blockIdx.x;
  if (idx < num) {
    out[idx] = op(input[idx]);
  }
}
#endif

template <typename T>
class ClipFunctor {
 public:
  explicit ClipFunctor(const T min, const T max) : min_(min), max_(max) {}
  HOSTDEVICE T operator()(const T& x) const {
    return x < min_ ? min_ : x > max_ ? max_ : x;
  }

 private:
  T min_;
  T max_;
};

template <typename T>
class ClipGradFunctor {
 public:
  explicit ClipGradFunctor(const T min, const T max) : min_(min), max_(max) {}
  HOSTDEVICE T operator()(const T& x, const T& y) const {
    return (y > min_ && y < max_) ? x : 0;
  }

 private:
  T min_;
  T max_;
};

template <typename DeviceContext, typename T>
class ClipKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto max = static_cast<T>(context.Attr<float>("max"));
    Tensor max_cpu;
    if (context.HasInput("Max")) {
      auto* max_t = context.Input<Tensor>("Max");
      auto* max_data = max_t->data<T>();
      if (platform::is_gpu_place(max_t->place())) {
        TensorCopySync(*max_t, platform::CPUPlace(), &max_cpu);
        max_data = max_cpu.data<T>();
      }
      max = max_data[0];
    }
    max = static_cast<T>(max);

    auto min = context.Attr<float>("min");
    Tensor min_cpu;
    if (context.HasInput("Min")) {
      auto* min_t = context.Input<Tensor>("Min");
      auto* min_data = min_t->data<T>();
      if (platform::is_gpu_place(min_t->place())) {
        TensorCopySync(*min_t, platform::CPUPlace(), &min_cpu);
        min_data = min_cpu.data<T>();
      }
      min = min_data[0];
    }

    PADDLE_ENFORCE_LE(min, max,
                      platform::errors::InvalidArgument(
                          "max should be greater than or equal to min. "
                          "But received min = %f, max = %f",
                          min, max));

    auto* x_var = context.InputVar("X");
    if (x_var->IsType<framework::LoDTensor>()) {
      auto* x = context.Input<framework::LoDTensor>("X");
      auto* out = context.Output<framework::LoDTensor>("Out");
      T* out_data = out->mutable_data<T>(context.GetPlace());
      const T* x_data = x->data<T>();
      int64_t numel = x->numel();
      if (platform::is_gpu_place(context.GetPlace())) {
#ifdef __NVCC__
        int threads = 256;
        int blocks = (numel + threads - 1) / threads;
        ClipCudaKernel<T, ClipFunctor<T>><<<
            blocks, threads, 0,
            context.template device_context<platform::CUDADeviceContext>()
                .stream()>>>(x_data, out_data, numel, ClipFunctor<T>(min, max));
#endif
      } else {
        Transform<DeviceContext> trans;
        trans(context.template device_context<DeviceContext>(), x_data,
              x_data + numel, out_data, ClipFunctor<T>(min, max));
      }
    } else if (x_var->IsType<framework::SelectedRows>()) {
      auto* x = context.Input<framework::SelectedRows>("X");
      auto* out = context.Output<framework::SelectedRows>("Out");
      PADDLE_ENFORCE_NE(x, out, platform::errors::InvalidArgument(
                                    "Inplace clip is not allowed "
                                    "when x is SelectedRows"));
      math::scatter::MergeAdd<DeviceContext, T> merge_func;
      merge_func(context.template device_context<DeviceContext>(), *x, out);
      auto* out_tensor = out->mutable_value();
      auto* out_data = out_tensor->data<T>();
      int64_t numel = out_tensor->numel();
      Transform<DeviceContext> trans;
      trans(context.template device_context<DeviceContext>(), out_data,
            out_data + numel, out_data, ClipFunctor<T>(min, max));
    } else {
      PADDLE_THROW(platform::errors::Unavailable(
          "ClipOp only supports LoDTensor and SelectedRows."));
    }
  }
};

template <typename DeviceContext, typename T>
class ClipGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto max = static_cast<T>(context.Attr<float>("max"));
    Tensor max_cpu;
    if (context.HasInput("Max")) {
      auto* max_t = context.Input<Tensor>("Max");
      auto* max_data = max_t->data<T>();
      if (platform::is_gpu_place(max_t->place())) {
        TensorCopySync(*max_t, platform::CPUPlace(), &max_cpu);
        max_data = max_cpu.data<T>();
      }
      max = max_data[0];
    }
    max = static_cast<T>(max);

    auto min = context.Attr<float>("min");
    Tensor min_cpu;
    if (context.HasInput("Min")) {
      auto* min_t = context.Input<Tensor>("Min");
      auto* min_data = min_t->data<T>();
      if (platform::is_gpu_place(min_t->place())) {
        TensorCopySync(*min_t, platform::CPUPlace(), &min_cpu);
        min_data = min_cpu.data<T>();
      }
      min = min_data[0];
    }
    min = static_cast<T>(min);

    auto* d_out =
        context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
    auto* d_x =
        context.Output<framework::LoDTensor>(framework::GradVarName("X"));
    if (d_x != nullptr) {
      auto* x = context.Input<framework::LoDTensor>("X");
      int64_t numel = d_out->numel();
      auto* d_x_data = d_x->mutable_data<T>(context.GetPlace());
      const T* d_out_data = d_out->data<T>();
      const T* x_data = x->data<T>();
      Transform<DeviceContext> trans;
      trans(context.template device_context<DeviceContext>(), d_out_data,
            d_out_data + numel, x_data, d_x_data, ClipGradFunctor<T>(min, max));
    }
  }
};

}  // namespace operators
}  // namespace paddle
Fix the grammar in copyright. (#8403) 7 years ago			`/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.`
Add clip op 8 years ago
unify the indentation of license 7 years ago			`Licensed under the Apache License, Version 2.0 (the "License");`
			`you may not use this file except in compliance with the License.`
			`You may obtain a copy of the License at`
Add clip op 8 years ago
unify the indentation of license 7 years ago			`http://www.apache.org/licenses/LICENSE-2.0`
Add clip op 8 years ago
unify the indentation of license 7 years ago			`Unless required by applicable law or agreed to in writing, software`
			`distributed under the License is distributed on an "AS IS" BASIS,`
			`WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`See the License for the specific language governing permissions and`
			`limitations under the License. */`
Add clip op 8 years ago
			`#pragma once`

Correct #include path 7 years ago			`#include "paddle/fluid/framework/eigen.h"`
			`#include "paddle/fluid/framework/op_registry.h"`
fix sparse gradient clip 7 years ago			`#include "paddle/fluid/operators/math/selected_rows_functor.h"`
Correct #include path 7 years ago			`#include "paddle/fluid/platform/transform.h"`
Add clip op 8 years ago
			`namespace paddle {`
			`namespace operators {`

Use Transform instead of eigen 8 years ago			`using framework::Tensor;`
			`using platform::Transform;`
Add clip op 8 years ago
add eltwise clip cuda impl. (#25689) test=develop 5 years ago			`#ifdef __NVCC__`
			`template <typename T, typename UnaryOperation>`
			`__global__ void ClipCudaKernel(const T* input, T* out, int num,`
			`UnaryOperation op) {`
			`int idx = threadIdx.x + blockDim.x * blockIdx.x;`
			`if (idx < num) {`
			`out[idx] = op(input[idx]);`
			`}`
			`}`
			`#endif`

Use Transform instead of eigen 8 years ago			`template <typename T>`
			`class ClipFunctor {`
			`public:`
			`explicit ClipFunctor(const T min, const T max) : min_(min), max_(max) {}`
			`HOSTDEVICE T operator()(const T& x) const {`
add eltwise clip cuda impl. (#25689) test=develop 5 years ago			`return x < min_ ? min_ : x > max_ ? max_ : x;`
Use Transform instead of eigen 8 years ago			`}`

			`private:`
			`T min_;`
			`T max_;`
			`};`

			`template <typename T>`
			`class ClipGradFunctor {`
			`public:`
			`explicit ClipGradFunctor(const T min, const T max) : min_(min), max_(max) {}`
			`HOSTDEVICE T operator()(const T& x, const T& y) const {`
Fix some inssues 8 years ago			`return (y > min_ && y < max_) ? x : 0;`
Use Transform instead of eigen 8 years ago			`}`
Add clip op 8 years ago
Use Transform instead of eigen 8 years ago			`private:`
			`T min_;`
			`T max_;`
			`};`
1. Add CUDA stream when launching kernel. 2. Fix unitest. 3. Fix comments and some issues. Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into clip_op 8 years ago
Refine device context (#6433) There are mainly following fixes: - take `DeviceContext` as the template parameter of math functors and OpKernel instead of `Place` - remove `eigen_device` interface in base class `DeviceContext` - remove `GetEigenDevice` interface in `ExecutionContext` and base class `DeviceContext` - remove unused `platform::EigenDeviceConverter` - rename `REGISTER_OP_GPU_KERNEL` to `REGISTER_OP_CUDA_KERNEL` - rename `USE_GPU_ONLY_OP` to `USE_CUDA_ONLY_OP` 7 years ago			`template <typename DeviceContext, typename T>`
Add Skeleton of Double support 7 years ago			`class ClipKernel : public framework::OpKernel<T> {`
Add clip op 8 years ago			`public:`
			`void Compute(const framework::ExecutionContext& context) const override {`
Fix clip op attr (#26924) 5 years ago			`auto max = static_cast<T>(context.Attr<float>("max"));`
add clamp api, test=develop (#23273) * add clamp api, test=develop 5 years ago			`Tensor max_cpu;`
			`if (context.HasInput("Max")) {`
			`auto* max_t = context.Input<Tensor>("Max");`
			`auto* max_data = max_t->data<T>();`
			`if (platform::is_gpu_place(max_t->place())) {`
			`TensorCopySync(*max_t, platform::CPUPlace(), &max_cpu);`
			`max_data = max_cpu.data<T>();`
			`}`
			`max = max_data[0];`
			`}`
Fix clip op attr (#26924) 5 years ago			`max = static_cast<T>(max);`
add clamp api, test=develop (#23273) * add clamp api, test=develop 5 years ago
Fix clip op attr (#26924) 5 years ago			`auto min = context.Attr<float>("min");`
add clamp api, test=develop (#23273) * add clamp api, test=develop 5 years ago			`Tensor min_cpu;`
			`if (context.HasInput("Min")) {`
			`auto* min_t = context.Input<Tensor>("Min");`
			`auto* min_data = min_t->data<T>();`
			`if (platform::is_gpu_place(min_t->place())) {`
			`TensorCopySync(*min_t, platform::CPUPlace(), &min_cpu);`
			`min_data = min_cpu.data<T>();`
			`}`
			`min = min_data[0];`
			`}`
Fix clip input check (#26683) * Fix clip input check * Fix default min/max value * Allow both max and min to be None * Register op change * Revert OP signature change 5 years ago
			`PADDLE_ENFORCE_LE(min, max,`
			`platform::errors::InvalidArgument(`
			`"max should be greater than or equal to min. "`
			`"But received min = %f, max = %f",`
			`min, max));`
add clamp api, test=develop (#23273) * add clamp api, test=develop 5 years ago
fix sparse gradient clip 7 years ago			`auto* x_var = context.InputVar("X");`
			`if (x_var->IsType<framework::LoDTensor>()) {`
			`auto* x = context.Input<framework::LoDTensor>("X");`
			`auto* out = context.Output<framework::LoDTensor>("Out");`
			`T* out_data = out->mutable_data<T>(context.GetPlace());`
			`const T* x_data = x->data<T>();`
			`int64_t numel = x->numel();`
add eltwise clip cuda impl. (#25689) test=develop 5 years ago			`if (platform::is_gpu_place(context.GetPlace())) {`
			`#ifdef __NVCC__`
			`int threads = 256;`
			`int blocks = (numel + threads - 1) / threads;`
			`ClipCudaKernel<T, ClipFunctor<T>><<<`
			`blocks, threads, 0,`
			`context.template device_context<platform::CUDADeviceContext>()`
			`.stream()>>>(x_data, out_data, numel, ClipFunctor<T>(min, max));`
			`#endif`
			`} else {`
			`Transform<DeviceContext> trans;`
			`trans(context.template device_context<DeviceContext>(), x_data,`
			`x_data + numel, out_data, ClipFunctor<T>(min, max));`
			`}`
fix sparse gradient clip 7 years ago			`} else if (x_var->IsType<framework::SelectedRows>()) {`
			`auto* x = context.Input<framework::SelectedRows>("X");`
			`auto* out = context.Output<framework::SelectedRows>("Out");`
add clamp api, test=develop (#23273) * add clamp api, test=develop 5 years ago			`PADDLE_ENFORCE_NE(x, out, platform::errors::InvalidArgument(`
			`"Inplace clip is not allowed "`
			`"when x is SelectedRows"));`
fix sparse gradient clip 7 years ago			`math::scatter::MergeAdd<DeviceContext, T> merge_func;`
			`merge_func(context.template device_context<DeviceContext>(), *x, out);`
			`auto* out_tensor = out->mutable_value();`
			`auto* out_data = out_tensor->data<T>();`
			`int64_t numel = out_tensor->numel();`
			`Transform<DeviceContext> trans;`
			`trans(context.template device_context<DeviceContext>(), out_data,`
			`out_data + numel, out_data, ClipFunctor<T>(min, max));`
			`} else {`
Add compile limit for PADDLE_ENFORCE without error message (#28221) * add compile limit for paddle enforce * polish elementwise_op_function.cu.h * fix failed unittest * fix windows compile failed * detail polish * revert no type constructor 4 years ago			`PADDLE_THROW(platform::errors::Unavailable(`
			`"ClipOp only supports LoDTensor and SelectedRows."));`
fix sparse gradient clip 7 years ago			`}`
Add clip op 8 years ago			`}`
			`};`

Refine device context (#6433) There are mainly following fixes: - take `DeviceContext` as the template parameter of math functors and OpKernel instead of `Place` - remove `eigen_device` interface in base class `DeviceContext` - remove `GetEigenDevice` interface in `ExecutionContext` and base class `DeviceContext` - remove unused `platform::EigenDeviceConverter` - rename `REGISTER_OP_GPU_KERNEL` to `REGISTER_OP_CUDA_KERNEL` - rename `USE_GPU_ONLY_OP` to `USE_CUDA_ONLY_OP` 7 years ago			`template <typename DeviceContext, typename T>`
Add Skeleton of Double support 7 years ago			`class ClipGradKernel : public framework::OpKernel<T> {`
Add clip op 8 years ago			`public:`
			`void Compute(const framework::ExecutionContext& context) const override {`
Fix clip op attr (#26924) 5 years ago			`auto max = static_cast<T>(context.Attr<float>("max"));`
add clamp api, test=develop (#23273) * add clamp api, test=develop 5 years ago			`Tensor max_cpu;`
			`if (context.HasInput("Max")) {`
			`auto* max_t = context.Input<Tensor>("Max");`
			`auto* max_data = max_t->data<T>();`
			`if (platform::is_gpu_place(max_t->place())) {`
			`TensorCopySync(*max_t, platform::CPUPlace(), &max_cpu);`
			`max_data = max_cpu.data<T>();`
			`}`
			`max = max_data[0];`
			`}`
Fix clip op attr (#26924) 5 years ago			`max = static_cast<T>(max);`
add clamp api, test=develop (#23273) * add clamp api, test=develop 5 years ago
Fix clip op attr (#26924) 5 years ago			`auto min = context.Attr<float>("min");`
add clamp api, test=develop (#23273) * add clamp api, test=develop 5 years ago			`Tensor min_cpu;`
			`if (context.HasInput("Min")) {`
			`auto* min_t = context.Input<Tensor>("Min");`
			`auto* min_data = min_t->data<T>();`
			`if (platform::is_gpu_place(min_t->place())) {`
			`TensorCopySync(*min_t, platform::CPUPlace(), &min_cpu);`
			`min_data = min_cpu.data<T>();`
			`}`
			`min = min_data[0];`
			`}`
Fix clip op attr (#26924) 5 years ago			`min = static_cast<T>(min);`
add clamp api, test=develop (#23273) * add clamp api, test=develop 5 years ago
fix sparse gradient clip 7 years ago			`auto* d_out =`
			`context.Input<framework::LoDTensor>(framework::GradVarName("Out"));`
			`auto* d_x =`
			`context.Output<framework::LoDTensor>(framework::GradVarName("X"));`
Add nullptr check 8 years ago			`if (d_x != nullptr) {`
fix sparse gradient clip 7 years ago			`auto* x = context.Input<framework::LoDTensor>("X");`
Use Transform instead of eigen 8 years ago			`int64_t numel = d_out->numel();`
Fix ptr type 8 years ago			`auto* d_x_data = d_x->mutable_data<T>(context.GetPlace());`
Use Transform instead of eigen 8 years ago			`const T* d_out_data = d_out->data<T>();`
			`const T* x_data = x->data<T>();`
Refine device context (#6433) There are mainly following fixes: - take `DeviceContext` as the template parameter of math functors and OpKernel instead of `Place` - remove `eigen_device` interface in base class `DeviceContext` - remove `GetEigenDevice` interface in `ExecutionContext` and base class `DeviceContext` - remove unused `platform::EigenDeviceConverter` - rename `REGISTER_OP_GPU_KERNEL` to `REGISTER_OP_CUDA_KERNEL` - rename `USE_GPU_ONLY_OP` to `USE_CUDA_ONLY_OP` 7 years ago			`Transform<DeviceContext> trans;`
			`trans(context.template device_context<DeviceContext>(), d_out_data,`
			`d_out_data + numel, x_data, d_x_data, ClipGradFunctor<T>(min, max));`
Add clip op 8 years ago			`}`
			`}`
			`};`

			`} // namespace operators`
			`} // namespace paddle`