Adding Hard Sigmoid Activation (#4771)

* Adding Hard Sigmoid Activation * Adding a comment for slope to be only positive * Fixing grammatical mistake in comment
8 years ago · 3b954e1ddc
parent 1c1f73b46d
commit 3b954e1ddc
3 changed files with 119 additions and 23 deletions
--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
@ -338,6 +338,38 @@ class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker {
  }
 };

+template <typename AttrType>
+class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  HardSigmoidOpMaker(framework::OpProto *proto,
+                     framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of HardSigmoid operator");
+    AddOutput("Y", "Output of HardSigmoid operator");
+    AddComment(R"DOC(
+Hard Sigmoid activation operator.
+
+Segment-wise linear approximation of sigmoid[1].
+This is much faster than sigmoid.
+
+hard_sigmoid = max(0, min(1, slope * x + shift))
+
+The slope should be positive. The offset can be either positive or negative.
+The default slope and shift are set from [1].
+It is recommended to use the defaults for this activation.
+
+References:
+  [1] Noisy Activation Functions
+      (https://arxiv.org/abs/1603.00391)
+
+    )DOC");
+    AddAttr<AttrType>("slope", "Slope for linear approximation of sigmoid")
+        .SetDefault(static_cast<AttrType>(0.2));
+    AddAttr<AttrType>("offset", "Offset for linear approximation of sigmoid")
+        .SetDefault(static_cast<AttrType>(0.5));
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle

@ -413,6 +445,9 @@ REGISTER_OP(thresholded_relu, ops::ActivationOp,
            ops::ThresholdedReluOpMaker<float>, thresholded_relu_grad,
            ops::ActivationOpGrad);

+REGISTER_OP(hard_sigmoid, ops::ActivationOp, ops::HardSigmoidOpMaker<float>,
+            hard_sigmoid_grad, ops::ActivationOpGrad);
+
 #define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor)        \
  REGISTER_OP_CPU_KERNEL(                                                      \
      act_type,                                                                \
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
@ -616,30 +616,63 @@ struct ThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
  }
 };

+template <typename T>
+struct HardSigmoidFunctor : public BaseActivationFunctor<T> {
+  float slope;
+  float offset;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"slope", &slope}, {"offset", &offset}};
+  }
+
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) const {
+    auto temp = x * static_cast<T>(slope) + static_cast<T>(offset);
+    y.device(d) = temp.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(1));
+  }
+};
+
+template <typename T>
+struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
+  float slope;
+  float offset;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"slope", &slope}, {"offset", &offset}};
+  }
+
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+    dx.device(d) =
+        dy *
+        ((y > static_cast<T>(0)) * (y < static_cast<T>(1))).template cast<T>() *
+        static_cast<T>(slope);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle

-#define FOR_EACH_KERNEL_FUNCTOR(__macro)                          \
-  __macro(sigmoid, SigmoidFunctor, SigmoidGradFunctor);           \
-  __macro(logsigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);  \
-  __macro(exp, ExpFunctor, ExpGradFunctor);                       \
-  __macro(relu, ReluFunctor, ReluGradFunctor);                    \
-  __macro(tanh, TanhFunctor, TanhGradFunctor);                    \
-  __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor);  \
-  __macro(sqrt, SqrtFunctor, SqrtGradFunctor);                    \
-  __macro(abs, AbsFunctor, AbsGradFunctor);                       \
-  __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);  \
-  __macro(log, LogFunctor, LogGradFunctor);                       \
-  __macro(square, SquareFunctor, SquareGradFunctor);              \
-  __macro(brelu, BReluFunctor, BReluGradFunctor);                 \
-  __macro(soft_relu, SoftReluFunctor, SoftReluGradFunctor);       \
-  __macro(pow, PowFunctor, PowGradFunctor);                       \
-  __macro(stanh, STanhFunctor, STanhGradFunctor);                 \
-  __macro(softplus, SoftplusFunctor, SoftplusGradFunctor);        \
-  __macro(softsign, SoftsignFunctor, SoftsignGradFunctor);        \
-  __macro(relu6, Relu6Functor, Relu6GradFunctor);                 \
-  __macro(leaky_relu, LeakyReluFunctor, LeakyReluGradFunctor);    \
-  __macro(tanh_shrink, TanhShrinkFunctor, TanhShrinkGradFunctor); \
-  __macro(elu, ELUFunctor, ELUGradFunctor);                       \
-  __macro(hard_shrink, HardShrinkFunctor, HardShrinkGradFunctor); \
+#define FOR_EACH_KERNEL_FUNCTOR(__macro)                             \
+  __macro(sigmoid, SigmoidFunctor, SigmoidGradFunctor);              \
+  __macro(logsigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);     \
+  __macro(exp, ExpFunctor, ExpGradFunctor);                          \
+  __macro(relu, ReluFunctor, ReluGradFunctor);                       \
+  __macro(tanh, TanhFunctor, TanhGradFunctor);                       \
+  __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor);     \
+  __macro(sqrt, SqrtFunctor, SqrtGradFunctor);                       \
+  __macro(abs, AbsFunctor, AbsGradFunctor);                          \
+  __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);     \
+  __macro(log, LogFunctor, LogGradFunctor);                          \
+  __macro(square, SquareFunctor, SquareGradFunctor);                 \
+  __macro(brelu, BReluFunctor, BReluGradFunctor);                    \
+  __macro(soft_relu, SoftReluFunctor, SoftReluGradFunctor);          \
+  __macro(pow, PowFunctor, PowGradFunctor);                          \
+  __macro(stanh, STanhFunctor, STanhGradFunctor);                    \
+  __macro(softplus, SoftplusFunctor, SoftplusGradFunctor);           \
+  __macro(softsign, SoftsignFunctor, SoftsignGradFunctor);           \
+  __macro(relu6, Relu6Functor, Relu6GradFunctor);                    \
+  __macro(leaky_relu, LeakyReluFunctor, LeakyReluGradFunctor);       \
+  __macro(tanh_shrink, TanhShrinkFunctor, TanhShrinkGradFunctor);    \
+  __macro(elu, ELUFunctor, ELUGradFunctor);                          \
+  __macro(hard_shrink, HardShrinkFunctor, HardShrinkGradFunctor);    \
+  __macro(hard_sigmoid, HardSigmoidFunctor, HardSigmoidGradFunctor); \
  __macro(thresholded_relu, ThresholdedReluFunctor, ThresholdedReluGradFunctor);
--- a/python/paddle/v2/framework/tests/test_activation_op.py
+++ b/python/paddle/v2/framework/tests/test_activation_op.py
@ -384,5 +384,33 @@ class TestThresholdedRelu(OpTest):
        self.check_grad(['X'], 'Y', max_relative_error=self.relative_error)


+class TestHardSigmoid(OpTest):
+    def setUp(self):
+        self.op_type = "hard_sigmoid"
+        self.relative_error = 0.002
+
+        X = np.random.uniform(-5, 5, [2, 2]).astype("float32")
+        slope = 0.2
+        offset = 0.5
+        lower_threshold = -offset / slope
+        upper_threshold = (1 - offset) / slope
+
+        self.inputs = {'X': X}
+        # Same reason as TestAbs
+        X[np.abs(X - lower_threshold) < self.relative_error] = \
+            lower_threshold + 0.2
+        X[np.abs(X - upper_threshold) < self.relative_error] = \
+            upper_threshold - 0.2
+
+        temp = X * slope + offset
+        self.outputs = {'Y': np.maximum(0.0, np.minimum(1.0, temp))}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.002)
+
+
 if __name__ == "__main__":
    unittest.main()