Fix the HierarchicalSigmoidGradOpKernel and refine the codes. Now hsigmoid_op is same with V2 implementation and can pass gradient check.

7 years ago · 4ee069fdba
parent e7f7ba97fe
commit 4ee069fdba
4 changed files with 43 additions and 36 deletions
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
@ -42,13 +42,13 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
    int64_t code_length = math::FindLastSet(num_classes - 1);
    int64_t batch_size = in->dims()[0];
    framework::Tensor sum;
-    math::SetConstant<DeviceContext, T> zero;
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
    auto* pre_out_data = pre_out->mutable_data<T>(
        framework::make_ddim({batch_size, code_length}), ctx.GetPlace());
    auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
    // Not all class(leaf) nodes' path lengths equal code_length, thus init as
    // 0s can avoid out of path's loss.
+    math::SetConstant<DeviceContext, T> zero;
    zero(dev_ctx, pre_out, static_cast<T>(0.0));
    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
    math::RowwiseSum<DeviceContext, T> row_sum;
@ -72,6 +72,10 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
    // use softrelu to calculate cross entropy
    pre_out_mat.device(place) = (static_cast<T>(1.0) + pre_out_mat.exp()).log();
    row_sum(dev_ctx, *pre_out, &sum);
+    // TODO(guosheng): Subtract the out of path's loss, since not all
+    // class(leaf) nodes' path lengths equal code_length. But it won't break the
+    // gradient check since both have the out of path's loss and will cancel out
+    // each other.
    out_mat.device(place) = sum_mat + out_mat;
  }
 };
@ -90,33 +94,38 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
    auto* pre_out = ctx.Input<framework::Tensor>("PreOut");
    auto* out_grad =
        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    framework::Tensor pre_out_grad;
+
+    pre_out_grad.mutable_data<T>(pre_out->dims(), ctx.GetPlace());
+    in_grad->mutable_data<T>(ctx.GetPlace());
+    w_grad->mutable_data<T>(ctx.GetPlace());
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> zero;
+    zero(dev_ctx, in_grad, static_cast<T>(0.0));
+    zero(dev_ctx, w_grad, static_cast<T>(0.0));

    size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
-    int64_t code_length = math::FindLastSet(num_classes - 1);
-    int64_t batch_size = in->dims()[0];
-    framework::Tensor pre_out_grad;
-    pre_out_grad.mutable_data<T>(
-        framework::make_ddim({batch_size, code_length}), ctx.GetPlace());
+    math::MatrixBitCodeFunctor<T> bit_code(num_classes, label->data<int64_t>());
+
    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
    auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
    auto pre_out_grad_mat = EigenMatrix<T>::From(pre_out_grad);
-    math::MatrixBitCodeFunctor<T> bit_code(num_classes, label->data<int64_t>());
-    Eigen::array<int, 2> bcast({{1, static_cast<int>(pre_out_grad.dims()[1])}});
    auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
-    pre_out_grad_mat = out_grad_mat.broadcast(bcast);
+    Eigen::array<int, 2> bcast({{1, static_cast<int>(pre_out_grad.dims()[1])}});
+
+    // softrelu derivative
+    pre_out_grad_mat.device(place) =
+        static_cast<T>(1.0) - static_cast<T>(1.0) / pre_out_mat.exp();
+    bit_code.Sub(&pre_out_grad);  // the gradient of clip(w * x + b)
    pre_out_grad_mat.device(place) =
-        pre_out_grad_mat *
-        (static_cast<T>(1.0) -
-         static_cast<T>(1.0) / pre_out_mat.exp());  // softrelu derivative
-    bit_code.Sub(&pre_out_grad);
+        pre_out_grad_mat * out_grad_mat.broadcast(bcast);
    // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
    // be consistent with the clipping in forward.
    if (bias_grad) {
      bias_grad->mutable_data<T>(ctx.GetPlace());
+      zero(dev_ctx, bias_grad, static_cast<T>(0.0));
      bit_code.AddGrad(pre_out_grad, bias_grad);
    }
-    in_grad->mutable_data<T>(ctx.GetPlace());
-    w_grad->mutable_data<T>(ctx.GetPlace());
    bit_code.MulGradWeight(pre_out_grad, w_grad, *in);
    bit_code.MulGradError(pre_out_grad, *w, in_grad);
  }
--- a/paddle/fluid/operators/math/matrix_bit_code.cc
+++ b/paddle/fluid/operators/math/matrix_bit_code.cc
@ -62,6 +62,8 @@ void MatrixBitCodeFunctor<T>::Sum(const framework::Tensor& tmat,
    int code_length = code.get_length();
    for (int j = 0; j < code_length; ++j) {
      if (code.calc_bit(j)) {
+        // calc_bit starts from right most bit, while data in tmat[i] is in the
+        // reverse order.
        sm += tmat.data<T>()[i * o_width + j];
      }
    }
--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
@ -66,23 +66,20 @@ inline constexpr size_t FindLastSet(size_t x) {
 struct SimpleCode {
  SimpleCode(size_t code, size_t num_classes) : c_(code + num_classes) {}
  /**
-   * calc_index should make sure that all siblings have the same weight indice.
-   * As for which weight index it maps to, it doesn't matter. To satisfy this,
-   * the id of root should be 1, and the left child of a node i is 2*i, the
-   * right child of a node i is 2*i+1.
+   * Here the id of root shoud be 1 rather than 0, thus the encoding of class c
+   * is `c + num_classes` and all siblings can get the same weight indice using
+   * prefixes.
+   * Weight index is the prefixes of encoding, thus leave out the right most
+   * bit in calc_index.
+   * Binary classification path is the suffixes of encoding, thus leave out the
+   * left most bit in calc_bit.
   */
  inline size_t calc_index(int bit) const { return (c_ >> (bit + 1)) - 1; }
-  /**
-   * calc_bit uses the right most bits, while calc_index uses the left most
-   * bits. They are not the same, and that's why we say it doesn't matter which
-   * weight index calc_index maps to.
-   */
  inline bool calc_bit(int bit) const { return c_ & (1 << bit); }
  inline int get_length() const { return FindLastSet(c_) - 1; }

 private:
-  size_t c_;  // Here the id of root is 1 rather than 0, thus the id of class c
-              // is `c + num_classes`.
+  size_t c_;
 };

 struct SimpleCodeTable {
--- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
@ -37,7 +37,6 @@ class CodeTable(object):


 def hsigmoid(x, w, label, bias, num_classes):
-    global pre_output
    batch_size = x.shape[0]
    code_length = find_latest_set(num_classes - 1)
    code_table = [0 for _ in range(code_length)]
@ -50,12 +49,12 @@ def hsigmoid(x, w, label, bias, num_classes):
        for j in range(length):
            idx = code_table.cal_index(j)
            pre_output[i][j] += bias[0][idx]
-    for j in range(batch_size):
-        code_table = CodeTable(num_classes, label[j])
+    for i in range(batch_size):
+        code_table = CodeTable(num_classes, label[i])
        length = code_table.get_length()
-        for k in range(length):
-            idx = code_table.cal_index(k)
-            pre_output[j][k] = np.dot(w[idx], x[j])
+        for j in range(length):
+            idx = code_table.cal_index(j)
+            pre_output[i][j] += np.dot(w[idx], x[i])
    # clip[-40.0, 40.0]
    pre_output = np.clip(pre_output, -40.0, 40.0)
    # out(i, 0) = \sum_j  bit(i, j) * preout(i, j)
@ -71,22 +70,22 @@ def hsigmoid(x, w, label, bias, num_classes):
    pre_output = np.log(1 + np.exp(pre_output))
    pre_sum = pre_output.sum(1).reshape((batch_size, 1))
    out += pre_sum
-    return out
+    return pre_output, out


 class TestHSigmoidOp(OpTest):
    def setUp(self):
        self.op_type = "hierarchical_sigmoid"
        num_classes = 6
-        feature_size = 5
+        feature_size = 8
        batch_size = 4
        x = np.random.random((batch_size, feature_size)).astype("float32")
        w = np.random.random((num_classes - 1, feature_size)).astype("float32")
-        label = np.random.randint(0, num_classes, batch_size)
+        label = np.random.randint(0, num_classes, (batch_size, 1))
        bias = np.random.random((1, num_classes - 1)).astype("float32")
        self.attrs = {'num_classes': num_classes}
        self.inputs = {'X': x, 'W': w, 'Label': label, 'Bias': bias}
-        out = hsigmoid(x, w, label, bias, num_classes)
+        pre_output, out = hsigmoid(x, w, label, bias, num_classes)
        self.outputs = {'PreOut': pre_output, 'Out': out}

    def test_check_output(self):