Paddle/paddle/fluid/operators/hierarchical_sigmoid_op.h

/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once
#include <iostream>
#include <set>
#include <vector>
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/clip_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/matrix_bit_code.h"
#include "paddle/fluid/platform/transform.h"
namespace paddle {
namespace operators {

template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
using platform::Transform;

std::vector<int64_t> cal_rows(const framework::LoDTensor* path) {
  std::set<int64_t> tmp;
  std::vector<int64_t> rows;
  rows.clear();
  for (size_t i = 0; i < static_cast<size_t>(path->dims()[0]); i++) {
    for (size_t j = 0; j < static_cast<size_t>(path->dims()[1]); j++) {
      int64_t temp =
          path->data<int64_t>()[i * static_cast<size_t>(path->dims()[1]) + j];
      if (temp >= 0) {
        tmp.insert(temp);
      }
    }
  }
  rows.assign(tmp.begin(), tmp.end());
  return rows;
}

template <typename DeviceContext, typename T>
class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* in = ctx.Input<framework::LoDTensor>("X");
    auto* w = ctx.Input<framework::LoDTensor>("W");
    auto* path = ctx.Input<framework::LoDTensor>("PTable");
    auto* code = ctx.Input<framework::LoDTensor>("PCode");
    auto* label = ctx.Input<framework::LoDTensor>("Label");
    auto* bias = ctx.Input<framework::LoDTensor>("Bias");
    auto* out = ctx.Output<framework::LoDTensor>("Out");
    auto* pre_out = ctx.Output<framework::LoDTensor>("PreOut");
    size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
    bool is_custom = false;
    if (path) {
      is_custom = true;
    } else {
      is_custom = false;
    }
    int64_t code_length =
        path ? path->dims()[1] : math::FindLastSet(num_classes - 1);
    int64_t batch_size = in->dims()[0];
    framework::LoDTensor sum;
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
    auto* pre_out_data = pre_out->mutable_data<T>(
        framework::make_ddim({batch_size, code_length}), ctx.GetPlace());
    auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
    // Not all class(leaf) nodes' path lengths equal code_length, thus init as
    // 0s can avoid out of path's loss.
    math::SetConstant<DeviceContext, T> zero;
    zero(dev_ctx, pre_out, static_cast<T>(0.0));
    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
    math::RowwiseSum<DeviceContext, T> row_sum;

    std::unique_ptr<math::MatrixBitCodeFunctor<T>> bit_code;
    if (!is_custom) {
      bit_code.reset(new math::MatrixBitCodeFunctor<T>(num_classes,
                                                       label->data<int64_t>()));
    } else {
      bit_code.reset(new math::MatrixBitCodeFunctor<T>(path, code,
                                                       label->data<int64_t>()));
    }

    std::vector<int64_t> sum_dims({batch_size, 1UL});
    sum.mutable_data<T>(framework::make_ddim(sum_dims), ctx.GetPlace());
    auto sum_mat = EigenMatrix<T>::From(sum);
    out->mutable_data<T>(ctx.GetPlace());
    auto out_mat = framework::EigenVector<T>::Flatten(*out);
    if (bias) {
      bit_code->Add(pre_out, *bias);
    }
    bit_code->Mul(pre_out, *w, *in);
    // clip to [-40, 40]
    Transform<DeviceContext> trans;
    trans(ctx.template device_context<DeviceContext>(), pre_out_data,
          pre_out_data + pre_out->numel(), pre_out_data,
          ClipFunctor<T>(static_cast<T>(-40.0), static_cast<T>(40.0)));
    bit_code->Sum(*pre_out, out, static_cast<T>(-1));
    // use softrelu to calculate cross entropy
    pre_out_mat.device(place) = (static_cast<T>(1.0) + pre_out_mat.exp()).log();
    row_sum(dev_ctx, *pre_out, &sum);
    // TODO(guosheng): Subtract the out of path's loss, since not all
    // class(leaf) nodes' path lengths equal code_length. But it won't break the
    // gradient check since both have the out of path's loss and will cancel out
    // each other.
    out_mat.device(place) = sum_mat + out_mat;
  }
};

template <typename DeviceContext, typename T>
class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* in = ctx.Input<framework::LoDTensor>("X");
    auto* w = ctx.Input<framework::LoDTensor>("W");
    auto* path = ctx.Input<framework::LoDTensor>("PTable");
    auto* code = ctx.Input<framework::LoDTensor>("PCode");
    auto* in_grad =
        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
    bool is_sparse = ctx.Attr<bool>("is_sparse");
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
    math::SetConstant<DeviceContext, T> zero;
    auto* bias_grad =
        ctx.Output<framework::LoDTensor>(framework::GradVarName("Bias"));
    auto* label = ctx.Input<framework::LoDTensor>("Label");
    auto* pre_out = ctx.Input<framework::LoDTensor>("PreOut");
    auto* out_grad =
        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
    framework::LoDTensor pre_out_grad;

    pre_out_grad.mutable_data<T>(pre_out->dims(), ctx.GetPlace());
    in_grad->mutable_data<T>(ctx.GetPlace());
    zero(dev_ctx, in_grad, static_cast<T>(0.0));

    size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));

    bool is_custom = false;
    if (path) {
      is_custom = true;
    } else {
      is_custom = false;
    }

    std::unique_ptr<math::MatrixBitCodeFunctor<T>> bit_code;
    if (!is_custom) {
      bit_code.reset(new math::MatrixBitCodeFunctor<T>(num_classes,
                                                       label->data<int64_t>()));
    } else {
      bit_code.reset(new math::MatrixBitCodeFunctor<T>(path, code,
                                                       label->data<int64_t>()));
    }

    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
    auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
    auto pre_out_grad_mat = EigenMatrix<T>::From(pre_out_grad);
    auto out_grad_mat = EigenMatrix<T>::From(*out_grad);

    Eigen::array<int, 2> bcast({{1, static_cast<int>(pre_out_grad.dims()[1])}});

    // softrelu derivative
    pre_out_grad_mat.device(place) =
        static_cast<T>(1.0) - static_cast<T>(1.0) / pre_out_mat.exp();
    bit_code->Sub(&pre_out_grad);  // the gradient of clip(w * x + b)
    pre_out_grad_mat.device(place) =
        pre_out_grad_mat * out_grad_mat.broadcast(bcast);
    // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
    // be consistent with the clipping in forward.
    if (bias_grad) {
      bias_grad->mutable_data<T>(ctx.GetPlace());
      zero(dev_ctx, bias_grad, static_cast<T>(0.0));
      bit_code->AddGrad(pre_out_grad, bias_grad);
    }
    if (!is_sparse) {
      auto* w_grad =
          ctx.Output<framework::LoDTensor>(framework::GradVarName("W"));
      w_grad->mutable_data<T>(ctx.GetPlace());
      zero(dev_ctx, w_grad, static_cast<T>(0.0));
      bit_code->MulGradWeight(pre_out_grad, w_grad, *in);
    } else {
      framework::Vector<int64_t> real_rows = cal_rows(path);
      auto* w_grad =
          ctx.Output<framework::SelectedRows>(framework::GradVarName("W"));
      w_grad->set_rows(real_rows);
      // build ids -> rows index map
      w_grad->SyncIndex();
      w_grad->set_height(w->dims()[0]);
      auto* w_grad_value = w_grad->mutable_value();
      framework::DDim temp_dim(w->dims());
      set(temp_dim, 0, real_rows.size());

      w_grad_value->mutable_data<T>(temp_dim, ctx.GetPlace());
      zero(dev_ctx, w_grad_value, static_cast<T>(0.0));
      bit_code->MulGradWeight(pre_out_grad, w_grad, *in);
    }
    bit_code->MulGradError(pre_out_grad, *w, in_grad);
  }
};

}  // namespace operators
}  // namespace paddle
port hsigmoid layer 7 years ago			`/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.`

			`Licensed under the Apache License, Version 2.0 (the "License");`
			`you may not use this file except in compliance with the License.`
			`You may obtain a copy of the License at`

			`http://www.apache.org/licenses/LICENSE-2.0`

			`Unless required by applicable law or agreed to in writing, software`
			`distributed under the License is distributed on an "AS IS" BASIS,`
			`WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`See the License for the specific language governing permissions and`
			`limitations under the License. */`

			`#pragma once`
add hsigmoid 7 years ago			`#include <iostream>`
test=develop 6 years ago			`#include <set>`
add hsigmoid 7 years ago			`#include <vector>`
test=develop 6 years ago			`#include "paddle/fluid/framework/mixed_vector.h"`
add hsigmoid 7 years ago			`#include "paddle/fluid/framework/op_registry.h"`
			`#include "paddle/fluid/operators/clip_op.h"`
			`#include "paddle/fluid/operators/math/math_function.h"`
			`#include "paddle/fluid/operators/math/matrix_bit_code.h"`
			`#include "paddle/fluid/platform/transform.h"`
port hsigmoid layer 7 years ago			`namespace paddle {`
			`namespace operators {`

implement forward 7 years ago			`template <typename T, int MajorType = Eigen::RowMajor,`
			`typename IndexType = Eigen::DenseIndex>`
			`using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;`
make forward work 7 years ago			`using platform::Transform;`
implement forward 7 years ago
test=develop 6 years ago			`std::vector<int64_t> cal_rows(const framework::LoDTensor* path) {`
			`std::set<int64_t> tmp;`
			`std::vector<int64_t> rows;`
			`rows.clear();`
			`for (size_t i = 0; i < static_cast<size_t>(path->dims()[0]); i++) {`
			`for (size_t j = 0; j < static_cast<size_t>(path->dims()[1]); j++) {`
			`int64_t temp =`
			`path->data<int64_t>()[i * static_cast<size_t>(path->dims()[1]) + j];`
			`if (temp >= 0) {`
			`tmp.insert(temp);`
			`}`
			`}`
			`}`
test=develop 6 years ago			`rows.assign(tmp.begin(), tmp.end());`
test=develop 6 years ago			`return rows;`
			`}`

make forward work 7 years ago			`template <typename DeviceContext, typename T>`
port hsigmoid layer 7 years ago			`class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {`
			`public:`
add forward 7 years ago			`void Compute(const framework::ExecutionContext& ctx) const override {`
test=develop 6 years ago			`auto* in = ctx.Input<framework::LoDTensor>("X");`
			`auto* w = ctx.Input<framework::LoDTensor>("W");`
			`auto* path = ctx.Input<framework::LoDTensor>("PTable");`
			`auto* code = ctx.Input<framework::LoDTensor>("PCode");`
			`auto* label = ctx.Input<framework::LoDTensor>("Label");`
			`auto* bias = ctx.Input<framework::LoDTensor>("Bias");`
			`auto* out = ctx.Output<framework::LoDTensor>("Out");`
			`auto* pre_out = ctx.Output<framework::LoDTensor>("PreOut");`
add forward 7 years ago			`size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));`
grad diff problem to be fixed and need api spec change to be done 6 years ago			`bool is_custom = false;`
			`if (path) {`
			`is_custom = true;`
			`} else {`
			`is_custom = false;`
			`}`
			`int64_t code_length =`
			`path ? path->dims()[1] : math::FindLastSet(num_classes - 1);`
make forward work 7 years ago			`int64_t batch_size = in->dims()[0];`
test=develop 6 years ago			`framework::LoDTensor sum;`
add hsigmoid 7 years ago			`auto& dev_ctx = ctx.template device_context<DeviceContext>();`
complete the hsigmoid_op 7 years ago			`auto* pre_out_data = pre_out->mutable_data<T>(`
make forward work 7 years ago			`framework::make_ddim({batch_size, code_length}), ctx.GetPlace());`
add hsigmoid 7 years ago			`auto pre_out_mat = EigenMatrix<T>::From(*pre_out);`
complete the hsigmoid_op 7 years ago			`// Not all class(leaf) nodes' path lengths equal code_length, thus init as`
			`// 0s can avoid out of path's loss.`
Fix the HierarchicalSigmoidGradOpKernel and refine the codes. Now hsigmoid_op is same with V2 implementation and can pass gradient check. 7 years ago			`math::SetConstant<DeviceContext, T> zero;`
add hsigmoid 7 years ago			`zero(dev_ctx, pre_out, static_cast<T>(0.0));`
make forward work 7 years ago			`auto& place = *ctx.template device_context<DeviceContext>().eigen_device();`
			`math::RowwiseSum<DeviceContext, T> row_sum;`
grad diff problem to be fixed and need api spec change to be done 6 years ago
			`std::unique_ptr<math::MatrixBitCodeFunctor<T>> bit_code;`
			`if (!is_custom) {`
			`bit_code.reset(new math::MatrixBitCodeFunctor<T>(num_classes,`
			`label->data<int64_t>()));`
			`} else {`
			`bit_code.reset(new math::MatrixBitCodeFunctor<T>(path, code,`
			`label->data<int64_t>()));`
			`}`
make forward work 7 years ago
implement forward 7 years ago			`std::vector<int64_t> sum_dims({batch_size, 1UL});`
			`sum.mutable_data<T>(framework::make_ddim(sum_dims), ctx.GetPlace());`
make forward work 7 years ago			`auto sum_mat = EigenMatrix<T>::From(sum);`
implement forward 7 years ago			`out->mutable_data<T>(ctx.GetPlace());`
make forward work 7 years ago			`auto out_mat = framework::EigenVector<T>::Flatten(*out);`
implement forward 7 years ago			`if (bias) {`
grad diff problem to be fixed and need api spec change to be done 6 years ago			`bit_code->Add(pre_out, *bias);`
add forward 7 years ago			`}`
grad diff problem to be fixed and need api spec change to be done 6 years ago			`bit_code->Mul(pre_out, w, in);`
complete the hsigmoid_op 7 years ago			`// clip to [-40, 40]`
make forward work 7 years ago			`Transform<DeviceContext> trans;`
			`trans(ctx.template device_context<DeviceContext>(), pre_out_data,`
add hsigmoid 7 years ago			`pre_out_data + pre_out->numel(), pre_out_data,`
make forward work 7 years ago			`ClipFunctor<T>(static_cast<T>(-40.0), static_cast<T>(40.0)));`
grad diff problem to be fixed and need api spec change to be done 6 years ago			`bit_code->Sum(*pre_out, out, static_cast<T>(-1));`
complete the hsigmoid_op 7 years ago			`// use softrelu to calculate cross entropy`
add backward 7 years ago			`pre_out_mat.device(place) = (static_cast<T>(1.0) + pre_out_mat.exp()).log();`
add hsigmoid 7 years ago			`row_sum(dev_ctx, *pre_out, &sum);`
Fix the HierarchicalSigmoidGradOpKernel and refine the codes. Now hsigmoid_op is same with V2 implementation and can pass gradient check. 7 years ago			`// TODO(guosheng): Subtract the out of path's loss, since not all`
			`// class(leaf) nodes' path lengths equal code_length. But it won't break the`
			`// gradient check since both have the out of path's loss and will cancel out`
			`// each other.`
make forward work 7 years ago			`out_mat.device(place) = sum_mat + out_mat;`
add forward 7 years ago			`}`
port hsigmoid layer 7 years ago			`};`

make forward work 7 years ago			`template <typename DeviceContext, typename T>`
port hsigmoid layer 7 years ago			`class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {`
			`public:`
add backward 7 years ago			`void Compute(const framework::ExecutionContext& ctx) const override {`
test=develop 6 years ago			`auto* in = ctx.Input<framework::LoDTensor>("X");`
			`auto* w = ctx.Input<framework::LoDTensor>("W");`
			`auto* path = ctx.Input<framework::LoDTensor>("PTable");`
			`auto* code = ctx.Input<framework::LoDTensor>("PCode");`
			`auto* in_grad =`
			`ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));`
			`bool is_sparse = ctx.Attr<bool>("is_sparse");`
			`auto& dev_ctx = ctx.template device_context<DeviceContext>();`
			`math::SetConstant<DeviceContext, T> zero;`
add hsigmoid 7 years ago			`auto* bias_grad =`
test=develop 6 years ago			`ctx.Output<framework::LoDTensor>(framework::GradVarName("Bias"));`
			`auto* label = ctx.Input<framework::LoDTensor>("Label");`
			`auto* pre_out = ctx.Input<framework::LoDTensor>("PreOut");`
add hsigmoid 7 years ago			`auto* out_grad =`
test=develop 6 years ago			`ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));`
			`framework::LoDTensor pre_out_grad;`
Fix the HierarchicalSigmoidGradOpKernel and refine the codes. Now hsigmoid_op is same with V2 implementation and can pass gradient check. 7 years ago
			`pre_out_grad.mutable_data<T>(pre_out->dims(), ctx.GetPlace());`
			`in_grad->mutable_data<T>(ctx.GetPlace());`
			`zero(dev_ctx, in_grad, static_cast<T>(0.0));`
add hsigmoid 7 years ago
add backward 7 years ago			`size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));`
grad diff problem to be fixed and need api spec change to be done 6 years ago
			`bool is_custom = false;`
			`if (path) {`
			`is_custom = true;`
			`} else {`
			`is_custom = false;`
			`}`

			`std::unique_ptr<math::MatrixBitCodeFunctor<T>> bit_code;`
			`if (!is_custom) {`
			`bit_code.reset(new math::MatrixBitCodeFunctor<T>(num_classes,`
			`label->data<int64_t>()));`
			`} else {`
			`bit_code.reset(new math::MatrixBitCodeFunctor<T>(path, code,`
			`label->data<int64_t>()));`
			`}`
Fix the HierarchicalSigmoidGradOpKernel and refine the codes. Now hsigmoid_op is same with V2 implementation and can pass gradient check. 7 years ago
make forward work 7 years ago			`auto& place = *ctx.template device_context<DeviceContext>().eigen_device();`
add hsigmoid 7 years ago			`auto pre_out_mat = EigenMatrix<T>::From(*pre_out);`
			`auto pre_out_grad_mat = EigenMatrix<T>::From(pre_out_grad);`
fix some errors 7 years ago			`auto out_grad_mat = EigenMatrix<T>::From(*out_grad);`
test=develop 6 years ago
Fix the HierarchicalSigmoidGradOpKernel and refine the codes. Now hsigmoid_op is same with V2 implementation and can pass gradient check. 7 years ago			`Eigen::array<int, 2> bcast({{1, static_cast<int>(pre_out_grad.dims()[1])}});`

			`// softrelu derivative`
			`pre_out_grad_mat.device(place) =`
			`static_cast<T>(1.0) - static_cast<T>(1.0) / pre_out_mat.exp();`
grad diff problem to be fixed and need api spec change to be done 6 years ago			`bit_code->Sub(&pre_out_grad); // the gradient of clip(w * x + b)`
add hsigmoid 7 years ago			`pre_out_grad_mat.device(place) =`
Fix the HierarchicalSigmoidGradOpKernel and refine the codes. Now hsigmoid_op is same with V2 implementation and can pass gradient check. 7 years ago			`pre_out_grad_mat * out_grad_mat.broadcast(bcast);`
complete the hsigmoid_op 7 years ago			`// TODO(guosheng): multiply pre_out_grad with subgradient of clipping to`
			`// be consistent with the clipping in forward.`
add hsigmoid 7 years ago			`if (bias_grad) {`
			`bias_grad->mutable_data<T>(ctx.GetPlace());`
Fix the HierarchicalSigmoidGradOpKernel and refine the codes. Now hsigmoid_op is same with V2 implementation and can pass gradient check. 7 years ago			`zero(dev_ctx, bias_grad, static_cast<T>(0.0));`
grad diff problem to be fixed and need api spec change to be done 6 years ago			`bit_code->AddGrad(pre_out_grad, bias_grad);`
add backward 7 years ago			`}`
test=develop 6 years ago			`if (!is_sparse) {`
			`auto* w_grad =`
			`ctx.Output<framework::LoDTensor>(framework::GradVarName("W"));`
			`w_grad->mutable_data<T>(ctx.GetPlace());`
			`zero(dev_ctx, w_grad, static_cast<T>(0.0));`
			`bit_code->MulGradWeight(pre_out_grad, w_grad, *in);`
			`} else {`
			`framework::Vector<int64_t> real_rows = cal_rows(path);`
			`auto* w_grad =`
			`ctx.Output<framework::SelectedRows>(framework::GradVarName("W"));`
			`w_grad->set_rows(real_rows);`
			`// build ids -> rows index map`
			`w_grad->SyncIndex();`
test=develop 6 years ago			`w_grad->set_height(w->dims()[0]);`
test=develop 6 years ago			`auto* w_grad_value = w_grad->mutable_value();`
			`framework::DDim temp_dim(w->dims());`
			`set(temp_dim, 0, real_rows.size());`

			`w_grad_value->mutable_data<T>(temp_dim, ctx.GetPlace());`
			`zero(dev_ctx, w_grad_value, static_cast<T>(0.0));`
			`bit_code->MulGradWeight(pre_out_grad, w_grad, *in);`
			`}`
grad diff problem to be fixed and need api spec change to be done 6 years ago			`bit_code->MulGradError(pre_out_grad, *w, in_grad);`
add backward 7 years ago			`}`
port hsigmoid layer 7 years ago			`};`

			`} // namespace operators`
			`} // namespace paddle`