Paddle/paddle/operators/math/sequence2batch.h

/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

   http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once
#include "paddle/framework/eigen.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/tensor.h"
#include "paddle/platform/device_context.h"

namespace paddle {
namespace operators {
namespace math {

template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;

template <typename DeviceContext, typename T>
class CopyMatrixRowsFunctor {
 public:
  // If is_src_index is true,
  // copy the indexed rows of input src to the output dst.
  // If is_src_index is false,
  // copy the input src to the indexed rows of output dst.
  // The indexed rows are based on the input index.
  void operator()(const DeviceContext& context, const framework::Tensor& src,
                  const size_t* index, framework::Tensor& dst,
                  bool is_src_index);
};

template <typename DeviceContext, typename T>
class LoDTensor2BatchFunctor {
  // Calculate the length of each sequence and
  // sort sequence index by the length.
  // example:  sequences = {s0, s1, s2}
  //           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
  //           seq_info[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)}
  //
  struct SeqInfo {
    SeqInfo(int start, int length, int seq_idx)
        : start(start), length(length), seq_idx(seq_idx) {}
    int start;
    int length;
    int seq_idx;
  };

 public:
  void operator()(const DeviceContext& context,
                  const framework::LoDTensor& lod_tensor,
                  framework::LoDTensor& batch, bool is_cal_batch_lod,
                  bool is_reverse = false) const {
    if (!is_cal_batch_lod) {
      auto lods = batch.lod();
      PADDLE_ENFORCE_GT(lods.size(), 2UL);
      PADDLE_ENFORCE_EQ(lods[1].size(),
                        static_cast<size_t>(lod_tensor.dims()[0]));
      CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
      to_batch(context, lod_tensor, lods[1].data(), batch, true);
      return;
    }

    auto lods = lod_tensor.lod();
    auto lod = lods[0];
    PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now.");

    std::vector<SeqInfo> seq_info;
    for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) {
      int length = lod[seq_id + 1] - lod[seq_id];
      seq_info.emplace_back(lod[seq_id], length, seq_id);
    }

    std::sort(seq_info.begin(), seq_info.end(),
              [](SeqInfo a, SeqInfo b) { return a.length > b.length; });

    // Calculate the start position of each batch.
    // example:  sequences = {s0, s1, s2}
    //           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
    //           num_batch = 5,
    //           batchIndex = {b0, b1, b2, b3, b4}
    //           b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
    //           batch_start_positions[6] = {0, 3, 6, 9, 11, 12}
    //              batch_start_positions[0] = len(b0)
    //              batch_start_positions[1] = len(b0) + len(b1)
    //              batch_start_positions[2] = len(b0) + len(b1) + len(b2)
    //              ...
    //           seq2batch_idx[12] = {4, 0, 9,
    //                                5, 1, 10,
    //                                6, 2, 11,
    //                                7, 3,
    //                                8}
    //           seq_order = {1, 0, 2}, the sort order.
    //               where 1 is the second sequence,
    //                     0 is the first sequence,
    //                     2 is the third sequence.
    // The num_batch represents batch size after rearranging the
    // input LodTensor. It is also the maximum length of input sequence.

    paddle::framework::LoD batch_lods;
    batch_lods.emplace_back(std::vector<size_t>{0});
    batch_lods.emplace_back(std::vector<size_t>{0});
    batch_lods.emplace_back(std::vector<size_t>{0});

    // batch_lods[0] is the start positions for batch LoDTensor
    int num_batch = seq_info[0].length;
    batch_lods[0].resize(static_cast<size_t>(num_batch + 1));
    // batch_lods[1] is the raw index in the input LoDTensor
    batch_lods[1].resize(static_cast<size_t>(lod_tensor.dims()[0]));
    // batch_lods[2] is the sort order for the input LoDTensor.
    batch_lods[2].resize(seq_info.size());

    size_t* batch_starts = batch_lods[0].data();
    size_t* seq2batch_idx = batch_lods[1].data();
    batch_starts[0] = 0;
    for (int n = 0; n < num_batch; n++) {
      auto batch_id = static_cast<int>(batch_starts[n]);
      for (size_t i = 0; i < seq_info.size(); ++i) {
        int seq_len = seq_info[i].length;
        int start = seq_info[i].start;
        if (n < seq_len) {
          seq2batch_idx[batch_id] =
              is_reverse ? start + seq_len - 1 - n : start + n;
          batch_id++;
        } else {
          break;
        }
      }
      batch_starts[n + 1] = static_cast<size_t>(batch_id);
    }
    size_t* seq_order = batch_lods[2].data();
    for (size_t i = 0; i < seq_info.size(); ++i) {
      seq_order[i] = seq_info[i].seq_idx;
    }
    batch.set_lod(batch_lods);

    CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
    to_batch(context, lod_tensor, seq2batch_idx, batch, true);
  }
};

template <typename DeviceContext, typename T>
class Batch2LoDTensorFunctor {
 public:
  void operator()(const DeviceContext& context,
                  const framework::LoDTensor& batch,
                  framework::LoDTensor& lod_tensor) const {
    auto in_lod = batch.lod();
    PADDLE_ENFORCE_GT(in_lod.size(), 2UL);
    PADDLE_ENFORCE_EQ(in_lod[1].size(),
                      static_cast<size_t>(lod_tensor.dims()[0]));
    CopyMatrixRowsFunctor<DeviceContext, T> to_seq;
    size_t* index = in_lod[1].data();
    to_seq(context, batch, index, lod_tensor, false);
  }
};

}  // namespace math
}  // namespace operators
}  // namespace paddle
Add LSTM Operators. 7 years ago			`/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.`

			`Licensed under the Apache License, Version 2.0 (the "License");`
			`you may not use this file except in compliance with the License.`
			`You may obtain a copy of the License at`

			`http://www.apache.org/licenses/LICENSE-2.0`

			`Unless required by applicable law or agreed to in writing, software`
			`distributed under the License is distributed on an "AS IS" BASIS,`
			`WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`See the License for the specific language governing permissions and`
			`limitations under the License. */`

LSTM Operator forward implementation. 7 years ago			`#pragma once`
Use G++ to compile some cu operators. 7 years ago			`#include "paddle/framework/eigen.h"`
LSTM Operator forward implementation. 7 years ago			`#include "paddle/framework/lod_tensor.h"`
			`#include "paddle/framework/tensor.h"`
			`#include "paddle/platform/device_context.h"`

Add LSTM Operators. 7 years ago			`namespace paddle {`
			`namespace operators {`
			`namespace math {`

Use G++ to compile some cu operators. 7 years ago			`template <typename T, int MajorType = Eigen::RowMajor,`
			`typename IndexType = Eigen::DenseIndex>`
			`using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;`

Refine device context (#6433) There are mainly following fixes: - take `DeviceContext` as the template parameter of math functors and OpKernel instead of `Place` - remove `eigen_device` interface in base class `DeviceContext` - remove `GetEigenDevice` interface in `ExecutionContext` and base class `DeviceContext` - remove unused `platform::EigenDeviceConverter` - rename `REGISTER_OP_GPU_KERNEL` to `REGISTER_OP_CUDA_KERNEL` - rename `USE_GPU_ONLY_OP` to `USE_CUDA_ONLY_OP` 7 years ago			`template <typename DeviceContext, typename T>`
Add lstm implementation. 7 years ago			`class CopyMatrixRowsFunctor {`
			`public:`
			`// If is_src_index is true,`
			`// copy the indexed rows of input src to the output dst.`
			`// If is_src_index is false,`
			`// copy the input src to the indexed rows of output dst.`
			`// The indexed rows are based on the input index.`
Refine device context (#6433) There are mainly following fixes: - take `DeviceContext` as the template parameter of math functors and OpKernel instead of `Place` - remove `eigen_device` interface in base class `DeviceContext` - remove `GetEigenDevice` interface in `ExecutionContext` and base class `DeviceContext` - remove unused `platform::EigenDeviceConverter` - rename `REGISTER_OP_GPU_KERNEL` to `REGISTER_OP_CUDA_KERNEL` - rename `USE_GPU_ONLY_OP` to `USE_CUDA_ONLY_OP` 7 years ago			`void operator()(const DeviceContext& context, const framework::Tensor& src,`
			`const size_t* index, framework::Tensor& dst,`
			`bool is_src_index);`
Add lstm implementation. 7 years ago			`};`

Refine device context (#6433) There are mainly following fixes: - take `DeviceContext` as the template parameter of math functors and OpKernel instead of `Place` - remove `eigen_device` interface in base class `DeviceContext` - remove `GetEigenDevice` interface in `ExecutionContext` and base class `DeviceContext` - remove unused `platform::EigenDeviceConverter` - rename `REGISTER_OP_GPU_KERNEL` to `REGISTER_OP_CUDA_KERNEL` - rename `USE_GPU_ONLY_OP` to `USE_CUDA_ONLY_OP` 7 years ago			`template <typename DeviceContext, typename T>`
Add LSTM Operators. 7 years ago			`class LoDTensor2BatchFunctor {`
Several Enhancement 7 years ago			`// Calculate the length of each sequence and`
			`// sort sequence index by the length.`
			`// example: sequences = {s0, s1, s2}`
			`// s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2`
			`// seq_info[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)}`
			`//`
			`struct SeqInfo {`
			`SeqInfo(int start, int length, int seq_idx)`
			`: start(start), length(length), seq_idx(seq_idx) {}`
			`int start;`
			`int length;`
			`int seq_idx;`
			`};`

Add LSTM Operators. 7 years ago			`public:`
Refine device context (#6433) There are mainly following fixes: - take `DeviceContext` as the template parameter of math functors and OpKernel instead of `Place` - remove `eigen_device` interface in base class `DeviceContext` - remove `GetEigenDevice` interface in `ExecutionContext` and base class `DeviceContext` - remove unused `platform::EigenDeviceConverter` - rename `REGISTER_OP_GPU_KERNEL` to `REGISTER_OP_CUDA_KERNEL` - rename `USE_GPU_ONLY_OP` to `USE_CUDA_ONLY_OP` 7 years ago			`void operator()(const DeviceContext& context,`
Add LSTM Operators. 7 years ago			`const framework::LoDTensor& lod_tensor,`
Add LSTM backward implenmentation. 7 years ago			`framework::LoDTensor& batch, bool is_cal_batch_lod,`
			`bool is_reverse = false) const {`
			`if (!is_cal_batch_lod) {`
			`auto lods = batch.lod();`
Enhance unit testing. 1. user can disable peephole connections. 2. not calculate some gradients. 7 years ago			`PADDLE_ENFORCE_GT(lods.size(), 2UL);`
fix compiling warning. 7 years ago			`PADDLE_ENFORCE_EQ(lods[1].size(),`
			`static_cast<size_t>(lod_tensor.dims()[0]));`
Refine device context (#6433) There are mainly following fixes: - take `DeviceContext` as the template parameter of math functors and OpKernel instead of `Place` - remove `eigen_device` interface in base class `DeviceContext` - remove `GetEigenDevice` interface in `ExecutionContext` and base class `DeviceContext` - remove unused `platform::EigenDeviceConverter` - rename `REGISTER_OP_GPU_KERNEL` to `REGISTER_OP_CUDA_KERNEL` - rename `USE_GPU_ONLY_OP` to `USE_CUDA_ONLY_OP` 7 years ago			`CopyMatrixRowsFunctor<DeviceContext, T> to_batch;`
Add LSTM backward implenmentation. 7 years ago			`to_batch(context, lod_tensor, lods[1].data(), batch, true);`
			`return;`
			`}`

LSTM Operator forward implementation. 7 years ago			`auto lods = lod_tensor.lod();`
Add LSTM Operators. 7 years ago			`auto lod = lods[0];`
Enable initial hidden state and cell state in LSTM Operator. 7 years ago			`PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now.");`
Add LSTM Operators. 7 years ago
			`std::vector<SeqInfo> seq_info;`
Add unit testing for forwad implementation. 7 years ago			`for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) {`
Add LSTM Operators. 7 years ago			`int length = lod[seq_id + 1] - lod[seq_id];`
			`seq_info.emplace_back(lod[seq_id], length, seq_id);`
			`}`

			`std::sort(seq_info.begin(), seq_info.end(),`
			`[](SeqInfo a, SeqInfo b) { return a.length > b.length; });`

Enable initial hidden state and cell state in LSTM Operator. 7 years ago			`// Calculate the start position of each batch.`
Add LSTM Operators. 7 years ago			`// example: sequences = {s0, s1, s2}`
			`// s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2`
			`// num_batch = 5,`
			`// batchIndex = {b0, b1, b2, b3, b4}`
			`// b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1`
			`// batch_start_positions[6] = {0, 3, 6, 9, 11, 12}`
Several Enhancement 7 years ago			`// batch_start_positions[0] = len(b0)`
			`// batch_start_positions[1] = len(b0) + len(b1)`
			`// batch_start_positions[2] = len(b0) + len(b1) + len(b2)`
			`// ...`
Add LSTM Operators. 7 years ago			`// seq2batch_idx[12] = {4, 0, 9,`
			`// 5, 1, 10,`
			`// 6, 2, 11,`
			`// 7, 3,`
			`// 8}`
Enable initial hidden state and cell state in LSTM Operator. 7 years ago			`// seq_order = {1, 0, 2}, the sort order.`
			`// where 1 is the second sequence,`
			`// 0 is the first sequence,`
			`// 2 is the third sequence.`
			`// The num_batch represents batch size after rearranging the`
Add LSTM Operators. 7 years ago			`// input LodTensor. It is also the maximum length of input sequence.`
Add unit testing for forwad implementation. 7 years ago
			`paddle::framework::LoD batch_lods;`
Several Enhancement 7 years ago			`batch_lods.emplace_back(std::vector<size_t>{0});`
			`batch_lods.emplace_back(std::vector<size_t>{0});`
Enable initial hidden state and cell state in LSTM Operator. 7 years ago			`batch_lods.emplace_back(std::vector<size_t>{0});`
Add unit testing for forwad implementation. 7 years ago
Add LSTM Operators. 7 years ago			`// batch_lods[0] is the start positions for batch LoDTensor`
Several Enhancement 7 years ago			`int num_batch = seq_info[0].length;`
			`batch_lods[0].resize(static_cast<size_t>(num_batch + 1));`
Add LSTM Operators. 7 years ago			`// batch_lods[1] is the raw index in the input LoDTensor`
Enhance unit testing. 1. user can disable peephole connections. 2. not calculate some gradients. 7 years ago			`batch_lods[1].resize(static_cast<size_t>(lod_tensor.dims()[0]));`
Enable initial hidden state and cell state in LSTM Operator. 7 years ago			`// batch_lods[2] is the sort order for the input LoDTensor.`
			`batch_lods[2].resize(seq_info.size());`
Add LSTM Operators. 7 years ago
LSTM Operator forward implementation. 7 years ago			`size_t* batch_starts = batch_lods[0].data();`
			`size_t* seq2batch_idx = batch_lods[1].data();`
Add LSTM Operators. 7 years ago			`batch_starts[0] = 0;`
fix compiling warning. 7 years ago			`for (int n = 0; n < num_batch; n++) {`
Several Enhancement 7 years ago			`auto batch_id = static_cast<int>(batch_starts[n]);`
Add LSTM Operators. 7 years ago			`for (size_t i = 0; i < seq_info.size(); ++i) {`
fix compiling warning. 7 years ago			`int seq_len = seq_info[i].length;`
Add LSTM Operators. 7 years ago			`int start = seq_info[i].start;`
			`if (n < seq_len) {`
update to the develop branch. 7 years ago			`seq2batch_idx[batch_id] =`
			`is_reverse ? start + seq_len - 1 - n : start + n;`
Add LSTM Operators. 7 years ago			`batch_id++;`
			`} else {`
			`break;`
			`}`
			`}`
Several Enhancement 7 years ago			`batch_starts[n + 1] = static_cast<size_t>(batch_id);`
Add LSTM Operators. 7 years ago			`}`
Enable initial hidden state and cell state in LSTM Operator. 7 years ago			`size_t* seq_order = batch_lods[2].data();`
			`for (size_t i = 0; i < seq_info.size(); ++i) {`
			`seq_order[i] = seq_info[i].seq_idx;`
			`}`
Add unit testing for forwad implementation. 7 years ago			`batch.set_lod(batch_lods);`
Add lstm implementation. 7 years ago
Refine device context (#6433) There are mainly following fixes: - take `DeviceContext` as the template parameter of math functors and OpKernel instead of `Place` - remove `eigen_device` interface in base class `DeviceContext` - remove `GetEigenDevice` interface in `ExecutionContext` and base class `DeviceContext` - remove unused `platform::EigenDeviceConverter` - rename `REGISTER_OP_GPU_KERNEL` to `REGISTER_OP_CUDA_KERNEL` - rename `USE_GPU_ONLY_OP` to `USE_CUDA_ONLY_OP` 7 years ago			`CopyMatrixRowsFunctor<DeviceContext, T> to_batch;`
LSTM Operator forward implementation. 7 years ago			`to_batch(context, lod_tensor, seq2batch_idx, batch, true);`
Add LSTM Operators. 7 years ago			`}`
Add lstm implementation. 7 years ago			`};`
Add LSTM Operators. 7 years ago
Refine device context (#6433) There are mainly following fixes: - take `DeviceContext` as the template parameter of math functors and OpKernel instead of `Place` - remove `eigen_device` interface in base class `DeviceContext` - remove `GetEigenDevice` interface in `ExecutionContext` and base class `DeviceContext` - remove unused `platform::EigenDeviceConverter` - rename `REGISTER_OP_GPU_KERNEL` to `REGISTER_OP_CUDA_KERNEL` - rename `USE_GPU_ONLY_OP` to `USE_CUDA_ONLY_OP` 7 years ago			`template <typename DeviceContext, typename T>`
LSTM Operator forward implementation. 7 years ago			`class Batch2LoDTensorFunctor {`
Add LSTM Operators. 7 years ago			`public:`
Refine device context (#6433) There are mainly following fixes: - take `DeviceContext` as the template parameter of math functors and OpKernel instead of `Place` - remove `eigen_device` interface in base class `DeviceContext` - remove `GetEigenDevice` interface in `ExecutionContext` and base class `DeviceContext` - remove unused `platform::EigenDeviceConverter` - rename `REGISTER_OP_GPU_KERNEL` to `REGISTER_OP_CUDA_KERNEL` - rename `USE_GPU_ONLY_OP` to `USE_CUDA_ONLY_OP` 7 years ago			`void operator()(const DeviceContext& context,`
Add LSTM Operators. 7 years ago			`const framework::LoDTensor& batch,`
LSTM Operator forward implementation. 7 years ago			`framework::LoDTensor& lod_tensor) const {`
			`auto in_lod = batch.lod();`
Enhance unit testing. 1. user can disable peephole connections. 2. not calculate some gradients. 7 years ago			`PADDLE_ENFORCE_GT(in_lod.size(), 2UL);`
Add gradient check unit testing and fix bug. 7 years ago			`PADDLE_ENFORCE_EQ(in_lod[1].size(),`
			`static_cast<size_t>(lod_tensor.dims()[0]));`
Refine device context (#6433) There are mainly following fixes: - take `DeviceContext` as the template parameter of math functors and OpKernel instead of `Place` - remove `eigen_device` interface in base class `DeviceContext` - remove `GetEigenDevice` interface in `ExecutionContext` and base class `DeviceContext` - remove unused `platform::EigenDeviceConverter` - rename `REGISTER_OP_GPU_KERNEL` to `REGISTER_OP_CUDA_KERNEL` - rename `USE_GPU_ONLY_OP` to `USE_CUDA_ONLY_OP` 7 years ago			`CopyMatrixRowsFunctor<DeviceContext, T> to_seq;`
Add unit testing for forwad implementation. 7 years ago			`size_t* index = in_lod[1].data();`
LSTM Operator forward implementation. 7 years ago			`to_seq(context, batch, index, lod_tensor, false);`
			`}`
Add lstm implementation. 7 years ago			`};`
Add LSTM Operators. 7 years ago
			`} // namespace math`
			`} // namespace operators`
			`} // namespace paddle`