add support to float64 input of warpctc op. (#27399)

* add float64 input to ctc_loss * modified error message of warpctc * update repo and tag of warpctc * add test for warpctc with float64 input * modified warpctc.cmake to make sure build always * resolved sample code bug of warpctc * add core.ops in warpctc dygraph * fix a bug of test
5 years ago · 1501a80f74
parent 3f170dd83d
commit 1501a80f74
12 changed files with 221 additions and 37 deletions
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@ -18,7 +18,7 @@ SET(WARPCTC_PREFIX_DIR  ${THIRD_PARTY_PATH}/warpctc)
 SET(WARPCTC_SOURCE_DIR  ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc)
 SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
 set(WARPCTC_REPOSITORY  https://github.com/baidu-research/warp-ctc.git)
-set(WARPCTC_TAG         fc7f226b93758216a03b1be9d24593a12819b984)
+set(WARPCTC_TAG         95a461eddeabd51099ef059dcfada1117eb1bfb8)

 SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
    CACHE PATH "Warp-ctc Directory" FORCE)
@ -44,8 +44,9 @@ ExternalProject_Add(
    "${WARPCTC_DOWNLOAD_CMD}"
    PREFIX          ${WARPCTC_PREFIX_DIR}
    SOURCE_DIR      ${WARPCTC_SOURCE_DIR}
-    UPDATE_COMMAND  ""
+    #UPDATE_COMMAND  ""
    PATCH_COMMAND   ""
+    BUILD_ALWAYS    1
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
--- a/paddle/fluid/operators/math/sequence_scale.cc
+++ b/paddle/fluid/operators/math/sequence_scale.cc
@ -46,6 +46,7 @@ class ScaleLoDTensorFunctor<platform::CPUDeviceContext, T> {
 };

 template class ScaleLoDTensorFunctor<platform::CPUDeviceContext, float>;
+template class ScaleLoDTensorFunctor<platform::CPUDeviceContext, double>;

 }  // namespace math
 }  // namespace operators
--- a/paddle/fluid/operators/math/sequence_scale.cu
+++ b/paddle/fluid/operators/math/sequence_scale.cu
@ -52,6 +52,7 @@ class ScaleLoDTensorFunctor<platform::CUDADeviceContext, T> {
 };

 template class ScaleLoDTensorFunctor<platform::CUDADeviceContext, float>;
+template class ScaleLoDTensorFunctor<platform::CUDADeviceContext, double>;

 }  // namespace math
 }  // namespace operators
--- a/paddle/fluid/operators/warpctc_op.cc
+++ b/paddle/fluid/operators/warpctc_op.cc
@ -103,13 +103,13 @@ class WarpCTCOpMaker : public framework::OpProtoAndCheckerMaker {
             "Target sequence length for Label when Label is a 2-D tensor.")
        .AsDispensable();
    AddOutput("WarpCTCGrad",
-              "(Tensor, default: Tensor<float>), a temporary "
+              "(Tensor), a temporary "
              "output Tensor to store the gradients of warp-ctc, which is "
              "computed with loss together in one call. It is a 3-D Tensor of "
              "the shape [max_sequence_length, batch_size, num_classes + 1].")
        .AsIntermediate();
    AddOutput("Loss",
-              "(Tensor, default: Tensor<float>), the Connectionist "
+              "(Tensor), the Connectionist "
              "Temporal Classification (CTC) loss, which is a 2-D Tensor of "
              "the shape [batch_size, 1]");
    AddAttr<int>("blank",
@ -197,7 +197,9 @@ REGISTER_OPERATOR(warpctc, ops::WarpCTCOp, ops::WarpCTCOpMaker,
 REGISTER_OPERATOR(warpctc_grad, ops::WarpCTCGradOp,
                  ops::WarpCTCGradOpNoNeedBufferVarInferer);
 REGISTER_OP_CPU_KERNEL(
-    warpctc, ops::WarpCTCKernel<paddle::platform::CPUDeviceContext, float>);
+    warpctc, ops::WarpCTCKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::WarpCTCKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
    warpctc_grad,
-    ops::WarpCTCGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::WarpCTCGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::WarpCTCGradKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/warpctc_op.cu.cc
+++ b/paddle/fluid/operators/warpctc_op.cu.cc
@ -16,7 +16,9 @@ limitations under the License. */

 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
-    warpctc, ops::WarpCTCKernel<paddle::platform::CUDADeviceContext, float>);
+    warpctc, ops::WarpCTCKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::WarpCTCKernel<paddle::platform::CUDADeviceContext, double>);
 REGISTER_OP_CUDA_KERNEL(
    warpctc_grad,
-    ops::WarpCTCGradKernel<paddle::platform::CUDADeviceContext, float>);
+    ops::WarpCTCGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::WarpCTCGradKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/fluid/operators/warpctc_op.h
+++ b/paddle/fluid/operators/warpctc_op.h
@ -27,7 +27,52 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;

+template <typename DeviceContext, typename T>
+class ComputeCtcLossFunctor {
+ public:
+  ctcStatus_t operator()(const T* const activations, T* gradients,
+                         const int* const flat_labels,
+                         const int* const label_lengths,
+                         const int* const input_lengths, int alphabet_size,
+                         int minibatch, T* costs, void* workspace,
+                         ctcOptions options) {
+    return CTC_STATUS_EXECUTION_FAILED;
+  }
+};
+
+template <typename DeviceContext>
+class ComputeCtcLossFunctor<DeviceContext, float> {
+ public:
+  ctcStatus_t operator()(const float* const activations, float* gradients,
+                         const int* const flat_labels,
+                         const int* const label_lengths,
+                         const int* const input_lengths, int alphabet_size,
+                         int minibatch, float* costs, void* workspace,
+                         ctcOptions options) {
+    return platform::dynload::compute_ctc_loss(
+        activations, gradients, flat_labels, label_lengths, input_lengths,
+        static_cast<int>(alphabet_size), static_cast<int>(minibatch), costs,
+        workspace, options);
+  }
+};
+
 template <typename DeviceContext>
+class ComputeCtcLossFunctor<DeviceContext, double> {
+ public:
+  ctcStatus_t operator()(const double* const activations, double* gradients,
+                         const int* const flat_labels,
+                         const int* const label_lengths,
+                         const int* const input_lengths, int alphabet_size,
+                         int minibatch, double* costs, void* workspace,
+                         ctcOptions options) {
+    return platform::dynload::compute_ctc_loss_double(
+        activations, gradients, flat_labels, label_lengths, input_lengths,
+        static_cast<int>(alphabet_size), static_cast<int>(minibatch), costs,
+        workspace, options);
+  }
+};
+
+template <typename DeviceContext, typename T>
 class WarpCTCFunctor {
 public:
  /*
@ -51,21 +96,29 @@ class WarpCTCFunctor {
   * \param blank             blank label used in ctc loss function.
   * \param cpu_losss         cost of each sequence in CPU memory.
   */
-  void operator()(const framework::ExecutionContext& ctx, const float* input,
-                  float* gradient, const int* cpu_labels,
+  void operator()(const framework::ExecutionContext& ctx, const T* input,
+                  T* gradient, const int* cpu_labels,
                  const int* cpu_label_lengths, const int* cpu_input_lengths,
                  const size_t sequence_width, const size_t num_sequences,
-                  const size_t blank, float* cpu_loss) {
+                  const size_t blank, T* cpu_loss) {
    // Init warp-ctc options
    init(ctx, blank);

    // Compute the required workspace size.
    // There is no memory allocated operations within warp-ctc.
    size_t workspace_bytes = 0;
-    ctcStatus_t status = platform::dynload::get_workspace_size(
-        cpu_label_lengths, cpu_input_lengths, static_cast<int>(sequence_width),
-        static_cast<int>(num_sequences), options_, &workspace_bytes);
-
+    ctcStatus_t status = CTC_STATUS_UNKNOWN_ERROR;
+    if (sizeof(T) == 4) {
+      status = platform::dynload::get_workspace_size(
+          cpu_label_lengths, cpu_input_lengths,
+          static_cast<int>(sequence_width), static_cast<int>(num_sequences),
+          options_, &workspace_bytes);
+    } else {
+      status = platform::dynload::get_workspace_size_double(
+          cpu_label_lengths, cpu_input_lengths,
+          static_cast<int>(sequence_width), static_cast<int>(num_sequences),
+          options_, &workspace_bytes);
+    }
    PADDLE_ENFORCE_EQ(
        CTC_STATUS_SUCCESS, status,
        platform::errors::PreconditionNotMet(
@ -79,17 +132,17 @@ class WarpCTCFunctor {
            workspace_bytes));

    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    size_t workspace_elements = workspace_bytes / sizeof(float) + 1UL;
-    Tensor workspace = ctx.AllocateTmpTensor<float, DeviceContext>(
+    size_t workspace_elements = workspace_bytes / sizeof(T) + 1UL;
+    Tensor workspace = ctx.AllocateTmpTensor<T, DeviceContext>(
        framework::make_ddim({static_cast<int64_t>(workspace_elements)}),
        dev_ctx);
-    float* workspace_data = workspace.data<float>();
-    math::SetConstant<DeviceContext, float>()(
+    T* workspace_data = workspace.data<T>();
+    math::SetConstant<DeviceContext, T>()(
        ctx.template device_context<DeviceContext>(), &workspace,
-        static_cast<float>(0));
+        static_cast<T>(0));

    // compute loss and gradient
-    status = platform::dynload::compute_ctc_loss(
+    status = ComputeCtcLossFunctor<DeviceContext, T>()(
        input, gradient, cpu_labels, cpu_label_lengths, cpu_input_lengths,
        static_cast<int>(sequence_width), static_cast<int>(num_sequences),
        cpu_loss, workspace_data, options_);
@ -112,7 +165,8 @@ class WarpCTCFunctor {
                            ctx.device_context())
                            .stream();
 #else
-      PADDLE_THROW("[warpctc init] GPU is not enabled.");
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
+          "[warpctc init] GPU is not enabled."));
 #endif
    } else {
      options_.loc = CTC_CPU;
@ -292,7 +346,7 @@ class WarpCTCKernel : public framework::OpKernel<T> {

    const size_t blank = static_cast<size_t>(ctx.Attr<int>("blank"));

-    WarpCTCFunctor<DeviceContext>()(
+    WarpCTCFunctor<DeviceContext, T>()(
        ctx, warpctc_logits_data, warpctc_grad_data, warpctc_label_data,
        warpctc_label_lengths.data(), warpctc_logits_lengths.data(),
        sequence_width, num_sequences, blank, warpctc_loss_data);
--- a/paddle/fluid/platform/dynload/warpctc.h
+++ b/paddle/fluid/platform/dynload/warpctc.h
@ -53,7 +53,9 @@ extern void* warpctc_dso_handle;
  __macro(get_warpctc_version);       \
  __macro(ctcGetStatusString);        \
  __macro(compute_ctc_loss);          \
-  __macro(get_workspace_size)
+  __macro(compute_ctc_loss_double);   \
+  __macro(get_workspace_size);        \
+  __macro(get_workspace_size_double)

 WARPCTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP);

--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@ -48,6 +48,7 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
    {"collect_fpn_proposals",
     {"MultiLevelRois", "MultiLevelScores", "MultiLevelRoIsNum"}},
    {"distribute_fpn_proposals", {"FpnRois", "RoisNum"}},
+    {"warpctc", {"Logits", "Label", "LogitsLength", "LabelLength"}},
 };

 // NOTE(zhiqiu): Like op_ins_map.
--- a/python/paddle/fluid/layers/loss.py
+++ b/python/paddle/fluid/layers/loss.py
@ -541,7 +541,7 @@ def warpctc(input,
         (not including the blank label). When it is a 3-D Tensor, its shape 
         is `[max_logit_length, batch_size, num_classes + 1]`,
         where `max_logit_length` is the longest length of
-         input logit sequence. The data type must be float32.
+         input logit sequence. The data type should be float32 or float64.
       label (Variable): The ground truth of variable-length sequence,
         which must be a 2-D Tensor with LoD information or a 3-D Tensor without
         LoD information, needs to be consistent with the coressponding input. 
@ -571,6 +571,7 @@ def warpctc(input,
        .. code-block:: python

            # using LoDTensor
+            import paddle
            import paddle.fluid as fluid
            import numpy as np

@ -581,6 +582,7 @@ def warpctc(input,
            # class num
            class_num = 5

+            paddle.enable_static()
            logits = fluid.data(name='logits',shape=[None, class_num+1],
                                 dtype='float32',lod_level=1)
            label = fluid.data(name='label', shape=[None, 1],
@ -602,6 +604,7 @@ def warpctc(input,
        .. code-block:: python

            # using Tensor
+            import paddle
            import paddle.fluid as fluid
            import numpy as np

@ -613,6 +616,7 @@ def warpctc(input,
            batch_size = 16
            # class num
            class_num = 5
+            paddle.enable_static()
            logits = fluid.data(name='logits',
                           shape=[max_seq_length, batch_size, class_num+1],
                           dtype='float32')
@ -637,8 +641,23 @@ def warpctc(input,
                                  fetch_list=[cost.name])
            print(output)
    """
+    if in_dygraph_mode():
+        if input_length is None or label_length is None:
+            raise ValueError(
+                "input_length and label_length must not be None in dygraph mode!"
+            )
+        grad, loss_out = core.ops.warpctc(
+            input,
+            label,
+            input_length,
+            label_length,
+            'blank',
+            blank,
+            'norm_by_times',
+            norm_by_times, )
+        return loss_out
    helper = LayerHelper('warpctc', **locals())
-    check_variable_and_dtype(input, 'input', ['float32'], "warpctc")
+    check_variable_and_dtype(input, 'input', ['float32', 'float64'], "warpctc")
    check_variable_and_dtype(label, 'label', ['int32'], "warpctc")
    this_inputs = {'Logits': [input], 'Label': [label]}
    if input_length is not None and label_length is not None:
--- a/python/paddle/fluid/tests/unittests/test_warpctc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
@ -24,7 +24,7 @@ from paddle.fluid import Program, program_guard
 import paddle
 import paddle.nn.functional as F

-CUDA_BLOCK_SIZE = 512
+CUDA_BLOCK_SIZE = 32


 class CTCForward(object):
@ -41,8 +41,8 @@ class CTCForward(object):
        self.num_classes = num_classes
        self.batch_size = batch_size

-        self.loss = np.zeros([self.batch_size, 1], dtype="float32")
-        self.gradient = np.zeros(self.softmax.shape, dtype="float32")
+        self.loss = np.zeros([self.batch_size, 1], dtype=softmax.dtype)
+        self.gradient = np.zeros(self.softmax.shape, dtype=softmax.dtype)

        # float64
        self.EXP_MAX = sys.float_info.max
@ -112,13 +112,15 @@ class CTCForward(object):
        # calculate the forward and backward variables,
        # reference Chapter 7.3 of "Alex Grave, Supervised Sequence
        # Labelling with Recurrent Neural Networks"
-        log_acts = np.zeros([total_times, self.num_classes], dtype="float32")
+        log_acts = np.zeros(
+            [total_times, self.num_classes], dtype=softmax_a_sequence.dtype)
        for i in range(total_times):
            for j in range(self.num_classes):
                log_acts[i, j] = self.safe_log(softmax_a_sequence[i, j])

        # calculate the forward variables
-        forward_vars = np.zeros([total_times, total_segments], dtype="float32")
+        forward_vars = np.zeros(
+            [total_times, total_segments], dtype=softmax_a_sequence.dtype)
        for i in range(total_times):
            for j in range(total_segments):
                forward_vars[i, j] = self.LOG_ZERO
@ -219,7 +221,7 @@ class TestWarpCTCOp(OpTest):
                                      self.logits_lod[0][i])
        self.gradient = np.zeros(
            [max_sequence_length, self.batch_size, self.num_classes],
-            dtype="float32")
+            dtype=logits.dtype)

        self.inputs = {
            "Logits": (logits, self.logits_lod),
@ -287,7 +289,7 @@ class TestWarpCTCOpWithPadding(OpTest):
        # reshape logits to T*N*S
        new_logits = np.zeros(
            [max_sequence_length, self.batch_size, self.num_classes],
-            dtype="float32")
+            dtype=logits.dtype)

        cur = 0
        for batch_id in range(self.batch_size):
@ -312,7 +314,7 @@ class TestWarpCTCOpWithPadding(OpTest):

        self.gradient = np.zeros(
            [max_sequence_length, self.batch_size, self.num_classes],
-            dtype="float32")
+            dtype=logits.dtype)

        self.inputs = {
            "Logits": new_logits,
@ -347,6 +349,90 @@ class TestWarpCTCOpWithPaddingCase1(TestWarpCTCOpWithPadding):
        self.norm_by_times = False


+class TestWarpCTCOpFp64(OpTest):
+    def config(self):
+        self.batch_size = 4
+        self.num_classes = 8
+        self.logits_lod = [[4, 1, 5, 5]]
+        self.labels_lod = [[3, 1, 4, 2]]
+        self.logits_length = np.array([4, 1, 5, 5], dtype=np.int64)
+        self.labels_length = np.array([3, 1, 4, 2], dtype=np.int64)
+        self.blank = self.num_classes - 1
+        self.norm_by_times = False
+
+    def setUp(self):
+        self.op_type = "warpctc"
+        self.config()
+
+        logits = np.random.uniform(
+            0.1, 1.0,
+            [sum(self.logits_length), self.num_classes]).astype("float64")
+        softmax = np.apply_along_axis(stable_softmax, 1, logits)
+        # labels should not be blank
+        labels = np.random.randint(
+            0,
+            self.num_classes - 1, [sum(self.labels_length), 1],
+            dtype="int32")
+
+        ctc = CTCForward(softmax, self.logits_lod, labels, self.labels_lod,
+                         self.num_classes, self.batch_size, self.blank,
+                         self.norm_by_times)
+        loss = ctc.forward()
+
+        max_sequence_length = 0
+        for i in range(self.batch_size):
+            max_sequence_length = max(max_sequence_length,
+                                      self.logits_length[i])
+        # reshape logits to T*N*S
+        new_logits = np.zeros(
+            [max_sequence_length, self.batch_size, self.num_classes],
+            dtype=logits.dtype)
+
+        cur = 0
+        for batch_id in range(self.batch_size):
+            for i in range(self.logits_length[batch_id]):
+                for j in range(self.num_classes):
+                    new_logits[i, batch_id, j] = logits[cur + i, j]
+            cur = cur + self.logits_length[batch_id]
+
+        # reshape labels to N*S
+        max_target_seq_length = 0
+        for i in range(self.batch_size):
+            max_target_seq_length = max(max_target_seq_length,
+                                        self.labels_length[i])
+        new_labels = np.zeros(
+            [self.batch_size, max_target_seq_length], dtype="int32")
+
+        cur = 0
+        for batch_id in range(self.batch_size):
+            for i in range(self.labels_length[batch_id]):
+                new_labels[batch_id, i] = labels[cur + i]
+            cur = cur + self.labels_length[batch_id]
+
+        self.gradient = np.zeros(
+            [max_sequence_length, self.batch_size, self.num_classes],
+            dtype=logits.dtype)
+
+        self.inputs = {
+            "Logits": new_logits,
+            "Label": new_labels,
+            "LogitsLength": self.logits_length,
+            "LabelLength": self.labels_length
+        }
+        self.outputs = {"Loss": loss}
+        self.attrs = {
+            "blank": self.blank,
+            "norm_by_times": self.norm_by_times,
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.outputs['WarpCTCGrad'] = self.gradient
+        self.check_grad(["Logits"], "Loss")
+
+
 class TestWarpCTCOpError(unittest.TestCase):
    def test_errors(self):
        with program_guard(Program(), Program()):
@ -359,7 +445,7 @@ class TestWarpCTCOpError(unittest.TestCase):
                name='labels_length', shape=[None], dtype='int64')

            def test_logits_Variable():
-                logits_data = np.random.rand(5, 16, 6).astype("float32")
+                logits_data = np.random.rand(5, 16, 6).astype(logits.dtype)
                fluid.layers.warpctc(
                    input=logits_data,
                    label=label,
@ -398,6 +484,21 @@ class TestWarpCTCOpError(unittest.TestCase):

            self.assertRaises(TypeError, test_label_len_Variable)

+    def test_dygraph_errors(self):
+        def test_dygraph_with_lod():
+
+            logits = np.random.uniform(0.1, 1.0, [20, 15]).astype("float32")
+            # labels should not be blank
+            labels = np.random.randint(0, 15 - 1, [15, 1], dtype="int32")
+            softmax = paddle.to_variable(logits)
+            labels = paddle.to_variable(labels)
+
+            fluid.layers.warpctc(input=softmax, label=labels)
+
+        paddle.disable_static()
+        self.assertRaises(ValueError, test_dygraph_with_lod)
+        paddle.enable_static()
+

 class TestCTCLossAPICase(unittest.TestCase):
    def test_functinal_api(self):
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@ -933,7 +933,7 @@ def ctc_loss(log_probs,
    is interated to the Warp-CTC library to normalize values for each row of the input tensor.

    Parameters:
-        log_probs (Tensor): The unscaled probability sequence with padding, which is a 3-D Tensor. The tensor shape is [max_logit_length, batch_size, num_classes + 1], where max_logit_length is the longest length of input logit sequence. The data type must be float32.
+        log_probs (Tensor): The unscaled probability sequence with padding, which is a 3-D Tensor. The tensor shape is [max_logit_length, batch_size, num_classes + 1], where max_logit_length is the longest length of input logit sequence. The data type should be float32 or float64.
        labels (Tensor): The ground truth sequence with padding, which must be a 3-D Tensor. The tensor shape is [batch_size, max_label_length], where max_label_length is the longest length of label sequence. The data type must be int32.
        input_lengths (Tensor): The length for each input sequence, it should have shape [batch_size] and dtype int64.
        label_lengths (Tensor): The length for each label sequence, it should have shape [batch_size] and dtype int64.
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@ -773,7 +773,7 @@ class CTCLoss(fluid.dygraph.Layer):
        reduction (string, optional): Indicate how to average the loss, the candicates are ``'none'`` | ``'mean'`` | ``'sum'``. If :attr:`reduction` is ``'mean'``, the output loss will be divided by the label_lengths, and then return the mean of quotient; If :attr:`reduction` is ``'sum'``, return the sum of loss; If :attr:`reduction` is ``'none'``, no reduction will be applied. Default is ``'mean'``.

    Shape:
-        log_probs (Tensor): The unscaled probability sequence with padding, which is a 3-D Tensor. The tensor shape is [max_logit_length, batch_size, num_classes + 1], where max_logit_length is the longest length of input logit sequence. The data type must be float32.
+        log_probs (Tensor): The unscaled probability sequence with padding, which is a 3-D Tensor. The tensor shape is [max_logit_length, batch_size, num_classes + 1], where max_logit_length is the longest length of input logit sequence. The data type should be float32 or float64.
        labels (Tensor): The ground truth sequence with padding, which must be a 3-D Tensor. The tensor shape is [batch_size, max_label_length], where max_label_length is the longest length of label sequence. The data type must be int32.
        input_lengths (Tensor): The length for each input sequence, it should have shape [batch_size] and dtype int64.
        label_lengths (Tensor): The length for each label sequence, it should have shape [batch_size] and dtype int64.