diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index c5d73e64b9..0b570181d3 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -161,7 +161,7 @@ paddle.fluid.layers.sequence_last_step (ArgSpec(args=['input'], varargs=None, ke paddle.fluid.layers.sequence_slice (ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '39fbc5437be389f6c0c769f82fc1fba2')) paddle.fluid.layers.dropout (ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name', 'dropout_implementation'], varargs=None, keywords=None, defaults=(False, None, None, 'downgrade_in_infer')), ('document', '558d13133596209190df9a624264f28f')) paddle.fluid.layers.split (ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '78cf3a7323d1a7697658242e13f63759')) -paddle.fluid.layers.ctc_greedy_decoder (ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '2bc3a59efa9d52b628a6255422d9f0e8')) +paddle.fluid.layers.ctc_greedy_decoder (ArgSpec(args=['input', 'blank', 'input_length', 'padding_value', 'name'], varargs=None, keywords=None, defaults=(None, 0, None)), ('document', '9abb7bb8d267e017620a39a146dc47ea')) paddle.fluid.layers.edit_distance (ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens', 'input_length', 'label_length'], varargs=None, keywords=None, defaults=(True, None, None, None)), ('document', '77cbfb28cd2fc589f589c7013c5086cd')) paddle.fluid.layers.l2_normalize (ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None)), ('document', 'c1df110ea65998984f564c5c10abc54a')) paddle.fluid.layers.matmul (ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None)), ('document', '3720b4a386585094435993deb028b592')) diff --git a/paddle/fluid/operators/ctc_align_op.cc b/paddle/fluid/operators/ctc_align_op.cc index 9467c517e2..4abe9509e6 100644 --- a/paddle/fluid/operators/ctc_align_op.cc +++ b/paddle/fluid/operators/ctc_align_op.cc @@ -22,15 +22,18 @@ class CTCAlignOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("Input"), - "Input of CTCAlignOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Output"), - "Output of CTCAlignOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true, + "Input of CTCAlignOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Output"), true, + "Output of CTCAlignOp should not be null."); auto input_dims = ctx->GetInputDim("Input"); // TODO(wanghaoshuang): it is tricky to set the wrong dimension here. ctx->SetOutputDim("Output", input_dims); + if (ctx->HasInput("InputLength")) { + ctx->SetOutputDim("OutputLength", {input_dims[0], 1}); + } } protected: @@ -47,7 +50,17 @@ class CTCAlignOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Input", "2-D Tensor or LodTensor with shape " "[Lp, 1], where Lp is the sum of all input sequences' length."); + AddInput("InputLength", + "2-D Tensor with shape [batch_size, 1], " + " When Input is padding mode, InputLength is length of every " + "sequence in Input.") + .AsDispensable(); AddOutput("Output", "(Tensor, default: Tensor), The align result."); + AddOutput("OutputLength", + "2-D Tensor with shape [batch_size, 1], " + "When Input is padding mode, OutputLength is length of every " + "sequence in Output.") + .AsDispensable(); AddAttr("blank", "(int, default: 0), the blank label setted in Connectionist " "Temporal Classification (CTC) op.") @@ -83,7 +96,10 @@ Then: or Given: Input.data = [[0, 1, 2, 2, 0, 4], [0, 4, 5, 0, 6, 0], - [0, 7, 7, 7, 0, 0]] + [0, 7, 7, 7, 0, 0]] + InputLength.data = [[6], + [5], + [4]], Input.dims = {3, 6}, Input.Lod = [] And: @@ -94,7 +110,10 @@ And: Then: Output.data = [[1, 2, 4, 0, 0, 0], [4, 5, 6, 0, 0, 0], - [7, 0, 0, 0, 0, 0]] + [7, 0, 0, 0, 0, 0]], + OutputLength.data = [[3], + [3], + [1]], Output.dims = {3, 6}, Output.Lod = [] )DOC"); diff --git a/paddle/fluid/operators/ctc_align_op.cu b/paddle/fluid/operators/ctc_align_op.cu index fa1f218d78..44a7c16f96 100644 --- a/paddle/fluid/operators/ctc_align_op.cu +++ b/paddle/fluid/operators/ctc_align_op.cu @@ -43,17 +43,15 @@ __global__ void MergeAndDelCudaKernel(const int64_t num_token, const T* tokens, } template -__global__ void PaddingMergeAndDelCudaKernel(const int64_t num_token, - const T* tokens, const int blank, - const int merge_repeated, - const int padding_value, - const int64_t batch_size, - T* output) { +__global__ void PaddingMergeAndDelCudaKernel( + const int64_t num_token, const T* tokens, const T* tokens_length, + const int blank, const int merge_repeated, const int padding_value, + const int64_t batch_size, T* output, T* output_length) { int ind = blockIdx.x * blockDim.x + threadIdx.x; if (ind >= batch_size) return; int output_idx = ind * num_token; T prev_token = -1; - for (int i = ind * num_token; i < ind * num_token + num_token; i++) { + for (int i = ind * num_token; i < ind * num_token + tokens_length[ind]; i++) { if ((unsigned)tokens[i] != blank && !(merge_repeated && tokens[i] == prev_token)) { output[output_idx] = tokens[i]; @@ -61,6 +59,7 @@ __global__ void PaddingMergeAndDelCudaKernel(const int64_t num_token, } prev_token = tokens[i]; } + output_length[ind] = output_idx - ind * num_token; for (int i = output_idx; i < ind * num_token + num_token; i++) { output[i] = padding_value; } @@ -86,10 +85,15 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel { auto input_dims = input->dims(); T* output_data = output->mutable_data({input_dims[0], input_dims[1]}, ctx.GetPlace()); + auto* input_length = ctx.Input("InputLength"); + const T* input_length_data = input_length->data(); + auto* output_length = ctx.Output("OutputLength"); + T* output_length_data = + output_length->mutable_data({input_dims[0], 1}, ctx.GetPlace()); PaddingMergeAndDelCudaKernel< T><<<32, (input_dims[0] + 32 - 1) / 32, 0, stream>>>( - input_dims[1], tokens, blank, merge_repeated, padding_value, - input_dims[0], output_data); + input_dims[1], tokens, input_length_data, blank, merge_repeated, + padding_value, input_dims[0], output_data, output_length_data); } else { const size_t level = 0; auto input_lod = framework::ToAbsOffset(input->lod()); diff --git a/paddle/fluid/operators/ctc_align_op.h b/paddle/fluid/operators/ctc_align_op.h index 0ea770a389..ccf91471ab 100644 --- a/paddle/fluid/operators/ctc_align_op.h +++ b/paddle/fluid/operators/ctc_align_op.h @@ -41,11 +41,17 @@ class CTCAlignKernel : public framework::OpKernel { if (input->lod().empty()) { size_t padding_value = static_cast(ctx.Attr("padding_value")); + auto* input_length = ctx.Input("InputLength"); + const T* input_length_data = input_length->data(); + + auto* output_length = ctx.Output("OutputLength"); + T* output_length_data = output_length->mutable_data(ctx.GetPlace()); + for (size_t batch_id = 0; batch_id < (unsigned)input_dims[0]; batch_id++) { T prev_token = -1; size_t output_idx = 0; - for (size_t i = 0; i < (unsigned)input_dims[1]; i++) { + for (size_t i = 0; i < (unsigned)input_length_data[batch_id]; i++) { size_t input_ind = batch_id * input_dims[1] + i; if ((unsigned)input_data[input_ind] != blank && !(merge_repeated && input_data[input_ind] == prev_token)) { @@ -55,6 +61,7 @@ class CTCAlignKernel : public framework::OpKernel { } prev_token = input_data[input_ind]; } + output_length_data[batch_id] = output_idx; for (size_t j = output_idx; j < (unsigned)input_dims[1]; j++) output_data[batch_id * input_dims[1] + j] = padding_value; } diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index c68ecba971..9f2a4f5dac 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -5851,7 +5851,11 @@ def edit_distance(input, return edit_distance_out, sequence_num -def ctc_greedy_decoder(input, blank, name=None): +def ctc_greedy_decoder(input, + blank, + input_length=None, + padding_value=0, + name=None): """ This op is used to decode sequences by greedy policy by below steps: @@ -5865,6 +5869,7 @@ def ctc_greedy_decoder(input, blank, name=None): .. code-block:: text Given: + for lod mode: input.data = [[0.6, 0.1, 0.3, 0.1], [0.3, 0.2, 0.4, 0.1], @@ -5893,45 +5898,106 @@ def ctc_greedy_decoder(input, blank, name=None): output.lod = [[2, 1]] + for padding mode: + + input.data = [[[0.6, 0.1, 0.3, 0.1], + [0.3, 0.2, 0.4, 0.1], + [0.1, 0.5, 0.1, 0.3], + [0.5, 0.1, 0.3, 0.1]], + + [[0.5, 0.1, 0.3, 0.1], + [0.2, 0.2, 0.2, 0.4], + [0.2, 0.2, 0.1, 0.5], + [0.5, 0.1, 0.3, 0.1]]] + + input_length.data = [[4], [4]] + input.shape = [2, 4, 4] + + step1: Apply argmax to first input sequence which is input.data[0:4]. Then we get: + [[0], [2], [1], [0]], for input.data[4:8] is [[0], [3], [3], [0]], shape is [2,4,1] + step2: Change the argmax result to use padding mode, then argmax result is + [[0, 2, 1, 0], [0, 3, 3, 0]], shape is [2, 4], lod is [], input_length is [[4], [4]] + step3: Apply ctc_align to padding argmax result, padding_value is 0 + + Finally: + output.data = [[2, 1, 0, 0], + [3, 0, 0, 0]] + output_length.data = [[2], [1]] + + + Args: input(Variable): (LoDTensor), the probabilities of - variable-length sequences, which is a 2-D Tensor with - LoD information. It's shape is [Lp, num_classes + 1], + variable-length sequences. When in lod mode, it is a 2-D Tensor with + LoD information. It's shape is [Lp, num_classes + 1] where Lp is the sum of all input sequences' length and - num_classes is the true number of classes. (not - including the blank label). + num_classes is the true number of classes. When in padding mode, + it is a 3-D Tensor with padding, It's shape is [batch_size, N, num_classes + 1]. + (not including the blank label). blank(int): the blank label index of Connectionist Temporal Classification (CTC) loss, which is in thehalf-opened interval [0, num_classes + 1). - name (str): The name of this layer. It is optional. + input_length(Variable, optional): (LoDTensor), shape is [batch_size, 1], when in lod mode, input_length + is None. + padding_value(int): padding value. + name (str, optional): The name of this layer. It is optional. Returns: - Variable: CTC greedy decode result which is a 2-D tensor with shape [Lp, 1]. \ + output(Variable): For lod mode, CTC greedy decode result which is a 2-D tensor with shape [Lp, 1]. \ 'Lp' is the sum if all output sequences' length. If all the sequences \ in result were empty, the result LoDTensor will be [-1] with \ - LoD [[]] and dims [1, 1]. + LoD [[]] and dims [1, 1]. For padding mode, CTC greedy decode result is a 2-D tensor \ + with shape [batch_size, N], output length's shape is [batch_size, 1] which is length \ + of every sequence in output. + output_length(Variable, optional): length of each sequence of output for padding mode. Examples: .. code-block:: python + # for lod mode import paddle.fluid as fluid x = fluid.layers.data(name='x', shape=[8], dtype='float32') cost = fluid.layers.ctc_greedy_decoder(input=x, blank=0) + + # for padding mode + x_pad = fluid.layers.data(name='x_pad', shape=[4,8], dtype='float32') + x_pad_len = fluid.layers.data(name='x_pad_len', shape=[1], dtype='int64') + out, out_len = fluid.layers.ctc_greedy_decoder(input=x_pad, blank=0, + input_length=x_pad_len) + """ helper = LayerHelper("ctc_greedy_decoder", **locals()) _, topk_indices = topk(input, k=1) # ctc align op ctc_out = helper.create_variable_for_type_inference(dtype="int64") - helper.append_op( - type="ctc_align", - inputs={"Input": [topk_indices]}, - outputs={"Output": [ctc_out]}, - attrs={"merge_repeated": True, - "blank": blank}) - return ctc_out + + if input_length is None: + helper.append_op( + type="ctc_align", + inputs={"Input": [topk_indices]}, + outputs={"Output": [ctc_out]}, + attrs={"merge_repeated": True, + "blank": blank}) + return ctc_out + else: + ctc_out_len = helper.create_variable_for_type_inference(dtype="int64") + ctc_input = squeeze(topk_indices, [2]) + + helper.append_op( + type="ctc_align", + inputs={"Input": [ctc_input], + "InputLength": [input_length]}, + outputs={"Output": [ctc_out], + "OutputLength": [ctc_out_len]}, + attrs={ + "merge_repeated": True, + "blank": blank, + "padding_value": padding_value + }) + return ctc_out, ctc_out_len def warpctc(input, diff --git a/python/paddle/fluid/tests/unittests/test_ctc_align.py b/python/paddle/fluid/tests/unittests/test_ctc_align.py index 042057ffec..2078ff8ef1 100644 --- a/python/paddle/fluid/tests/unittests/test_ctc_align.py +++ b/python/paddle/fluid/tests/unittests/test_ctc_align.py @@ -19,10 +19,11 @@ import unittest import numpy as np from op_test import OpTest from test_softmax_op import stable_softmax +import paddle.fluid as fluid -def CTCAlign(input, lod, blank, merge_repeated, padding=0): - if lod is not None and len(lod) > 0: +def CTCAlign(input, lod, blank, merge_repeated, padding=0, input_length=None): + if input_length is None: lod0 = lod[0] result = [] cur_offset = 0 @@ -38,23 +39,28 @@ def CTCAlign(input, lod, blank, merge_repeated, padding=0): result = np.array(result).reshape([len(result), 1]).astype("int32") if len(result) == 0: result = np.array([-1]) + return result else: result = [[] for i in range(len(input))] + output_length = [] for i in range(len(input)): prev_token = -1 - for j in range(len(input[i])): + for j in range(input_length[i][0]): token = input[i][j] if (token != blank) and not (merge_repeated and token == prev_token): result[i].append(token) prev_token = token start = len(result[i]) + output_length.append([start]) for j in range(start, len(input[i])): result[i].append(padding) result = np.array(result).reshape( [len(input), len(input[0])]).astype("int32") + output_length = np.array(output_length).reshape( + [len(input), 1]).astype("int32") - return result + return result, output_length class TestCTCAlignOp(OpTest): @@ -114,13 +120,18 @@ class TestCTCAlignPaddingOp(OpTest): self.input = np.array([[0, 2, 4, 4, 0, 6, 3, 6, 6, 0, 0], [1, 1, 3, 0, 0, 4, 5, 6, 0, 0, 0]]).reshape( [2, 11]).astype("int32") + self.input_length = np.array([[9], [8]]).reshape([2, 1]).astype("int32") def setUp(self): self.config() - output = CTCAlign(self.input, self.input_lod, self.blank, - self.merge_repeated, self.padding_value) - self.inputs = {"Input": (self.input, self.input_lod), } - self.outputs = {"Output": output} + output, output_length = CTCAlign(self.input, self.input_lod, self.blank, + self.merge_repeated, + self.padding_value, self.input_length) + self.inputs = { + "Input": (self.input, self.input_lod), + "InputLength": self.input_length + } + self.outputs = {"Output": output, "OutputLength": output_length} self.attrs = { "blank": self.blank, "merge_repeated": self.merge_repeated, @@ -129,7 +140,6 @@ class TestCTCAlignPaddingOp(OpTest): def test_check_output(self): self.check_output() - pass class TestCTCAlignOpCase3(TestCTCAlignPaddingOp): @@ -142,6 +152,8 @@ class TestCTCAlignOpCase3(TestCTCAlignPaddingOp): self.input = np.array([[0, 1, 2, 2, 0, 4], [0, 4, 5, 0, 6, 0], [0, 7, 7, 7, 0, 0]]).reshape( [3, 6]).astype("int32") + self.input_length = np.array([[6], [5], + [4]]).reshape([3, 1]).astype("int32") class TestCTCAlignOpCase4(TestCTCAlignPaddingOp): @@ -158,6 +170,8 @@ class TestCTCAlignOpCase4(TestCTCAlignPaddingOp): self.input = np.array([[0, 1, 2, 2, 0, 4], [0, 4, 5, 0, 6, 0], [0, 7, 7, 7, 0, 0]]).reshape( [3, 6]).astype("int32") + self.input_length = np.array([[6], [5], + [4]]).reshape([3, 1]).astype("int32") class TestCTCAlignOpCase5(TestCTCAlignPaddingOp): @@ -170,6 +184,37 @@ class TestCTCAlignOpCase5(TestCTCAlignPaddingOp): self.input = np.array([[0, 1, 2, 2, 0, 4], [0, 4, 5, 0, 6, 0], [0, 7, 1, 7, 0, 0]]).reshape( [3, 6]).astype("int32") + self.input_length = np.array([[6], [5], + [4]]).reshape([3, 1]).astype("int32") + + +class TestCTCAlignOpApi(unittest.TestCase): + def test_api(self): + x = fluid.layers.data('x', shape=[4], dtype='float32') + y = fluid.layers.ctc_greedy_decoder(x, blank=0) + + x_pad = fluid.layers.data('x_pad', shape=[4, 4], dtype='float32') + x_pad_len = fluid.layers.data('x_pad_len', shape=[1], dtype='int64') + y_pad, y_pad_len = fluid.layers.ctc_greedy_decoder( + x_pad, blank=0, input_length=x_pad_len) + + place = fluid.CPUPlace() + x_tensor = fluid.create_lod_tensor( + np.random.rand(8, 4).astype("float32"), [[4, 4]], place) + + x_pad_tensor = np.random.rand(2, 4, 4).astype("float32") + x_pad_len_tensor = np.array([[4], [4]]).reshape([2, 1]).astype("int64") + + exe = fluid.Executor(place) + + exe.run(fluid.default_startup_program()) + ret = exe.run(feed={ + 'x': x_tensor, + 'x_pad': x_pad_tensor, + 'x_pad_len': x_pad_len_tensor + }, + fetch_list=[y, y_pad, y_pad_len], + return_numpy=False) if __name__ == "__main__":