Paddle/python/paddle/fluid/tests/unittests/test_edit_distance_op.py

#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function

import unittest
import numpy as np
from op_test import OpTest


def Levenshtein(hyp, ref):
    """ Compute the Levenshtein distance between two strings.

    :param hyp: hypothesis string in index
    :type hyp: list
    :param ref: reference string in index
    :type ref: list
    """
    m = len(hyp)
    n = len(ref)
    if m == 0:
        return n
    if n == 0:
        return m

    dist = np.zeros((m + 1, n + 1)).astype("float32")
    for i in range(0, m + 1):
        dist[i][0] = i
    for j in range(0, n + 1):
        dist[0][j] = j

    for i in range(1, m + 1):
        for j in range(1, n + 1):
            cost = 0 if hyp[i - 1] == ref[j - 1] else 1
            deletion = dist[i - 1][j] + 1
            insertion = dist[i][j - 1] + 1
            substitution = dist[i - 1][j - 1] + cost
            dist[i][j] = min(deletion, insertion, substitution)
    return dist[m][n]


class TestEditDistanceOp(OpTest):
    def setUp(self):
        self.op_type = "edit_distance"
        normalized = False
        x1 = np.array([[12, 3, 5, 8, 2]]).astype("int64")
        x2 = np.array([[12, 4, 7, 8]]).astype("int64")
        x1 = np.transpose(x1)
        x2 = np.transpose(x2)
        self.x1_lod = [1, 4]
        self.x2_lod = [3, 1]

        num_strs = len(self.x1_lod)
        distance = np.zeros((num_strs, 1)).astype("float32")
        sequence_num = np.array(2).astype("int64")

        x1_offset = 0
        x2_offset = 0
        for i in range(0, num_strs):
            distance[i] = Levenshtein(
                hyp=x1[x1_offset:(x1_offset + self.x1_lod[i])],
                ref=x2[x2_offset:(x2_offset + self.x2_lod[i])])
            x1_offset += self.x1_lod[i]
            x2_offset += self.x2_lod[i]
            if normalized is True:
                len_ref = self.x2_lod[i]
                distance[i] = distance[i] / len_ref

        self.attrs = {'normalized': normalized}
        self.inputs = {'Hyps': (x1, [self.x1_lod]), 'Refs': (x2, [self.x2_lod])}
        self.outputs = {'Out': distance, 'SequenceNum': sequence_num}

    def test_check_output(self):
        self.check_output()


class TestEditDistanceOpNormalizedCase0(OpTest):
    def reset_config(self):
        pass

    def post_config(self):
        pass

    def setUp(self):
        self.op_type = "edit_distance"
        normalized = True
        self.x1 = np.array([[10, 3, 6, 5, 8, 2]]).astype("int64")
        self.x2 = np.array([[10, 4, 6, 7, 8]]).astype("int64")
        self.x1_lod = [3, 0, 3]
        self.x2_lod = [2, 1, 2]
        self.x1 = np.transpose(self.x1)
        self.x2 = np.transpose(self.x2)

        self.reset_config()

        num_strs = len(self.x1_lod)
        distance = np.zeros((num_strs, 1)).astype("float32")
        sequence_num = np.array(num_strs).astype("int64")

        x1_offset = 0
        x2_offset = 0
        for i in range(0, num_strs):
            distance[i] = Levenshtein(
                hyp=self.x1[x1_offset:(x1_offset + self.x1_lod[i])],
                ref=self.x2[x2_offset:(x2_offset + self.x2_lod[i])])
            x1_offset += self.x1_lod[i]
            x2_offset += self.x2_lod[i]
            if normalized is True:
                len_ref = self.x2_lod[i]
                distance[i] = distance[i] / len_ref

        self.attrs = {'normalized': normalized}
        self.inputs = {
            'Hyps': (self.x1, [self.x1_lod]),
            'Refs': (self.x2, [self.x2_lod])
        }
        self.outputs = {'Out': distance, 'SequenceNum': sequence_num}

        self.post_config()

    def test_check_output(self):
        self.check_output()


class TestEditDistanceOpNormalizedCase1(TestEditDistanceOpNormalizedCase0):
    def reset_config(self):
        self.x1_lod = [0, 6, 0]
        self.x2_lod = [2, 1, 2]


class TestEditDistanceOpNormalizedCase2(TestEditDistanceOpNormalizedCase0):
    def reset_config(self):
        self.x1_lod = [0, 0, 6]
        self.x2_lod = [2, 2, 1]


class TestEditDistanceOpNormalizedTensor(OpTest):
    def reset_config(self):
        self.x1 = np.array([[10, 3, 0, 0], [6, 5, 8, 2]], dtype=np.int64)
        self.x2 = np.array([[10, 4, 0], [6, 7, 8]], dtype=np.int64)
        self.x1_lod = np.array([2, 4], dtype=np.int64)
        self.x2_lod = np.array([2, 3], dtype=np.int64)

    def setUp(self):
        self.op_type = "edit_distance"
        normalized = True

        self.reset_config()

        num_strs = len(self.x1_lod)
        distance = np.zeros((num_strs, 1)).astype("float32")
        sequence_num = np.array(num_strs).astype("int64")

        for i in range(0, num_strs):
            distance[i] = Levenshtein(
                hyp=self.x1[i][0:self.x1_lod[i]],
                ref=self.x2[i][0:self.x2_lod[i]])
            if normalized is True:
                len_ref = self.x2_lod[i]
                distance[i] = distance[i] / len_ref

        self.attrs = {'normalized': normalized}
        self.inputs = {
            'Hyps': self.x1,
            'Refs': self.x2,
            'HypsLength': self.x1_lod,
            'RefsLength': self.x2_lod
        }
        self.outputs = {'Out': distance, 'SequenceNum': sequence_num}

    def test_check_output(self):
        self.check_output()


if __name__ == '__main__':
    unittest.main()
Fix the grammar in copyright. (#8403) 7 years ago			`# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.`
fix copyright 7 years ago			`#`
"fix decode bug" (#7711) * "fix decode bug" * "follow commnet" * "fix error" * "fix hook bug" * fix based comment * fix copyright * fix based on comment 7 years ago			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
fix copyright 7 years ago			`#`
"fix decode bug" (#7711) * "fix decode bug" * "follow commnet" * "fix error" * "fix hook bug" * fix based comment * fix copyright * fix based on comment 7 years ago			`# http://www.apache.org/licenses/LICENSE-2.0`
fix copyright 7 years ago			`#`
"fix decode bug" (#7711) * "fix decode bug" * "follow commnet" * "fix error" * "fix hook bug" * fix based comment * fix copyright * fix based on comment 7 years ago			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

Add print_function for all python files 7 years ago			`from __future__ import print_function`

Add edit distance operator 7 years ago			`import unittest`
			`import numpy as np`
Remove python3 relative import of unittest 7 years ago			`from op_test import OpTest`
Add edit distance operator 7 years ago

			`def Levenshtein(hyp, ref):`
			`""" Compute the Levenshtein distance between two strings.`

rename some variables in ctc_edit_distance_op 7 years ago			`:param hyp: hypothesis string in index`
Add edit distance operator 7 years ago			`:type hyp: list`
rename some variables in ctc_edit_distance_op 7 years ago			`:param ref: reference string in index`
Add edit distance operator 7 years ago			`:type ref: list`
			`"""`
			`m = len(hyp)`
			`n = len(ref)`
			`if m == 0:`
			`return n`
			`if n == 0:`
			`return m`

Enable batch input in edit_distance_op 7 years ago			`dist = np.zeros((m + 1, n + 1)).astype("float32")`
Add edit distance operator 7 years ago			`for i in range(0, m + 1):`
			`dist[i][0] = i`
			`for j in range(0, n + 1):`
			`dist[0][j] = j`

			`for i in range(1, m + 1):`
			`for j in range(1, n + 1):`
			`cost = 0 if hyp[i - 1] == ref[j - 1] else 1`
			`deletion = dist[i - 1][j] + 1`
			`insertion = dist[i][j - 1] + 1`
			`substitution = dist[i - 1][j - 1] + cost`
			`dist[i][j] = min(deletion, insertion, substitution)`
			`return dist[m][n]`


Remove unnecessary prefix in test name of edit_distance_op 7 years ago			`class TestEditDistanceOp(OpTest):`
Enable batch input in edit_distance_op 7 years ago			`def setUp(self):`
			`self.op_type = "edit_distance"`
			`normalized = False`
Modify Pybind LoDTensor API according to length-based LoD (#11106) * add lod_tensor util and modify pybind * refind pybind LoDTensor API and modify LoDTensor and DataFeeder test * fix test error * fix detection map op test * fix reorder_lod_tensor test * fix seq_concat_op * fix chunk evel op test * fix target assign op * fix warp ctc op * address comments step 1: reverse reset_lod op * step 2: modify op test * add warning message * remove has_valid_lod * add back has_valid_lod * address comments * add exception catching trial 7 years ago			`x1 = np.array([[12, 3, 5, 8, 2]]).astype("int64")`
			`x2 = np.array([[12, 4, 7, 8]]).astype("int64")`
Enable batch input in edit_distance_op 7 years ago			`x1 = np.transpose(x1)`
			`x2 = np.transpose(x2)`
Support seq len equal to 0 in sequence ops (#16935) * Support seq len equal to 0 in sequence ops test=develop * Add more test cases * Fix some comments test=develop * Fix py3 error test=develop 6 years ago			`self.x1_lod = [1, 4]`
			`self.x2_lod = [3, 1]`
Enable batch input in edit_distance_op 7 years ago
Support seq len equal to 0 in sequence ops (#16935) * Support seq len equal to 0 in sequence ops test=develop * Add more test cases * Fix some comments test=develop * Fix py3 error test=develop 6 years ago			`num_strs = len(self.x1_lod)`
Enable batch input in edit_distance_op 7 years ago			`distance = np.zeros((num_strs, 1)).astype("float32")`
1. Add sequence_num as edit distance op's output 2. Fix evaluator using 'reduce_sum' op instead of 'mean' op 7 years ago			`sequence_num = np.array(2).astype("int64")`
Modify Pybind LoDTensor API according to length-based LoD (#11106) * add lod_tensor util and modify pybind * refind pybind LoDTensor API and modify LoDTensor and DataFeeder test * fix test error * fix detection map op test * fix reorder_lod_tensor test * fix seq_concat_op * fix chunk evel op test * fix target assign op * fix warp ctc op * address comments step 1: reverse reset_lod op * step 2: modify op test * add warning message * remove has_valid_lod * add back has_valid_lod * address comments * add exception catching trial 7 years ago
			`x1_offset = 0`
			`x2_offset = 0`
Enable batch input in edit_distance_op 7 years ago			`for i in range(0, num_strs):`
			`distance[i] = Levenshtein(`
Support seq len equal to 0 in sequence ops (#16935) * Support seq len equal to 0 in sequence ops test=develop * Add more test cases * Fix some comments test=develop * Fix py3 error test=develop 6 years ago			`hyp=x1[x1_offset:(x1_offset + self.x1_lod[i])],`
			`ref=x2[x2_offset:(x2_offset + self.x2_lod[i])])`
			`x1_offset += self.x1_lod[i]`
			`x2_offset += self.x2_lod[i]`
Enable batch input in edit_distance_op 7 years ago			`if normalized is True:`
Support seq len equal to 0 in sequence ops (#16935) * Support seq len equal to 0 in sequence ops test=develop * Add more test cases * Fix some comments test=develop * Fix py3 error test=develop 6 years ago			`len_ref = self.x2_lod[i]`
Enable batch input in edit_distance_op 7 years ago			`distance[i] = distance[i] / len_ref`
Modify Pybind LoDTensor API according to length-based LoD (#11106) * add lod_tensor util and modify pybind * refind pybind LoDTensor API and modify LoDTensor and DataFeeder test * fix test error * fix detection map op test * fix reorder_lod_tensor test * fix seq_concat_op * fix chunk evel op test * fix target assign op * fix warp ctc op * address comments step 1: reverse reset_lod op * step 2: modify op test * add warning message * remove has_valid_lod * add back has_valid_lod * address comments * add exception catching trial 7 years ago
Enable batch input in edit_distance_op 7 years ago			`self.attrs = {'normalized': normalized}`
Support seq len equal to 0 in sequence ops (#16935) * Support seq len equal to 0 in sequence ops test=develop * Add more test cases * Fix some comments test=develop * Fix py3 error test=develop 6 years ago			`self.inputs = {'Hyps': (x1, [self.x1_lod]), 'Refs': (x2, [self.x2_lod])}`
1. Add sequence_num as edit distance op's output 2. Fix evaluator using 'reduce_sum' op instead of 'mean' op 7 years ago			`self.outputs = {'Out': distance, 'SequenceNum': sequence_num}`
Enable batch input in edit_distance_op 7 years ago
			`def test_check_output(self):`
			`self.check_output()`


Support seq len equal to 0 in sequence ops (#16935) * Support seq len equal to 0 in sequence ops test=develop * Add more test cases * Fix some comments test=develop * Fix py3 error test=develop 6 years ago			`class TestEditDistanceOpNormalizedCase0(OpTest):`
			`def reset_config(self):`
			`pass`

support Tensor input for edit_distance op (#18162) 6 years ago			`def post_config(self):`
			`pass`

Add edit distance operator 7 years ago			`def setUp(self):`
Rename ctc_edit_distance_op to edit_distance_op 7 years ago			`self.op_type = "edit_distance"`
clean up code in ctc_edit_distance_op 7 years ago			`normalized = True`
support Tensor input for edit_distance op (#18162) 6 years ago			`self.x1 = np.array([[10, 3, 6, 5, 8, 2]]).astype("int64")`
			`self.x2 = np.array([[10, 4, 6, 7, 8]]).astype("int64")`
Support seq len equal to 0 in sequence ops (#16935) * Support seq len equal to 0 in sequence ops test=develop * Add more test cases * Fix some comments test=develop * Fix py3 error test=develop 6 years ago			`self.x1_lod = [3, 0, 3]`
			`self.x2_lod = [2, 1, 2]`
support Tensor input for edit_distance op (#18162) 6 years ago			`self.x1 = np.transpose(self.x1)`
			`self.x2 = np.transpose(self.x2)`

Support seq len equal to 0 in sequence ops (#16935) * Support seq len equal to 0 in sequence ops test=develop * Add more test cases * Fix some comments test=develop * Fix py3 error test=develop 6 years ago			`self.reset_config()`
Add edit distance operator 7 years ago
Support seq len equal to 0 in sequence ops (#16935) * Support seq len equal to 0 in sequence ops test=develop * Add more test cases * Fix some comments test=develop * Fix py3 error test=develop 6 years ago			`num_strs = len(self.x1_lod)`
Enable batch input in edit_distance_op 7 years ago			`distance = np.zeros((num_strs, 1)).astype("float32")`
support Tensor input for edit_distance op (#18162) 6 years ago			`sequence_num = np.array(num_strs).astype("int64")`
Modify Pybind LoDTensor API according to length-based LoD (#11106) * add lod_tensor util and modify pybind * refind pybind LoDTensor API and modify LoDTensor and DataFeeder test * fix test error * fix detection map op test * fix reorder_lod_tensor test * fix seq_concat_op * fix chunk evel op test * fix target assign op * fix warp ctc op * address comments step 1: reverse reset_lod op * step 2: modify op test * add warning message * remove has_valid_lod * add back has_valid_lod * address comments * add exception catching trial 7 years ago
			`x1_offset = 0`
			`x2_offset = 0`
Enable batch input in edit_distance_op 7 years ago			`for i in range(0, num_strs):`
			`distance[i] = Levenshtein(`
support Tensor input for edit_distance op (#18162) 6 years ago			`hyp=self.x1[x1_offset:(x1_offset + self.x1_lod[i])],`
			`ref=self.x2[x2_offset:(x2_offset + self.x2_lod[i])])`
Support seq len equal to 0 in sequence ops (#16935) * Support seq len equal to 0 in sequence ops test=develop * Add more test cases * Fix some comments test=develop * Fix py3 error test=develop 6 years ago			`x1_offset += self.x1_lod[i]`
			`x2_offset += self.x2_lod[i]`
Enable batch input in edit_distance_op 7 years ago			`if normalized is True:`
Support seq len equal to 0 in sequence ops (#16935) * Support seq len equal to 0 in sequence ops test=develop * Add more test cases * Fix some comments test=develop * Fix py3 error test=develop 6 years ago			`len_ref = self.x2_lod[i]`
Enable batch input in edit_distance_op 7 years ago			`distance[i] = distance[i] / len_ref`
Modify Pybind LoDTensor API according to length-based LoD (#11106) * add lod_tensor util and modify pybind * refind pybind LoDTensor API and modify LoDTensor and DataFeeder test * fix test error * fix detection map op test * fix reorder_lod_tensor test * fix seq_concat_op * fix chunk evel op test * fix target assign op * fix warp ctc op * address comments step 1: reverse reset_lod op * step 2: modify op test * add warning message * remove has_valid_lod * add back has_valid_lod * address comments * add exception catching trial 7 years ago
Add edit distance operator 7 years ago			`self.attrs = {'normalized': normalized}`
support Tensor input for edit_distance op (#18162) 6 years ago			`self.inputs = {`
			`'Hyps': (self.x1, [self.x1_lod]),`
			`'Refs': (self.x2, [self.x2_lod])`
			`}`
1. Add sequence_num as edit distance op's output 2. Fix evaluator using 'reduce_sum' op instead of 'mean' op 7 years ago			`self.outputs = {'Out': distance, 'SequenceNum': sequence_num}`
Add edit distance operator 7 years ago
support Tensor input for edit_distance op (#18162) 6 years ago			`self.post_config()`

Add edit distance operator 7 years ago			`def test_check_output(self):`
			`self.check_output()`


Support seq len equal to 0 in sequence ops (#16935) * Support seq len equal to 0 in sequence ops test=develop * Add more test cases * Fix some comments test=develop * Fix py3 error test=develop 6 years ago			`class TestEditDistanceOpNormalizedCase1(TestEditDistanceOpNormalizedCase0):`
			`def reset_config(self):`
			`self.x1_lod = [0, 6, 0]`
			`self.x2_lod = [2, 1, 2]`


			`class TestEditDistanceOpNormalizedCase2(TestEditDistanceOpNormalizedCase0):`
			`def reset_config(self):`
			`self.x1_lod = [0, 0, 6]`
			`self.x2_lod = [2, 2, 1]`


support Tensor input for edit_distance op (#18162) 6 years ago			`class TestEditDistanceOpNormalizedTensor(OpTest):`
			`def reset_config(self):`
			`self.x1 = np.array([[10, 3, 0, 0], [6, 5, 8, 2]], dtype=np.int64)`
			`self.x2 = np.array([[10, 4, 0], [6, 7, 8]], dtype=np.int64)`
			`self.x1_lod = np.array([2, 4], dtype=np.int64)`
			`self.x2_lod = np.array([2, 3], dtype=np.int64)`

			`def setUp(self):`
			`self.op_type = "edit_distance"`
			`normalized = True`

			`self.reset_config()`

			`num_strs = len(self.x1_lod)`
			`distance = np.zeros((num_strs, 1)).astype("float32")`
			`sequence_num = np.array(num_strs).astype("int64")`

			`for i in range(0, num_strs):`
			`distance[i] = Levenshtein(`
			`hyp=self.x1[i][0:self.x1_lod[i]],`
			`ref=self.x2[i][0:self.x2_lod[i]])`
			`if normalized is True:`
			`len_ref = self.x2_lod[i]`
			`distance[i] = distance[i] / len_ref`

			`self.attrs = {'normalized': normalized}`
			`self.inputs = {`
			`'Hyps': self.x1,`
			`'Refs': self.x2,`
			`'HypsLength': self.x1_lod,`
			`'RefsLength': self.x2_lod`
			`}`
			`self.outputs = {'Out': distance, 'SequenceNum': sequence_num}`

			`def test_check_output(self):`
			`self.check_output()`


Add edit distance operator 7 years ago			`if __name__ == '__main__':`
			`unittest.main()`