From dd0f8c1d89bae629a9877c2649a994642dd0794b Mon Sep 17 00:00:00 2001 From: tink2123 Date: Tue, 8 Dec 2020 19:07:39 +0800 Subject: [PATCH 1/3] update for multi-language --- configs/rec/multi_language/rec_en_number_lite_train.yml | 2 +- configs/rec/multi_language/rec_french_lite_train.yml | 4 ++-- configs/rec/multi_language/rec_german_lite_train.yml | 2 +- configs/rec/multi_language/rec_japan_lite_train.yml | 2 +- configs/rec/multi_language/rec_korean_lite_train.yml | 2 +- ppocr/data/imaug/label_ops.py | 6 ++++-- ppocr/postprocess/rec_postprocess.py | 8 +++++--- 7 files changed, 15 insertions(+), 11 deletions(-) diff --git a/configs/rec/multi_language/rec_en_number_lite_train.yml b/configs/rec/multi_language/rec_en_number_lite_train.yml index 9d0f1f0..70d825e 100644 --- a/configs/rec/multi_language/rec_en_number_lite_train.yml +++ b/configs/rec/multi_language/rec_en_number_lite_train.yml @@ -15,7 +15,7 @@ Global: use_visualdl: False infer_img: # for data or label process - character_dict_path: ppocr/utils/ic15_dict.txt + character_dict_path: ppocr/utils/dict/ic15_dict.txt character_type: ch max_text_length: 25 infer_mode: False diff --git a/configs/rec/multi_language/rec_french_lite_train.yml b/configs/rec/multi_language/rec_french_lite_train.yml index da3aad5..0b8659a 100644 --- a/configs/rec/multi_language/rec_french_lite_train.yml +++ b/configs/rec/multi_language/rec_french_lite_train.yml @@ -15,7 +15,7 @@ Global: use_visualdl: False infer_img: # for data or label process - character_dict_path: ppocr/utils/french_dict.txt + character_dict_path: ppocr/utils/dict/french_dict.txt character_type: french max_text_length: 25 infer_mode: False @@ -85,7 +85,7 @@ Eval: dataset: name: SimpleDataSet data_dir: ./train_data/ - label_file_list: ["./train_data/eval_list.txt"] + label_file_list: ["./train_data/train_list.txt"] transforms: - DecodeImage: # load image img_mode: BGR diff --git a/configs/rec/multi_language/rec_german_lite_train.yml b/configs/rec/multi_language/rec_german_lite_train.yml index 403be66..9978a21 100644 --- a/configs/rec/multi_language/rec_german_lite_train.yml +++ b/configs/rec/multi_language/rec_german_lite_train.yml @@ -15,7 +15,7 @@ Global: use_visualdl: False infer_img: # for data or label process - character_dict_path: ppocr/utils/german_dict.txt + character_dict_path: ppocr/utils/dict/german_dict.txt character_type: german max_text_length: 25 infer_mode: False diff --git a/configs/rec/multi_language/rec_japan_lite_train.yml b/configs/rec/multi_language/rec_japan_lite_train.yml index 5ff61c0..938d377 100644 --- a/configs/rec/multi_language/rec_japan_lite_train.yml +++ b/configs/rec/multi_language/rec_japan_lite_train.yml @@ -15,7 +15,7 @@ Global: use_visualdl: False infer_img: # for data or label process - character_dict_path: ppocr/utils/japan_dict.txt + character_dict_path: ppocr/utils/dict/japan_dict.txt character_type: japan max_text_length: 25 infer_mode: False diff --git a/configs/rec/multi_language/rec_korean_lite_train.yml b/configs/rec/multi_language/rec_korean_lite_train.yml index 2b2211e..7b070c4 100644 --- a/configs/rec/multi_language/rec_korean_lite_train.yml +++ b/configs/rec/multi_language/rec_korean_lite_train.yml @@ -15,7 +15,7 @@ Global: use_visualdl: False infer_img: # for data or label process - character_dict_path: ppocr/utils/korean_dict.txt + character_dict_path: ppocr/utils/dict/korean_dict.txt character_type: korean max_text_length: 25 infer_mode: False diff --git a/ppocr/data/imaug/label_ops.py b/ppocr/data/imaug/label_ops.py index f3c9005..2932f2a 100644 --- a/ppocr/data/imaug/label_ops.py +++ b/ppocr/data/imaug/label_ops.py @@ -79,7 +79,9 @@ class BaseRecLabelEncode(object): character_dict_path=None, character_type='ch', use_space_char=False): - support_character_type = ['ch', 'en', 'en_sensitive'] + support_character_type = [ + 'ch', 'en', 'en_sensitive', 'french', 'german', 'japan', 'french' + ] assert character_type in support_character_type, "Only {} are supported now but get {}".format( support_character_type, self.character_str) @@ -87,7 +89,7 @@ class BaseRecLabelEncode(object): if character_type == "en": self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz" dict_character = list(self.character_str) - elif character_type == "ch": + elif character_type in ["ch", "french", "german", "japan", "french"]: self.character_str = "" assert character_dict_path is not None, "character_dict_path should not be None when character_type is ch" with open(character_dict_path, "rb") as fin: diff --git a/ppocr/postprocess/rec_postprocess.py b/ppocr/postprocess/rec_postprocess.py index eb9be68..8f6fca9 100644 --- a/ppocr/postprocess/rec_postprocess.py +++ b/ppocr/postprocess/rec_postprocess.py @@ -23,14 +23,16 @@ class BaseRecLabelDecode(object): character_dict_path=None, character_type='ch', use_space_char=False): - support_character_type = ['ch', 'en', 'en_sensitive'] + support_character_type = [ + 'ch', 'en', 'en_sensitive', 'french', 'german', 'japan', 'french' + ] assert character_type in support_character_type, "Only {} are supported now but get {}".format( support_character_type, self.character_str) if character_type == "en": self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz" dict_character = list(self.character_str) - elif character_type == "ch": + elif character_type in ["ch", "french", "german", "japan", "french"]: self.character_str = "" assert character_dict_path is not None, "character_dict_path should not be None when character_type is ch" with open(character_dict_path, "rb") as fin: @@ -150,4 +152,4 @@ class AttnLabelDecode(BaseRecLabelDecode): else: assert False, "unsupport type %s in get_beg_end_flag_idx" \ % beg_or_end - return idx \ No newline at end of file + return idx From 311569b2bca6b12ff7eaa6781b2de03c51d6e8dc Mon Sep 17 00:00:00 2001 From: tink2123 Date: Tue, 8 Dec 2020 19:09:03 +0800 Subject: [PATCH 2/3] update for multi-language --- configs/rec/multi_language/rec_french_lite_train.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/rec/multi_language/rec_french_lite_train.yml b/configs/rec/multi_language/rec_french_lite_train.yml index 0b8659a..0e8f4eb 100644 --- a/configs/rec/multi_language/rec_french_lite_train.yml +++ b/configs/rec/multi_language/rec_french_lite_train.yml @@ -85,7 +85,7 @@ Eval: dataset: name: SimpleDataSet data_dir: ./train_data/ - label_file_list: ["./train_data/train_list.txt"] + label_file_list: ["./train_data/eval_list.txt"] transforms: - DecodeImage: # load image img_mode: BGR From bccf9edf617c8c165d4b452d75485971af08891a Mon Sep 17 00:00:00 2001 From: tink2123 Date: Tue, 8 Dec 2020 19:10:57 +0800 Subject: [PATCH 3/3] update for multi-language --- ppocr/data/imaug/label_ops.py | 4 ++-- ppocr/postprocess/rec_postprocess.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ppocr/data/imaug/label_ops.py b/ppocr/data/imaug/label_ops.py index 2932f2a..0b1d46c 100644 --- a/ppocr/data/imaug/label_ops.py +++ b/ppocr/data/imaug/label_ops.py @@ -80,7 +80,7 @@ class BaseRecLabelEncode(object): character_type='ch', use_space_char=False): support_character_type = [ - 'ch', 'en', 'en_sensitive', 'french', 'german', 'japan', 'french' + 'ch', 'en', 'en_sensitive', 'french', 'german', 'japan', 'korean' ] assert character_type in support_character_type, "Only {} are supported now but get {}".format( support_character_type, self.character_str) @@ -89,7 +89,7 @@ class BaseRecLabelEncode(object): if character_type == "en": self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz" dict_character = list(self.character_str) - elif character_type in ["ch", "french", "german", "japan", "french"]: + elif character_type in ["ch", "french", "german", "japan", "korean"]: self.character_str = "" assert character_dict_path is not None, "character_dict_path should not be None when character_type is ch" with open(character_dict_path, "rb") as fin: diff --git a/ppocr/postprocess/rec_postprocess.py b/ppocr/postprocess/rec_postprocess.py index 8f6fca9..6943f84 100644 --- a/ppocr/postprocess/rec_postprocess.py +++ b/ppocr/postprocess/rec_postprocess.py @@ -24,7 +24,7 @@ class BaseRecLabelDecode(object): character_type='ch', use_space_char=False): support_character_type = [ - 'ch', 'en', 'en_sensitive', 'french', 'german', 'japan', 'french' + 'ch', 'en', 'en_sensitive', 'french', 'german', 'japan', 'korean' ] assert character_type in support_character_type, "Only {} are supported now but get {}".format( support_character_type, self.character_str) @@ -32,7 +32,7 @@ class BaseRecLabelDecode(object): if character_type == "en": self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz" dict_character = list(self.character_str) - elif character_type in ["ch", "french", "german", "japan", "french"]: + elif character_type in ["ch", "french", "german", "japan", "korean"]: self.character_str = "" assert character_dict_path is not None, "character_dict_path should not be None when character_type is ch" with open(character_dict_path, "rb") as fin: