update infer_rec for srn

5 years ago · bf4863c950
parent 6832ca029f
commit bf4863c950
8 changed files with 197 additions and 161 deletions
--- a/ppocr/data/rec/dataset_traversal.py
+++ b/ppocr/data/rec/dataset_traversal.py
@ -40,10 +40,12 @@ class LMDBReader(object):
        self.image_shape = params['image_shape']
        self.loss_type = params['loss_type']
        self.max_text_length = params['max_text_length']
-        self.num_heads = params['num_heads']
        self.mode = params['mode']
        self.drop_last = False
        self.use_tps = False
+        self.num_heads = None
+        if "num_heads" in params:
+            self.num_heads = params['num_heads']
        if "tps" in params:
            self.ues_tps = True
        self.use_distort = False
@ -134,20 +136,6 @@ class LMDBReader(object):
                            tps=self.use_tps,
                            infer_mode=True)
                    yield norm_img
-            #elif self.mode == 'eval':
-            #    image_file_list = get_image_file_list(self.infer_img)
-            #    for single_img in image_file_list:
-            #        img = cv2.imread(single_img)
-            #        if img.shape[-1]==1 or len(list(img.shape))==2:
-            #            img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
-            #        norm_img = process_image(
-            #            img=img,
-            #            image_shape=self.image_shape,
-            #            char_ops=self.char_ops,
-            #            tps=self.use_tps,
-            #            infer_mode=True
-            #        )
-            #        yield norm_img
            else:
                lmdb_sets = self.load_hierarchical_lmdb_dataset()
                if process_id == 0:
@ -169,14 +157,22 @@ class LMDBReader(object):
                            outs = []
                            if self.loss_type == "srn":
                                outs = process_image_srn(
-                                    img, self.image_shape, self.num_heads,
-                                    self.max_text_length, label, self.char_ops,
-                                    self.loss_type)
+                                    img=img,
+                                    image_shape=self.image_shape,
+                                    num_heads=self.num_heads,
+                                    max_text_length=self.max_text_length,
+                                    label=label,
+                                    char_ops=self.char_ops,
+                                    loss_type=self.loss_type)

                            else:
                                outs = process_image(
-                                    img, self.image_shape, label, self.char_ops,
-                                    self.loss_type, self.max_text_length)
+                                    img=img,
+                                    image_shape=self.image_shape,
+                                    label=label,
+                                    char_ops=self.char_ops,
+                                    loss_type=self.loss_type,
+                                    max_text_length=self.max_text_length)
                            if outs is None:
                                continue
                            yield outs
@ -192,8 +188,9 @@ class LMDBReader(object):
                if len(batch_outs) == self.batch_size:
                    yield batch_outs
                    batch_outs = []
-            if len(batch_outs) != 0:
-                yield batch_outs
+            if not self.drop_last:
+                if len(batch_outs) != 0:
+                    yield batch_outs

        if self.infer_img is None:
            return batch_iter_reader
--- a/ppocr/modeling/architectures/rec_model.py
+++ b/ppocr/modeling/architectures/rec_model.py
@ -58,7 +58,10 @@ class RecModel(object):
        self.loss_type = global_params['loss_type']
        self.image_shape = global_params['image_shape']
        self.max_text_length = global_params['max_text_length']
-        self.num_heads = global_params["num_heads"]
+        if "num_heads" in params:
+            self.num_heads = global_params["num_heads"]
+        else:
+            self.num_heads = None

    def create_feed(self, mode):
        image_shape = deepcopy(self.image_shape)
--- a/ppocr/modeling/backbones/rec_resnet_vd.py
+++ b/ppocr/modeling/backbones/rec_resnet_vd.py
@ -32,7 +32,7 @@ class ResNet():
    def __init__(self, params):
        self.layers = params['layers']
        self.is_3x3 = True
-        supported_layers = [18, 34, 50, 101, 152]
+        supported_layers = [18, 34, 50, 101, 152, 200]
        assert self.layers in supported_layers, \
            "supported layers are {} but input layer is {}".format(supported_layers, self.layers)

--- a/ppocr/modeling/heads/rec_srn_all_head.py
+++ b/ppocr/modeling/heads/rec_srn_all_head.py
--- a/ppocr/modeling/losses/rec_srn_loss.py
+++ b/ppocr/modeling/losses/rec_srn_loss.py
@ -35,24 +35,21 @@ class SRNLoss(object):
        lbl_weight = others['lbl_weight']

        casted_label = fluid.layers.cast(x=label, dtype='int64')
-        cost_word = fluid.layers.cross_entropy(input=word_predict, label=casted_label)
-        cost_gsrm = fluid.layers.cross_entropy(input=gsrm_predict, label=casted_label)
-        cost_vsfd = fluid.layers.cross_entropy(input=predict, label=casted_label)
-
-        #cost_word = cost_word * lbl_weight
-        #cost_gsrm = cost_gsrm * lbl_weight
-        #cost_vsfd = cost_vsfd * lbl_weight
-
-        cost_word = fluid.layers.reshape(x=fluid.layers.reduce_sum(cost_word), shape=[1])
-        cost_gsrm = fluid.layers.reshape(x=fluid.layers.reduce_sum(cost_gsrm), shape=[1])
-        cost_vsfd = fluid.layers.reshape(x=fluid.layers.reduce_sum(cost_vsfd), shape=[1])
-
-        sum_cost = fluid.layers.sum([cost_word, cost_vsfd * 2.0, cost_gsrm * 0.15])
-
-        #sum_cost = fluid.layers.sum([cost_word * 3.0, cost_vsfd, cost_gsrm * 0.15])
-        #sum_cost = cost_word
-
-        #fluid.layers.Print(cost_word,message="word_cost")
-        #fluid.layers.Print(cost_vsfd,message="img_cost")
-        return [sum_cost,cost_vsfd,cost_word]
-        #return [sum_cost, cost_vsfd, cost_word]
+        cost_word = fluid.layers.cross_entropy(
+            input=word_predict, label=casted_label)
+        cost_gsrm = fluid.layers.cross_entropy(
+            input=gsrm_predict, label=casted_label)
+        cost_vsfd = fluid.layers.cross_entropy(
+            input=predict, label=casted_label)
+
+        cost_word = fluid.layers.reshape(
+            x=fluid.layers.reduce_sum(cost_word), shape=[1])
+        cost_gsrm = fluid.layers.reshape(
+            x=fluid.layers.reduce_sum(cost_gsrm), shape=[1])
+        cost_vsfd = fluid.layers.reshape(
+            x=fluid.layers.reduce_sum(cost_vsfd), shape=[1])
+
+        sum_cost = fluid.layers.sum(
+            [cost_word, cost_vsfd * 2.0, cost_gsrm * 0.15])
+
+        return [sum_cost, cost_vsfd, cost_word]
--- a/ppocr/utils/character.py
+++ b/ppocr/utils/character.py
@ -149,38 +149,29 @@ def cal_predicts_accuracy(char_ops,
    acc = acc_num * 1.0 / img_num
    return acc, acc_num, img_num

+
 def cal_predicts_accuracy_srn(char_ops,
-                          preds,
-                          labels,
-                          max_text_len,
-                          is_debug=False):
+                              preds,
+                              labels,
+                              max_text_len,
+                              is_debug=False):
    acc_num = 0
    img_num = 0

    total_len = preds.shape[0]
    img_num = int(total_len / max_text_len)
-    #print (img_num)
    for i in range(img_num):
        cur_label = []
        cur_pred = []
        for j in range(max_text_len):
-            if labels[j + i * max_text_len] != 37: #0
+            if labels[j + i * max_text_len] != 37:  #0
                cur_label.append(labels[j + i * max_text_len][0])
            else:
                break

-        if is_debug:
-            for j in range(max_text_len):
-                if preds[j + i * max_text_len] != 37: #0
-                    cur_pred.append(preds[j + i * max_text_len][0])
-                else:
-                    break
-            print ("cur_label: ", cur_label)
-            print ("cur_pred: ", cur_pred)
-
-
        for j in range(max_text_len + 1):
-            if j < len(cur_label) and preds[j + i * max_text_len][0] != cur_label[j]:
+            if j < len(cur_label) and preds[j + i * max_text_len][
+                    0] != cur_label[j]:
                break
            elif j == len(cur_label) and j == max_text_len:
                acc_num += 1
--- a/tools/eval_utils/eval_rec_utils.py
+++ b/tools/eval_utils/eval_rec_utils.py
@ -123,8 +123,8 @@ def eval_rec_run(exe, config, eval_info_dict, mode):

 def test_rec_benchmark(exe, config, eval_info_dict):
    " Evaluate lmdb dataset "
-    eval_data_list = ['IIIT5k_3000', 'SVT', 'IC03_860',  \
-                      'IC13_857', 'IC15_1811', 'IC15_2077','SVTP', 'CUTE80']
+    eval_data_list = ['IIIT5k_3000', 'SVT', 'IC03_860','IC03_867',  \
+                      'IC13_857', 'IC13_1015', 'IC15_1811', 'IC15_2077','SVTP', 'CUTE80']
    eval_data_dir = config['TestReader']['lmdb_sets_dir']
    total_evaluation_data_number = 0
    total_correct_number = 0
--- a/tools/infer_rec.py
+++ b/tools/infer_rec.py
@ -64,7 +64,6 @@ def main():
    exe = fluid.Executor(place)

    rec_model = create_module(config['Architecture']['function'])(params=config)
-
    startup_prog = fluid.Program()
    eval_prog = fluid.Program()
    with fluid.program_guard(eval_prog, startup_prog):
@ -86,10 +85,36 @@ def main():
    for i in range(max_img_num):
        logger.info("infer_img:%s" % infer_list[i])
        img = next(blobs)
-        predict = exe.run(program=eval_prog,
-                          feed={"image": img},
-                          fetch_list=fetch_varname_list,
-                          return_numpy=False)
+        if loss_type != "srn":
+            predict = exe.run(program=eval_prog,
+                              feed={"image": img},
+                              fetch_list=fetch_varname_list,
+                              return_numpy=False)
+        else:
+            encoder_word_pos_list = []
+            gsrm_word_pos_list = []
+            gsrm_slf_attn_bias1_list = []
+            gsrm_slf_attn_bias2_list = []
+            encoder_word_pos_list.append(img[1])
+            gsrm_word_pos_list.append(img[2])
+            gsrm_slf_attn_bias1_list.append(img[3])
+            gsrm_slf_attn_bias2_list.append(img[4])
+
+            encoder_word_pos_list = np.concatenate(
+                encoder_word_pos_list, axis=0).astype(np.int64)
+            gsrm_word_pos_list = np.concatenate(
+                gsrm_word_pos_list, axis=0).astype(np.int64)
+            gsrm_slf_attn_bias1_list = np.concatenate(
+                gsrm_slf_attn_bias1_list, axis=0).astype(np.float32)
+            gsrm_slf_attn_bias2_list = np.concatenate(
+                gsrm_slf_attn_bias2_list, axis=0).astype(np.float32)
+
+            predict = exe.run(program=eval_prog, \
+                       feed={'image': img[0], 'encoder_word_pos': encoder_word_pos_list,
+                             'gsrm_word_pos': gsrm_word_pos_list, 'gsrm_slf_attn_bias1': gsrm_slf_attn_bias1_list,
+                             'gsrm_slf_attn_bias2': gsrm_slf_attn_bias2_list}, \
+                       fetch_list=fetch_varname_list, \
+                       return_numpy=False)
        if loss_type == "ctc":
            preds = np.array(predict[0])
            preds = preds.reshape(-1)
@ -114,7 +139,18 @@ def main():
                score = np.mean(probs[0, 1:end_pos[1]])
            preds = preds.reshape(-1)
            preds_text = char_ops.decode(preds)
-
+        elif loss_type == "srn":
+            cur_pred = []
+            preds = np.array(predict[0])
+            preds = preds.reshape(-1)
+            probs = np.array(predict[1])
+            ind = np.argmax(probs, axis=1)
+            valid_ind = np.where(preds != 37)[0]
+            if len(valid_ind) == 0:
+                continue
+            score = np.mean(probs[valid_ind, ind[valid_ind]])
+            preds = preds[:valid_ind[-1] + 1]
+            preds_text = char_ops.decode(preds)
        logger.info("\t index: {}".format(preds))
        logger.info("\t word : {}".format(preds_text))
        logger.info("\t score: {}".format(score))