!10774 add gpt2 model zoo

From: @ouyang_xx Reviewed-by: Signed-off-by:
4 years ago · ac5371b38f
parent a8a0956362 a465a48fb7
commit ac5371b38f
56 changed files with 60443 additions and 0 deletions
--- a/mindspore/lite/micro/example/mnist_stm32f746/mnist_stm32f746/operator_library/include/nnacl/base/minimal_filtering_generator.h
+++ b/mindspore/lite/micro/example/mnist_stm32f746/mnist_stm32f746/operator_library/include/nnacl/base/minimal_filtering_generator.h
--- a/model_zoo/research/nlp/gpt2/README_CN.md
+++ b/model_zoo/research/nlp/gpt2/README_CN.md
--- a/model_zoo/research/nlp/gpt2/cnn_dataset_sampler.py
+++ b/model_zoo/research/nlp/gpt2/cnn_dataset_sampler.py
@ -0,0 +1,141 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""
+CNN & DailyMail train dataset sampler
+"""
+
+import os
+import sys
+import shutil
+import argparse
+from random import random
+
+from src.utils.tokenization import Tokenizer
+
+
+def replace_split_word(read_path, output_path, tldr_str="TL;DR:", original_split='\t'):
+    """
+    append tldr str
+    """
+    with open(read_path, "r") as r, open(output_path, "a") as w:
+        line = r.readline()
+        while line:
+            article = line[:line.find(original_split)] + ' ' + tldr_str + ' '
+            ref = line[line.rfind(original_split) + 1:]
+            w.write(article + ref)
+            line = r.readline()
+
+
+def sample(read_path, out_path, threshold=1.0, max_items=0xFFFFFFF):
+    """
+    sample function
+    """
+    cnt = 0
+    total_cnt = 0
+    with open(read_path, "r") as r, open(out_path, "a") as w:
+        line = r.readline()
+        while line:
+            total_cnt += 1
+            if cnt >= max_items:
+                break
+            if random() > threshold:
+                line = r.readline()
+                continue
+            w.write(line)
+            if (cnt + 1) % 3000 == 0:
+                print("Now Processed Samples: {}, total: {}".format(cnt, total_cnt))
+            cnt += 1
+            line = r.readline()
+
+
+def clip_article(input_path, out_path, hint, max_length):
+    """
+    clip article that the sample (article + summary) exceed max_length
+    """
+    tokenizer = Tokenizer()
+    cnt = 0
+    with open(input_path, "r") as r, open(out_path, "a+") as a:
+        line = r.readline()
+        while line:
+            pos = line.rfind(hint)
+            article = line[:pos]
+            summary = line[pos:]
+            if len(tokenizer.encode(line)) > max_length:
+                l_article = tokenizer.encode(article)[:max_length - len(tokenizer.encode(summary))]
+                article = tokenizer.decode(l_article) + " "
+            if cnt % 1000 == 0:
+                print(article + summary)
+                print("==============================")
+            cnt += 1
+            a.write(article + summary)
+            line = r.readline()
+
+
+def sampler_dataset():
+    """
+    run CNN & DailyMail train dataset sampler
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input_path", type=str, default="",
+                        help="input file path")
+    parser.add_argument("--output_path", type=str, default="",
+                        help="out file path")
+    parser.add_argument("--replace_hint", type=str, default="true")
+    parser.add_argument("--sample", type=str, default="true",
+                        help="do sample? true or false")
+    parser.add_argument("--max_length", type=int, default=1022,
+                        help="max seq_length of input_raw_dataset")
+    parser.add_argument("--prob", type=float, default=0.25,
+                        help="sample rate")
+    parser.add_argument("--max_items", type=int, default=10000,
+                        help="max number of document")
+    parser.add_argument("--hint", type=str, default="TL:DR;",
+                        help="hint text")
+    args = parser.parse_args()
+
+    # temp_files, one for storing inputs in every stage, the other for storing middle results.
+    temp_file_input = sys.path[0] + '/temp_file1_by_sampler_py.txt'
+    temp_file_proc = sys.path[0] + '/temp_file2_by_sampler_py.txt'
+
+    read_path = args.input_path
+    output_path = args.output_path
+    prob = args.prob
+    max_items = args.max_items
+    hint = args.hint
+    max_length = args.max_length
+    split_str = '\t'
+
+    shutil.copyfile(read_path, temp_file_input)
+    clip_article(temp_file_input, temp_file_proc, hint=split_str, max_length=max_length)
+    shutil.copyfile(temp_file_proc, temp_file_input)
+    os.remove(temp_file_proc)
+
+    if args.replace_hint.lower() == "true":
+        replace_split_word(temp_file_input, temp_file_proc, hint, split_str)
+        shutil.copyfile(temp_file_proc, temp_file_input)
+        os.remove(temp_file_proc)
+
+    if args.sample.lower() == "true":
+        sample(temp_file_input, temp_file_proc, prob, max_items)
+        shutil.copyfile(temp_file_proc, temp_file_input)
+        os.remove(temp_file_proc)
+
+    shutil.copyfile(temp_file_input, output_path)
+    os.remove(temp_file_input)
+
+
+if __name__ == "__main__":
+    sampler_dataset()
--- a/model_zoo/research/nlp/gpt2/convert_tf_ckpt/init.py
+++ b/model_zoo/research/nlp/gpt2/convert_tf_ckpt/init.py
--- a/model_zoo/research/nlp/gpt2/convert_tf_ckpt/read_weight_tf.py
+++ b/model_zoo/research/nlp/gpt2/convert_tf_ckpt/read_weight_tf.py
@ -0,0 +1,67 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""
+Read weight using tensorflow
+to read the parameters of the gpt-2 pretrained model from tensorflow checkpoint
+and save them into npy files for mindspore to load.
+
+*this script is based on the gpt-2 model downloaded from openai.*
+"""
+import argparse
+import tensorflow as tf
+import numpy as np
+
+from .trans_dict import trans_dict_tf
+
+
+def read_weight(ckpt_path):
+    """
+    read weight
+    Args:
+        ckpt_path: the path of tensorflow checkpoint
+    """
+    # model path and model name
+    init_vars = tf.train.list_variables(ckpt_path)
+    # load the model parameters into vars
+    save_param_num = 0
+
+    for name, _ in init_vars:
+        array = tf.train.load_variable(ckpt_path, name)
+        # By this you can understand the next step easily
+        name = name[6:].replace(r"/", ".")
+        # skip 'model/' and change var names to avoid path mistake
+        if name not in trans_dict_tf.keys():
+            print(name + " is not in this model")
+        else:
+            np.save(trans_dict_tf[name] + ".npy", array)
+            save_param_num = save_param_num + 1
+        # save the parameters by 'npy'
+
+    print("finished!")
+    print("save {num} parameters.".format(num=save_param_num))
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Read GPT-2 model checkpoint weight")
+    parser.add_argument("--ckpt_file_path", type=str, default="",
+                        help="The tensorflow GPT-2 model checkpoint file path")
+    args_opt = parser.parse_args()
+    ckpt_path = args_opt.ckpt_file_path
+    read_weight(ckpt_path=ckpt_path)
+
+
+if __name__ == "__main__":
+    main()
--- a/model_zoo/research/nlp/gpt2/convert_tf_ckpt/save_weight_ms.py
+++ b/model_zoo/research/nlp/gpt2/convert_tf_ckpt/save_weight_ms.py
@ -0,0 +1,60 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+Save weight using mindspore, to load the parameters of gpt-2 model from npy file.
+npy files should be in the same path with this script. Otherwise you should change the path name of the script.
+"""
+import os
+import argparse
+import numpy as np
+
+from mindspore import Tensor
+from mindspore.train.serialization import save_checkpoint
+
+from .trans_dict import trans_dict_tf
+
+
+def trans_model_parameter(ckpt_name):
+    """
+    transform model parameters
+    Args:
+        ckpt_name (str): the name of the transformed checkpoint.
+    """
+    file_names = [name for name in os.listdir() if name.endswith(".npy")]
+    # to find all file names with suffix '.npy' in the current path.
+    new_params_list = []
+    for file_name in file_names:
+        var_name = file_name[:-4]
+        param_dict = {"name": var_name, "data": Tensor(np.load(file_name))}
+        if var_name in trans_dict_tf.values():
+            new_params_list.append(param_dict)
+            print(var_name+" has been saved")
+
+    save_checkpoint(new_params_list, ckpt_name)
+    # to load the parameters from npy files and save them as mindspore checkpoint
+    print("Finished:the parameters have been saved into mindspore checkpoint.")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Read GPT-2 model checkpoint weight")
+    parser.add_argument("--output_file_name", type=str, default="",
+                        help="The name of output checkpoint name")
+    args_opt = parser.parse_args()
+    ckpt_name = args_opt.output_file_name
+    trans_model_parameter(ckpt_name=ckpt_name)
+
+
+if __name__ == "__main__":
+    main()
--- a/model_zoo/research/nlp/gpt2/convert_tf_ckpt/trans_dict.py
+++ b/model_zoo/research/nlp/gpt2/convert_tf_ckpt/trans_dict.py
--- a/model_zoo/research/nlp/gpt2/create_cbt_data.py
+++ b/model_zoo/research/nlp/gpt2/create_cbt_data.py
@ -0,0 +1,148 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""create mindrecord data for Children's Book Test task"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import collections
+import logging
+import numpy as np
+
+from mindspore.mindrecord import FileWriter
+from src.utils.tokenization import Tokenizer
+
+
+def create_instance(tokenizer, text, max_length=None, num_choice=None):
+    """A single sample instance for cbt task."""
+    text = text.replace(" \t ", "\t ")
+    sentence = text.strip().split("\t")
+    context_length = len(tokenizer.encode(sentence[0]))
+
+    whole_sentence = sentence[0] + sentence[1]
+    whole_sentence = whole_sentence.strip()
+    assert whole_sentence != ""
+    print(" | whole sentence: ", whole_sentence)
+    ids = tokenizer.encode(whole_sentence)
+    input_length = len(ids)
+    pair_ids = None
+
+    output = tokenizer.prepare_for_model(ids=ids,
+                                         pair_ids=pair_ids,
+                                         add_special_tokens=True,
+                                         max_length=max_length,
+                                         padding=True,
+                                         truncate_direction="RIGHT",
+                                         return_overflowing_tokens=False,
+                                         return_attention_mask=True)
+
+    output["length"] = [context_length + 1] + [input_length + 1]
+
+    gold_answer_id = int(sentence[2])
+    assert gold_answer_id < 10
+    output["mc_labels"] = gold_answer_id
+
+    for name, value in output.items():
+        print(name)
+        print(value)
+        print("==================================")
+
+    return output
+
+
+def write_instance_to_file(writer, instance):
+    """write the instance to file"""
+    input_ids = instance["input_ids"]
+    input_mask = instance["attention_mask"]
+    assert len(input_ids) == len(input_mask)
+    length = instance["length"]  # list
+    mc_labels = instance["mc_labels"]
+
+    features = collections.OrderedDict()
+    features["input_ids"] = np.asarray(input_ids)
+    features["input_mask"] = np.asarray(input_mask)
+    features["input_length"] = np.asarray(length)
+    features["mc_labels"] = mc_labels
+
+    writer.write_raw_data([features])
+    return features
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input_file", type=str, required=True, default="", help='Input raw text file. ')
+    parser.add_argument("--output_file", type=str, required=True, default="", help='Output MindRecord file. ')
+    parser.add_argument("--num_splits", type=int, default=1,
+                        help='The MindRecord file will be split into the number of partition. ')
+    parser.add_argument("--max_seq_length", type=int, required=True, help='Maximum sequence length. ')
+    parser.add_argument("--num_choice", type=int, required=True, help='Number of choices. ')
+    parser.add_argument("--vocab_file", type=str, required=True, default='', help='url of gpt2-vocab.json ')
+    parser.add_argument("--merge_file", type=str, required=True, default='', help='url of gpt2-merges.txt ')
+    args = parser.parse_args()
+
+    tokenizer = Tokenizer(vocab_file=args.vocab_file, merge_file=args.merge_file)
+    num_choice = args.num_choice
+
+    input_file = args.input_file
+    logging.info("***** Reading from input files *****")
+    logging.info("Input File: %s", input_file)
+
+    output_file = args.output_file
+    logging.info("***** Writing to output files *****")
+    logging.info("Output File: %s", output_file)
+
+    writer = FileWriter(output_file, args.num_splits)
+    data_schema = {"input_ids": {"type": "int64", "shape": [-1]},
+                   "input_mask": {"type": "int64", "shape": [-1]},
+                   "input_length": {"type": "int64", "shape": [-1]},
+                   "mc_labels": {"type": "int64"}
+                   }
+    writer.add_schema(data_schema, "cbt-schema")
+
+    total_written = 0
+    total_read = 0
+
+    logging.info("***** Reading from  %s *****", input_file)
+    with open(input_file, "r") as f:
+        while True:
+            line = f.readline()
+            if not line:
+                break
+            total_read += 1
+            if total_read % 500 == 0:
+                logging.info("%d ...", total_read)
+
+            output = create_instance(tokenizer, line, args.max_seq_length, num_choice)
+            features = write_instance_to_file(writer, instance=output)
+            total_written += 1
+
+            if total_written <= 20:
+                logging.info("***** Example *****")
+                logging.info("input tokens: %s", tokenizer.decode(output["input_ids"][:-1]))
+                logging.info("label tokens: %s", tokenizer.decode(output["input_ids"][1:]))
+
+                for feature_name in features.keys():
+                    feature = features[feature_name]
+                    logging.info("%s: %s", feature_name, feature)
+
+    writer.commit()
+    logging.info("Wrote %d total instances", total_written)
+
+
+if __name__ == "__main__":
+    main()
--- a/model_zoo/research/nlp/gpt2/create_lambada_data.py
+++ b/model_zoo/research/nlp/gpt2/create_lambada_data.py
@ -0,0 +1,140 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""create mindrecord data for LAMBADA task"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import collections
+import logging
+import numpy as np
+
+from mindspore.mindrecord import FileWriter
+from src.utils.tokenization import Tokenizer
+
+
+def create_instance(tokenizer, text, max_length=None):
+    """A single sample instance for LAMBADA task."""
+    text = text.replace(" \t ", "\t ")
+    sentence = text.strip().split("\t")
+    context_length = len(tokenizer.encode(sentence[0]))
+
+    whole_sentence = sentence[0] + sentence[1]
+    whole_sentence = whole_sentence.strip()
+    assert whole_sentence != ""
+    print(" | whole sentence: ", whole_sentence)
+    ids = tokenizer.encode(whole_sentence)
+    input_length = len(ids)
+    pair_ids = None
+
+    output = tokenizer.prepare_for_model(ids=ids,
+                                         pair_ids=pair_ids,
+                                         add_special_tokens=True,
+                                         max_length=max_length,
+                                         padding=True,
+                                         truncate_direction="RIGHT",
+                                         return_overflowing_tokens=False,
+                                         return_attention_mask=True)
+
+    # input_length = <bos> + text_length, not include <eos>
+    output["length"] = [context_length + 1] + [input_length + 1]
+
+    for k, v in output.items():
+        print(k)
+        print(v)
+        print("==================================")
+
+    return output
+
+
+def write_instance_to_file(writer, instance):
+    """write the instance to file"""
+    input_ids = instance["input_ids"]
+    input_mask = instance["attention_mask"]
+    assert len(input_ids) == len(input_mask)
+    length = instance["length"]  # list
+
+    features = collections.OrderedDict()
+    features["input_ids"] = np.asarray(input_ids)
+    features["input_mask"] = np.asarray(input_mask)
+    features["input_length"] = np.asarray(length)
+
+    writer.write_raw_data([features])
+    return features
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input_file", type=str, required=True, help='Input raw text file. ')
+    parser.add_argument("--output_file", type=str, required=True, help='Output MindRecord file. ')
+    parser.add_argument("--num_splits", type=int, default=1,
+                        help='The MindRecord file will be split into the number of partition. ')
+    parser.add_argument("--max_seq_length", type=int, required=True, help='Maximum sequence length. ')
+    parser.add_argument("--vocab_file", type=str, required=True, default='', help='url of gpt2-vocab.json ')
+    parser.add_argument("--merge_file", type=str, required=True, default='', help='url of gpt2-merges.txt ')
+    args = parser.parse_args()
+
+    tokenizer = Tokenizer(vocab_file=args.vocab_file, merge_file=args.merge_file)
+
+    input_file = args.input_file
+    logging.info("***** Reading from input files *****")
+    logging.info("Input File: %s", input_file)
+
+    output_file = args.output_file
+    logging.info("***** Writing to output files *****")
+    logging.info("Output File: %s", output_file)
+
+    writer = FileWriter(output_file, args.num_splits)
+    data_schema = {"input_ids": {"type": "int64", "shape": [-1]},
+                   "input_mask": {"type": "int64", "shape": [-1]},
+                   "input_length": {"type": "int64", "shape": [-1]},
+                   }
+    writer.add_schema(data_schema, "lambada-schema")
+
+    total_written = 0
+    total_read = 0
+
+    logging.info("***** Reading from  %s *****", input_file)
+    with open(input_file, "r") as f:
+        while True:
+            line = f.readline()
+            if not line:
+                break
+            total_read += 1
+            if total_read % 500 == 0:
+                logging.info("%d ...", total_read)
+
+            output = create_instance(tokenizer, line, args.max_seq_length)
+            features = write_instance_to_file(writer, instance=output)
+            total_written += 1
+
+            if total_written <= 20:
+                logging.info("***** Example *****")
+                logging.info("input tokens: %s", tokenizer.decode(output["input_ids"][:-1]))
+                logging.info("label tokens: %s", tokenizer.decode(output["input_ids"][1:]))
+
+                for feature_name in features.keys():
+                    feature = features[feature_name]
+                    logging.info("%s: %s", feature_name, feature)
+
+    writer.commit()
+    logging.info("Wrote %d total instances", total_written)
+
+
+if __name__ == "__main__":
+    main()
--- a/model_zoo/research/nlp/gpt2/create_lm_data.py
+++ b/model_zoo/research/nlp/gpt2/create_lm_data.py
@ -0,0 +1,126 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""create mindrecord data for LM task"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import collections
+import logging
+import numpy as np
+
+from mindspore.mindrecord import FileWriter
+from src.utils.tokenization import Tokenizer
+
+
+def create_instance(tokenizer, text, max_length=None):
+    """A single sample instance for LM task."""
+    sentence = text.strip().split("\t")
+
+    ids = tokenizer.encode(sentence[0])
+    pair_ids = None
+    if len(sentence) == 2:
+        pair_ids = tokenizer.encode(sentence[1])
+
+    output = tokenizer.prepare_for_model(ids=ids,
+                                         pair_ids=pair_ids,
+                                         add_special_tokens=True,
+                                         max_length=max_length,
+                                         padding=True,
+                                         truncate_direction="LEFT",
+                                         return_overflowing_tokens=False,
+                                         return_attention_mask=True)
+    return output
+
+
+def write_instance_to_file(writer, instance):
+    """write the instance to file"""
+    input_ids = instance["input_ids"]
+    input_mask = instance["attention_mask"]
+    label_ids = instance["input_ids"]
+    assert len(input_ids) == len(label_ids)
+
+    features = collections.OrderedDict()
+    features["input_ids"] = np.asarray(input_ids)
+    features["input_mask"] = np.asarray(input_mask)
+    features["label_ids"] = np.asarray(label_ids)
+
+    writer.write_raw_data([features])
+    return features
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input_file", type=str, required=True, help='Input raw text file. ')
+    parser.add_argument("--output_file", type=str, required=True, help='Output MindRecord file. ')
+    parser.add_argument("--num_splits", type=int, default=1,
+                        help='The MindRecord file will be split into the number of partition. ')
+    parser.add_argument("--max_seq_length", type=int, required=True, help='Maximum sequence length. ')
+    parser.add_argument("--vocab_file", type=str, required=True, default='', help='url of gpt2-vocab.json ')
+    parser.add_argument("--merge_file", type=str, required=True, default='', help='url of gpt2-merges.txt ')
+    args = parser.parse_args()
+
+    tokenizer = Tokenizer(vocab_file=args.vocab_file, merge_file=args.merge_file)
+
+    input_file = args.input_file
+    logging.info("***** Reading from input files *****")
+    logging.info("Input File: %s", input_file)
+
+    output_file = args.output_file
+    logging.info("***** Writing to output files *****")
+    logging.info("Output File: %s", output_file)
+
+    writer = FileWriter(output_file, args.num_splits)
+    data_schema = {"input_ids": {"type": "int64", "shape": [-1]},
+                   "input_mask": {"type": "int64", "shape": [-1]},
+                   "label_ids": {"type": "int64", "shape": [-1]}
+                   }
+    writer.add_schema(data_schema, "lm-schema")
+
+    total_written = 0
+    total_read = 0
+
+    logging.info("***** Reading from  %s *****", input_file)
+    with open(input_file, "r") as f:
+        while True:
+            line = f.readline()
+            if not line:
+                break
+            total_read += 1
+            if total_read % 500 == 0:
+                logging.info("%d ...", total_read)
+
+            output = create_instance(tokenizer, line, args.max_seq_length)
+            features = write_instance_to_file(writer, instance=output)
+            total_written += 1
+
+            if total_written <= 20:
+                logging.info("***** Example *****")
+                logging.info("input tokens: %s", tokenizer.decode(output["input_ids"][:-1]))
+                logging.info("label tokens: %s", tokenizer.decode(output["input_ids"][1:]))
+
+                for feature_name in features.keys():
+                    feature = features[feature_name]
+                    logging.info("%s: %s", feature_name, feature)
+
+    writer.commit()
+    logging.info("Wrote %d total instances", total_written)
+
+
+if __name__ == "__main__":
+    main()
--- a/model_zoo/research/nlp/gpt2/create_summary_data.py
+++ b/model_zoo/research/nlp/gpt2/create_summary_data.py
@ -0,0 +1,130 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""create mindrecord data for Summarization task"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import collections
+import logging
+import numpy as np
+
+from mindspore.mindrecord import FileWriter
+from src.utils import tokenization
+
+
+def create_instance(tokenizer, text, max_length=None):
+    """A single sample instance for Summarization task."""
+    sentence = text.strip().split("\t")
+    ids = tokenizer.encode(sentence[0])
+    pair_ids = None
+    if len(sentence) == 2:
+        pair_ids = tokenizer.encode(sentence[1])
+    if len(sentence) >= 3:
+        article = sentence[0]
+        for i in range(1, len(sentence) - 1):
+            article += sentence[i]
+        ids = tokenizer.encode(article)
+        pair_ids = tokenizer.encode(sentence[-1])
+
+    output = tokenizer.prepare_for_model(ids=ids,
+                                         pair_ids=pair_ids,
+                                         add_special_tokens=True,
+                                         max_length=max_length,
+                                         padding=True,
+                                         return_overflowing_tokens=False,
+                                         return_attention_mask=True)
+    return output
+
+
+def write_instance_to_file(writer, instance):
+    """write the instance to file"""
+    input_ids = instance["input_ids"]
+    input_mask = instance["attention_mask"]
+    label_ids = instance["input_ids"]
+    assert len(input_ids) == len(label_ids)
+
+    features = collections.OrderedDict()
+    features["input_ids"] = np.asarray(input_ids)
+    features["input_mask"] = np.asarray(input_mask)
+    features["label_ids"] = np.asarray(label_ids)
+
+    writer.write_raw_data([features])
+    return features
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input_file", type=str, required=True, help='Input raw text file.')
+    parser.add_argument("--output_file", type=str, required=True, help='Output MindRecord file.')
+    parser.add_argument("--num_splits", type=int, default=1,
+                        help='The MindRecord file will be split into the number of partition. ')
+    parser.add_argument("--max_seq_length", type=int, required=True, help='Maximum sequence length.')
+    parser.add_argument("--vocab_file", type=str, required=True, default='', help='url of gpt2-vocab.json ')
+    parser.add_argument("--merge_file", type=str, required=True, default='', help='url of gpt2-merges.txt ')
+    parser.add_argument("--mode", type=str, required=True, default='cnn_dailymail', help='mode of dataset creation')
+    args = parser.parse_args()
+
+    tokenizer = tokenization.Tokenizer(vocab_file=args.vocab_file, merge_file=args.merge_file, mode=args.mode)
+    input_file = args.input_file
+    logging.info("***** Reading from input files *****")
+    logging.info("Input File: %s", input_file)
+
+    output_file = args.output_file
+    logging.info("***** Writing to output files *****")
+    logging.info("Output File: %s", output_file)
+
+    writer = FileWriter(output_file, args.num_splits)
+    data_schema = {"input_ids": {"type": "int64", "shape": [-1]},
+                   "input_mask": {"type": "int64", "shape": [-1]},
+                   "label_ids": {"type": "int64", "shape": [-1]}
+                   }
+    writer.add_schema(data_schema, "wikitext2-schema")
+
+    total_written = 0
+    total_read = 0
+
+    logging.info("***** Reading from  %s *****", input_file)
+    with open(input_file, "r") as f:
+        while True:
+            line = f.readline()
+            if not line:
+                break
+            total_read += 1
+            if total_read % 500 == 0:
+                logging.info("%d ...", total_read)
+
+            output = create_instance(tokenizer, line, args.max_seq_length)
+            features = write_instance_to_file(writer, instance=output)
+            total_written += 1
+
+            if total_written <= 20:
+                logging.info("***** Example *****")
+                logging.info("input tokens: %s", tokenizer.decode(output["input_ids"][:-1]))
+                logging.info("label tokens: %s", tokenizer.decode(output["input_ids"][1:]))
+
+                for feature_name in features.keys():
+                    feature = features[feature_name]
+                    logging.info("%s: %s", feature_name, feature)
+
+    writer.commit()
+    logging.info("Wrote %d total instances", total_written)
+
+
+if __name__ == "__main__":
+    main()
--- a/model_zoo/research/nlp/gpt2/download_cnn_dailymail.py
+++ b/model_zoo/research/nlp/gpt2/download_cnn_dailymail.py
@ -0,0 +1,59 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""download the CNN & DailyMail for Summarization task"""
+
+import argparse
+from datasets import load_dataset
+
+
+def generate_txt(url, split_, number=None, version="3.0.0"):
+    """
+    generate txt file of cnn_dailymail dataset
+
+    Args:
+        url (str): directory of dataset txt file.
+        split_ (str): test or train.
+        number (int): top-n number of samples from dataset
+        version (str): "3.0.0" by default
+
+    """
+    cnn = load_dataset("cnn_dailymail", version, split=split_)
+    if number == -1:
+        number = len(cnn)
+    f = open(url + split_ + '.txt', 'w')
+    for idx in range(number):
+        article = cnn[idx]['article']
+        article = article.replace('\n', ' ')
+        highlights = cnn[idx]['highlights']
+        highlights = highlights.replace('\n', ' ')
+        f.write(article + "\t" + highlights + '\n')
+    f.close()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Download CNN_Dailymail 3.0.0 using datasets by Huggingface')
+    parser.add_argument('--dir', type=str, default="", help="directory of dataset")
+    parser.add_argument('--split', type=str, default='test', help="[test,train]")
+    parser.add_argument('--num', type=int, default=-1,
+                        help=" number of samples by default order. "
+                             "If num is -1, it will download whole dataset. Default: -1")
+    args = parser.parse_args()
+
+    data_directory = args.dir
+    split = args.split
+    num = args.num
+
+    generate_txt(url=data_directory, split_=split, number=num)
--- a/model_zoo/research/nlp/gpt2/eval_rc_addition_answer.py
+++ b/model_zoo/research/nlp/gpt2/eval_rc_addition_answer.py
@ -0,0 +1,135 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Evaluation reading comprehension result with additional answer."""
+
+import json
+import re
+import string
+import argparse
+from collections import Counter
+
+
+def get_normalize_answer_token(string_):
+    """normalize the answer token, Lower text and remove punctuation, article and extra whitespace"""
+    def remove_articles(text):
+        regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
+        return re.sub(regex, ' ', text)
+
+    def white_space_fix(text):
+        return ' '.join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return ''.join(char for char in text if char not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(string_)))).split()
+
+
+def calculate_f1(pred_answer, gold_answer):
+    """
+    calculate final F1 score with addition answer
+    """
+    f1_score = 0
+    pred_answer = get_normalize_answer_token(pred_answer)
+    gold_answer = get_normalize_answer_token(gold_answer)
+    common = Counter(pred_answer) & Counter(gold_answer)
+    num_same = sum(common.values())
+    # the number of same tokens between pred_answer and gold_answer
+    precision = 1.0 * num_same / len(pred_answer) if pred_answer.strip() == "" else 0
+    recall = 1.0 * num_same / len(gold_answer) if gold_answer.strip() == "" else 0
+    if pred_answer.strip() == "" and gold_answer.strip() == "":
+        f1_score = 1
+    else:
+        f1_score = 2 * precision * recall / float(precision + recall) if (precision + recall) != 0 else 0.0
+    return f1_score
+
+
+def main():
+    parser = argparse.ArgumentParser(description="All Task dataset preprocessing")
+    parser.add_argument("--input_file", type=str, default="",
+                        help="The log file path of evaluation in Reading Comprehension. ")
+    parser.add_argument("--addition_file", type=str, default="", help="Coqa-dev-v1.0.json path")
+    args_opt = parser.parse_args()
+    input_file = args_opt.input_file
+    addition_file = args_opt.addition_file
+
+    find_word = 'Pred_answer:'
+    find_word_length = len(find_word)
+    pred_answer_list = []
+
+    with open(input_file, 'r', encoding='utf-8') as f:
+        while True:
+            line = f.readline()
+            if not line:
+                break
+            index = line.find(find_word)
+            if index != -1:
+                pred_answer = line[index + find_word_length:].strip()
+                pred_answer_list.append(pred_answer)
+
+    dataset = json.load(open(addition_file))
+    pred_answer_num = 0
+    total_f1score = 0
+    average_f1score = 0
+    data_num = len(pred_answer_list)
+
+    for story in dataset['data']:
+        questions = story['questions']
+        multiple_answers = [story['answers']]
+        multiple_answers += story['additional_answers'].values()
+        for question in questions:
+            pred_a = pred_answer_list[pred_answer_num]
+            turn_id = question['turn_id']
+            max_score = 0
+            max_group = 0
+            flag = 0
+            for i, answer in enumerate(multiple_answers):
+                gold_a = answer[turn_id - 1]['input_text']
+                score = calculate_f1(pred_a, gold_a)
+                if score > max_score:
+                    max_score = score
+                    max_group = i
+                # calculate the max score in multiple answers and record it's number.
+            gold_a = multiple_answers[max_group][turn_id - 1]['input_text']
+            pred_answer_num += 1
+            total_f1score += max_score
+            average_f1score = total_f1score / pred_answer_num
+
+            print('====================   data {}   ===================='.format(pred_answer_num))
+            print('| Gold_answer:{}'.format(gold_a))
+            print('| Pred_answer:{}'.format(pred_a))
+            print('| F1_Score:{:.8f}'.format(average_f1score))
+            print('=====================================================\n')
+
+            if pred_answer_num >= data_num:
+                flag = 1
+                break
+                # Stop flag
+        if flag:
+            print('Finished evaluation with addition answer! \n')
+            print("********************** Testing Finished **********************")
+            print('| Test file name: {}'.format(input_file))
+            print('| Final F1 score: {:.8f}'.format(average_f1score))
+            print('| Total data num: {}'.format(pred_answer_num))
+            print("**************************************************************")
+            break
+
+
+if __name__ == "__main__":
+    main()
--- a/model_zoo/research/nlp/gpt2/run_CBT_task.py
+++ b/model_zoo/research/nlp/gpt2/run_CBT_task.py
--- a/model_zoo/research/nlp/gpt2/run_ReadComprehension.py
+++ b/model_zoo/research/nlp/gpt2/run_ReadComprehension.py
--- a/model_zoo/research/nlp/gpt2/run_lambada.py
+++ b/model_zoo/research/nlp/gpt2/run_lambada.py
--- a/model_zoo/research/nlp/gpt2/run_language_model.py
+++ b/model_zoo/research/nlp/gpt2/run_language_model.py
--- a/model_zoo/research/nlp/gpt2/run_summarization.py
+++ b/model_zoo/research/nlp/gpt2/run_summarization.py
--- a/model_zoo/research/nlp/gpt2/run_translation.py
+++ b/model_zoo/research/nlp/gpt2/run_translation.py
--- a/model_zoo/research/nlp/gpt2/scripts/run_cbt.sh
+++ b/model_zoo/research/nlp/gpt2/scripts/run_cbt.sh
@ -0,0 +1,60 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+echo "=============================================================================================================="
+echo "Please run the script as: "
+echo "bash scripts/run_cbt.sh"
+echo "for example: bash scripts/run_cbt.sh"
+echo "metric method: Accuracy"
+echo "eval_type include: [zero-shot, finetuned]. Default: zero-shot"
+echo "=============================================================================================================="
+
+CUR_DIR=`pwd`
+mkdir -p ms_log
+output_log="${CUR_DIR}/ms_log/gpt2_cbt.log"
+
+# create file and head line
+echo " | Eval log file: " > $output_log
+echo $output_log >> $output_log
+
+# checkpoint path
+save_finetune_ckpt_path=""
+load_pretrain_ckpt_path=""
+load_eval_ckpt_path=""
+
+# dataset path
+train_data_file_path=""
+eval_data_file_path=""
+
+PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd)
+export GLOG_log_dir=${CUR_DIR}/ms_log
+export GLOG_logtostderr=0
+python ${PROJECT_DIR}/../run_CBT_task.py  \
+    --device_target="Ascend" \
+    --device_id=4 \
+    --num_choice=10 \
+    --metric_method="Accuracy" \
+    --do_train="false" \
+    --do_eval="true" \
+    --eval_type="zero-shot" \
+    --epoch_num=1 \
+    --train_data_shuffle="true" \
+    --eval_data_shuffle="false" \
+    --save_finetune_ckpt_path=$save_finetune_ckpt_path \
+    --load_pretrain_ckpt_path=$load_pretrain_ckpt_path \
+    --load_finetune_ckpt_path=$load_eval_ckpt_path \
+    --train_data_file_path=$train_data_file_path \
+    --eval_data_file_path=$eval_data_file_path >> $output_log 2>&1 &
--- a/model_zoo/research/nlp/gpt2/scripts/run_lambada.sh
+++ b/model_zoo/research/nlp/gpt2/scripts/run_lambada.sh
@ -0,0 +1,68 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+echo "=============================================================================================================="
+echo "Please run the script as: "
+echo "bash scripts/run_lambada.sh"
+echo "for example: bash scripts/run_lambada.sh"
+echo "method metric include: [Accuracy, PPL]"
+echo "eval_type include: [zero-shot, finetuned]. Default: zero-shot"
+echo "=============================================================================================================="
+
+CUR_DIR=`pwd`
+mkdir -p ms_log
+output_log="${CUR_DIR}/ms_log/gpt2_lambada.log"
+
+# create file and head line
+echo " | Eval log file: " > $output_log
+echo $output_log >> $output_log
+
+# checkpoint path
+save_finetune_ckpt_path=""
+load_pretrain_ckpt_path=""
+load_eval_ckpt_path=""
+
+# dataset path
+train_data_file_path=""
+eval_data_file_path=""
+
+# tokenizer path
+tokenizer_file_path=""
+
+# stopword path
+stop_word_file_path=""
+
+PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd)
+export GLOG_log_dir=${CUR_DIR}/ms_log
+export GLOG_logtostderr=0
+python ${PROJECT_DIR}/../run_lambada.py  \
+    --device_target="Ascend" \
+    --device_id=1 \
+    --metric_method="PPL" \
+    --do_train="false" \
+    --do_eval="true" \
+    --eval_type="zero-shot" \
+    --epoch_num=1 \
+    --train_data_shuffle="true" \
+    --eval_data_shuffle="false" \
+    --generate_length_dynamically="true" \
+    --save_finetune_ckpt_path=$save_finetune_ckpt_path \
+    --load_pretrain_ckpt_path=$load_pretrain_ckpt_path \
+    --load_finetune_ckpt_path=$load_eval_ckpt_path \
+    --train_data_file_path=$train_data_file_path \
+    --eval_data_file_path=$eval_data_file_path \
+    --tokenizer_file_path=$tokenizer_file_path \
+    --stop_word_file_path=$stop_word_file_path >> $output_log 2>&1 &
--- a/model_zoo/research/nlp/gpt2/scripts/run_language_model.sh
+++ b/model_zoo/research/nlp/gpt2/scripts/run_language_model.sh
@ -0,0 +1,59 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+echo "=============================================================================================================="
+echo "Please run the script as: "
+echo "bash scripts/run_language_model.sh"
+echo "for example: bash scripts/run_language_model.sh"
+echo "metric method: PPL"
+echo "eval_type include: [zero-shot, finetuned]. Default: zero-shot"
+echo "=============================================================================================================="
+
+CUR_DIR=`pwd`
+mkdir -p ms_log
+output_log="${CUR_DIR}/ms_log/gpt2_language_model.log"
+
+# create file and head line
+echo " | Eval log file: " > $output_log
+echo $output_log >> $output_log
+
+# checkpoint path
+save_finetune_ckpt_path=""
+load_pretrain_ckpt_path=""
+load_eval_ckpt_path=""
+
+# dataset path
+train_data_file_path=""
+eval_data_file_path=""
+
+PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd)
+export GLOG_log_dir=${CUR_DIR}/ms_log
+export GLOG_logtostderr=0
+python ${PROJECT_DIR}/../run_language_model.py  \
+    --device_target="Ascend" \
+    --device_id=4 \
+    --metric_method="PPL" \
+    --do_train="false" \
+    --do_eval="true" \
+    --eval_type="zero-shot" \
+    --epoch_num=1 \
+    --train_data_shuffle="true" \
+    --eval_data_shuffle="false" \
+    --save_finetune_ckpt_path=$save_finetune_ckpt_path \
+    --load_pretrain_ckpt_path=$load_pretrain_ckpt_path \
+    --load_finetune_ckpt_path=$load_eval_ckpt_path \
+    --train_data_file_path=$train_data_file_path \
+    --eval_data_file_path=$eval_data_file_path >> $output_log 2>&1 &
--- a/model_zoo/research/nlp/gpt2/scripts/run_read_comprehension.sh
+++ b/model_zoo/research/nlp/gpt2/scripts/run_read_comprehension.sh
@ -0,0 +1,67 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+echo "=============================================================================================================="
+echo "Please run the script as: "
+echo "bash scripts/run_read_comprehension.sh"
+echo "for example: bash scripts/run_read_comprehension.sh"
+echo "metric method: F1"
+echo "eval_type include: [zero-shot, finetuned]. Default: zero-shot"
+echo "=============================================================================================================="
+
+CUR_DIR=`pwd`
+mkdir -p ms_log
+output_log="${CUR_DIR}/ms_log/gpt2_read_comprehension.log"
+
+# create file and head line
+echo " | Eval log file: " > $output_log
+echo $output_log >> $output_log
+
+# checkpoint path
+save_finetune_ckpt_path=""
+load_pretrain_ckpt_path=""
+load_eval_ckpt_path=""
+
+# dataset path
+train_data_file_path=""
+eval_data_file_path=""
+
+# tokenizer path
+tokenizer_file_path=""
+
+PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd)
+export GLOG_log_dir=${CUR_DIR}/ms_log
+export GLOG_logtostderr=0
+python ${PROJECT_DIR}/../run_ReadComprehension.py  \
+    --device_target="Ascend" \
+    --device_id=7 \
+    --metric_method="F1" \
+    --do_train="false" \
+    --do_eval="true" \
+    --eval_type="zero-shot" \
+    --epoch_num=1 \
+    --train_data_shuffle="true" \
+    --eval_data_shuffle="false" \
+    --save_finetune_ckpt_path=$save_finetune_ckpt_path \
+    --load_pretrain_ckpt_path=$load_pretrain_ckpt_path \
+    --load_finetune_ckpt_path=$load_eval_ckpt_path \
+    --train_data_file_path=$train_data_file_path \
+    --eval_data_file_path=$eval_data_file_path \
+    --tokenizer_file_path=$tokenizer_file_path \
+    --generate_length=55 \
+    --top_k=1 \
+    --top_p="1.0" \
+    --temperature="1.0" >> $output_log 2>&1 &
--- a/model_zoo/research/nlp/gpt2/scripts/run_summarization.sh
+++ b/model_zoo/research/nlp/gpt2/scripts/run_summarization.sh
@ -0,0 +1,66 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+echo "=============================================================================================================="
+echo "Please run the script as: "
+echo "bash scripts/run_summarization.sh"
+echo "for example: bash scripts/run_summarization.sh"
+echo "eval_load_param_mode include: [zero-shot, finetuned]. Default: finetuned"
+echo "=============================================================================================================="
+
+CUR_DIR=`pwd`
+mkdir -p ms_log
+output_log="${CUR_DIR}/ms_log/gpt2_summarization.log"
+
+# create file and head line
+echo " | Eval log file: " > $output_log
+echo $output_log >> $output_log
+
+# checkpoint path
+save_finetune_ckpt_path=""
+load_pretrain_ckpt_path=""
+load_eval_ckpt_path=""
+
+# dataset path
+train_data_file_path=""
+eval_data_file_path=""
+
+# tokenizer path
+tokenizer_file_path=""
+
+PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd)
+export GLOG_log_dir=${CUR_DIR}/ms_log
+export GLOG_logtostderr=0
+python ${PROJECT_DIR}/../run_summarization.py  \
+    --device_target="Ascend" \
+    --device_id=0 \
+    --do_train="false" \
+    --do_eval="true" \
+    --metric_method="Rouge" \
+    --epoch_num=1 \
+    --train_data_shuffle="true" \
+    --eval_data_shuffle="false" \
+    --top_k=2 \
+    --top_p="1.0" \
+    --generate_length=100 \
+    --temperature="1.0" \
+    --eval_type="finetuned" \
+    --save_finetune_ckpt_path=$save_finetune_ckpt_path \
+    --load_pretrain_ckpt_path=$load_pretrain_ckpt_path \
+    --load_finetune_ckpt_path=$load_eval_ckpt_path \
+    --train_data_file_path=$train_data_file_path \
+    --eval_data_file_path=$eval_data_file_path \
+    --tokenizer_file_path=$tokenizer_file_path  >> $output_log 2>&1 &
--- a/model_zoo/research/nlp/gpt2/scripts/run_translation.sh
+++ b/model_zoo/research/nlp/gpt2/scripts/run_translation.sh
@ -0,0 +1,67 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+echo "=============================================================================================================="
+echo "Please run the script as: "
+echo "bash scripts/run_translation.sh"
+echo "for example: bash scripts/run_translation.sh"
+echo "metric method: BLEU"
+echo "eval_type include: [zero-shot, finetuned]. Default: zero-shot"
+echo "=============================================================================================================="
+
+CUR_DIR=`pwd`
+mkdir -p ms_log
+output_log="${CUR_DIR}/ms_log/gpt2_translation.log"
+
+# create file and head line
+echo " | Eval log file: " > $output_log
+echo $output_log >> $output_log
+
+# checkpoint path
+save_finetune_ckpt_path=""
+load_pretrain_ckpt_path=""
+load_eval_ckpt_path=""
+
+# dataset path
+train_data_file_path=""
+eval_data_file_path=""
+
+# tokenizer path
+tokenizer_file_path=""
+
+PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd)
+export GLOG_log_dir=${CUR_DIR}/ms_log
+export GLOG_logtostderr=0
+python ${PROJECT_DIR}/../run_translation.py  \
+    --device_target="Ascend" \
+    --device_id=4 \
+    --metric_method="BLEU" \
+    --do_train="false" \
+    --do_eval="true" \
+    --eval_type="zero-shot" \
+    --epoch_num=1 \
+    --train_data_shuffle="true" \
+    --eval_data_shuffle="false" \
+    --save_finetune_ckpt_path=$save_finetune_ckpt_path \
+    --load_pretrain_ckpt_path=$load_pretrain_ckpt_path \
+    --load_finetune_ckpt_path=$load_eval_ckpt_path \
+    --train_data_file_path=$train_data_file_path \
+    --eval_data_file_path=$eval_data_file_path \
+    --tokenizer_file_path=$tokenizer_file_path \
+    --generate_length=100 \
+    --top_k=1 \
+    --top_p="1.0" \
+    --temperature="1.0" >> $output_log 2>&1 &
--- a/Show More
+++ b/Show More