!6067 Generate cmd file for distributed_pretrain in bert scripts

Merge pull request !6067 from chenhaozhe/fix-bert-codex
pull/6067/MERGE
mindspore-ci-bot 4 years ago committed by Gitee
commit 326feab42f

@ -1,19 +1,32 @@
# Contents
- [Contents](#contents)
- [BERT Description](#bert-description)
- [Model Architecture](#model-architecture)
- [Dataset](#dataset)
- [Environment Requirements](#environment-requirements)
- [Quick Start](#quick-start)
- [Script Description](#script-description)
- [Script and Sample Code](#script-and-sample-code)
- [Script Parameters](#script-parameters)
- [Dataset Preparation](#dataset-preparation)
- [Training Process](#training-process)
- [Evaluation Process](#evaluation-process)
- [Model Description](#model-description)
- [Performance](#performance)
- [Training Performance](#training-performance)
- [Evaluation Performance](#evaluation-performance)
- [Script and Sample Code](#script-and-sample-code)
- [Script Parameters](#script-parameters)
- [Pre-Training](#pre-training)
- [Fine-Tuning and Evaluation](#fine-tuning-and-evaluation)
- [Options and Parameters](#options-and-parameters)
- [Options:](#options)
- [Parameters:](#parameters)
- [Training Process](#training-process)
- [Training](#training)
- [Running on Ascend](#running-on-ascend)
- [Distributed Training](#distributed-training)
- [Running on Ascend](#running-on-ascend-1)
- [Evaluation Process](#evaluation-process)
- [Evaluation](#evaluation)
- [evaluation on cola dataset when running on Ascend](#evaluation-on-cola-dataset-when-running-on-ascend)
- [evaluation on cluener dataset when running on Ascend](#evaluation-on-cluener-dataset-when-running-on-ascend)
- [evaluation on squad v1.1 dataset when running on Ascend](#evaluation-on-squad-v11-dataset-when-running-on-ascend)
- [Model Description](#model-description)
- [Performance](#performance)
- [Pretraining Performance](#pretraining-performance)
- [Inference Performance](#inference-performance)
- [Description of Random Situation](#description-of-random-situation)
- [ModelZoo Homepage](#modelzoo-homepage)
@ -139,7 +152,7 @@ For example, the schema file of cn-wiki-128 dataset for pretraining shows as fol
├─ascend_distributed_launcher
├─__init__.py
├─hyper_parameter_config.ini # hyper paramter for distributed pretraining
├─run_distribute_pretrain.py # script for distributed pretraining
├─get_distribute_pretrain_cmd.py # script for distributed pretraining
├─README.md
├─run_classifier.sh # shell script for standalone classifier task on ascend or gpu
├─run_ner.sh # shell script for standalone NER task on ascend or gpu

@ -5,9 +5,9 @@ The number of D chips can be automatically allocated based on the device_num set
## how to use
For example, if we want to run the distributed training of Bert model on D chip, we can in `/bert/` dir:
For example, if we want to generate the launch command of the distributed training of Bert model on D chip, we can run the following command in `/bert/` dir:
```
python ./scripts/ascend_distributed_launcher/run_distribute_pretrain.py --run_script_dir ./run_pretrain.py --hyper_parameter_config_dir ./scripts/ascend_distributed_launcher/hyper_parameter_config.ini --data_dir /path/dataset/ --hccl_config_dir model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json
python ./scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py --run_script_dir ./run_pretrain.py --hyper_parameter_config_dir ./scripts/ascend_distributed_launcher/hyper_parameter_config.ini --data_dir /path/dataset/ --hccl_config_dir model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json
```
output:
@ -42,7 +42,8 @@ log file dir: ./LOG6/log.txt
1. Note that `hccl_2p_56_x.x.x.x.json` can use [hccl_tools.py](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools) to generate.
2. For hyper parameter, please note that you should customize the scripts `hyper_parameter_config.ini`. Please note that these two hyper parameters are not allowed to be configured here:
device_id
device_num
- device_id
- device_num
- data_dir
3. For Other Model, please note that you should customize the option `run_script` and Corresponding `hyper_parameter_config.ini`.

@ -42,11 +42,21 @@ def parse_args():
help="Data path, it is better to use absolute path")
parser.add_argument("--hccl_config_dir", type=str, default="",
help="Hccl config path, it is better to use absolute path")
parser.add_argument("--cmd_file", type=str, default="distributed_cmd.sh",
help="Path of the generated cmd file.")
args = parser.parse_args()
return args
def append_cmd(cmd, s):
cmd += s
cmd += "\n"
return cmd
def append_cmd_env(cmd, key, value):
return append_cmd(cmd, "export" + str(key) + "=" + str(value))
def distribute_pretrain():
"""
distribute pretrain scripts. The number of D chips can be automatically allocated
@ -92,6 +102,7 @@ def distribute_pretrain():
print("avg_core_per_rank:", avg_core_per_rank)
count = 0
cmd = ""
for instance in this_server["device"]:
device_id = instance["device_id"]
rank_id = instance["rank_id"]
@ -104,39 +115,44 @@ def distribute_pretrain():
end = start + core_gap
cmdopt = str(start) + "-" + str(end)
os.environ["DEVICE_ID"] = device_id
os.environ["RANK_ID"] = rank_id
os.environ["DEPLOY_MODE"] = "0"
os.environ["GE_USE_STATIC_MEMORY"] = "1"
cmd = append_cmd(cmd, "export DEVICE_ID=" + str(device_id))
cmd = append_cmd(cmd, "export RANK_ID=" + str(rank_id))
cmd = append_cmd(cmd, "export DEPLOY_MODE=0")
cmd = append_cmd(cmd, "export GE_USE_STATIC_MEMORY=1")
os.system("rm -rf LOG" + str(device_id))
os.system("mkdir ./LOG" + str(device_id))
os.system("cp *.py ./LOG" + str(device_id))
os.system("mkdir -p ./LOG" + str(device_id) + "/ms_log")
os.system("env > ./LOG" + str(device_id) + "/env.log")
cmd = append_cmd(cmd, "rm -rf LOG" + str(device_id))
cmd = append_cmd(cmd, "mkdir ./LOG" + str(device_id))
cmd = append_cmd(cmd, "cp *.py ./LOG" + str(device_id))
cmd = append_cmd(cmd, "mkdir -p ./LOG" + str(device_id) + "/ms_log")
cmd = append_cmd(cmd, "env > ./LOG" + str(device_id) + "/env.log")
cur_dir = os.getcwd()
os.environ["GLOG_log_dir"] = cur_dir + "/LOG" + str(device_id) + "/ms_log"
os.environ["GLOG_logtostderr"] = "0"
cmd = append_cmd_env(cmd, "GLOG_LOG_DIR", cur_dir + "/LOG" + str(device_id) + "/ms_log")
cmd = append_cmd_env(cmd, "GLOG_logtostderr", "0")
print("core_nums:", cmdopt)
print("epoch_size:", str(cfg['epoch_size']))
print("data_dir:", data_dir)
print("log_file_dir: " + cur_dir + "/LOG" + str(device_id) + "/pretraining_log.txt")
os.chdir(cur_dir + "/LOG" + str(device_id))
cmd = 'taskset -c ' + cmdopt + ' nohup python ' + run_script + " "
cmd = append_cmd(cmd, "cd " + cur_dir + "/LOG" + str(device_id))
run_cmd = 'taskset -c ' + cmdopt + ' nohup python ' + run_script + " "
opt = " ".join(["--" + key + "=" + str(cfg[key]) for key in cfg.keys()])
if ('device_id' in opt) or ('device_num' in opt) or ('data_dir' in opt):
raise ValueError("hyper_parameter_config.ini can not setting 'device_id',"
" 'device_num' or 'data_dir'! ")
cmd += opt
cmd += " --data_dir=" + data_dir
cmd += ' --device_id=' + str(device_id) + ' --device_num=' \
run_cmd += opt
run_cmd += " --data_dir=" + data_dir
run_cmd += ' --device_id=' + str(device_id) + ' --device_num=' \
+ str(rank_size) + ' >./pretraining_log.txt 2>&1 &'
os.system(cmd)
os.chdir(cur_dir)
cmd = append_cmd(cmd, run_cmd)
cmd = append_cmd(cmd, "cd -")
cmd += "\n"
with open(args.cmd_file, "w") as f:
f.write(cmd)
if __name__ == "__main__":
distribute_pretrain()

@ -24,8 +24,11 @@ echo "For hyper parameter, please note that you should customize the scripts:
echo "=============================================================================================================="
CUR_DIR=`pwd`
python ${CUR_DIR}/scripts/ascend_distributed_launcher/run_distribute_pretrain.py \
python ${CUR_DIR}/scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py \
--run_script_dir=${CUR_DIR}/run_pretrain.py \
--hyper_parameter_config_dir=${CUR_DIR}/scripts/ascend_distributed_launcher/hyper_parameter_config.ini \
--data_dir=$1 \
--hccl_config_dir=$2
--hccl_config_dir=$2 \
--cmd_file=distributed_cmd.sh
bash distributed_cmd.sh

@ -590,7 +590,7 @@ class BertTrainAccumulateStepsWithLossScaleCell(nn.Cell):
scaling = scaling_sens * self.degree * self.accumulation_steps
grads = self.hyper_map(F.partial(grad_scale, scaling), grads)
if self.enable_global_norm:
grads = ClipByGlobalNorm()(grad)
grads = ClipByGlobalNorm()(grads)
else:
grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
accu_overflow = self.overflow_reducer(accu_overflow)

Loading…
Cancel
Save