diff --git a/model_zoo/bert/run_pretrain.py b/model_zoo/bert/run_pretrain.py index 28c021f56f..3646ec9a51 100644 --- a/model_zoo/bert/run_pretrain.py +++ b/model_zoo/bert/run_pretrain.py @@ -21,6 +21,7 @@ import os import argparse import numpy import mindspore.communication.management as D +import mindspore.common.dtype as mstype from mindspore import context from mindspore.train.model import Model from mindspore.train.parallel_utils import ParallelMode @@ -28,6 +29,7 @@ from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell from mindspore.train.callback import Callback, ModelCheckpoint, CheckpointConfig, TimeMonitor from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.nn.optim import Lamb, Momentum, AdamWeightDecayDynamicLR +from mindspore import log as logger from src import BertNetworkWithLoss, BertTrainOneStepCell, BertTrainOneStepWithLossScaleCell from src.dataset import create_bert_dataset from src.config import cfg, bert_net_cfg @@ -55,6 +57,8 @@ class LossCallBack(Callback): def run_pretrain(): """pre-train bert_clue""" parser = argparse.ArgumentParser(description='bert pre_training') + parser.add_argument('--device_target', type=str, default='Ascend', choices=['Ascend', 'GPU'], + help='device where the code will be implemented. (Default: Ascend)') parser.add_argument("--distribute", type=str, default="false", help="Run distribute, default is false.") parser.add_argument("--epoch_size", type=int, default="1", help="Epoch size, default is 1.") parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.") @@ -74,11 +78,21 @@ def run_pretrain(): parser.add_argument("--schema_dir", type=str, default="", help="Schema path, it is better to use absolute path") args_opt = parser.parse_args() - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args_opt.device_id) + context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, device_id=args_opt.device_id) context.set_context(reserve_class_name_in_scope=False) + ckpt_save_dir = args_opt.checkpoint_path if args_opt.distribute == "true": - device_num = args_opt.device_num + if args_opt.device_target == 'Ascend': + D.init('hccl') + device_num = args_opt.device_num + rank = args_opt.device_id % device_num + else: + D.init('nccl') + device_num = D.get_group_size() + rank = D.get_rank() + ckpt_save_dir = args_opt.checkpoint_path + 'ckpt_' + str(rank) + '/' + context.reset_auto_parallel_context() context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True, device_num=device_num) @@ -93,12 +107,15 @@ def run_pretrain(): auto_parallel_context().set_all_reduce_fusion_split_indices([30, 90, 150, 210, 270, 330, 390, 421]) else: auto_parallel_context().set_all_reduce_fusion_split_indices([38, 93, 148, 203, 258, 313, 368, 397]) - D.init() - rank = args_opt.device_id % device_num else: rank = 0 device_num = 1 + if args_opt.device_target == 'GPU' and bert_net_cfg.compute_type != mstype.float32: + logger.warning('Gpu only support fp32 temporarily, run with fp32.') + bert_net_cfg.compute_type = mstype.float32 + + ds, new_repeat_count = create_bert_dataset(args_opt.epoch_size, device_num, rank, args_opt.do_shuffle, args_opt.enable_data_sink, args_opt.data_sink_steps, args_opt.data_dir, args_opt.schema_dir) @@ -130,7 +147,7 @@ def run_pretrain(): if args_opt.enable_save_ckpt == "true": config_ck = CheckpointConfig(save_checkpoint_steps=args_opt.save_checkpoint_steps, keep_checkpoint_max=args_opt.save_checkpoint_num) - ckpoint_cb = ModelCheckpoint(prefix='checkpoint_bert', config=config_ck) + ckpoint_cb = ModelCheckpoint(prefix='checkpoint_bert', directory=ckpt_save_dir, config=config_ck) callback.append(ckpoint_cb) if args_opt.checkpoint_path: diff --git a/model_zoo/bert/scripts/run_distribute_pretrain_for_gpu.sh b/model_zoo/bert/scripts/run_distribute_pretrain_for_gpu.sh new file mode 100644 index 0000000000..db911b8279 --- /dev/null +++ b/model_zoo/bert/scripts/run_distribute_pretrain_for_gpu.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +echo "==============================================================================================================" +echo "Please run the scipt as: " +echo "bash run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR" +echo "for example: bash run_distribute_pretrain.sh 8 40 /path/zh-wiki/ /path/Schema.json" +echo "It is better to use absolute path." +echo "==============================================================================================================" + +RANK_SIZE=$1 +EPOCH_SIZE=$2 +DATA_DIR=$3 +SCHEMA_DIR=$4 + +mpirun --allow-run-as-root -n $RANK_SIZE \ + python run_pretrain.py \ + --device_target="GPU" \ + --distribute="true" \ + --epoch_size=$EPOCH_SIZE \ + --enable_save_ckpt="true" \ + --enable_lossscale="false" \ + --do_shuffle="true" \ + --enable_data_sink="true" \ + --data_sink_steps=1 \ + --checkpoint_path="" \ + --save_checkpoint_steps=10000 \ + --save_checkpoint_num=1 \ + --data_dir=$DATA_DIR \ + --schema_dir=$SCHEMA_DIR > log.txt 2>&1 & + diff --git a/model_zoo/bert/scripts/run_standalone_pretrain.sh b/model_zoo/bert/scripts/run_standalone_pretrain.sh index 438dda58c3..d29e04689a 100644 --- a/model_zoo/bert/scripts/run_standalone_pretrain.sh +++ b/model_zoo/bert/scripts/run_standalone_pretrain.sh @@ -37,7 +37,7 @@ python run_pretrain.py \ --enable_lossscale="true" \ --do_shuffle="true" \ --enable_data_sink="true" \ - --data_sink_steps=100 \ + --data_sink_steps=1 \ --checkpoint_path="" \ --save_checkpoint_steps=10000 \ --save_checkpoint_num=1 \ diff --git a/model_zoo/bert/scripts/run_standalone_pretrain_for_gpu.sh b/model_zoo/bert/scripts/run_standalone_pretrain_for_gpu.sh new file mode 100644 index 0000000000..8ec7d60160 --- /dev/null +++ b/model_zoo/bert/scripts/run_standalone_pretrain_for_gpu.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +echo "==============================================================================================================" +echo "Please run the scipt as: " +echo "bash run_standalone_pretrain.sh DEVICE_ID EPOCH_SIZE DATA_DIR SCHEMA_DIR" +echo "for example: bash run_standalone_pretrain.sh 0 40 /path/zh-wiki/ /path/Schema.json" +echo "==============================================================================================================" + +DEVICE_ID=$1 +EPOCH_SIZE=$2 +DATA_DIR=$3 +SCHEMA_DIR=$4 + +export CUDA_VISIBLE_DEVICES=$DEVICE_ID + +mkdir -p ms_log +CUR_DIR=`pwd` +export GLOG_log_dir=${CUR_DIR}/ms_log +export GLOG_logtostderr=0 +python run_pretrain.py \ + --device_target="GPU" \ + --distribute="false" \ + --epoch_size=$EPOCH_SIZE \ + --enable_save_ckpt="true" \ + --enable_lossscale="false" \ + --do_shuffle="true" \ + --enable_data_sink="true" \ + --data_sink_steps=1 \ + --checkpoint_path="" \ + --save_checkpoint_steps=10000 \ + --save_checkpoint_num=1 \ + --data_dir=$DATA_DIR \ + --schema_dir=$SCHEMA_DIR > log.txt 2>&1 &