diff --git a/model_zoo/bert/run_pretrain.py b/model_zoo/bert/run_pretrain.py
index 28c021f56f..3646ec9a51 100644
--- a/model_zoo/bert/run_pretrain.py
+++ b/model_zoo/bert/run_pretrain.py
@@ -21,6 +21,7 @@ import os
 import argparse
 import numpy
 import mindspore.communication.management as D
+import mindspore.common.dtype as mstype
 from mindspore import context
 from mindspore.train.model import Model
 from mindspore.train.parallel_utils import ParallelMode
@@ -28,6 +29,7 @@ from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell
 from mindspore.train.callback import Callback, ModelCheckpoint, CheckpointConfig, TimeMonitor
 from mindspore.train.serialization import load_checkpoint, load_param_into_net
 from mindspore.nn.optim import Lamb, Momentum, AdamWeightDecayDynamicLR
+from mindspore import log as logger
 from src import BertNetworkWithLoss, BertTrainOneStepCell, BertTrainOneStepWithLossScaleCell
 from src.dataset import create_bert_dataset
 from src.config import cfg, bert_net_cfg
@@ -55,6 +57,8 @@ class LossCallBack(Callback):
 def run_pretrain():
     """pre-train bert_clue"""
     parser = argparse.ArgumentParser(description='bert pre_training')
+    parser.add_argument('--device_target', type=str, default='Ascend', choices=['Ascend', 'GPU'],
+                        help='device where the code will be implemented. (Default: Ascend)')
     parser.add_argument("--distribute", type=str, default="false", help="Run distribute, default is false.")
     parser.add_argument("--epoch_size", type=int, default="1", help="Epoch size, default is 1.")
     parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.")
@@ -74,11 +78,21 @@ def run_pretrain():
     parser.add_argument("--schema_dir", type=str, default="", help="Schema path, it is better to use absolute path")
 
     args_opt = parser.parse_args()
-    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args_opt.device_id)
+    context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, device_id=args_opt.device_id)
     context.set_context(reserve_class_name_in_scope=False)
 
+    ckpt_save_dir = args_opt.checkpoint_path
     if args_opt.distribute == "true":
-        device_num = args_opt.device_num
+        if args_opt.device_target == 'Ascend':
+            D.init('hccl')
+            device_num = args_opt.device_num
+            rank = args_opt.device_id % device_num
+        else:
+            D.init('nccl')
+            device_num = D.get_group_size()
+            rank = D.get_rank()
+            ckpt_save_dir = args_opt.checkpoint_path + 'ckpt_' + str(rank) + '/'
+
         context.reset_auto_parallel_context()
         context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True,
                                           device_num=device_num)
@@ -93,12 +107,15 @@ def run_pretrain():
                 auto_parallel_context().set_all_reduce_fusion_split_indices([30, 90, 150, 210, 270, 330, 390, 421])
             else:
                 auto_parallel_context().set_all_reduce_fusion_split_indices([38, 93, 148, 203, 258, 313, 368, 397])
-        D.init()
-        rank = args_opt.device_id % device_num
     else:
         rank = 0
         device_num = 1
 
+    if args_opt.device_target == 'GPU' and bert_net_cfg.compute_type != mstype.float32:
+        logger.warning('Gpu only support fp32 temporarily, run with fp32.')
+        bert_net_cfg.compute_type = mstype.float32
+
+
     ds, new_repeat_count = create_bert_dataset(args_opt.epoch_size, device_num, rank, args_opt.do_shuffle,
                                                args_opt.enable_data_sink, args_opt.data_sink_steps,
                                                args_opt.data_dir, args_opt.schema_dir)
@@ -130,7 +147,7 @@ def run_pretrain():
     if args_opt.enable_save_ckpt == "true":
         config_ck = CheckpointConfig(save_checkpoint_steps=args_opt.save_checkpoint_steps,
                                      keep_checkpoint_max=args_opt.save_checkpoint_num)
-        ckpoint_cb = ModelCheckpoint(prefix='checkpoint_bert', config=config_ck)
+        ckpoint_cb = ModelCheckpoint(prefix='checkpoint_bert', directory=ckpt_save_dir, config=config_ck)
         callback.append(ckpoint_cb)
 
     if args_opt.checkpoint_path:
diff --git a/model_zoo/bert/scripts/run_distribute_pretrain_for_gpu.sh b/model_zoo/bert/scripts/run_distribute_pretrain_for_gpu.sh
new file mode 100644
index 0000000000..db911b8279
--- /dev/null
+++ b/model_zoo/bert/scripts/run_distribute_pretrain_for_gpu.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+echo "=============================================================================================================="
+echo "Please run the scipt as: "
+echo "bash run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR"
+echo "for example: bash run_distribute_pretrain.sh 8 40 /path/zh-wiki/ /path/Schema.json"
+echo "It is better to use absolute path."
+echo "=============================================================================================================="
+
+RANK_SIZE=$1
+EPOCH_SIZE=$2
+DATA_DIR=$3
+SCHEMA_DIR=$4
+
+mpirun --allow-run-as-root -n $RANK_SIZE \
+	python run_pretrain.py				\
+		--device_target="GPU"			\
+		--distribute="true"				\
+		--epoch_size=$EPOCH_SIZE		\
+		--enable_save_ckpt="true"		\
+		--enable_lossscale="false"		\
+		--do_shuffle="true"				\
+		--enable_data_sink="true"		\
+		--data_sink_steps=1				\
+		--checkpoint_path=""			\
+		--save_checkpoint_steps=10000	\
+		--save_checkpoint_num=1			\
+		--data_dir=$DATA_DIR			\
+		--schema_dir=$SCHEMA_DIR > log.txt 2>&1 &
+
diff --git a/model_zoo/bert/scripts/run_standalone_pretrain.sh b/model_zoo/bert/scripts/run_standalone_pretrain.sh
index 438dda58c3..d29e04689a 100644
--- a/model_zoo/bert/scripts/run_standalone_pretrain.sh
+++ b/model_zoo/bert/scripts/run_standalone_pretrain.sh
@@ -37,7 +37,7 @@ python run_pretrain.py  \
     --enable_lossscale="true" \
     --do_shuffle="true" \
     --enable_data_sink="true" \
-    --data_sink_steps=100 \
+    --data_sink_steps=1 \
     --checkpoint_path="" \
     --save_checkpoint_steps=10000 \
     --save_checkpoint_num=1 \
diff --git a/model_zoo/bert/scripts/run_standalone_pretrain_for_gpu.sh b/model_zoo/bert/scripts/run_standalone_pretrain_for_gpu.sh
new file mode 100644
index 0000000000..8ec7d60160
--- /dev/null
+++ b/model_zoo/bert/scripts/run_standalone_pretrain_for_gpu.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+echo "=============================================================================================================="
+echo "Please run the scipt as: "
+echo "bash run_standalone_pretrain.sh DEVICE_ID EPOCH_SIZE DATA_DIR SCHEMA_DIR"
+echo "for example: bash run_standalone_pretrain.sh 0 40 /path/zh-wiki/ /path/Schema.json"
+echo "=============================================================================================================="
+
+DEVICE_ID=$1
+EPOCH_SIZE=$2
+DATA_DIR=$3
+SCHEMA_DIR=$4
+
+export CUDA_VISIBLE_DEVICES=$DEVICE_ID
+
+mkdir -p ms_log
+CUR_DIR=`pwd`
+export GLOG_log_dir=${CUR_DIR}/ms_log
+export GLOG_logtostderr=0
+python run_pretrain.py  \
+    --device_target="GPU" \
+    --distribute="false" \
+    --epoch_size=$EPOCH_SIZE \
+    --enable_save_ckpt="true" \
+    --enable_lossscale="false" \
+    --do_shuffle="true" \
+    --enable_data_sink="true" \
+    --data_sink_steps=1 \
+    --checkpoint_path="" \
+    --save_checkpoint_steps=10000 \
+    --save_checkpoint_num=1 \
+    --data_dir=$DATA_DIR \
+    --schema_dir=$SCHEMA_DIR > log.txt 2>&1 &