Supports weight slice while training with multi servers

Merge pull request  from ZPaC/master-multi-server-weight-slice
pull/4377/MERGE
mindspore-ci-bot 5 years ago committed by Gitee
commit a3b8b4c2d6

@ -257,6 +257,7 @@ void ParameterServer<T>::ServerHandler::HandleInitEmbeddings(const ::ps::KVMeta
const ::ps::KVPairs<T> &req_data, ::ps::KVPairs<T> *res) { const ::ps::KVPairs<T> &req_data, ::ps::KVPairs<T> *res) {
std::unique_lock<std::mutex> lock(ps_->mutex()); std::unique_lock<std::mutex> lock(ps_->mutex());
const Key &key = req_data.keys[0]; const Key &key = req_data.keys[0];
MS_LOG(INFO) << "Initializing embedding table for key:" << key;
std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> shapes = std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> shapes =
std::make_shared<std::vector<std::shared_ptr<std::vector<size_t>>>>(); std::make_shared<std::vector<std::shared_ptr<std::vector<size_t>>>>();
std::shared_ptr<std::vector<size_t>> input_shape = std::make_shared<std::vector<size_t>>(); std::shared_ptr<std::vector<size_t>> input_shape = std::make_shared<std::vector<size_t>>();
@ -348,6 +349,8 @@ void ParameterServer<T>::InitWeightKeyToOptims(const Key &key, const int &optim_
} }
weight_key_to_optims_[key] = Util::optimizer_name(optim_id); weight_key_to_optims_[key] = Util::optimizer_name(optim_id);
weight_key_to_optim_op_[key] = Util::optimizer_node_name(optim_id); weight_key_to_optim_op_[key] = Util::optimizer_node_name(optim_id);
MS_LOG(INFO) << "Initializing optimizer id for key:" << key << ", optimizer name:" << weight_key_to_optims_[key]
<< ", optimizer op name:" << weight_key_to_optim_op_[key];
} }
template <typename T> template <typename T>
@ -355,7 +358,7 @@ void ParameterServer<T>::InitOptimInputsShape(const Keys &keys, const Values &va
InputsShapePtr inputs_shape = std::make_shared<InputsShape>(); InputsShapePtr inputs_shape = std::make_shared<InputsShape>();
int val_idx = 0; int val_idx = 0;
const Key &key = keys[0]; const Key &key = keys[0];
MS_LOG(INFO) << "Initializing optimizer inputs shape for key:" << key;
if (optim_inputs_shape_.count(key) == 0) { if (optim_inputs_shape_.count(key) == 0) {
optim_inputs_shape_[key] = inputs_shape; optim_inputs_shape_[key] = inputs_shape;
} }
@ -413,7 +416,7 @@ const CNodePtr ParameterServer<T>::GetCNode(const std::string &name) const {
template <typename T> template <typename T>
void ParameterServer<T>::InitWeight(const Key &key, const WeightPtr &weight) { void ParameterServer<T>::InitWeight(const Key &key, const WeightPtr &weight) {
MS_LOG(INFO) << "Initializing weight for key " << key; MS_LOG(INFO) << "Initializing weight for key " << key << ", server rank " << rank_id_;
if ((weights_.count(key) == 0) || (is_embedding_[key] && weights_.count(key) != 0)) { if ((weights_.count(key) == 0) || (is_embedding_[key] && weights_.count(key) != 0)) {
weights_[key] = weight; weights_[key] = weight;
tokens_[key] = 0; tokens_[key] = 0;
@ -432,7 +435,6 @@ void ParameterServer<T>::InitGrad(const Key &key, const GradPtr &grad) {
template <typename T> template <typename T>
void ParameterServer<T>::InitEmbeddingTable( void ParameterServer<T>::InitEmbeddingTable(
const Key &key, const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &shapes) { const Key &key, const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &shapes) {
MS_LOG(INFO) << "Initializing embedding table for key " << key;
std::shared_ptr<PServerKernel> lookup = std::make_shared<kernel::ps::EmbeddingLookUpPSKernel>(rank_id_, pserver_num_); std::shared_ptr<PServerKernel> lookup = std::make_shared<kernel::ps::EmbeddingLookUpPSKernel>(rank_id_, pserver_num_);
lookup->InitKernel(shapes); lookup->InitKernel(shapes);
embedding_lookup_ops_[key] = lookup; embedding_lookup_ops_[key] = lookup;

@ -89,7 +89,7 @@ void Worker<T>::Run() {
if (!::ps::IsWorker()) { if (!::ps::IsWorker()) {
MS_LOG(EXCEPTION) << "The role is not worker."; MS_LOG(EXCEPTION) << "The role is not worker.";
} }
kv_worker_ = std::make_shared<WorkerProxy<T>>(0, 0, 1); kv_worker_ = std::make_shared<WorkerProxy<T>>(0, 0, 1, 2);
running_ = true; running_ = true;
} }
@ -121,7 +121,7 @@ void Worker<T>::Pull(const size_t key, void *dev_addr, const size_t size) {
while (!kv_worker_->IsReadyForPull(key)) { while (!kv_worker_->IsReadyForPull(key)) {
continue; continue;
} }
kv_worker_->Wait(kv_worker_->ZPull({key}, &variables)); kv_worker_->PullData({key}, &variables);
auto ret = memcpy_s(dev_addr, size, variables.data(), size); auto ret = memcpy_s(dev_addr, size, variables.data(), size);
if (ret != 0) { if (ret != 0) {
MS_LOG(EXCEPTION) << "memcpy_s error, errorno(" << ret << ")"; MS_LOG(EXCEPTION) << "memcpy_s error, errorno(" << ret << ")";
@ -149,7 +149,7 @@ void Worker<T>::InitPSParamData(const std::vector<size_t> &keys, void *origin_ad
::ps::SArray<::ps::Key> key(keys); ::ps::SArray<::ps::Key> key(keys);
::ps::SArray<int> lens; ::ps::SArray<int> lens;
lens.push_back(addr.size()); lens.push_back(addr.size());
kv_worker_->Wait(kv_worker_->ZPush(key, addr, lens, kInitWeightsCmd)); kv_worker_->PushData(key, addr, lens, kInitWeightsCmd);
init_keys_[key[0]] = true; init_keys_[key[0]] = true;
} }
@ -269,7 +269,6 @@ void Worker<T>::InitPSEmbeddingTable(const std::vector<size_t> &keys, std::vecto
} }
template <typename T> template <typename T>
// Initialize parameters and optimizer kernels of Parameter Server.
void Worker<T>::InitPSParamAndOptim(const std::string &param_name, tensor::TensorPtr tensor) { void Worker<T>::InitPSParamAndOptim(const std::string &param_name, tensor::TensorPtr tensor) {
void *param_data = tensor->data_c(); void *param_data = tensor->data_c();
size_t param_size = LongToSize(tensor->data().nbytes()); size_t param_size = LongToSize(tensor->data().nbytes());
@ -290,6 +289,7 @@ void Worker<T>::InitPSParamAndOptim(const std::string &param_name, tensor::Tenso
if (!init) { if (!init) {
MS_LOG(INFO) << "Init paramter and optimizer in parameter server side for " << param_name MS_LOG(INFO) << "Init paramter and optimizer in parameter server side for " << param_name
<< ", whether init in server: " << init_in_server; << ", whether init in server: " << init_in_server;
kv_worker_->AddKeyToServerId(param_key);
if (!init_in_server) { if (!init_in_server) {
InitPSParamData({param_key}, param_data, param_size); InitPSParamData({param_key}, param_data, param_size);
} }

File diff suppressed because it is too large Load Diff

@ -99,27 +99,31 @@ then
fi fi
cd .. cd ..
export MS_ROLE=MS_PSERVER export MS_ROLE=MS_PSERVER
rm -rf ./server for((i=0;i<$MS_SERVER_NUM;i++));
mkdir ./server do
cp ../*.py ./server rm -rf ./server_$i
cp *.sh ./server mkdir ./server_$i
cp -r ../src ./server cp ../*.py ./server_$i
cd ./server || exit cp *.sh ./server_$i
if [ $# == 3 ] cp -r ../src ./server_$i
then cd ./server_$i || exit
mpirun --allow-run-as-root -n 1 \ if [ $# == 3 ]
python train.py --net=$1 --dataset=$2 --run_distribute=True \ then
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> server.log & mpirun --allow-run-as-root -n 1 \
fi
if [ $# == 4 ]
then
mpirun --allow-run-as-root -n 1 \
python train.py --net=$1 --dataset=$2 --run_distribute=True \ python train.py --net=$1 --dataset=$2 --run_distribute=True \
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> server.log & --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> server_$i.log &
fi fi
cd ..
if [ $# == 4 ]
then
mpirun --allow-run-as-root -n 1 \
python train.py --net=$1 --dataset=$2 --run_distribute=True \
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> server_$i.log &
fi
cd ..
done
export MS_ROLE=MS_WORKER export MS_ROLE=MS_WORKER
rm -rf ./worker rm -rf ./worker

@ -14,19 +14,22 @@
# ============================================================================ # ============================================================================
import os import os
# @pytest.mark.level0
# @pytest.mark.platform_arm_ascend_training def test_ps_ascend_multi_worker_multi_server():
# @pytest.mark.platform_x86_ascend_training return_code = os.system("bash shell_run_test.sh Ascend 8 8 127.0.0.1 8088")
# @pytest.mark.env_single
def test_multi_worker_full_ps_ascend_lenet():
return_code = os.system("bash shell_run_test.sh Ascend 8 1 127.0.0.1 8088")
assert return_code == 0 assert return_code == 0
# @pytest.mark.level0 def test_ps_ascend():
# @pytest.mark.platform_arm_ascend_training
# @pytest.mark.platform_x86_ascend_training
# @pytest.mark.env_onecard
def test_full_ps_ascend_lenet():
return_code = os.system("bash shell_run_test.sh Ascend 1 1 127.0.0.1 8088") return_code = os.system("bash shell_run_test.sh Ascend 1 1 127.0.0.1 8088")
assert return_code == 0 assert return_code == 0
def test_ps_gpu_multi_worker_multi_server():
return_code = os.system("bash shell_run_test.sh GPU 8 8 127.0.0.1 8088")
assert return_code == 0
def test_ps_gpu():
return_code = os.system("bash shell_run_test.sh GPU 1 1 127.0.0.1 8088")
assert return_code == 0

File diff suppressed because it is too large Load Diff

@ -30,9 +30,7 @@ do
rm -rf ${execute_path}/sched_$i/ rm -rf ${execute_path}/sched_$i/
mkdir ${execute_path}/sched_$i/ mkdir ${execute_path}/sched_$i/
cd ${execute_path}/sched_$i/ || exit cd ${execute_path}/sched_$i/ || exit
export RANK_ID=$i python ${self_path}/../test_multi_full_ps.py --device_target=$DEVICE_TARGET &
export DEVICE_ID=$i
python ${self_path}/../test_multi_worker_full_ps_lenet.py --device_target=$DEVICE_TARGET &
done done
export MS_ROLE=MS_PSERVER export MS_ROLE=MS_PSERVER
@ -43,10 +41,11 @@ do
cd ${execute_path}/server_$i/ || exit cd ${execute_path}/server_$i/ || exit
export RANK_ID=$i export RANK_ID=$i
export DEVICE_ID=$i export DEVICE_ID=$i
python ${self_path}/../test_multi_worker_full_ps_lenet.py --device_target=$DEVICE_TARGET & python ${self_path}/../test_multi_full_ps.py --device_target=$DEVICE_TARGET &
done done
export MS_ROLE=MS_WORKER export MS_ROLE=MS_WORKER
if [ $DEVICE_TARGET == "Ascend" ];then
for((i=0;i<$MS_WORKER_NUM;i++)); for((i=0;i<$MS_WORKER_NUM;i++));
do do
rm -rf ${execute_path}/worker_$i/ rm -rf ${execute_path}/worker_$i/
@ -54,8 +53,15 @@ do
cd ${execute_path}/worker_$i/ || exit cd ${execute_path}/worker_$i/ || exit
export RANK_ID=$i export RANK_ID=$i
export DEVICE_ID=$i export DEVICE_ID=$i
python ${self_path}/../test_multi_worker_full_ps_lenet.py --device_target=$DEVICE_TARGET & python ${self_path}/../test_multi_full_ps.py --device_target=$DEVICE_TARGET &
done done
fi
if [ $DEVICE_TARGET == "GPU" ];then
rm -rf ${execute_path}/worker/
mkdir ${execute_path}/worker/
cd ${execute_path}/worker/ || exit
mpirun -n $MS_WORKER_NUM python ${self_path}/../test_multi_full_ps.py --device_target=$DEVICE_TARGET &
fi
wait $! wait $!
exit $? exit $?

@ -21,12 +21,16 @@ import mindspore.nn as nn
from mindspore.common.initializer import TruncatedNormal from mindspore.common.initializer import TruncatedNormal
from mindspore import Tensor from mindspore import Tensor
from mindspore.nn import TrainOneStepCell, WithLossCell from mindspore.nn import TrainOneStepCell, WithLossCell
from mindspore.communication.management import init, get_group_size
# from resnet import resnet50
parser = argparse.ArgumentParser(description="test_ps_lenet") parser = argparse.ArgumentParser(description="test_ps_lenet")
parser.add_argument("--device_target", type=str, default="Ascend") parser.add_argument("--device_target", type=str, default="Ascend")
args, _ = parser.parse_known_args() args, _ = parser.parse_known_args()
device_target = args.device_target device_target = args.device_target
context.set_context(mode=context.GRAPH_MODE, device_target=device_target) context.set_context(mode=context.GRAPH_MODE, device_target=device_target)
if device_target == "GPU":
init('nccl')
def conv(in_channels, out_channels, kernel_size, stride=1, padding=0): def conv(in_channels, out_channels, kernel_size, stride=1, padding=0):
@ -94,7 +98,8 @@ if __name__ == "__main__":
is_grad=False, sparse=True, reduction="mean" is_grad=False, sparse=True, reduction="mean"
) )
net_opt = nn.Momentum(network.trainable_params(), 0.01, 0.9) net_opt = nn.Momentum(network.trainable_params(), 0.01, 0.9)
if device_target == "GPU":
context.set_auto_parallel_context(parallel_mode="data_parallel", mirror_mean=True, device_num=get_group_size())
net_with_criterion = WithLossCell(network, criterion) net_with_criterion = WithLossCell(network, criterion)
train_network = TrainOneStepCell(net_with_criterion, net_opt) train_network = TrainOneStepCell(net_with_criterion, net_opt)
train_network.set_train() train_network.set_train()
Loading…
Cancel
Save