You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Paddle/paddle/scripts/cluster_train_v2/openmpi/start_mpi_train.sh

29 lines
914 B

#!/bin/bash
# General trainning configurations
NICS=eth0
PADDLE_INIT_PORT=7164
PADDLE_INIT_PORTS_NUM=1
PADDLE_INIT_PORTS_NUM_FOR_SPARSE=1
PADDLE_INIT_PSERVERS=$(cat machines | sed -e ':a' -e 'N' -e '$!ba' -e 's/\n/,/g')
PADDLE_INIT_USE_GPU=False
PADDLE_INIT_NUM_GRADIENT_SERVERS=${OMPI_COMM_WORLD_SIZE}
PADDLE_INIT_TRAINER_ID=${OMPI_COMM_WORLD_RANK}
PADDLE_CLUSTER_TRAIN=True
env
# start pserver
stdbuf -oL nohup paddle pserver --port=$PADDLE_INIT_PORT --ports_num=$PADDLE_INIT_PORTS_NUM \
--ports_num_for_sparse=$PADDLE_INIT_PORTS_NUM_FOR_SPARSE --nics=$NICS \
--comment=paddle_cluster_pserver \
--num_gradient_servers=$PADDLE_INIT_NUM_GRADIENT_SERVERS &> logs/pserver.log &
# start trainer
# NOTE: train.py will use the above environment variables as configuration
python train.py &> logs/train.log
# kill background pservers when train finishes
ps -ef | grep pserver | awk '{print $2}' | xargs kill