parent
eec133ca6a
commit
5316c64776
@ -1,37 +0,0 @@
|
|||||||
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
HOSTS = [
|
|
||||||
"root@192.168.100.17",
|
|
||||||
"root@192.168.100.18",
|
|
||||||
]
|
|
||||||
'''
|
|
||||||
workspace configuration
|
|
||||||
'''
|
|
||||||
#root dir for workspace, can be set as any director with real user account
|
|
||||||
ROOT_DIR = "/home/paddle"
|
|
||||||
'''
|
|
||||||
network configuration
|
|
||||||
'''
|
|
||||||
#pserver nics
|
|
||||||
PADDLE_NIC = "eth0"
|
|
||||||
#pserver port
|
|
||||||
PADDLE_PORT = 7164
|
|
||||||
#pserver ports num
|
|
||||||
PADDLE_PORTS_NUM = 2
|
|
||||||
#pserver sparse ports num
|
|
||||||
PADDLE_PORTS_NUM_FOR_SPARSE = 2
|
|
||||||
|
|
||||||
#environments setting for all processes in cluster job
|
|
||||||
LD_LIBRARY_PATH = "/usr/local/cuda/lib64:/usr/lib64"
|
|
@ -1,82 +0,0 @@
|
|||||||
#!/usr/bin/python
|
|
||||||
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
""" module for launching cluster job """
|
|
||||||
|
|
||||||
import os
|
|
||||||
import argparse
|
|
||||||
import socket
|
|
||||||
import copy
|
|
||||||
import time
|
|
||||||
import signal
|
|
||||||
|
|
||||||
from fabric.api import run, put, settings, env, prefix
|
|
||||||
from fabric.tasks import execute
|
|
||||||
|
|
||||||
#configuration for cluster
|
|
||||||
import conf
|
|
||||||
|
|
||||||
|
|
||||||
def refine_unknown_args(cmd_args):
|
|
||||||
'''
|
|
||||||
refine unknown parameters to handle some special parameters
|
|
||||||
'''
|
|
||||||
new_args = []
|
|
||||||
for arg in cmd_args:
|
|
||||||
if arg.startswith("--") and arg.find("=") != -1:
|
|
||||||
equal_pos = arg.find("=") #find first = pos
|
|
||||||
arglist = list(arg)
|
|
||||||
arglist[equal_pos] = " "
|
|
||||||
arg = "".join(arglist)
|
|
||||||
arg = arg.lstrip("-")
|
|
||||||
new_args += arg.split(" ")
|
|
||||||
elif arg.startswith("--") and arg.find("=") == -1:
|
|
||||||
arg = arg.lstrip("-")
|
|
||||||
new_args.append(arg)
|
|
||||||
else:
|
|
||||||
new_args.append(arg)
|
|
||||||
return new_args
|
|
||||||
|
|
||||||
|
|
||||||
def kill_process():
|
|
||||||
'''
|
|
||||||
kill comments threads
|
|
||||||
'''
|
|
||||||
run("ps aux \
|
|
||||||
| grep paddle_process_by_paddle \
|
|
||||||
| grep -v grep \
|
|
||||||
| awk '{print $2}' \
|
|
||||||
| xargs kill > /dev/null 2>&1")
|
|
||||||
|
|
||||||
|
|
||||||
def job_prepare(jobdir, data=None):
|
|
||||||
'''
|
|
||||||
prepare job related workspace data
|
|
||||||
|
|
||||||
Assuming you already installed PaddlePaddle in all nodes which means
|
|
||||||
PaddlePaddle related bins and dependencies libraries.
|
|
||||||
Assuming the train/test data have already been installed.
|
|
||||||
This function just prepare all related model and other resources
|
|
||||||
needed at runtime.
|
|
||||||
'''
|
|
||||||
|
|
||||||
def job_create_workspace(jobdir, data=None):
|
|
||||||
'''
|
|
||||||
prepare job workspace, common file, etc.
|
|
||||||
'''
|
|
||||||
log = os.path.join(jobdir, "log")
|
|
||||||
if data is not None:
|
|
||||||
#create job dir
|
|
||||||
run('rm ' + jobdir + ' -fr && ' + 'mkdir -p ' + jobdir)
|
|
||||||
#push data and paddle bin
|
|
@ -1,27 +0,0 @@
|
|||||||
#!/bin/sh
|
|
||||||
|
|
||||||
#python paddle.py \
|
|
||||||
# --job_workspace="${PATH_TO_REMOTE_EXISTED_WORKSPACE}" \
|
|
||||||
# --dot_period=10 \
|
|
||||||
# --ports_num_for_sparse=2 \
|
|
||||||
# --log_period=50 \
|
|
||||||
# --num_passes=10 \
|
|
||||||
# --trainer_count=4 \
|
|
||||||
# --saving_period=1 \
|
|
||||||
# --local=0 \
|
|
||||||
# --config=./trainer_config.py \
|
|
||||||
# --save_dir=./output \
|
|
||||||
# --use_gpu=0
|
|
||||||
|
|
||||||
python paddle.py \
|
|
||||||
--job_dispatch_package="${PATH_TO_LOCAL_WORKSPACE}" \
|
|
||||||
--dot_period=10 \
|
|
||||||
--ports_num_for_sparse=2 \
|
|
||||||
--log_period=50 \
|
|
||||||
--num_passes=10 \
|
|
||||||
--trainer_count=4 \
|
|
||||||
--saving_period=1 \
|
|
||||||
--local=0 \
|
|
||||||
--config=./trainer_config.py \
|
|
||||||
--save_dir=./output \
|
|
||||||
--use_gpu=0
|
|
@ -1,39 +0,0 @@
|
|||||||
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
HOSTS = [
|
|
||||||
"root@10.1.9.7",
|
|
||||||
"root@10.1.18.7",
|
|
||||||
"root@10.1.32.9",
|
|
||||||
]
|
|
||||||
'''
|
|
||||||
workspace configuration
|
|
||||||
'''
|
|
||||||
#root dir for workspace, can be set as any director with real user account
|
|
||||||
ROOT_DIR = "/root"
|
|
||||||
'''
|
|
||||||
network configuration
|
|
||||||
'''
|
|
||||||
#pserver nics
|
|
||||||
PADDLE_NIC = "eth0"
|
|
||||||
#pserver port
|
|
||||||
PADDLE_PORT = 7164
|
|
||||||
#pserver ports num
|
|
||||||
PADDLE_PORTS_NUM = 1
|
|
||||||
#pserver sparse ports num
|
|
||||||
PADDLE_PORTS_NUM_FOR_SPARSE = 1
|
|
||||||
#trainer whether use gpu
|
|
||||||
PADDLE_USE_GPU = "False"
|
|
||||||
#environments setting for all processes in cluster job
|
|
||||||
LD_LIBRARY_PATH = "/usr/local/cuda/lib64:/usr/lib64"
|
|
@ -1,11 +0,0 @@
|
|||||||
FROM docker.paddlepaddlehub.com/paddle:0.10.0rc2
|
|
||||||
RUN apt-get update && apt-get install -y openssh-server
|
|
||||||
RUN mkdir /var/run/sshd
|
|
||||||
|
|
||||||
RUN echo 'root:root' |chpasswd
|
|
||||||
|
|
||||||
RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
|
|
||||||
RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
|
|
||||||
|
|
||||||
EXPOSE 22
|
|
||||||
CMD ["/usr/sbin/sshd", "-D"]
|
|
@ -1,23 +0,0 @@
|
|||||||
apiVersion: extensions/v1beta1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: ssh-servers
|
|
||||||
spec:
|
|
||||||
replicas: 3
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: ssh-servers
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- name: ssh-servers
|
|
||||||
image: docker.paddlepaddlehub.com/paddlessh
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
cpu: 500m
|
|
||||||
memory: 1Gi
|
|
||||||
requests:
|
|
||||||
cpu: 500m
|
|
||||||
memory: 1Gi
|
|
||||||
ports:
|
|
||||||
- containerPort: 22
|
|
@ -1,14 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
python paddle.py \
|
|
||||||
--job_dispatch_package="/root/wuyi/fabric_submit/workspace" \
|
|
||||||
--dot_period=10 \
|
|
||||||
--ports_num_for_sparse=1 \
|
|
||||||
--log_period=50 \
|
|
||||||
--num_passes=5 \
|
|
||||||
--trainer_count=2 \
|
|
||||||
--saving_period=1 \
|
|
||||||
--local=0 \
|
|
||||||
--config=./trainer_config.py \
|
|
||||||
--save_dir=./output \
|
|
||||||
--use_gpu=0
|
|
@ -1,43 +0,0 @@
|
|||||||
# Build this image: docker build -t mpi .
|
|
||||||
#
|
|
||||||
|
|
||||||
FROM paddlepaddle/paddle:0.10.0rc3
|
|
||||||
|
|
||||||
ENV DEBIAN_FRONTEND noninteractive
|
|
||||||
|
|
||||||
RUN apt-get update -y && \
|
|
||||||
apt-get upgrade -y && \
|
|
||||||
apt-get install -y openssh-server zip unzip vim sudo \
|
|
||||||
gcc gfortran openmpi-checkpoint binutils wget curl git openmpi-bin openmpi-common libopenmpi-dev && \
|
|
||||||
pip install mpi4py numpy virtualenv scipy matplotlib lxml sqlalchemy suds ipython obspy && \
|
|
||||||
mkdir /var/run/sshd && \
|
|
||||||
echo 'root:tutorial' | chpasswd && \
|
|
||||||
sed -i 's/PermitRootLogin without-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
|
|
||||||
# SSH login fix. Otherwise user is kicked off after login
|
|
||||||
sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd && \
|
|
||||||
echo "export VISIBLE=now" >> /etc/profile && \
|
|
||||||
adduser --disabled-password --gecos "" tutorial && \
|
|
||||||
echo "tutorial ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers && \
|
|
||||||
mkdir /home/tutorial/.ssh/
|
|
||||||
|
|
||||||
ENV HOME /home/tutorial
|
|
||||||
ENV NOTVISIBLE "in users profile"
|
|
||||||
|
|
||||||
# ------------------------------------------------------------
|
|
||||||
# Set-Up SSH with our Github deploy key
|
|
||||||
# ------------------------------------------------------------
|
|
||||||
|
|
||||||
ADD ssh/config /home/tutorial/.ssh/config
|
|
||||||
ADD ssh/id_rsa.mpi /home/tutorial/.ssh/id_rsa
|
|
||||||
ADD ssh/id_rsa.mpi.pub /home/tutorial/.ssh/id_rsa.pub
|
|
||||||
ADD ssh/id_rsa.mpi.pub /home/tutorial/.ssh/authorized_keys
|
|
||||||
|
|
||||||
#---------------------------------------------------------------
|
|
||||||
#LD_LIBRARY_PATH
|
|
||||||
#---------------------------------------------------------------
|
|
||||||
|
|
||||||
RUN export LD_LIBRARY_PATH=/usr/lib/openmpi/lib/
|
|
||||||
|
|
||||||
WORKDIR /home/tutorial
|
|
||||||
EXPOSE 22
|
|
||||||
CMD ["/usr/sbin/sshd", "-D"]
|
|
@ -1,25 +0,0 @@
|
|||||||
apiVersion: extensions/v1beta1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: mpi-header
|
|
||||||
labels:
|
|
||||||
app: mpi-header
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: mpi-header
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- image: typhoon1986/paddle-openmpi
|
|
||||||
name : mpi-header
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
cpu: 500m
|
|
||||||
memory: 2Gi
|
|
||||||
requests:
|
|
||||||
cpu: 500m
|
|
||||||
memory: 2Gi
|
|
||||||
ports:
|
|
||||||
- containerPort: 22
|
|
@ -1,26 +0,0 @@
|
|||||||
apiVersion: extensions/v1beta1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: mpi-nodes
|
|
||||||
labels:
|
|
||||||
app: mpi-nodes
|
|
||||||
spec:
|
|
||||||
replicas: 3
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: mpi-nodes
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- image: typhoon1986/paddle-openmpi
|
|
||||||
name : mpi-nodes
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
cpu: 500m
|
|
||||||
memory: 2Gi
|
|
||||||
requests:
|
|
||||||
cpu: 500m
|
|
||||||
memory: 2Gi
|
|
||||||
ports:
|
|
||||||
- containerPort: 22
|
|
||||||
imagePullPolicy: Always
|
|
@ -1 +0,0 @@
|
|||||||
StrictHostKeyChecking no
|
|
@ -1,27 +0,0 @@
|
|||||||
-----BEGIN RSA PRIVATE KEY-----
|
|
||||||
MIIEogIBAAKCAQEA7PWLZmgdJ508dD15T6+xqGDvL9Ehzo9SgsnN6xJ+qpUvvOi4
|
|
||||||
1axW0AqR4MnPTg/uuvk+x4tUpuufOW4w22UTGjsdvmIVWa9ujLtcRiN3YPY+SU+Y
|
|
||||||
O5FfqKg7r/hBn+/GMcSoffwSs7vVgmhBBnp/mJh2O1cOAFZEe98/47mbg3/kHBAk
|
|
||||||
36NOQktaU3l48B38EhBTnjWfcEGm1HcTRPFxXV5Wiko6ZhKFEuHcTVKng4ROtUqE
|
|
||||||
mgHyI0aB7TAxg4na0ejItsYWEPWGeDOw6ms/4MwylxNosWzHFPW9p4zgLCLNr+b6
|
|
||||||
bDDfYKjXZflAuTQtQhLmJUwD9uuYLAijpSE2fQIDAQABAoIBADgcgRET8Gt0CV/B
|
|
||||||
OtvKz/f+VEVvcWD3gWNlJDTZIVOFllNWjIZUlA4ZoqenQkbK8Q4nfV1FOht4yjCQ
|
|
||||||
TlN1oMtiWk297i5Zo4UBzPzy4w774I39oh/g8dT/WXr2/5s+7SDV38xNh6Q2A34o
|
|
||||||
79T35wUcfUrZ93/O7dKjb/6d8hx2FMha0wVKqY4lmG1lQE3bbx3kakec0PdvU5kO
|
|
||||||
YHKlpqj3pMR7CpMa+4yL/iXFwWYmnK+uu+zw7JR7PwvH1CzrnvW438wjQ1QmYbSx
|
|
||||||
mHHOE89X67Lsl5hn81qYWBhpwAlBwi1qscsE0cV9GcFyKqWFqZsj5coM9u3CRfvy
|
|
||||||
lrWe1OUCgYEA+LBUFEd3Hxs4sFiYElJ8R9SAs1udaqPvAl01hTEijJLfYlMMVs/y
|
|
||||||
rgNN7j22zjDak2f8QdyMJZX7EZdRmdYcHO0csYOwbYvalzcnwk+U3mxmdD3r4xSo
|
|
||||||
DSvkJ70fogAqUlcVIg2re6fCmZVJQTvMQYTVEM8zQomJRt/Lb2esSfsCgYEA8+zv
|
|
||||||
44aToe8uqiDs4w8guRW7LCDkTw4z4IVo9JUibIaPjaAs5bZEBXSB43EEywXCR75H
|
|
||||||
fML0rU1PVvKh1rqcvZdVzm+XMWVr3asPk0sapaiHaTcmyZvJRDxxqbLFp0zRP1T6
|
|
||||||
cCtXNFdHWU4KiuKrUi6cDyOKchpfkSZa4seiT+cCgYB+n4FgBfdQPlMB70oW4irn
|
|
||||||
g/q32CjxuGCk6oKqu5bkzo+xB6obtavSEFqouIGQwO056tNVUY+GP7Rjg5GH663K
|
|
||||||
yKw4cl3tmS0Gm43B8TVSfw03mKO3rrfWZQe5eCFYIg9qd26KNT2gK435FzsCXQkm
|
|
||||||
PxUhhu6JrW/ZR2/U3Iur6wKBgADrWLAb1ryagSuE+j+U1AO+kDkHWrTtkcZ72jxp
|
|
||||||
v3p3O11GSEUJXdJDcSXhTCpTuDq6/dv7hB6PFwh126RKicKxKlKf2wsFndV1Cpb8
|
|
||||||
hnovW2tLGOtTmfuW2rrQAKyzvmolsNfxYd/BoHQ2thV16z1hDZeFA8WQUeHjKh6G
|
|
||||||
sBbrAoGATdtQlaUxx4izua6k02ihkxx/cRYwDl2N8UDvDBHokS7vJFMX8b8NpsGg
|
|
||||||
zMElnqSpu/pe/0UG7N2MtPF6uyMcX8AZzzcsRkiMkDvWJzYt8Jpf+Eyd/uryF+Yv
|
|
||||||
yrXaOEY83tm6x/fny5ZaZmk8lNth7bfWywuTMkZLX3fYpWtIeE4=
|
|
||||||
-----END RSA PRIVATE KEY-----
|
|
@ -1 +0,0 @@
|
|||||||
ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDs9YtmaB0nnTx0PXlPr7GoYO8v0SHOj1KCyc3rEn6qlS+86LjVrFbQCpHgyc9OD+66+T7Hi1Sm6585bjDbZRMaOx2+YhVZr26Mu1xGI3dg9j5JT5g7kV+oqDuv+EGf78YxxKh9/BKzu9WCaEEGen+YmHY7Vw4AVkR73z/juZuDf+QcECTfo05CS1pTeXjwHfwSEFOeNZ9wQabUdxNE8XFdXlaKSjpmEoUS4dxNUqeDhE61SoSaAfIjRoHtMDGDidrR6Mi2xhYQ9YZ4M7Dqaz/gzDKXE2ixbMcU9b2njOAsIs2v5vpsMN9gqNdl+UC5NC1CEuYlTAP265gsCKOlITZ9 oweidner@peahi
|
|
@ -1,32 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
# General trainning configurations
|
|
||||||
|
|
||||||
NICS=eth0
|
|
||||||
PADDLE_INIT_PORT=7164
|
|
||||||
PADDLE_INIT_PORTS_NUM=1
|
|
||||||
PADDLE_INIT_PORTS_NUM_FOR_SPARSE=1
|
|
||||||
PADDLE_INIT_PSERVERS=$(cat machines | sed -e ':a' -e 'N' -e '$!ba' -e 's/\n/,/g')
|
|
||||||
PADDLE_INIT_USE_GPU=False
|
|
||||||
|
|
||||||
PADDLE_INIT_NUM_GRADIENT_SERVERS=${OMPI_COMM_WORLD_SIZE}
|
|
||||||
PADDLE_INIT_TRAINER_ID=${OMPI_COMM_WORLD_RANK}
|
|
||||||
PADDLE_CLUSTER_TRAIN=True
|
|
||||||
|
|
||||||
env
|
|
||||||
|
|
||||||
# start pserver
|
|
||||||
stdbuf -oL nohup paddle pserver \
|
|
||||||
--port=$PADDLE_INIT_PORT \
|
|
||||||
--ports_num=$PADDLE_INIT_PORTS_NUM \
|
|
||||||
--ports_num_for_sparse=$PADDLE_INIT_PORTS_NUM_FOR_SPARSE \
|
|
||||||
--nics=$NICS \
|
|
||||||
--comment=paddle_cluster_pserver \
|
|
||||||
--num_gradient_servers=$PADDLE_INIT_NUM_GRADIENT_SERVERS \
|
|
||||||
&> logs/pserver.log &
|
|
||||||
|
|
||||||
# start trainer
|
|
||||||
# NOTE: train.py will use the above environment variables as configuration
|
|
||||||
python train.py &> logs/train.log
|
|
||||||
|
|
||||||
# kill background pservers when train finishes
|
|
||||||
ps -ef | grep pserver | awk '{print $2}' | xargs kill
|
|
Loading…
Reference in new issue