parent
eec133ca6a
commit
5316c64776
@ -1,37 +0,0 @@
|
||||
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
HOSTS = [
|
||||
"root@192.168.100.17",
|
||||
"root@192.168.100.18",
|
||||
]
|
||||
'''
|
||||
workspace configuration
|
||||
'''
|
||||
#root dir for workspace, can be set as any director with real user account
|
||||
ROOT_DIR = "/home/paddle"
|
||||
'''
|
||||
network configuration
|
||||
'''
|
||||
#pserver nics
|
||||
PADDLE_NIC = "eth0"
|
||||
#pserver port
|
||||
PADDLE_PORT = 7164
|
||||
#pserver ports num
|
||||
PADDLE_PORTS_NUM = 2
|
||||
#pserver sparse ports num
|
||||
PADDLE_PORTS_NUM_FOR_SPARSE = 2
|
||||
|
||||
#environments setting for all processes in cluster job
|
||||
LD_LIBRARY_PATH = "/usr/local/cuda/lib64:/usr/lib64"
|
@ -1,82 +0,0 @@
|
||||
#!/usr/bin/python
|
||||
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" module for launching cluster job """
|
||||
|
||||
import os
|
||||
import argparse
|
||||
import socket
|
||||
import copy
|
||||
import time
|
||||
import signal
|
||||
|
||||
from fabric.api import run, put, settings, env, prefix
|
||||
from fabric.tasks import execute
|
||||
|
||||
#configuration for cluster
|
||||
import conf
|
||||
|
||||
|
||||
def refine_unknown_args(cmd_args):
|
||||
'''
|
||||
refine unknown parameters to handle some special parameters
|
||||
'''
|
||||
new_args = []
|
||||
for arg in cmd_args:
|
||||
if arg.startswith("--") and arg.find("=") != -1:
|
||||
equal_pos = arg.find("=") #find first = pos
|
||||
arglist = list(arg)
|
||||
arglist[equal_pos] = " "
|
||||
arg = "".join(arglist)
|
||||
arg = arg.lstrip("-")
|
||||
new_args += arg.split(" ")
|
||||
elif arg.startswith("--") and arg.find("=") == -1:
|
||||
arg = arg.lstrip("-")
|
||||
new_args.append(arg)
|
||||
else:
|
||||
new_args.append(arg)
|
||||
return new_args
|
||||
|
||||
|
||||
def kill_process():
|
||||
'''
|
||||
kill comments threads
|
||||
'''
|
||||
run("ps aux \
|
||||
| grep paddle_process_by_paddle \
|
||||
| grep -v grep \
|
||||
| awk '{print $2}' \
|
||||
| xargs kill > /dev/null 2>&1")
|
||||
|
||||
|
||||
def job_prepare(jobdir, data=None):
|
||||
'''
|
||||
prepare job related workspace data
|
||||
|
||||
Assuming you already installed PaddlePaddle in all nodes which means
|
||||
PaddlePaddle related bins and dependencies libraries.
|
||||
Assuming the train/test data have already been installed.
|
||||
This function just prepare all related model and other resources
|
||||
needed at runtime.
|
||||
'''
|
||||
|
||||
def job_create_workspace(jobdir, data=None):
|
||||
'''
|
||||
prepare job workspace, common file, etc.
|
||||
'''
|
||||
log = os.path.join(jobdir, "log")
|
||||
if data is not None:
|
||||
#create job dir
|
||||
run('rm ' + jobdir + ' -fr && ' + 'mkdir -p ' + jobdir)
|
||||
#push data and paddle bin
|
@ -1,27 +0,0 @@
|
||||
#!/bin/sh
|
||||
|
||||
#python paddle.py \
|
||||
# --job_workspace="${PATH_TO_REMOTE_EXISTED_WORKSPACE}" \
|
||||
# --dot_period=10 \
|
||||
# --ports_num_for_sparse=2 \
|
||||
# --log_period=50 \
|
||||
# --num_passes=10 \
|
||||
# --trainer_count=4 \
|
||||
# --saving_period=1 \
|
||||
# --local=0 \
|
||||
# --config=./trainer_config.py \
|
||||
# --save_dir=./output \
|
||||
# --use_gpu=0
|
||||
|
||||
python paddle.py \
|
||||
--job_dispatch_package="${PATH_TO_LOCAL_WORKSPACE}" \
|
||||
--dot_period=10 \
|
||||
--ports_num_for_sparse=2 \
|
||||
--log_period=50 \
|
||||
--num_passes=10 \
|
||||
--trainer_count=4 \
|
||||
--saving_period=1 \
|
||||
--local=0 \
|
||||
--config=./trainer_config.py \
|
||||
--save_dir=./output \
|
||||
--use_gpu=0
|
@ -1,39 +0,0 @@
|
||||
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
HOSTS = [
|
||||
"root@10.1.9.7",
|
||||
"root@10.1.18.7",
|
||||
"root@10.1.32.9",
|
||||
]
|
||||
'''
|
||||
workspace configuration
|
||||
'''
|
||||
#root dir for workspace, can be set as any director with real user account
|
||||
ROOT_DIR = "/root"
|
||||
'''
|
||||
network configuration
|
||||
'''
|
||||
#pserver nics
|
||||
PADDLE_NIC = "eth0"
|
||||
#pserver port
|
||||
PADDLE_PORT = 7164
|
||||
#pserver ports num
|
||||
PADDLE_PORTS_NUM = 1
|
||||
#pserver sparse ports num
|
||||
PADDLE_PORTS_NUM_FOR_SPARSE = 1
|
||||
#trainer whether use gpu
|
||||
PADDLE_USE_GPU = "False"
|
||||
#environments setting for all processes in cluster job
|
||||
LD_LIBRARY_PATH = "/usr/local/cuda/lib64:/usr/lib64"
|
@ -1,11 +0,0 @@
|
||||
FROM docker.paddlepaddlehub.com/paddle:0.10.0rc2
|
||||
RUN apt-get update && apt-get install -y openssh-server
|
||||
RUN mkdir /var/run/sshd
|
||||
|
||||
RUN echo 'root:root' |chpasswd
|
||||
|
||||
RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
|
||||
RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
|
||||
|
||||
EXPOSE 22
|
||||
CMD ["/usr/sbin/sshd", "-D"]
|
@ -1,23 +0,0 @@
|
||||
apiVersion: extensions/v1beta1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: ssh-servers
|
||||
spec:
|
||||
replicas: 3
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: ssh-servers
|
||||
spec:
|
||||
containers:
|
||||
- name: ssh-servers
|
||||
image: docker.paddlepaddlehub.com/paddlessh
|
||||
resources:
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 1Gi
|
||||
requests:
|
||||
cpu: 500m
|
||||
memory: 1Gi
|
||||
ports:
|
||||
- containerPort: 22
|
@ -1,14 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
python paddle.py \
|
||||
--job_dispatch_package="/root/wuyi/fabric_submit/workspace" \
|
||||
--dot_period=10 \
|
||||
--ports_num_for_sparse=1 \
|
||||
--log_period=50 \
|
||||
--num_passes=5 \
|
||||
--trainer_count=2 \
|
||||
--saving_period=1 \
|
||||
--local=0 \
|
||||
--config=./trainer_config.py \
|
||||
--save_dir=./output \
|
||||
--use_gpu=0
|
@ -1,43 +0,0 @@
|
||||
# Build this image: docker build -t mpi .
|
||||
#
|
||||
|
||||
FROM paddlepaddle/paddle:0.10.0rc3
|
||||
|
||||
ENV DEBIAN_FRONTEND noninteractive
|
||||
|
||||
RUN apt-get update -y && \
|
||||
apt-get upgrade -y && \
|
||||
apt-get install -y openssh-server zip unzip vim sudo \
|
||||
gcc gfortran openmpi-checkpoint binutils wget curl git openmpi-bin openmpi-common libopenmpi-dev && \
|
||||
pip install mpi4py numpy virtualenv scipy matplotlib lxml sqlalchemy suds ipython obspy && \
|
||||
mkdir /var/run/sshd && \
|
||||
echo 'root:tutorial' | chpasswd && \
|
||||
sed -i 's/PermitRootLogin without-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
|
||||
# SSH login fix. Otherwise user is kicked off after login
|
||||
sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd && \
|
||||
echo "export VISIBLE=now" >> /etc/profile && \
|
||||
adduser --disabled-password --gecos "" tutorial && \
|
||||
echo "tutorial ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers && \
|
||||
mkdir /home/tutorial/.ssh/
|
||||
|
||||
ENV HOME /home/tutorial
|
||||
ENV NOTVISIBLE "in users profile"
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# Set-Up SSH with our Github deploy key
|
||||
# ------------------------------------------------------------
|
||||
|
||||
ADD ssh/config /home/tutorial/.ssh/config
|
||||
ADD ssh/id_rsa.mpi /home/tutorial/.ssh/id_rsa
|
||||
ADD ssh/id_rsa.mpi.pub /home/tutorial/.ssh/id_rsa.pub
|
||||
ADD ssh/id_rsa.mpi.pub /home/tutorial/.ssh/authorized_keys
|
||||
|
||||
#---------------------------------------------------------------
|
||||
#LD_LIBRARY_PATH
|
||||
#---------------------------------------------------------------
|
||||
|
||||
RUN export LD_LIBRARY_PATH=/usr/lib/openmpi/lib/
|
||||
|
||||
WORKDIR /home/tutorial
|
||||
EXPOSE 22
|
||||
CMD ["/usr/sbin/sshd", "-D"]
|
@ -1,25 +0,0 @@
|
||||
apiVersion: extensions/v1beta1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: mpi-header
|
||||
labels:
|
||||
app: mpi-header
|
||||
spec:
|
||||
replicas: 1
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: mpi-header
|
||||
spec:
|
||||
containers:
|
||||
- image: typhoon1986/paddle-openmpi
|
||||
name : mpi-header
|
||||
resources:
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 2Gi
|
||||
requests:
|
||||
cpu: 500m
|
||||
memory: 2Gi
|
||||
ports:
|
||||
- containerPort: 22
|
@ -1,26 +0,0 @@
|
||||
apiVersion: extensions/v1beta1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: mpi-nodes
|
||||
labels:
|
||||
app: mpi-nodes
|
||||
spec:
|
||||
replicas: 3
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: mpi-nodes
|
||||
spec:
|
||||
containers:
|
||||
- image: typhoon1986/paddle-openmpi
|
||||
name : mpi-nodes
|
||||
resources:
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 2Gi
|
||||
requests:
|
||||
cpu: 500m
|
||||
memory: 2Gi
|
||||
ports:
|
||||
- containerPort: 22
|
||||
imagePullPolicy: Always
|
@ -1 +0,0 @@
|
||||
StrictHostKeyChecking no
|
@ -1,27 +0,0 @@
|
||||
-----BEGIN RSA PRIVATE KEY-----
|
||||
MIIEogIBAAKCAQEA7PWLZmgdJ508dD15T6+xqGDvL9Ehzo9SgsnN6xJ+qpUvvOi4
|
||||
1axW0AqR4MnPTg/uuvk+x4tUpuufOW4w22UTGjsdvmIVWa9ujLtcRiN3YPY+SU+Y
|
||||
O5FfqKg7r/hBn+/GMcSoffwSs7vVgmhBBnp/mJh2O1cOAFZEe98/47mbg3/kHBAk
|
||||
36NOQktaU3l48B38EhBTnjWfcEGm1HcTRPFxXV5Wiko6ZhKFEuHcTVKng4ROtUqE
|
||||
mgHyI0aB7TAxg4na0ejItsYWEPWGeDOw6ms/4MwylxNosWzHFPW9p4zgLCLNr+b6
|
||||
bDDfYKjXZflAuTQtQhLmJUwD9uuYLAijpSE2fQIDAQABAoIBADgcgRET8Gt0CV/B
|
||||
OtvKz/f+VEVvcWD3gWNlJDTZIVOFllNWjIZUlA4ZoqenQkbK8Q4nfV1FOht4yjCQ
|
||||
TlN1oMtiWk297i5Zo4UBzPzy4w774I39oh/g8dT/WXr2/5s+7SDV38xNh6Q2A34o
|
||||
79T35wUcfUrZ93/O7dKjb/6d8hx2FMha0wVKqY4lmG1lQE3bbx3kakec0PdvU5kO
|
||||
YHKlpqj3pMR7CpMa+4yL/iXFwWYmnK+uu+zw7JR7PwvH1CzrnvW438wjQ1QmYbSx
|
||||
mHHOE89X67Lsl5hn81qYWBhpwAlBwi1qscsE0cV9GcFyKqWFqZsj5coM9u3CRfvy
|
||||
lrWe1OUCgYEA+LBUFEd3Hxs4sFiYElJ8R9SAs1udaqPvAl01hTEijJLfYlMMVs/y
|
||||
rgNN7j22zjDak2f8QdyMJZX7EZdRmdYcHO0csYOwbYvalzcnwk+U3mxmdD3r4xSo
|
||||
DSvkJ70fogAqUlcVIg2re6fCmZVJQTvMQYTVEM8zQomJRt/Lb2esSfsCgYEA8+zv
|
||||
44aToe8uqiDs4w8guRW7LCDkTw4z4IVo9JUibIaPjaAs5bZEBXSB43EEywXCR75H
|
||||
fML0rU1PVvKh1rqcvZdVzm+XMWVr3asPk0sapaiHaTcmyZvJRDxxqbLFp0zRP1T6
|
||||
cCtXNFdHWU4KiuKrUi6cDyOKchpfkSZa4seiT+cCgYB+n4FgBfdQPlMB70oW4irn
|
||||
g/q32CjxuGCk6oKqu5bkzo+xB6obtavSEFqouIGQwO056tNVUY+GP7Rjg5GH663K
|
||||
yKw4cl3tmS0Gm43B8TVSfw03mKO3rrfWZQe5eCFYIg9qd26KNT2gK435FzsCXQkm
|
||||
PxUhhu6JrW/ZR2/U3Iur6wKBgADrWLAb1ryagSuE+j+U1AO+kDkHWrTtkcZ72jxp
|
||||
v3p3O11GSEUJXdJDcSXhTCpTuDq6/dv7hB6PFwh126RKicKxKlKf2wsFndV1Cpb8
|
||||
hnovW2tLGOtTmfuW2rrQAKyzvmolsNfxYd/BoHQ2thV16z1hDZeFA8WQUeHjKh6G
|
||||
sBbrAoGATdtQlaUxx4izua6k02ihkxx/cRYwDl2N8UDvDBHokS7vJFMX8b8NpsGg
|
||||
zMElnqSpu/pe/0UG7N2MtPF6uyMcX8AZzzcsRkiMkDvWJzYt8Jpf+Eyd/uryF+Yv
|
||||
yrXaOEY83tm6x/fny5ZaZmk8lNth7bfWywuTMkZLX3fYpWtIeE4=
|
||||
-----END RSA PRIVATE KEY-----
|
@ -1 +0,0 @@
|
||||
ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDs9YtmaB0nnTx0PXlPr7GoYO8v0SHOj1KCyc3rEn6qlS+86LjVrFbQCpHgyc9OD+66+T7Hi1Sm6585bjDbZRMaOx2+YhVZr26Mu1xGI3dg9j5JT5g7kV+oqDuv+EGf78YxxKh9/BKzu9WCaEEGen+YmHY7Vw4AVkR73z/juZuDf+QcECTfo05CS1pTeXjwHfwSEFOeNZ9wQabUdxNE8XFdXlaKSjpmEoUS4dxNUqeDhE61SoSaAfIjRoHtMDGDidrR6Mi2xhYQ9YZ4M7Dqaz/gzDKXE2ixbMcU9b2njOAsIs2v5vpsMN9gqNdl+UC5NC1CEuYlTAP265gsCKOlITZ9 oweidner@peahi
|
@ -1,32 +0,0 @@
|
||||
#!/bin/bash
|
||||
# General trainning configurations
|
||||
|
||||
NICS=eth0
|
||||
PADDLE_INIT_PORT=7164
|
||||
PADDLE_INIT_PORTS_NUM=1
|
||||
PADDLE_INIT_PORTS_NUM_FOR_SPARSE=1
|
||||
PADDLE_INIT_PSERVERS=$(cat machines | sed -e ':a' -e 'N' -e '$!ba' -e 's/\n/,/g')
|
||||
PADDLE_INIT_USE_GPU=False
|
||||
|
||||
PADDLE_INIT_NUM_GRADIENT_SERVERS=${OMPI_COMM_WORLD_SIZE}
|
||||
PADDLE_INIT_TRAINER_ID=${OMPI_COMM_WORLD_RANK}
|
||||
PADDLE_CLUSTER_TRAIN=True
|
||||
|
||||
env
|
||||
|
||||
# start pserver
|
||||
stdbuf -oL nohup paddle pserver \
|
||||
--port=$PADDLE_INIT_PORT \
|
||||
--ports_num=$PADDLE_INIT_PORTS_NUM \
|
||||
--ports_num_for_sparse=$PADDLE_INIT_PORTS_NUM_FOR_SPARSE \
|
||||
--nics=$NICS \
|
||||
--comment=paddle_cluster_pserver \
|
||||
--num_gradient_servers=$PADDLE_INIT_NUM_GRADIENT_SERVERS \
|
||||
&> logs/pserver.log &
|
||||
|
||||
# start trainer
|
||||
# NOTE: train.py will use the above environment variables as configuration
|
||||
python train.py &> logs/train.log
|
||||
|
||||
# kill background pservers when train finishes
|
||||
ps -ef | grep pserver | awk '{print $2}' | xargs kill
|
Loading…
Reference in new issue