Cluster train doc for v2 API (#2072)
* update cluster train v2 doc * WIP cluster train doc * update * cluster train doc * add TOC for en doc * fix sphix build issue * fix error links * fix link errors * fix image link * polish cluster train docs * update general distributed training document * fix sphinx compile error * fix doc image errorrevert-4814-Add_sequence_project_op
parent
ba5ebe5300
commit
63ffe5250a
Binary file not shown.
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
After Width: | Height: | Size: 142 KiB |
After Width: | Height: | Size: 33 KiB |
@ -0,0 +1,100 @@
|
||||
import gzip
|
||||
import math
|
||||
|
||||
import paddle.v2 as paddle
|
||||
|
||||
embsize = 32
|
||||
hiddensize = 256
|
||||
N = 5
|
||||
|
||||
|
||||
def wordemb(inlayer):
|
||||
wordemb = paddle.layer.embedding(
|
||||
input=inlayer,
|
||||
size=embsize,
|
||||
param_attr=paddle.attr.Param(
|
||||
name="_proj",
|
||||
initial_std=0.001,
|
||||
learning_rate=1,
|
||||
l2_rate=0,
|
||||
sparse_update=True))
|
||||
return wordemb
|
||||
|
||||
|
||||
def main():
|
||||
# for local training
|
||||
cluster_train = False
|
||||
|
||||
if not cluster_train:
|
||||
paddle.init(use_gpu=False, trainer_count=1)
|
||||
else:
|
||||
paddle.init(
|
||||
use_gpu=False,
|
||||
trainer_count=2,
|
||||
port=7164,
|
||||
ports_num=1,
|
||||
ports_num_for_sparse=1,
|
||||
num_gradient_servers=1)
|
||||
word_dict = paddle.dataset.imikolov.build_dict()
|
||||
dict_size = len(word_dict)
|
||||
firstword = paddle.layer.data(
|
||||
name="firstw", type=paddle.data_type.integer_value(dict_size))
|
||||
secondword = paddle.layer.data(
|
||||
name="secondw", type=paddle.data_type.integer_value(dict_size))
|
||||
thirdword = paddle.layer.data(
|
||||
name="thirdw", type=paddle.data_type.integer_value(dict_size))
|
||||
fourthword = paddle.layer.data(
|
||||
name="fourthw", type=paddle.data_type.integer_value(dict_size))
|
||||
nextword = paddle.layer.data(
|
||||
name="fifthw", type=paddle.data_type.integer_value(dict_size))
|
||||
|
||||
Efirst = wordemb(firstword)
|
||||
Esecond = wordemb(secondword)
|
||||
Ethird = wordemb(thirdword)
|
||||
Efourth = wordemb(fourthword)
|
||||
|
||||
contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
|
||||
hidden1 = paddle.layer.fc(input=contextemb,
|
||||
size=hiddensize,
|
||||
act=paddle.activation.Sigmoid(),
|
||||
layer_attr=paddle.attr.Extra(drop_rate=0.5),
|
||||
bias_attr=paddle.attr.Param(learning_rate=2),
|
||||
param_attr=paddle.attr.Param(
|
||||
initial_std=1. / math.sqrt(embsize * 8),
|
||||
learning_rate=1))
|
||||
predictword = paddle.layer.fc(input=hidden1,
|
||||
size=dict_size,
|
||||
bias_attr=paddle.attr.Param(learning_rate=2),
|
||||
act=paddle.activation.Softmax())
|
||||
|
||||
def event_handler(event):
|
||||
if isinstance(event, paddle.event.EndIteration):
|
||||
if event.batch_id % 100 == 0:
|
||||
with gzip.open("batch-" + str(event.batch_id) + ".tar.gz",
|
||||
'w') as f:
|
||||
trainer.save_parameter_to_tar(f)
|
||||
result = trainer.test(
|
||||
paddle.batch(
|
||||
paddle.dataset.imikolov.test(word_dict, N), 32))
|
||||
print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
|
||||
event.pass_id, event.batch_id, event.cost, event.metrics,
|
||||
result.metrics)
|
||||
|
||||
cost = paddle.layer.classification_cost(input=predictword, label=nextword)
|
||||
|
||||
parameters = paddle.parameters.create(cost)
|
||||
adagrad = paddle.optimizer.AdaGrad(
|
||||
learning_rate=3e-3,
|
||||
regularization=paddle.optimizer.L2Regularization(8e-4))
|
||||
trainer = paddle.trainer.SGD(cost,
|
||||
parameters,
|
||||
adagrad,
|
||||
is_local=not cluster_train)
|
||||
trainer.train(
|
||||
paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32),
|
||||
num_passes=30,
|
||||
event_handler=event_handler)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -0,0 +1,123 @@
|
||||
import math
|
||||
import os
|
||||
import paddle.v2 as paddle
|
||||
import pickle
|
||||
|
||||
embsize = 32
|
||||
hiddensize = 256
|
||||
N = 5
|
||||
cluster_train_file = "./train_data_dir/train/train.txt"
|
||||
cluster_test_file = "./test_data_dir/test/test.txt"
|
||||
node_id = os.getenv("OMPI_COMM_WORLD_RANK")
|
||||
if not node_id:
|
||||
raise EnvironmentError("must provied OMPI_COMM_WORLD_RANK")
|
||||
|
||||
|
||||
def wordemb(inlayer):
|
||||
wordemb = paddle.layer.embedding(
|
||||
input=inlayer,
|
||||
size=embsize,
|
||||
param_attr=paddle.attr.Param(
|
||||
name="_proj",
|
||||
initial_std=0.001,
|
||||
learning_rate=1,
|
||||
l2_rate=0,
|
||||
sparse_update=True))
|
||||
return wordemb
|
||||
|
||||
|
||||
def cluster_reader_cluster(filename, node_id):
|
||||
def cluster_reader():
|
||||
with open("-".join([filename, "%05d" % int(node_id)]), "r") as f:
|
||||
for l in f:
|
||||
csv_data = [int(cell) for cell in l.split(",")]
|
||||
yield tuple(csv_data)
|
||||
|
||||
return cluster_reader
|
||||
|
||||
|
||||
def main():
|
||||
# get arguments from env
|
||||
|
||||
# for local training
|
||||
TRUTH = ["true", "True", "TRUE", "1", "yes", "Yes", "YES"]
|
||||
cluster_train = os.getenv('PADDLE_CLUSTER_TRAIN', "False") in TRUTH
|
||||
use_gpu = os.getenv('PADDLE_INIT_USE_GPU', "False")
|
||||
|
||||
if not cluster_train:
|
||||
paddle.init(
|
||||
use_gpu=use_gpu,
|
||||
trainer_count=int(os.getenv("PADDLE_INIT_TRAINER_COUNT", "1")))
|
||||
else:
|
||||
paddle.init(
|
||||
use_gpu=use_gpu,
|
||||
trainer_count=int(os.getenv("PADDLE_INIT_TRAINER_COUNT", "1")),
|
||||
port=int(os.getenv("PADDLE_INIT_PORT", "7164")),
|
||||
ports_num=int(os.getenv("PADDLE_INIT_PORTS_NUM", "1")),
|
||||
ports_num_for_sparse=int(
|
||||
os.getenv("PADDLE_INIT_PORTS_NUM_FOR_SPARSE", "1")),
|
||||
num_gradient_servers=int(
|
||||
os.getenv("PADDLE_INIT_NUM_GRADIENT_SERVERS", "1")),
|
||||
trainer_id=int(os.getenv("PADDLE_INIT_TRAINER_ID", "0")),
|
||||
pservers=os.getenv("PADDLE_INIT_PSERVERS", "127.0.0.1"))
|
||||
fn = open("thirdparty/wuyi_train_thdpty/word_dict.pickle", "r")
|
||||
word_dict = pickle.load(fn)
|
||||
fn.close()
|
||||
dict_size = len(word_dict)
|
||||
firstword = paddle.layer.data(
|
||||
name="firstw", type=paddle.data_type.integer_value(dict_size))
|
||||
secondword = paddle.layer.data(
|
||||
name="secondw", type=paddle.data_type.integer_value(dict_size))
|
||||
thirdword = paddle.layer.data(
|
||||
name="thirdw", type=paddle.data_type.integer_value(dict_size))
|
||||
fourthword = paddle.layer.data(
|
||||
name="fourthw", type=paddle.data_type.integer_value(dict_size))
|
||||
nextword = paddle.layer.data(
|
||||
name="fifthw", type=paddle.data_type.integer_value(dict_size))
|
||||
|
||||
Efirst = wordemb(firstword)
|
||||
Esecond = wordemb(secondword)
|
||||
Ethird = wordemb(thirdword)
|
||||
Efourth = wordemb(fourthword)
|
||||
|
||||
contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
|
||||
hidden1 = paddle.layer.fc(input=contextemb,
|
||||
size=hiddensize,
|
||||
act=paddle.activation.Sigmoid(),
|
||||
layer_attr=paddle.attr.Extra(drop_rate=0.5),
|
||||
bias_attr=paddle.attr.Param(learning_rate=2),
|
||||
param_attr=paddle.attr.Param(
|
||||
initial_std=1. / math.sqrt(embsize * 8),
|
||||
learning_rate=1))
|
||||
predictword = paddle.layer.fc(input=hidden1,
|
||||
size=dict_size,
|
||||
bias_attr=paddle.attr.Param(learning_rate=2),
|
||||
act=paddle.activation.Softmax())
|
||||
|
||||
def event_handler(event):
|
||||
if isinstance(event, paddle.event.EndIteration):
|
||||
if event.batch_id % 100 == 0:
|
||||
result = trainer.test(
|
||||
paddle.batch(
|
||||
cluster_reader_cluster(cluster_test_file, node_id), 32))
|
||||
print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
|
||||
event.pass_id, event.batch_id, event.cost, event.metrics,
|
||||
result.metrics)
|
||||
|
||||
cost = paddle.layer.classification_cost(input=predictword, label=nextword)
|
||||
parameters = paddle.parameters.create(cost)
|
||||
adagrad = paddle.optimizer.AdaGrad(
|
||||
learning_rate=3e-3,
|
||||
regularization=paddle.optimizer.L2Regularization(8e-4))
|
||||
trainer = paddle.trainer.SGD(cost,
|
||||
parameters,
|
||||
adagrad,
|
||||
is_local=not cluster_train)
|
||||
trainer.train(
|
||||
paddle.batch(cluster_reader_cluster(cluster_train_file, node_id), 32),
|
||||
num_passes=30,
|
||||
event_handler=event_handler)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -0,0 +1,41 @@
|
||||
import paddle.v2 as paddle
|
||||
import tarfile
|
||||
import os
|
||||
import pickle
|
||||
|
||||
SPLIT_COUNT = 3
|
||||
N = 5
|
||||
|
||||
|
||||
def file_len(fd):
|
||||
for i, l in enumerate(fd):
|
||||
pass
|
||||
return i + 1
|
||||
|
||||
|
||||
def split_from_reader_by_line(filename, reader, split_count):
|
||||
fn = open(filename, "w")
|
||||
for batch_id, batch_data in enumerate(reader()):
|
||||
batch_data_str = [str(d) for d in batch_data]
|
||||
fn.write(",".join(batch_data_str))
|
||||
fn.write("\n")
|
||||
fn.close()
|
||||
|
||||
fn = open(filename, "r")
|
||||
total_line_count = file_len(fn)
|
||||
fn.close()
|
||||
per_file_lines = total_line_count / split_count + 1
|
||||
cmd = "split -d -a 5 -l %d %s %s-" % (per_file_lines, filename, filename)
|
||||
os.system(cmd)
|
||||
|
||||
|
||||
word_dict = paddle.dataset.imikolov.build_dict()
|
||||
with open("word_dict.pickle", "w") as dict_f:
|
||||
pickle.dump(word_dict, dict_f)
|
||||
|
||||
split_from_reader_by_line("train.txt",
|
||||
paddle.dataset.imikolov.train(word_dict, N),
|
||||
SPLIT_COUNT)
|
||||
split_from_reader_by_line("test.txt",
|
||||
paddle.dataset.imikolov.test(word_dict, N),
|
||||
SPLIT_COUNT)
|
@ -0,0 +1,39 @@
|
||||
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
HOSTS = [
|
||||
"root@10.1.9.7",
|
||||
"root@10.1.18.7",
|
||||
"root@10.1.32.9",
|
||||
]
|
||||
'''
|
||||
workspace configuration
|
||||
'''
|
||||
#root dir for workspace, can be set as any director with real user account
|
||||
ROOT_DIR = "/root"
|
||||
'''
|
||||
network configuration
|
||||
'''
|
||||
#pserver nics
|
||||
PADDLE_NIC = "eth0"
|
||||
#pserver port
|
||||
PADDLE_PORT = 7164
|
||||
#pserver ports num
|
||||
PADDLE_PORTS_NUM = 1
|
||||
#pserver sparse ports num
|
||||
PADDLE_PORTS_NUM_FOR_SPARSE = 1
|
||||
#trainer whether use gpu
|
||||
PADDLE_USE_GPU = "False"
|
||||
#environments setting for all processes in cluster job
|
||||
LD_LIBRARY_PATH = "/usr/local/cuda/lib64:/usr/lib64"
|
@ -0,0 +1,11 @@
|
||||
FROM docker.paddlepaddlehub.com/paddle:0.10.0rc2
|
||||
RUN apt-get update && apt-get install -y openssh-server
|
||||
RUN mkdir /var/run/sshd
|
||||
|
||||
RUN echo 'root:root' |chpasswd
|
||||
|
||||
RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
|
||||
RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
|
||||
|
||||
EXPOSE 22
|
||||
CMD ["/usr/sbin/sshd", "-D"]
|
@ -0,0 +1,23 @@
|
||||
apiVersion: extensions/v1beta1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: ssh-servers
|
||||
spec:
|
||||
replicas: 3
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: ssh-servers
|
||||
spec:
|
||||
containers:
|
||||
- name: ssh-servers
|
||||
image: docker.paddlepaddlehub.com/paddlessh
|
||||
resources:
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 1Gi
|
||||
requests:
|
||||
cpu: 500m
|
||||
memory: 1Gi
|
||||
ports:
|
||||
- containerPort: 22
|
@ -0,0 +1,14 @@
|
||||
#!/bin/bash
|
||||
|
||||
python paddle.py \
|
||||
--job_dispatch_package="/root/wuyi/fabric_submit/workspace" \
|
||||
--dot_period=10 \
|
||||
--ports_num_for_sparse=1 \
|
||||
--log_period=50 \
|
||||
--num_passes=5 \
|
||||
--trainer_count=2 \
|
||||
--saving_period=1 \
|
||||
--local=0 \
|
||||
--config=./trainer_config.py \
|
||||
--save_dir=./output \
|
||||
--use_gpu=0
|
@ -0,0 +1,43 @@
|
||||
# Build this image: docker build -t mpi .
|
||||
#
|
||||
|
||||
FROM paddledev/paddle:0.10.0rc3
|
||||
|
||||
ENV DEBIAN_FRONTEND noninteractive
|
||||
|
||||
RUN apt-get update -y && \
|
||||
apt-get upgrade -y && \
|
||||
apt-get install -y openssh-server zip unzip vim sudo \
|
||||
gcc gfortran openmpi-checkpoint binutils wget curl git openmpi-bin openmpi-common libopenmpi-dev && \
|
||||
pip install mpi4py numpy virtualenv scipy matplotlib lxml sqlalchemy suds ipython obspy && \
|
||||
mkdir /var/run/sshd && \
|
||||
echo 'root:tutorial' | chpasswd && \
|
||||
sed -i 's/PermitRootLogin without-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
|
||||
# SSH login fix. Otherwise user is kicked off after login
|
||||
sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd && \
|
||||
echo "export VISIBLE=now" >> /etc/profile && \
|
||||
adduser --disabled-password --gecos "" tutorial && \
|
||||
echo "tutorial ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers && \
|
||||
mkdir /home/tutorial/.ssh/
|
||||
|
||||
ENV HOME /home/tutorial
|
||||
ENV NOTVISIBLE "in users profile"
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# Set-Up SSH with our Github deploy key
|
||||
# ------------------------------------------------------------
|
||||
|
||||
ADD ssh/config /home/tutorial/.ssh/config
|
||||
ADD ssh/id_rsa.mpi /home/tutorial/.ssh/id_rsa
|
||||
ADD ssh/id_rsa.mpi.pub /home/tutorial/.ssh/id_rsa.pub
|
||||
ADD ssh/id_rsa.mpi.pub /home/tutorial/.ssh/authorized_keys
|
||||
|
||||
#---------------------------------------------------------------
|
||||
#LD_LIBRARY_PATH
|
||||
#---------------------------------------------------------------
|
||||
|
||||
RUN export LD_LIBRARY_PATH=/usr/lib/openmpi/lib/
|
||||
|
||||
WORKDIR /home/tutorial
|
||||
EXPOSE 22
|
||||
CMD ["/usr/sbin/sshd", "-D"]
|
@ -0,0 +1,25 @@
|
||||
apiVersion: extensions/v1beta1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: mpi-header
|
||||
labels:
|
||||
app: mpi-header
|
||||
spec:
|
||||
replicas: 1
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: mpi-header
|
||||
spec:
|
||||
containers:
|
||||
- image: typhoon1986/paddle-openmpi
|
||||
name : mpi-header
|
||||
resources:
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 2Gi
|
||||
requests:
|
||||
cpu: 500m
|
||||
memory: 2Gi
|
||||
ports:
|
||||
- containerPort: 22
|
@ -0,0 +1,26 @@
|
||||
apiVersion: extensions/v1beta1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: mpi-nodes
|
||||
labels:
|
||||
app: mpi-nodes
|
||||
spec:
|
||||
replicas: 3
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: mpi-nodes
|
||||
spec:
|
||||
containers:
|
||||
- image: typhoon1986/paddle-openmpi
|
||||
name : mpi-nodes
|
||||
resources:
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 2Gi
|
||||
requests:
|
||||
cpu: 500m
|
||||
memory: 2Gi
|
||||
ports:
|
||||
- containerPort: 22
|
||||
imagePullPolicy: Always
|
@ -0,0 +1 @@
|
||||
StrictHostKeyChecking no
|
@ -0,0 +1,27 @@
|
||||
-----BEGIN RSA PRIVATE KEY-----
|
||||
MIIEogIBAAKCAQEA7PWLZmgdJ508dD15T6+xqGDvL9Ehzo9SgsnN6xJ+qpUvvOi4
|
||||
1axW0AqR4MnPTg/uuvk+x4tUpuufOW4w22UTGjsdvmIVWa9ujLtcRiN3YPY+SU+Y
|
||||
O5FfqKg7r/hBn+/GMcSoffwSs7vVgmhBBnp/mJh2O1cOAFZEe98/47mbg3/kHBAk
|
||||
36NOQktaU3l48B38EhBTnjWfcEGm1HcTRPFxXV5Wiko6ZhKFEuHcTVKng4ROtUqE
|
||||
mgHyI0aB7TAxg4na0ejItsYWEPWGeDOw6ms/4MwylxNosWzHFPW9p4zgLCLNr+b6
|
||||
bDDfYKjXZflAuTQtQhLmJUwD9uuYLAijpSE2fQIDAQABAoIBADgcgRET8Gt0CV/B
|
||||
OtvKz/f+VEVvcWD3gWNlJDTZIVOFllNWjIZUlA4ZoqenQkbK8Q4nfV1FOht4yjCQ
|
||||
TlN1oMtiWk297i5Zo4UBzPzy4w774I39oh/g8dT/WXr2/5s+7SDV38xNh6Q2A34o
|
||||
79T35wUcfUrZ93/O7dKjb/6d8hx2FMha0wVKqY4lmG1lQE3bbx3kakec0PdvU5kO
|
||||
YHKlpqj3pMR7CpMa+4yL/iXFwWYmnK+uu+zw7JR7PwvH1CzrnvW438wjQ1QmYbSx
|
||||
mHHOE89X67Lsl5hn81qYWBhpwAlBwi1qscsE0cV9GcFyKqWFqZsj5coM9u3CRfvy
|
||||
lrWe1OUCgYEA+LBUFEd3Hxs4sFiYElJ8R9SAs1udaqPvAl01hTEijJLfYlMMVs/y
|
||||
rgNN7j22zjDak2f8QdyMJZX7EZdRmdYcHO0csYOwbYvalzcnwk+U3mxmdD3r4xSo
|
||||
DSvkJ70fogAqUlcVIg2re6fCmZVJQTvMQYTVEM8zQomJRt/Lb2esSfsCgYEA8+zv
|
||||
44aToe8uqiDs4w8guRW7LCDkTw4z4IVo9JUibIaPjaAs5bZEBXSB43EEywXCR75H
|
||||
fML0rU1PVvKh1rqcvZdVzm+XMWVr3asPk0sapaiHaTcmyZvJRDxxqbLFp0zRP1T6
|
||||
cCtXNFdHWU4KiuKrUi6cDyOKchpfkSZa4seiT+cCgYB+n4FgBfdQPlMB70oW4irn
|
||||
g/q32CjxuGCk6oKqu5bkzo+xB6obtavSEFqouIGQwO056tNVUY+GP7Rjg5GH663K
|
||||
yKw4cl3tmS0Gm43B8TVSfw03mKO3rrfWZQe5eCFYIg9qd26KNT2gK435FzsCXQkm
|
||||
PxUhhu6JrW/ZR2/U3Iur6wKBgADrWLAb1ryagSuE+j+U1AO+kDkHWrTtkcZ72jxp
|
||||
v3p3O11GSEUJXdJDcSXhTCpTuDq6/dv7hB6PFwh126RKicKxKlKf2wsFndV1Cpb8
|
||||
hnovW2tLGOtTmfuW2rrQAKyzvmolsNfxYd/BoHQ2thV16z1hDZeFA8WQUeHjKh6G
|
||||
sBbrAoGATdtQlaUxx4izua6k02ihkxx/cRYwDl2N8UDvDBHokS7vJFMX8b8NpsGg
|
||||
zMElnqSpu/pe/0UG7N2MtPF6uyMcX8AZzzcsRkiMkDvWJzYt8Jpf+Eyd/uryF+Yv
|
||||
yrXaOEY83tm6x/fny5ZaZmk8lNth7bfWywuTMkZLX3fYpWtIeE4=
|
||||
-----END RSA PRIVATE KEY-----
|
@ -0,0 +1 @@
|
||||
ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDs9YtmaB0nnTx0PXlPr7GoYO8v0SHOj1KCyc3rEn6qlS+86LjVrFbQCpHgyc9OD+66+T7Hi1Sm6585bjDbZRMaOx2+YhVZr26Mu1xGI3dg9j5JT5g7kV+oqDuv+EGf78YxxKh9/BKzu9WCaEEGen+YmHY7Vw4AVkR73z/juZuDf+QcECTfo05CS1pTeXjwHfwSEFOeNZ9wQabUdxNE8XFdXlaKSjpmEoUS4dxNUqeDhE61SoSaAfIjRoHtMDGDidrR6Mi2xhYQ9YZ4M7Dqaz/gzDKXE2ixbMcU9b2njOAsIs2v5vpsMN9gqNdl+UC5NC1CEuYlTAP265gsCKOlITZ9 oweidner@peahi
|
@ -0,0 +1,28 @@
|
||||
#!/bin/bash
|
||||
# General trainning configurations
|
||||
|
||||
NICS=eth0
|
||||
PADDLE_INIT_PORT=7164
|
||||
PADDLE_INIT_PORTS_NUM=1
|
||||
PADDLE_INIT_PORTS_NUM_FOR_SPARSE=1
|
||||
PADDLE_INIT_PSERVERS=$(cat machines | sed -e ':a' -e 'N' -e '$!ba' -e 's/\n/,/g')
|
||||
PADDLE_INIT_USE_GPU=False
|
||||
|
||||
PADDLE_INIT_NUM_GRADIENT_SERVERS=${OMPI_COMM_WORLD_SIZE}
|
||||
PADDLE_INIT_TRAINER_ID=${OMPI_COMM_WORLD_RANK}
|
||||
PADDLE_CLUSTER_TRAIN=True
|
||||
|
||||
env
|
||||
|
||||
# start pserver
|
||||
stdbuf -oL nohup paddle pserver --port=$PADDLE_INIT_PORT --ports_num=$PADDLE_INIT_PORTS_NUM \
|
||||
--ports_num_for_sparse=$PADDLE_INIT_PORTS_NUM_FOR_SPARSE --nics=$NICS \
|
||||
--comment=paddle_cluster_pserver \
|
||||
--num_gradient_servers=$PADDLE_INIT_NUM_GRADIENT_SERVERS &> logs/pserver.log &
|
||||
|
||||
# start trainer
|
||||
# NOTE: train.py will use the above environment variables as configuration
|
||||
python train.py &> logs/train.log
|
||||
|
||||
# kill background pservers when train finishes
|
||||
ps -ef | grep pserver | awk '{print $2}' | xargs kill
|
Loading…
Reference in new issue