@ -0,0 +1,62 @@
|
||||
set -e
|
||||
|
||||
function clock_to_seconds() {
|
||||
hours=`echo $1 | awk -F ':' '{print $1}'`
|
||||
mins=`echo $1 | awk -F ':' '{print $2}'`
|
||||
secs=`echo $1 | awk -F ':' '{print $3}'`
|
||||
echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'`
|
||||
}
|
||||
|
||||
function infer() {
|
||||
unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
|
||||
topology=$1
|
||||
layer_num=$2
|
||||
bs=$3
|
||||
thread=`nproc`
|
||||
if [ $thread -gt $bs ]; then
|
||||
thread=$bs
|
||||
fi
|
||||
log="logs/infer-${topology}-${layer_num}-${thread}openblas-${bs}.log"
|
||||
|
||||
models_in="models/${topology}-${layer_num}/pass-00000/"
|
||||
if [ ! -d $models_in ]; then
|
||||
echo "./run_mkl_infer.sh to save the model first"
|
||||
exit 0
|
||||
fi
|
||||
log_period=$((256 / bs))
|
||||
paddle train --job=test \
|
||||
--config="${topology}.py" \
|
||||
--use_gpu=False \
|
||||
--trainer_count=$thread \
|
||||
--log_period=$log_period \
|
||||
--config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True" \
|
||||
--init_model_path=$models_in \
|
||||
2>&1 | tee ${log}
|
||||
|
||||
# calculate the last 5 logs period time of 1280 samples,
|
||||
# the time before are burning time.
|
||||
start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
|
||||
end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
|
||||
start_sec=`clock_to_seconds $start`
|
||||
end_sec=`clock_to_seconds $end`
|
||||
fps=`awk 'BEGIN{printf "%.2f",(1280 / ('$end_sec' - '$start_sec'))}'`
|
||||
echo "Last 1280 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
|
||||
echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
|
||||
}
|
||||
|
||||
if [ ! -f "train.list" ]; then
|
||||
echo " " > train.list
|
||||
fi
|
||||
if [ ! -f "test.list" ]; then
|
||||
echo " " > test.list
|
||||
fi
|
||||
if [ ! -d "logs" ]; then
|
||||
mkdir logs
|
||||
fi
|
||||
|
||||
# inference benchmark
|
||||
for batchsize in 1 2 4 8 16; do
|
||||
infer googlenet v1 $batchsize
|
||||
infer resnet 50 $batchsize
|
||||
infer vgg 19 $batchsize
|
||||
done
|
@ -0,0 +1,39 @@
|
||||
set -e
|
||||
|
||||
function train() {
|
||||
unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
|
||||
topology=$1
|
||||
layer_num=$2
|
||||
bs=$3
|
||||
thread=`nproc`
|
||||
# each trainer_count use only 1 core to avoid conflict
|
||||
log="logs/train-${topology}-${layer_num}-${thread}openblas-${bs}.log"
|
||||
args="batch_size=${bs},layer_num=${layer_num}"
|
||||
config="${topology}.py"
|
||||
paddle train --job=time \
|
||||
--config=$config \
|
||||
--use_gpu=False \
|
||||
--trainer_count=$thread \
|
||||
--log_period=10 \
|
||||
--test_period=100 \
|
||||
--config_args=$args \
|
||||
2>&1 | tee ${log}
|
||||
|
||||
avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'`
|
||||
fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'`
|
||||
echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
|
||||
}
|
||||
|
||||
if [ ! -f "train.list" ]; then
|
||||
echo " " > train.list
|
||||
fi
|
||||
if [ ! -d "logs" ]; then
|
||||
mkdir logs
|
||||
fi
|
||||
|
||||
# training benchmark
|
||||
for batchsize in 64 128 256; do
|
||||
train vgg 19 $batchsize
|
||||
train resnet 50 $batchsize
|
||||
train googlenet v1 $batchsize
|
||||
done
|
@ -1,23 +1,29 @@
|
||||
# Executor Design Doc
|
||||
|
||||
## Motivation
|
||||
In [fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md), we encourage the user to use deep learning programming paradigms to describe the training process. When the user-written Python program is executed, it will first create a protobuf message
|
||||
[`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
|
||||
|
||||
We use executor to do the runtime evaluation of a `ProgramDesc`.
|
||||
The executor runs the `ProgramDesc` like an interpreter. `ProgramDesc` contains the intrinsics (operators in this case) and variables which will be used, executor explicitly executes the stored precompiled code.
|
||||
|
||||
## Overview
|
||||
|
||||
An executor takes a `ProgramDesc`, a `block_id` and a `Scope`. The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instance, which is persistent throughout different runs.
|
||||
An executor takes a `ProgramDesc`, a `block_id` and a `Scope`. The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators in the block. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instances, which is persistent throughout different runs.
|
||||
|
||||
### What does executor do?
|
||||
## Executor
|
||||
|
||||
It evaluates all the operators in the `block_id`th block of a `ProgramDesc`.
|
||||
The `Executor` explicitly executes all the intrinsics (operators here) in the `block_id`th block of a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then runs all the operators in sequence one-by-one.
|
||||
It is very similar to how a push stack frame works when entering a block, following which it cleans up all the temporary variables when a mini-batch is finished. It does not however, have the stack frame pop process.
|
||||
|
||||
### What does executor NOT do?
|
||||
### The interface
|
||||
```c++
|
||||
Executor(places);
|
||||
```
|
||||
A executor does not own any computing resources, a user can only construct an executor using the specified places.
|
||||
|
||||
It does not do runtime optimization, meaning intelligently parse the dependency of each op a choose which one to be run and in which order they should be run.
|
||||
### Running an Executor
|
||||
|
||||
It does not do graph partitioning, meaning dividing the `ProgramDesc` into several small pieces and executing them on different devices.
|
||||
|
||||
## Implementation
|
||||
|
||||
`Executor` evaluates a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then run all the operators in sequence. [[code]](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)
|
||||
```
|
||||
void Run(ProgramDesc, Scope, block_id, create_local_scope);
|
||||
```
|
||||
An `Executor` only provides a unified way to execute `ProgramDesc`. `ProgramDesc` is the target that will be executed, the `Scope` specifies the variable container, the `block_id` indicates the entrance block and `create_local_scope` is a boolean that states whether it will destroy the temporary variables after the execution is finished.
|
||||
|
@ -0,0 +1,42 @@
|
||||
# 使用fabric启动集群训练
|
||||
|
||||
## 准备一个Linux集群
|
||||
可以在`paddle/scripts/cluster_train_v2/fabric/docker_cluster`目录下,执行`kubectl -f ssh_servers.yaml`启动一个测试集群,并使用`kubectl get po -o wide`获得这些节点的IP地址。
|
||||
|
||||
## 启动集群作业
|
||||
|
||||
`paddle.py` 提供了自动化脚本来启动不同节点中的所有 PaddlePaddle 集群进程。默认情况下,所有命令行选项可以设置为 `paddle.py` 命令选项并且 `paddle.py` 将透明、自动地将这些选项应用到 PaddlePaddle 底层进程。
|
||||
|
||||
`paddle.py` 为方便作业启动提供了两个独特的命令选项。
|
||||
|
||||
- `job_dispatch_package` 设为本地 `workspace` 目录,它将被分发到 `conf.py` 中设置的所有节点。它有助于帮助频繁修改和访问工作区文件的用户减少负担,否则频繁的多节点工作空间部署可能会很麻烦。
|
||||
- `job_workspace` 设为已部署的工作空间目录,`paddle.py` 将跳过分发阶段直接启动所有节点的集群作业。它可以帮助减少分发延迟。
|
||||
|
||||
`cluster_train/run.sh` 提供了命令样例来运行 `doc/howto/usage/cluster/src/word2vec` 集群任务,只需用您定义的目录修改 `job_dispatch_package` 和 `job_workspace`,然后:
|
||||
```
|
||||
sh run.sh
|
||||
```
|
||||
|
||||
集群作业将会在几秒后启动。
|
||||
|
||||
## 终止集群作业
|
||||
`paddle.py`能获取`Ctrl + C` SIGINT 信号来自动终止它启动的所有进程。只需中断 `paddle.py` 任务来终止集群作业。如果程序崩溃你也可以手动终止。
|
||||
|
||||
## 检查集群训练结果
|
||||
详细信息请检查 $workspace/log 里的日志,每一个节点都有相同的日志结构。
|
||||
|
||||
`paddle_trainer.INFO`
|
||||
提供几乎所有训练的内部输出日志,与本地训练相同。这里检验运行时间模型的收敛。
|
||||
|
||||
`paddle_pserver2.INFO`
|
||||
提供 pserver 运行日志,有助于诊断分布式错误。
|
||||
|
||||
`server.log`
|
||||
提供 parameter server 进程的 stderr 和 stdout。训练失败时可以检查错误日志。
|
||||
|
||||
`train.log`
|
||||
提供训练过程的 stderr 和 stdout。训练失败时可以检查错误日志。
|
||||
|
||||
## 检查模型输出
|
||||
运行完成后,模型文件将被写入节点 0 的 `output` 目录中。
|
||||
工作空间中的 `nodefile` 表示当前集群作业的节点 ID。
|
@ -0,0 +1,43 @@
|
||||
# Cluster Training Using Fabric
|
||||
|
||||
## Prepare a Linux cluster
|
||||
|
||||
Run `kubectl -f ssh_servers.yaml` under the directory: `paddle/scripts/cluster_train_v2/fabric/docker_cluster` will launch a demo cluster. Run `kubectl get po -o wide` to get IP addresses of these nodes.
|
||||
|
||||
## Launching Cluster Job
|
||||
`paddle.py` provides automatical scripts to start all PaddlePaddle cluster processes in different nodes. By default, all command line options can be set as `paddle.py` command options and `paddle.py` will transparently and automatically set these options to PaddlePaddle lower level processes.
|
||||
|
||||
`paddle.py`provides two distinguished command option for easy job launching.
|
||||
|
||||
- `job_dispatch_package` set it with local `workspace` directory, it will be dispatched to all nodes which is set in `conf.py`. It could be helpful for frequently manipulating workspace files. otherwise, frequent multi-nodes workspace deployment is very annoying.
|
||||
- `job_workspace` set it with already deployed workspace directory, `paddle.py` will skip dispatch stage to directly launch cluster job with all nodes. It could help to reduce heavy
|
||||
dispatch latency.
|
||||
|
||||
`cluster_train/run.sh` provides command line sample to run `demo/recommendation` cluster job, just modify `job_dispatch_package` and `job_workspace` with your defined directory, then:
|
||||
```
|
||||
sh run.sh
|
||||
```
|
||||
|
||||
The cluster Job will start in several seconds.
|
||||
|
||||
## Kill Cluster Job
|
||||
`paddle.py` can capture `Ctrl + C` SIGINT signal to automatically kill all processes launched by it. So just stop `paddle.py` to kill cluster job. You should manually kill the job if the program crashed.
|
||||
|
||||
## Check Cluster Training Result
|
||||
Check log in $workspace/log for details, each node owns same log structure.
|
||||
|
||||
`paddle_trainer.INFO`
|
||||
It provides almost all internal output log for training, same as local training. Check runtime model convergence here.
|
||||
|
||||
`paddle_pserver2.INFO`
|
||||
It provides parameter server running log, which could help to diagnose distributed error.
|
||||
|
||||
`server.log`
|
||||
It provides stderr and stdout of parameter server process. Check error log if training crashes.
|
||||
|
||||
`train.log`
|
||||
It provides stderr and stdout of trainer process. Check error log if training crashes.
|
||||
|
||||
## Check Model Output
|
||||
After one pass finished, model files will be written in `output` directory in node 0.
|
||||
`nodefile` in workspace indicates the node id of current cluster job.
|
@ -0,0 +1 @@
|
||||
k8s_aws_en.md
|
@ -0,0 +1,41 @@
|
||||
# Cluster Training Using OpenMPI
|
||||
|
||||
## Prepare an OpenMPI cluster
|
||||
|
||||
Run the following command to start a 3-node MPI cluster and one "head" node.
|
||||
|
||||
```bash
|
||||
cd paddle/scripts/cluster_train_v2/openmpi/docker_cluster
|
||||
kubectl create -f head.yaml
|
||||
kubectl create -f mpi-nodes.yaml
|
||||
```
|
||||
|
||||
Then you can log in to every OpenMPI node using ssh without input any passwords.
|
||||
|
||||
## Launching Cluster Job
|
||||
|
||||
Follow the steps to launch a PaddlePaddle training job in OpenMPI cluster:\
|
||||
|
||||
```bash
|
||||
# find out node IP addresses
|
||||
kubectl get po -o wide
|
||||
# generate a "machines" file containing node IP addresses
|
||||
kubectl get po -o wide | grep nodes | awk '{print $6}' > machines
|
||||
# copy necessary files onto "head" node
|
||||
scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~
|
||||
# login to head node using ssh
|
||||
ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP]
|
||||
# --------------- in head node ---------------
|
||||
# prepare training data
|
||||
python prepare.py
|
||||
# copy training data and dict file to MPI nodes
|
||||
cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial
|
||||
# creat a directory for storing log files
|
||||
mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs
|
||||
# copy training data to every node
|
||||
scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial
|
||||
scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial
|
||||
scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial
|
||||
# start the job
|
||||
mpirun -hostfile machines -n 3 /home/tutorial/start_mpi_train.sh
|
||||
```
|
Before Width: | Height: | Size: 116 KiB After Width: | Height: | Size: 116 KiB |
Before Width: | Height: | Size: 236 KiB After Width: | Height: | Size: 236 KiB |
Before Width: | Height: | Size: 225 KiB After Width: | Height: | Size: 225 KiB |
After Width: | Height: | Size: 421 KiB |