Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix-6581
@ -0,0 +1,62 @@
|
||||
set -e
|
||||
|
||||
function clock_to_seconds() {
|
||||
hours=`echo $1 | awk -F ':' '{print $1}'`
|
||||
mins=`echo $1 | awk -F ':' '{print $2}'`
|
||||
secs=`echo $1 | awk -F ':' '{print $3}'`
|
||||
echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'`
|
||||
}
|
||||
|
||||
function infer() {
|
||||
unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
|
||||
topology=$1
|
||||
layer_num=$2
|
||||
bs=$3
|
||||
thread=`nproc`
|
||||
if [ $thread -gt $bs ]; then
|
||||
thread=$bs
|
||||
fi
|
||||
log="logs/infer-${topology}-${layer_num}-${thread}openblas-${bs}.log"
|
||||
|
||||
models_in="models/${topology}-${layer_num}/pass-00000/"
|
||||
if [ ! -d $models_in ]; then
|
||||
echo "./run_mkl_infer.sh to save the model first"
|
||||
exit 0
|
||||
fi
|
||||
log_period=$((256 / bs))
|
||||
paddle train --job=test \
|
||||
--config="${topology}.py" \
|
||||
--use_gpu=False \
|
||||
--trainer_count=$thread \
|
||||
--log_period=$log_period \
|
||||
--config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True" \
|
||||
--init_model_path=$models_in \
|
||||
2>&1 | tee ${log}
|
||||
|
||||
# calculate the last 5 logs period time of 1280 samples,
|
||||
# the time before are burning time.
|
||||
start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
|
||||
end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
|
||||
start_sec=`clock_to_seconds $start`
|
||||
end_sec=`clock_to_seconds $end`
|
||||
fps=`awk 'BEGIN{printf "%.2f",(1280 / ('$end_sec' - '$start_sec'))}'`
|
||||
echo "Last 1280 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
|
||||
echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
|
||||
}
|
||||
|
||||
if [ ! -f "train.list" ]; then
|
||||
echo " " > train.list
|
||||
fi
|
||||
if [ ! -f "test.list" ]; then
|
||||
echo " " > test.list
|
||||
fi
|
||||
if [ ! -d "logs" ]; then
|
||||
mkdir logs
|
||||
fi
|
||||
|
||||
# inference benchmark
|
||||
for batchsize in 1 2 4 8 16; do
|
||||
infer googlenet v1 $batchsize
|
||||
infer resnet 50 $batchsize
|
||||
infer vgg 19 $batchsize
|
||||
done
|
@ -0,0 +1,39 @@
|
||||
set -e
|
||||
|
||||
function train() {
|
||||
unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
|
||||
topology=$1
|
||||
layer_num=$2
|
||||
bs=$3
|
||||
thread=`nproc`
|
||||
# each trainer_count use only 1 core to avoid conflict
|
||||
log="logs/train-${topology}-${layer_num}-${thread}openblas-${bs}.log"
|
||||
args="batch_size=${bs},layer_num=${layer_num}"
|
||||
config="${topology}.py"
|
||||
paddle train --job=time \
|
||||
--config=$config \
|
||||
--use_gpu=False \
|
||||
--trainer_count=$thread \
|
||||
--log_period=10 \
|
||||
--test_period=100 \
|
||||
--config_args=$args \
|
||||
2>&1 | tee ${log}
|
||||
|
||||
avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'`
|
||||
fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'`
|
||||
echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
|
||||
}
|
||||
|
||||
if [ ! -f "train.list" ]; then
|
||||
echo " " > train.list
|
||||
fi
|
||||
if [ ! -d "logs" ]; then
|
||||
mkdir logs
|
||||
fi
|
||||
|
||||
# training benchmark
|
||||
for batchsize in 64 128 256; do
|
||||
train vgg 19 $batchsize
|
||||
train resnet 50 $batchsize
|
||||
train googlenet v1 $batchsize
|
||||
done
|
@ -1,23 +1,27 @@
|
||||
# Executor Design Doc
|
||||
|
||||
## Motivation
|
||||
In the [fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md), we encourage user use deep learning programming paradigms to describe training process. When the user-written Python program is executed, it will create a protobuf message
|
||||
[`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
|
||||
|
||||
We use executor to do the runtime evaluation of a `ProgramDesc`.
|
||||
The executor runs the `ProgramDesc` like an interpreter. `ProgramDesc` contains intrinsics/operators and variables which will be used, executor explicitly execute the stored precompiled code.
|
||||
|
||||
## Overview
|
||||
|
||||
An executor takes a `ProgramDesc`, a `block_id` and a `Scope`. The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instance, which is persistent throughout different runs.
|
||||
|
||||
### What does executor do?
|
||||
## Executor
|
||||
|
||||
It evaluates all the operators in the `block_id`th block of a `ProgramDesc`.
|
||||
`Executor` explicitly executes all the intrinsics/operators in the `block_id`th block of a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then runs all the operators in sequence. It is very similar to push stack frame when entering the block, it will destroy the temporary variables when mini-batch is finished, but it does not have stack frame pop process.
|
||||
|
||||
### What does executor NOT do?
|
||||
### Interface
|
||||
```c++
|
||||
Executor(places);
|
||||
```
|
||||
A executor does not own any computing resources, user can only construct an executor with specified places.
|
||||
|
||||
It does not do runtime optimization, meaning intelligently parse the dependency of each op a choose which one to be run and in which order they should be run.
|
||||
|
||||
It does not do graph partitioning, meaning dividing the `ProgramDesc` into several small pieces and executing them on different devices.
|
||||
|
||||
## Implementation
|
||||
|
||||
`Executor` evaluates a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then run all the operators in sequence. [[code]](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)
|
||||
```
|
||||
void Run(ProgramDesc, Scope, block_id, create_local_scope);
|
||||
```
|
||||
A executor only provides an unified way to execute `ProgramDesc`. `ProgramDesc` is the target will be executed, scope specifies the variable container. `block_id` indicates the entrance block, `create_local_scope` means if it will destroy the temporary variables after execution finished.
|
||||
|
@ -0,0 +1,42 @@
|
||||
# 使用fabric启动集群训练
|
||||
|
||||
## 准备一个Linux集群
|
||||
可以在`paddle/scripts/cluster_train_v2/fabric/docker_cluster`目录下,执行`kubectl -f ssh_servers.yaml`启动一个测试集群,并使用`kubectl get po -o wide`获得这些节点的IP地址。
|
||||
|
||||
## 启动集群作业
|
||||
|
||||
`paddle.py` 提供了自动化脚本来启动不同节点中的所有 PaddlePaddle 集群进程。默认情况下,所有命令行选项可以设置为 `paddle.py` 命令选项并且 `paddle.py` 将透明、自动地将这些选项应用到 PaddlePaddle 底层进程。
|
||||
|
||||
`paddle.py` 为方便作业启动提供了两个独特的命令选项。
|
||||
|
||||
- `job_dispatch_package` 设为本地 `workspace` 目录,它将被分发到 `conf.py` 中设置的所有节点。它有助于帮助频繁修改和访问工作区文件的用户减少负担,否则频繁的多节点工作空间部署可能会很麻烦。
|
||||
- `job_workspace` 设为已部署的工作空间目录,`paddle.py` 将跳过分发阶段直接启动所有节点的集群作业。它可以帮助减少分发延迟。
|
||||
|
||||
`cluster_train/run.sh` 提供了命令样例来运行 `doc/howto/usage/cluster/src/word2vec` 集群任务,只需用您定义的目录修改 `job_dispatch_package` 和 `job_workspace`,然后:
|
||||
```
|
||||
sh run.sh
|
||||
```
|
||||
|
||||
集群作业将会在几秒后启动。
|
||||
|
||||
## 终止集群作业
|
||||
`paddle.py`能获取`Ctrl + C` SIGINT 信号来自动终止它启动的所有进程。只需中断 `paddle.py` 任务来终止集群作业。如果程序崩溃你也可以手动终止。
|
||||
|
||||
## 检查集群训练结果
|
||||
详细信息请检查 $workspace/log 里的日志,每一个节点都有相同的日志结构。
|
||||
|
||||
`paddle_trainer.INFO`
|
||||
提供几乎所有训练的内部输出日志,与本地训练相同。这里检验运行时间模型的收敛。
|
||||
|
||||
`paddle_pserver2.INFO`
|
||||
提供 pserver 运行日志,有助于诊断分布式错误。
|
||||
|
||||
`server.log`
|
||||
提供 parameter server 进程的 stderr 和 stdout。训练失败时可以检查错误日志。
|
||||
|
||||
`train.log`
|
||||
提供训练过程的 stderr 和 stdout。训练失败时可以检查错误日志。
|
||||
|
||||
## 检查模型输出
|
||||
运行完成后,模型文件将被写入节点 0 的 `output` 目录中。
|
||||
工作空间中的 `nodefile` 表示当前集群作业的节点 ID。
|
@ -0,0 +1,43 @@
|
||||
# Cluster Training Using Fabric
|
||||
|
||||
## Prepare a Linux cluster
|
||||
|
||||
Run `kubectl -f ssh_servers.yaml` under the directory: `paddle/scripts/cluster_train_v2/fabric/docker_cluster` will launch a demo cluster. Run `kubectl get po -o wide` to get IP addresses of these nodes.
|
||||
|
||||
## Launching Cluster Job
|
||||
`paddle.py` provides automatical scripts to start all PaddlePaddle cluster processes in different nodes. By default, all command line options can be set as `paddle.py` command options and `paddle.py` will transparently and automatically set these options to PaddlePaddle lower level processes.
|
||||
|
||||
`paddle.py`provides two distinguished command option for easy job launching.
|
||||
|
||||
- `job_dispatch_package` set it with local `workspace` directory, it will be dispatched to all nodes which is set in `conf.py`. It could be helpful for frequently manipulating workspace files. otherwise, frequent multi-nodes workspace deployment is very annoying.
|
||||
- `job_workspace` set it with already deployed workspace directory, `paddle.py` will skip dispatch stage to directly launch cluster job with all nodes. It could help to reduce heavy
|
||||
dispatch latency.
|
||||
|
||||
`cluster_train/run.sh` provides command line sample to run `demo/recommendation` cluster job, just modify `job_dispatch_package` and `job_workspace` with your defined directory, then:
|
||||
```
|
||||
sh run.sh
|
||||
```
|
||||
|
||||
The cluster Job will start in several seconds.
|
||||
|
||||
## Kill Cluster Job
|
||||
`paddle.py` can capture `Ctrl + C` SIGINT signal to automatically kill all processes launched by it. So just stop `paddle.py` to kill cluster job. You should manually kill the job if the program crashed.
|
||||
|
||||
## Check Cluster Training Result
|
||||
Check log in $workspace/log for details, each node owns same log structure.
|
||||
|
||||
`paddle_trainer.INFO`
|
||||
It provides almost all internal output log for training, same as local training. Check runtime model convergence here.
|
||||
|
||||
`paddle_pserver2.INFO`
|
||||
It provides parameter server running log, which could help to diagnose distributed error.
|
||||
|
||||
`server.log`
|
||||
It provides stderr and stdout of parameter server process. Check error log if training crashes.
|
||||
|
||||
`train.log`
|
||||
It provides stderr and stdout of trainer process. Check error log if training crashes.
|
||||
|
||||
## Check Model Output
|
||||
After one pass finished, model files will be written in `output` directory in node 0.
|
||||
`nodefile` in workspace indicates the node id of current cluster job.
|
@ -0,0 +1 @@
|
||||
k8s_aws_en.md
|
@ -0,0 +1,41 @@
|
||||
# Cluster Training Using OpenMPI
|
||||
|
||||
## Prepare an OpenMPI cluster
|
||||
|
||||
Run the following command to start a 3-node MPI cluster and one "head" node.
|
||||
|
||||
```bash
|
||||
cd paddle/scripts/cluster_train_v2/openmpi/docker_cluster
|
||||
kubectl create -f head.yaml
|
||||
kubectl create -f mpi-nodes.yaml
|
||||
```
|
||||
|
||||
Then you can log in to every OpenMPI node using ssh without input any passwords.
|
||||
|
||||
## Launching Cluster Job
|
||||
|
||||
Follow the steps to launch a PaddlePaddle training job in OpenMPI cluster:\
|
||||
|
||||
```bash
|
||||
# find out node IP addresses
|
||||
kubectl get po -o wide
|
||||
# generate a "machines" file containing node IP addresses
|
||||
kubectl get po -o wide | grep nodes | awk '{print $6}' > machines
|
||||
# copy necessary files onto "head" node
|
||||
scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~
|
||||
# login to head node using ssh
|
||||
ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP]
|
||||
# --------------- in head node ---------------
|
||||
# prepare training data
|
||||
python prepare.py
|
||||
# copy training data and dict file to MPI nodes
|
||||
cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial
|
||||
# creat a directory for storing log files
|
||||
mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs
|
||||
# copy training data to every node
|
||||
scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial
|
||||
scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial
|
||||
scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial
|
||||
# start the job
|
||||
mpirun -hostfile machines -n 3 /home/tutorial/start_mpi_train.sh
|
||||
```
|
Before Width: | Height: | Size: 116 KiB After Width: | Height: | Size: 116 KiB |
Before Width: | Height: | Size: 236 KiB After Width: | Height: | Size: 236 KiB |
Before Width: | Height: | Size: 225 KiB After Width: | Height: | Size: 225 KiB |
After Width: | Height: | Size: 421 KiB |