parent
f4ff8d26d6
commit
67bf06ce73
File diff suppressed because it is too large
Load Diff
Before Width: | Height: | Size: 232 KiB After Width: | Height: | Size: 116 KiB |
Before Width: | Height: | Size: 244 KiB After Width: | Height: | Size: 236 KiB |
@ -1,43 +0,0 @@
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: paddle-cluster-job
|
||||
spec:
|
||||
parallelism: 3
|
||||
completions: 3
|
||||
template:
|
||||
metadata:
|
||||
name: paddle-cluster-job
|
||||
spec:
|
||||
volumes:
|
||||
- name: jobpath
|
||||
hostPath:
|
||||
path: /home/work/paddle_output
|
||||
containers:
|
||||
- name: trainer
|
||||
image: registry.baidu.com/public/paddle:mypaddle
|
||||
command: ["bin/bash", "-c", "/root/start.sh"]
|
||||
env:
|
||||
- name: JOB_NAME
|
||||
value: paddle-cluster-job
|
||||
- name: JOB_PATH
|
||||
value: /home/jobpath
|
||||
- name: JOB_NAMESPACE
|
||||
value: default
|
||||
- name: TRAIN_CONFIG_DIR
|
||||
value: recommendation
|
||||
- name: CONF_PADDLE_NIC
|
||||
value: eth0
|
||||
- name: CONF_PADDLE_PORT
|
||||
value: "7164"
|
||||
- name: CONF_PADDLE_PORTS_NUM
|
||||
value: "2"
|
||||
- name: CONF_PADDLE_PORTS_NUM_SPARSE
|
||||
value: "2"
|
||||
- name: CONF_PADDLE_GRADIENT_NUM
|
||||
value: "3"
|
||||
volumeMounts:
|
||||
- name: jobpath
|
||||
mountPath: /home/jobpath
|
||||
restartPolicy: Never
|
||||
|
@ -0,0 +1,7 @@
|
||||
FROM alpine
|
||||
|
||||
RUN apk update && apk upgrade && apk add coreutils
|
||||
ADD quick_start /quick_start
|
||||
ADD get_data.sh /bin/
|
||||
RUN chmod +x /bin/get_data.sh
|
||||
ENTRYPOINT ["/bin/get_data.sh"]
|
@ -0,0 +1,6 @@
|
||||
To build PaddlePaddle data preparation image in tutorial [Distributed PaddlePaddle Training on AWS with Kubernetes](../k8s_aws_en.md), run following commands:
|
||||
|
||||
```
|
||||
cp -r ../../../../../../demo/quick_start .
|
||||
docker build . -t prepare-data-image-name
|
||||
```
|
@ -0,0 +1,26 @@
|
||||
#!/bin/sh
|
||||
|
||||
out_dir=$OUT_DIR
|
||||
split_count=$SPLIT_COUNT
|
||||
|
||||
set -e
|
||||
|
||||
mkdir -p $out_dir
|
||||
cp -r /quick_start $out_dir/
|
||||
|
||||
mkdir -p $out_dir/0/data
|
||||
cd $out_dir/0/data
|
||||
wget http://paddlepaddle.bj.bcebos.com/demo/quick_start_preprocessed_data/preprocessed_data.tar.gz
|
||||
tar zxvf preprocessed_data.tar.gz
|
||||
rm preprocessed_data.tar.gz
|
||||
|
||||
split -d --number=l/$split_count -a 5 train.txt train.
|
||||
mv train.00000 train.txt
|
||||
|
||||
cd $out_dir
|
||||
end=$(expr $split_count - 1)
|
||||
for i in $(seq 1 $end); do
|
||||
mkdir -p $i/data
|
||||
cp -r 0/data/* $i/data
|
||||
mv $i/data/train.`printf %05d $i` $i/data/train.txt
|
||||
done;
|
@ -0,0 +1,6 @@
|
||||
FROM paddledev/paddle:cpu-latest
|
||||
|
||||
COPY start.sh /root/
|
||||
COPY start_paddle.py /root/
|
||||
RUN chmod +x /root/start.sh
|
||||
CMD ["bash"," -c","/root/start.sh"]
|
@ -0,0 +1,5 @@
|
||||
To build PaddlePaddle training image in tutorial [Distributed PaddlePaddle Training on AWS with Kubernetes](../k8s_aws_en.md), run following command:
|
||||
|
||||
```
|
||||
docker build . -t train-image-name
|
||||
```
|
@ -1,19 +1,19 @@
|
||||
#!/bin/sh
|
||||
|
||||
set -eu
|
||||
|
||||
jobconfig=${JOB_PATH}"/"${JOB_NAME}"/"${TRAIN_CONFIG_DIR}
|
||||
cd /root
|
||||
cp -rf $jobconfig .
|
||||
cd $TRAIN_CONFIG_DIR
|
||||
|
||||
cp -rf $jobconfig/* .
|
||||
|
||||
python /root/start_paddle.py \
|
||||
--dot_period=10 \
|
||||
--ports_num_for_sparse=$CONF_PADDLE_PORTS_NUM \
|
||||
--ports_num=$CONF_PADDLE_PORTS_NUM \
|
||||
--ports_num_for_sparse=$CONF_PADDLE_PORTS_NUM_SPARSE \
|
||||
--log_period=50 \
|
||||
--num_passes=10 \
|
||||
--trainer_count=4 \
|
||||
--trainer_count=$TRAINER_COUNT \
|
||||
--saving_period=1 \
|
||||
--local=0 \
|
||||
--config=./trainer_config.py \
|
||||
--config=trainer_config.lr.py \
|
||||
--use_gpu=0
|
After Width: | Height: | Size: 87 KiB |
Loading…
Reference in new issue