Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into seq_error

optimizer
wanghaoshuang 7 years ago
commit 87d90d2afd

1
.gitignore vendored

@ -27,7 +27,6 @@ third_party/
cmake-build-*
# generated while compiling
python/paddle/v2/fluid/core.so
paddle/pybind/pybind.h
CMakeFiles
cmake_install.cmake

@ -1,4 +1,4 @@
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -19,7 +19,7 @@ set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
include(system)
project(paddle CXX C Go)
project(paddle CXX C)
message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
"${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
@ -60,6 +60,7 @@ option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF)
option(WITH_DISTRIBUTE "Compile with grpc distributed support" OFF)
option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF)
option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF)
option(WITH_FAST_BUNDLE_TEST "Bundle tests that can be run in a single process together to reduce launch overhead" OFF)
# CMAKE_BUILD_TYPE
if(NOT CMAKE_BUILD_TYPE)
@ -141,11 +142,11 @@ include(external/boost) # download boost
include(external/any) # download libn::any
include(external/eigen) # download eigen3
include(external/pybind11) # download pybind11
include(external/nccl)
include(external/cares)
include(external/grpc)
include(cudnn) # set cudnn libraries, must before configure
include(cupti)
include(configure) # add paddle env configuration
include(generic) # simplify cmake module
include(package) # set paddle packages
@ -174,7 +175,7 @@ set(EXTERNAL_LIBS
)
if(WITH_GPU)
include(cuda)
include(cuda)
endif(WITH_GPU)
if(WITH_MKLML)
@ -201,17 +202,18 @@ endif()
# "add_subdirectory(paddle)" and "add_subdirectory(python)" should be
# placed after this block, because they depends on it.
if(WITH_GOLANG)
enable_language(Go)
add_subdirectory(go)
endif(WITH_GOLANG)
set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
add_subdirectory(paddle)
if(WITH_PYTHON)
add_subdirectory(python)
add_subdirectory(python)
endif()
if(WITH_DOC)

@ -22,7 +22,8 @@ COPY ./paddle/scripts/docker/root/ /root/
RUN apt-get update && \
apt-get install -y \
git python-pip python-dev openssh-server bison libnccl-dev \
git python-pip python-dev openssh-server bison \
libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \
wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
curl sed grep graphviz libjpeg-dev zlib1g-dev \
python-matplotlib gcc-4.8 g++-4.8 \

@ -21,16 +21,6 @@ RUN apt-get update && \
wget curl tar unzip gcc g++ locales clang-format-3.8 swig cmake && \
apt-get clean -y
# Install Go and glide
RUN wget -qO- go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
tar -xz -C /usr/local && \
mkdir /root/gopath && \
mkdir /root/gopath/bin && \
mkdir /root/gopath/src
ENV GOROOT=/usr/local/go GOPATH=/root/gopath
# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
# git credential to skip password typing
RUN git config --global credential.helper store

@ -188,7 +188,7 @@ Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.

@ -1,18 +1,35 @@
#FROM python:2.7.14
FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
RUN apt-get update && apt-get install -y python
RUN pip install -U kubernetes opencv-python && apt-get update -y && apt-get install -y iputils-ping libgtk2.0-dev
# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
# so we must build one with distribute support to install in this image.
# you can get mirror list here:
# https://launchpad.net/ubuntu/+archivemirrors
ARG UBUNTU_MIRROR
RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
RUN apt-get update && apt-get install -y python python-dev python-pip iputils-ping libgtk2.0-dev
RUN pip install -U kubernetes opencv-python
RUN pip install paddlepaddle
# if network is slowly, you may need to add proxy here.
# ENV https_proxy=
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
RUN pip uninstall -y paddlepaddle
# unset proxy if it is setted.
# ENV https_proxy=""
# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
# so we must build one with distribute support to install in this image.
ADD *.whl /
RUN pip install /*.whl && rm -f /*.whl
ENV LD_LIBRARY_PATH=/usr/local/lib
# tf k8s
RUN pip install tensorflow==1.4.0
ADD tf_k8s /usr/bin
RUN chmod +x /usr/bin/tf_k8s
ADD vgg16_tf.py /workspace/
# below lines may change a lot for debugging
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
ADD *.whl /
RUN pip install /*.whl && rm -f /*.whl && \
chmod +x /usr/bin/paddle_k8s
ENV LD_LIBRARY_PATH=/usr/local/lib
RUN chmod +x /usr/bin/paddle_k8s
ADD vgg16_fluid.py vgg16_v2.py /workspace/

@ -8,23 +8,24 @@
- cpu MHz : 2101.000
- cache size : 20480 KB
### Blas settings
Setting environment variable: `MKL_NUM_THREADS=1`.
### Single Node Single Thread
- PServer Count: 10
- Trainer Count: 20
- Metrics: samples / sec
| Batch Size | 32 | 64 | 128 | 256 |
| -- | -- | -- | -- | -- |
| PaddlePaddle Fluid | 15.44 | 16.32 | 16.74 | 16.79 |
| PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 |
| TensorFlow | - | - | - | - |
| TensorFlow | 9.09 | 9.10 | 9.24 | 8.66 |
### Different Batch Size
- PServer Count: 10
- Trainer Count: 20
- Per trainer CPU Core: 1
- Metrics: samples / sec
| Batch Size | 32 | 64 | 128 | 256 |

@ -11,7 +11,7 @@ spec:
paddle-job: vgg16job
spec:
imagePullSecrets:
- name: job-registry-secret
- name: job-registry-secret
hostNetwork: true
containers:
- name: trainer

@ -0,0 +1,82 @@
#!/bin/bash
check_trainer_ret() {
ret=$1
stdbuf -oL echo "job returned $ret...setting pod return message..."
stdbuf -oL echo "==============================="
if [ $ret -eq 136 ] ; then
echo "Error Arithmetic Operation(Floating Point Exception)" > /dev/termination-log
elif [ $ret -eq 139 ] ; then
echo "Segmentation Fault" > /dev/termination-log
elif [ $ret -eq 1 ] ; then
echo "General Error" > /dev/termination-log
elif [ $ret -eq 134 ] ; then
echo "Program Abort" > /dev/termination-log
fi
stdbuf -oL echo "termination log wroted..."
exit $ret
}
g_pservers=""
g_trainers=""
wait_running_pods(){
pserver_label="tf-job-pserver=${JOB_NAME}"
trainer_label="tf-job-trainer=${JOB_NAME}"
stdbuf -oL python /root/k8s_tools.py wait_pods_running ${pserver_label} ${PSERVERS_NUM}
stdbuf -oL python /root/k8s_tools.py wait_pods_running ${trainer_label} ${TRAINERS_NUM}
g_pservers=$(python /root/k8s_tools.py fetch_endpoints ${pserver_label} ${PORT})
g_trainers=$(python /root/k8s_tools.py fetch_endpoints ${trainer_label} ${PORT})
}
start_tf_pserver(){
wait_running_pods
label="tf-job-pserver=${JOB_NAME}"
pserver_id=$(python /root/k8s_tools.py fetch_id ${label})
cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
--job_name=${TF_JOB_NAME} --task_index=${pserver_id}"
stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
}
start_tf_trainer(){
wait_running_pods
label="tf-job-trainer=${JOB_NAME}"
trainer_id=$(python /root/k8s_tools.py fetch_id ${label})
cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
--job_name=${TF_JOB_NAME} --task_index=${trainer_id} --batch_size=${BATCH_SIZE}"
stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
check_trainer_ret $?
}
start_tf(){
if [[ "${TF_JOB_NAME}" == "worker" ]]; then
start_tf_trainer
else
start_tf_pserver
fi
}
usage() {
echo "usage: tf_k8s [<args>]:"
echo " start_tf Start tensorflow jobs"
}
case "$1" in
start_tf)
start_tf
;;
--help)
usage
;;
*)
usage
;;
esac

@ -0,0 +1,56 @@
apiVersion: extensions/v1beta1
kind: ReplicaSet
metadata:
name: vgg16job-tf-pserver
spec:
replicas: 10
template:
metadata:
labels:
tf-job-pserver: vgg16job-tf
spec:
hostNetwork: true
imagePullSecrets:
- name: job-registry-secret
containers:
- name: pserver
image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
imagePullPolicy: Always
command: ["tf_k8s", "start_tf"]
ports:
- name: jobport-30236
containerPort: 30236
env:
- name: PORT
value: "32036"
- name: ENTRY
value: "python vgg16_tf.py"
- name: JOB_NAME
value: vgg16job-tf
- name: PSERVERS_NUM
value: "10"
- name: TF_JOB_NAME
value: "ps"
- name: TRAINERS_NUM
value: "20"
- name: BATCH_SIZE
value: "128"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: NUM_PASSES
value: "1"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: "status.podIP"
resources:
requests:
memory: 10Gi
cpu: 4
limits:
memory: 10Gi
cpu: 4

@ -0,0 +1,58 @@
apiVersion: batch/v1
kind: Job
metadata:
name: vgg16job-tf-trainer
spec:
parallelism: 20
completions: 20
template:
metadata:
labels:
tf-job-trainer: vgg16job-tf
spec:
imagePullSecrets:
- name: job-registry-secret
hostNetwork: true
containers:
- name: trainer
image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
imagePullPolicy: Always
command: ["tf_k8s", "start_tf"]
ports:
- name: jobport-30236
containerPort: 30236
env:
- name: PORT
value: "32036"
- name: JOB_NAME
value: vgg16job-tf
- name: TF_JOB_NAME
value: "worker"
- name: ENTRY
value: "python vgg16_tf.py"
- name: PSERVERS_NUM
value: "10"
- name: BATCH_SIZE
value: "128"
- name: TRAINERS_NUM
value: "20"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: NUM_PASSES
value: "1"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: "status.podIP"
resources:
requests:
memory: 40Gi
cpu: 2
limits:
memory: 40Gi
cpu: 2
restartPolicy: Never

@ -1,4 +1,4 @@
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -68,6 +68,21 @@ parser.add_argument(
type=str2bool,
default=True,
help='Whether to run as local mode.')
parser.add_argument(
"--ps_hosts",
type=str,
default="",
help="Comma-separated list of hostname:port pairs")
parser.add_argument(
"--trainer_hosts",
type=str,
default="",
help="Comma-separated list of hostname:port pairs")
# Flags for defining the tf.train.Server
parser.add_argument(
"--task_index", type=int, default=0, help="Index of task within the job")
args = parser.parse_args()
@ -180,8 +195,9 @@ def main():
iters += 1
num_samples += len(data)
print(
"Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f"
% (pass_id, iters, loss, acc, time.time() - ts)
"Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s"
% (pass_id, iters, loss, acc,
len(data) / (time.time() - ts))
) # The accuracy is the accumulation of batches, but not the current batch.
pass_elapsed = time.time() - start_time
@ -209,27 +225,24 @@ def main():
batch_size=args.batch_size)
train_loop(exe, fluid.default_main_program())
else:
pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # all pserver endpoints
eplist = []
for ip in pserver_ips.split(","):
eplist.append(':'.join([ip, "6174"]))
pserver_endpoints = ",".join(eplist)
print("pserver endpoints: ", pserver_endpoints)
trainers = int(os.getenv("TRAINERS")) # total trainer count
print("trainers total: ", trainers)
current_endpoint = os.getenv(
"POD_IP") + ":6174" # current pserver endpoint
training_role = os.getenv(
"TRAINING_ROLE",
"TRAINER") # get the training role: trainer/pserver
t = fluid.DistributeTranspiler()
t.transpile(
optimize_ops,
params_grads,
pservers=pserver_endpoints,
trainer_id=args.task_index,
pservers=args.ps_hosts,
trainers=trainers)
if training_role == "PSERVER":
current_endpoint = os.getenv("POD_IP") + ":" + os.getenv(
"PADDLE_INIT_PORT")
if not current_endpoint:
print("need env SERVER_ENDPOINT")
exit(1)

File diff suppressed because it is too large Load Diff

@ -1,4 +1,4 @@
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.

@ -1,4 +1,4 @@
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.

@ -1,4 +1,4 @@
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.

@ -1,4 +1,4 @@
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.

@ -1,4 +1,4 @@
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.

@ -1,4 +1,4 @@
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.

@ -1,4 +1,4 @@
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.

@ -1,4 +1,4 @@
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.

@ -1,4 +1,4 @@
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.

@ -1,4 +1,4 @@
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.

@ -1,4 +1,4 @@
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.

@ -1,4 +1,4 @@
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -59,6 +59,7 @@ endif(NOT WITH_GOLANG)
if(NOT WITH_GPU)
add_definitions(-DHPPL_STUB_FUNC)
add_definitions("-DCUPTI_LIB_PATH=\"\"")
list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
else()
@ -73,7 +74,14 @@ else()
if(NOT CUDNN_FOUND)
message(FATAL_ERROR "Paddle needs cudnn to compile")
endif()
if(CUPTI_FOUND)
include_directories(${CUPTI_INCLUDE_DIR})
add_definitions(-DPADDLE_WITH_CUPTI)
add_definitions("-DCUPTI_LIB_PATH=\"${CUPTI_LIBRARY_PATH}\"")
else()
add_definitions("-DCUPTI_LIB_PATH=\"\"")
message(STATUS "Cannot find CUPTI, GPU Profiling is incorrect.")
endif()
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}")
# Include cuda and cudnn

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save