commit
c0876cf686
@ -1,18 +1,35 @@
|
|||||||
#FROM python:2.7.14
|
|
||||||
FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
|
FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
|
||||||
RUN apt-get update && apt-get install -y python
|
|
||||||
RUN pip install -U kubernetes opencv-python && apt-get update -y && apt-get install -y iputils-ping libgtk2.0-dev
|
# you can get mirror list here:
|
||||||
# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
|
# https://launchpad.net/ubuntu/+archivemirrors
|
||||||
# so we must build one with distribute support to install in this image.
|
ARG UBUNTU_MIRROR
|
||||||
|
RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y python python-dev python-pip iputils-ping libgtk2.0-dev
|
||||||
|
RUN pip install -U kubernetes opencv-python
|
||||||
|
|
||||||
RUN pip install paddlepaddle
|
RUN pip install paddlepaddle
|
||||||
|
# if network is slowly, you may need to add proxy here.
|
||||||
|
# ENV https_proxy=
|
||||||
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
|
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
|
||||||
RUN pip uninstall -y paddlepaddle
|
RUN pip uninstall -y paddlepaddle
|
||||||
|
# unset proxy if it is setted.
|
||||||
|
# ENV https_proxy=""
|
||||||
|
|
||||||
|
# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
|
||||||
|
# so we must build one with distribute support to install in this image.
|
||||||
|
ADD *.whl /
|
||||||
|
RUN pip install /*.whl && rm -f /*.whl
|
||||||
|
ENV LD_LIBRARY_PATH=/usr/local/lib
|
||||||
|
|
||||||
|
# tf k8s
|
||||||
|
RUN pip install tensorflow==1.4.0
|
||||||
|
ADD tf_k8s /usr/bin
|
||||||
|
RUN chmod +x /usr/bin/tf_k8s
|
||||||
|
ADD vgg16_tf.py /workspace/
|
||||||
|
|
||||||
# below lines may change a lot for debugging
|
# below lines may change a lot for debugging
|
||||||
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
|
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
|
||||||
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
|
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
|
||||||
ADD *.whl /
|
RUN chmod +x /usr/bin/paddle_k8s
|
||||||
RUN pip install /*.whl && rm -f /*.whl && \
|
|
||||||
chmod +x /usr/bin/paddle_k8s
|
|
||||||
ENV LD_LIBRARY_PATH=/usr/local/lib
|
|
||||||
ADD vgg16_fluid.py vgg16_v2.py /workspace/
|
ADD vgg16_fluid.py vgg16_v2.py /workspace/
|
||||||
|
@ -0,0 +1,82 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
check_trainer_ret() {
|
||||||
|
ret=$1
|
||||||
|
stdbuf -oL echo "job returned $ret...setting pod return message..."
|
||||||
|
stdbuf -oL echo "==============================="
|
||||||
|
|
||||||
|
if [ $ret -eq 136 ] ; then
|
||||||
|
echo "Error Arithmetic Operation(Floating Point Exception)" > /dev/termination-log
|
||||||
|
elif [ $ret -eq 139 ] ; then
|
||||||
|
echo "Segmentation Fault" > /dev/termination-log
|
||||||
|
elif [ $ret -eq 1 ] ; then
|
||||||
|
echo "General Error" > /dev/termination-log
|
||||||
|
elif [ $ret -eq 134 ] ; then
|
||||||
|
echo "Program Abort" > /dev/termination-log
|
||||||
|
fi
|
||||||
|
stdbuf -oL echo "termination log wroted..."
|
||||||
|
exit $ret
|
||||||
|
}
|
||||||
|
|
||||||
|
g_pservers=""
|
||||||
|
g_trainers=""
|
||||||
|
|
||||||
|
wait_running_pods(){
|
||||||
|
pserver_label="tf-job-pserver=${JOB_NAME}"
|
||||||
|
trainer_label="tf-job-trainer=${JOB_NAME}"
|
||||||
|
|
||||||
|
stdbuf -oL python /root/k8s_tools.py wait_pods_running ${pserver_label} ${PSERVERS_NUM}
|
||||||
|
stdbuf -oL python /root/k8s_tools.py wait_pods_running ${trainer_label} ${TRAINERS_NUM}
|
||||||
|
|
||||||
|
g_pservers=$(python /root/k8s_tools.py fetch_endpoints ${pserver_label} ${PORT})
|
||||||
|
g_trainers=$(python /root/k8s_tools.py fetch_endpoints ${trainer_label} ${PORT})
|
||||||
|
}
|
||||||
|
|
||||||
|
start_tf_pserver(){
|
||||||
|
wait_running_pods
|
||||||
|
|
||||||
|
label="tf-job-pserver=${JOB_NAME}"
|
||||||
|
pserver_id=$(python /root/k8s_tools.py fetch_id ${label})
|
||||||
|
|
||||||
|
cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
|
||||||
|
--job_name=${TF_JOB_NAME} --task_index=${pserver_id}"
|
||||||
|
|
||||||
|
stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
|
||||||
|
}
|
||||||
|
|
||||||
|
start_tf_trainer(){
|
||||||
|
wait_running_pods
|
||||||
|
|
||||||
|
label="tf-job-trainer=${JOB_NAME}"
|
||||||
|
trainer_id=$(python /root/k8s_tools.py fetch_id ${label})
|
||||||
|
|
||||||
|
cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
|
||||||
|
--job_name=${TF_JOB_NAME} --task_index=${trainer_id} --batch_size=${BATCH_SIZE}"
|
||||||
|
|
||||||
|
stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
|
||||||
|
check_trainer_ret $?
|
||||||
|
}
|
||||||
|
|
||||||
|
start_tf(){
|
||||||
|
if [[ "${TF_JOB_NAME}" == "worker" ]]; then
|
||||||
|
start_tf_trainer
|
||||||
|
else
|
||||||
|
start_tf_pserver
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo "usage: tf_k8s [<args>]:"
|
||||||
|
echo " start_tf Start tensorflow jobs"
|
||||||
|
}
|
||||||
|
|
||||||
|
case "$1" in
|
||||||
|
start_tf)
|
||||||
|
start_tf
|
||||||
|
;;
|
||||||
|
--help)
|
||||||
|
usage
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
usage
|
||||||
|
;;
|
||||||
|
esac
|
@ -0,0 +1,56 @@
|
|||||||
|
apiVersion: extensions/v1beta1
|
||||||
|
kind: ReplicaSet
|
||||||
|
metadata:
|
||||||
|
name: vgg16job-tf-pserver
|
||||||
|
spec:
|
||||||
|
replicas: 10
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
tf-job-pserver: vgg16job-tf
|
||||||
|
spec:
|
||||||
|
hostNetwork: true
|
||||||
|
imagePullSecrets:
|
||||||
|
- name: job-registry-secret
|
||||||
|
containers:
|
||||||
|
- name: pserver
|
||||||
|
image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
|
||||||
|
imagePullPolicy: Always
|
||||||
|
command: ["tf_k8s", "start_tf"]
|
||||||
|
ports:
|
||||||
|
- name: jobport-30236
|
||||||
|
containerPort: 30236
|
||||||
|
env:
|
||||||
|
- name: PORT
|
||||||
|
value: "32036"
|
||||||
|
- name: ENTRY
|
||||||
|
value: "python vgg16_tf.py"
|
||||||
|
- name: JOB_NAME
|
||||||
|
value: vgg16job-tf
|
||||||
|
- name: PSERVERS_NUM
|
||||||
|
value: "10"
|
||||||
|
- name: TF_JOB_NAME
|
||||||
|
value: "ps"
|
||||||
|
- name: TRAINERS_NUM
|
||||||
|
value: "20"
|
||||||
|
- name: BATCH_SIZE
|
||||||
|
value: "128"
|
||||||
|
- name: TRAINER_PACKAGE
|
||||||
|
value: "/workspace"
|
||||||
|
- name: NUM_PASSES
|
||||||
|
value: "1"
|
||||||
|
- name: NAMESPACE
|
||||||
|
valueFrom:
|
||||||
|
fieldRef:
|
||||||
|
fieldPath: "metadata.namespace"
|
||||||
|
- name: POD_IP
|
||||||
|
valueFrom:
|
||||||
|
fieldRef:
|
||||||
|
fieldPath: "status.podIP"
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
memory: 10Gi
|
||||||
|
cpu: 4
|
||||||
|
limits:
|
||||||
|
memory: 10Gi
|
||||||
|
cpu: 4
|
@ -0,0 +1,58 @@
|
|||||||
|
apiVersion: batch/v1
|
||||||
|
kind: Job
|
||||||
|
metadata:
|
||||||
|
name: vgg16job-tf-trainer
|
||||||
|
spec:
|
||||||
|
parallelism: 20
|
||||||
|
completions: 20
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
tf-job-trainer: vgg16job-tf
|
||||||
|
spec:
|
||||||
|
imagePullSecrets:
|
||||||
|
- name: job-registry-secret
|
||||||
|
hostNetwork: true
|
||||||
|
containers:
|
||||||
|
- name: trainer
|
||||||
|
image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
|
||||||
|
imagePullPolicy: Always
|
||||||
|
command: ["tf_k8s", "start_tf"]
|
||||||
|
ports:
|
||||||
|
- name: jobport-30236
|
||||||
|
containerPort: 30236
|
||||||
|
env:
|
||||||
|
- name: PORT
|
||||||
|
value: "32036"
|
||||||
|
- name: JOB_NAME
|
||||||
|
value: vgg16job-tf
|
||||||
|
- name: TF_JOB_NAME
|
||||||
|
value: "worker"
|
||||||
|
- name: ENTRY
|
||||||
|
value: "python vgg16_tf.py"
|
||||||
|
- name: PSERVERS_NUM
|
||||||
|
value: "10"
|
||||||
|
- name: BATCH_SIZE
|
||||||
|
value: "128"
|
||||||
|
- name: TRAINERS_NUM
|
||||||
|
value: "20"
|
||||||
|
- name: TRAINER_PACKAGE
|
||||||
|
value: "/workspace"
|
||||||
|
- name: NUM_PASSES
|
||||||
|
value: "1"
|
||||||
|
- name: NAMESPACE
|
||||||
|
valueFrom:
|
||||||
|
fieldRef:
|
||||||
|
fieldPath: "metadata.namespace"
|
||||||
|
- name: POD_IP
|
||||||
|
valueFrom:
|
||||||
|
fieldRef:
|
||||||
|
fieldPath: "status.podIP"
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
memory: 40Gi
|
||||||
|
cpu: 2
|
||||||
|
limits:
|
||||||
|
memory: 40Gi
|
||||||
|
cpu: 2
|
||||||
|
restartPolicy: Never
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,41 @@
|
|||||||
|
if(NOT WITH_GPU)
|
||||||
|
return()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
|
||||||
|
set(CUPTI_ROOT "/usr" CACHE PATH "CUPTI ROOT")
|
||||||
|
find_path(CUPTI_INCLUDE_DIR cupti.h
|
||||||
|
PATHS ${CUPTI_ROOT} ${CUPTI_ROOT}/include
|
||||||
|
$ENV{CUPTI_ROOT} $ENV{CUPTI_ROOT}/include
|
||||||
|
${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/include
|
||||||
|
NO_DEFAULT_PATH
|
||||||
|
)
|
||||||
|
|
||||||
|
get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH)
|
||||||
|
|
||||||
|
set(TARGET_ARCH "x86_64")
|
||||||
|
if(NOT ${CMAKE_SYSTEM_PROCESSOR})
|
||||||
|
set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
list(APPEND CUPTI_CHECK_LIBRARY_DIRS
|
||||||
|
${CUPTI_ROOT}
|
||||||
|
${CUPTI_ROOT}/lib64
|
||||||
|
${CUPTI_ROOT}/lib
|
||||||
|
${CUPTI_ROOT}/lib/${TARGET_ARCH}-linux-gnu
|
||||||
|
$ENV{CUPTI_ROOT}
|
||||||
|
$ENV{CUPTI_ROOT}/lib64
|
||||||
|
$ENV{CUPTI_ROOT}/lib
|
||||||
|
/usr/lib
|
||||||
|
${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/lib64)
|
||||||
|
find_library(CUPTI_LIBRARY NAMES libcupti.so libcupti.dylib # libcupti_static.a
|
||||||
|
PATHS ${CUPTI_CHECK_LIBRARY_DIRS} ${CUPTI_INCLUDE_DIR} ${__libpath_hist}
|
||||||
|
NO_DEFAULT_PATH
|
||||||
|
DOC "Path to cuPTI library.")
|
||||||
|
|
||||||
|
get_filename_component(CUPTI_LIBRARY_PATH ${CUPTI_LIBRARY} DIRECTORY)
|
||||||
|
if(CUPTI_INCLUDE_DIR AND CUPTI_LIBRARY)
|
||||||
|
set(CUPTI_FOUND ON)
|
||||||
|
else()
|
||||||
|
set(CUPTI_FOUND OFF)
|
||||||
|
endif()
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,72 @@
|
|||||||
|
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
|
||||||
|
licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License. */
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
#include "paddle/fluid/platform/dynload/cupti.h"
|
||||||
|
#include "paddle/fluid/platform/profiler.pb.h"
|
||||||
|
|
||||||
|
namespace paddle {
|
||||||
|
namespace platform {
|
||||||
|
|
||||||
|
///////////////////////
|
||||||
|
// WARN: Under Development. Don't depend on it yet.
|
||||||
|
//////////////////////
|
||||||
|
|
||||||
|
// DeviceTracer performs the following tasks:
|
||||||
|
// 1. Register cuda callbacks for various events: kernel, memcpy, etc.
|
||||||
|
// 2. Collect cuda statistics: start/end ts, memory, etc.
|
||||||
|
// 3. Generate a protobuf for further analysis.
|
||||||
|
class DeviceTracer {
|
||||||
|
public:
|
||||||
|
struct KernelRecord {
|
||||||
|
uint64_t start_ns;
|
||||||
|
uint64_t end_ns;
|
||||||
|
uint32_t device_id;
|
||||||
|
uint32_t stream_id;
|
||||||
|
uint32_t correlation_id;
|
||||||
|
};
|
||||||
|
|
||||||
|
virtual ~DeviceTracer() {}
|
||||||
|
// Needs to be called once before use.
|
||||||
|
virtual void Enable() = 0;
|
||||||
|
// Needs to be called once after use.
|
||||||
|
virtual void Disable() = 0;
|
||||||
|
|
||||||
|
// Add a pair to correlate internal cuda id with high level
|
||||||
|
// annotation (string). So cuda statistics can be represented by
|
||||||
|
// human-readable annotations.
|
||||||
|
virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0;
|
||||||
|
|
||||||
|
// Add a cuda kernel stats. `correlation_id` will be mapped to annotation
|
||||||
|
// added before for human readability.
|
||||||
|
virtual void AddKernelRecords(uint64_t start, uint64_t end,
|
||||||
|
uint32_t device_id, uint32_t stream_id,
|
||||||
|
uint32_t correlation_id) = 0;
|
||||||
|
|
||||||
|
// Generate a proto after done (Disabled).
|
||||||
|
virtual proto::Profile GenProfile() = 0;
|
||||||
|
|
||||||
|
virtual bool IsEnabled() = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Get a DeviceTracer.
|
||||||
|
DeviceTracer* GetDeviceTracer();
|
||||||
|
|
||||||
|
// Set a name for the cuda kernel operation being launched by the thread.
|
||||||
|
void SetCurAnnotation(const char* anno);
|
||||||
|
// Clear the name after the operation is done.
|
||||||
|
void ClearCurAnnotation();
|
||||||
|
|
||||||
|
} // namespace platform
|
||||||
|
} // namespace paddle
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue