commit
c0876cf686
@ -1,18 +1,35 @@
|
||||
#FROM python:2.7.14
|
||||
FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
|
||||
RUN apt-get update && apt-get install -y python
|
||||
RUN pip install -U kubernetes opencv-python && apt-get update -y && apt-get install -y iputils-ping libgtk2.0-dev
|
||||
# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
|
||||
# so we must build one with distribute support to install in this image.
|
||||
|
||||
# you can get mirror list here:
|
||||
# https://launchpad.net/ubuntu/+archivemirrors
|
||||
ARG UBUNTU_MIRROR
|
||||
RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
|
||||
|
||||
RUN apt-get update && apt-get install -y python python-dev python-pip iputils-ping libgtk2.0-dev
|
||||
RUN pip install -U kubernetes opencv-python
|
||||
|
||||
RUN pip install paddlepaddle
|
||||
# if network is slowly, you may need to add proxy here.
|
||||
# ENV https_proxy=
|
||||
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
|
||||
RUN pip uninstall -y paddlepaddle
|
||||
# unset proxy if it is setted.
|
||||
# ENV https_proxy=""
|
||||
|
||||
# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
|
||||
# so we must build one with distribute support to install in this image.
|
||||
ADD *.whl /
|
||||
RUN pip install /*.whl && rm -f /*.whl
|
||||
ENV LD_LIBRARY_PATH=/usr/local/lib
|
||||
|
||||
# tf k8s
|
||||
RUN pip install tensorflow==1.4.0
|
||||
ADD tf_k8s /usr/bin
|
||||
RUN chmod +x /usr/bin/tf_k8s
|
||||
ADD vgg16_tf.py /workspace/
|
||||
|
||||
# below lines may change a lot for debugging
|
||||
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
|
||||
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
|
||||
ADD *.whl /
|
||||
RUN pip install /*.whl && rm -f /*.whl && \
|
||||
chmod +x /usr/bin/paddle_k8s
|
||||
ENV LD_LIBRARY_PATH=/usr/local/lib
|
||||
RUN chmod +x /usr/bin/paddle_k8s
|
||||
ADD vgg16_fluid.py vgg16_v2.py /workspace/
|
||||
|
@ -0,0 +1,82 @@
|
||||
#!/bin/bash
|
||||
check_trainer_ret() {
|
||||
ret=$1
|
||||
stdbuf -oL echo "job returned $ret...setting pod return message..."
|
||||
stdbuf -oL echo "==============================="
|
||||
|
||||
if [ $ret -eq 136 ] ; then
|
||||
echo "Error Arithmetic Operation(Floating Point Exception)" > /dev/termination-log
|
||||
elif [ $ret -eq 139 ] ; then
|
||||
echo "Segmentation Fault" > /dev/termination-log
|
||||
elif [ $ret -eq 1 ] ; then
|
||||
echo "General Error" > /dev/termination-log
|
||||
elif [ $ret -eq 134 ] ; then
|
||||
echo "Program Abort" > /dev/termination-log
|
||||
fi
|
||||
stdbuf -oL echo "termination log wroted..."
|
||||
exit $ret
|
||||
}
|
||||
|
||||
g_pservers=""
|
||||
g_trainers=""
|
||||
|
||||
wait_running_pods(){
|
||||
pserver_label="tf-job-pserver=${JOB_NAME}"
|
||||
trainer_label="tf-job-trainer=${JOB_NAME}"
|
||||
|
||||
stdbuf -oL python /root/k8s_tools.py wait_pods_running ${pserver_label} ${PSERVERS_NUM}
|
||||
stdbuf -oL python /root/k8s_tools.py wait_pods_running ${trainer_label} ${TRAINERS_NUM}
|
||||
|
||||
g_pservers=$(python /root/k8s_tools.py fetch_endpoints ${pserver_label} ${PORT})
|
||||
g_trainers=$(python /root/k8s_tools.py fetch_endpoints ${trainer_label} ${PORT})
|
||||
}
|
||||
|
||||
start_tf_pserver(){
|
||||
wait_running_pods
|
||||
|
||||
label="tf-job-pserver=${JOB_NAME}"
|
||||
pserver_id=$(python /root/k8s_tools.py fetch_id ${label})
|
||||
|
||||
cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
|
||||
--job_name=${TF_JOB_NAME} --task_index=${pserver_id}"
|
||||
|
||||
stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
|
||||
}
|
||||
|
||||
start_tf_trainer(){
|
||||
wait_running_pods
|
||||
|
||||
label="tf-job-trainer=${JOB_NAME}"
|
||||
trainer_id=$(python /root/k8s_tools.py fetch_id ${label})
|
||||
|
||||
cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
|
||||
--job_name=${TF_JOB_NAME} --task_index=${trainer_id} --batch_size=${BATCH_SIZE}"
|
||||
|
||||
stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
|
||||
check_trainer_ret $?
|
||||
}
|
||||
|
||||
start_tf(){
|
||||
if [[ "${TF_JOB_NAME}" == "worker" ]]; then
|
||||
start_tf_trainer
|
||||
else
|
||||
start_tf_pserver
|
||||
fi
|
||||
}
|
||||
|
||||
usage() {
|
||||
echo "usage: tf_k8s [<args>]:"
|
||||
echo " start_tf Start tensorflow jobs"
|
||||
}
|
||||
|
||||
case "$1" in
|
||||
start_tf)
|
||||
start_tf
|
||||
;;
|
||||
--help)
|
||||
usage
|
||||
;;
|
||||
*)
|
||||
usage
|
||||
;;
|
||||
esac
|
@ -0,0 +1,56 @@
|
||||
apiVersion: extensions/v1beta1
|
||||
kind: ReplicaSet
|
||||
metadata:
|
||||
name: vgg16job-tf-pserver
|
||||
spec:
|
||||
replicas: 10
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
tf-job-pserver: vgg16job-tf
|
||||
spec:
|
||||
hostNetwork: true
|
||||
imagePullSecrets:
|
||||
- name: job-registry-secret
|
||||
containers:
|
||||
- name: pserver
|
||||
image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
|
||||
imagePullPolicy: Always
|
||||
command: ["tf_k8s", "start_tf"]
|
||||
ports:
|
||||
- name: jobport-30236
|
||||
containerPort: 30236
|
||||
env:
|
||||
- name: PORT
|
||||
value: "32036"
|
||||
- name: ENTRY
|
||||
value: "python vgg16_tf.py"
|
||||
- name: JOB_NAME
|
||||
value: vgg16job-tf
|
||||
- name: PSERVERS_NUM
|
||||
value: "10"
|
||||
- name: TF_JOB_NAME
|
||||
value: "ps"
|
||||
- name: TRAINERS_NUM
|
||||
value: "20"
|
||||
- name: BATCH_SIZE
|
||||
value: "128"
|
||||
- name: TRAINER_PACKAGE
|
||||
value: "/workspace"
|
||||
- name: NUM_PASSES
|
||||
value: "1"
|
||||
- name: NAMESPACE
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: "metadata.namespace"
|
||||
- name: POD_IP
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: "status.podIP"
|
||||
resources:
|
||||
requests:
|
||||
memory: 10Gi
|
||||
cpu: 4
|
||||
limits:
|
||||
memory: 10Gi
|
||||
cpu: 4
|
@ -0,0 +1,58 @@
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: vgg16job-tf-trainer
|
||||
spec:
|
||||
parallelism: 20
|
||||
completions: 20
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
tf-job-trainer: vgg16job-tf
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: job-registry-secret
|
||||
hostNetwork: true
|
||||
containers:
|
||||
- name: trainer
|
||||
image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
|
||||
imagePullPolicy: Always
|
||||
command: ["tf_k8s", "start_tf"]
|
||||
ports:
|
||||
- name: jobport-30236
|
||||
containerPort: 30236
|
||||
env:
|
||||
- name: PORT
|
||||
value: "32036"
|
||||
- name: JOB_NAME
|
||||
value: vgg16job-tf
|
||||
- name: TF_JOB_NAME
|
||||
value: "worker"
|
||||
- name: ENTRY
|
||||
value: "python vgg16_tf.py"
|
||||
- name: PSERVERS_NUM
|
||||
value: "10"
|
||||
- name: BATCH_SIZE
|
||||
value: "128"
|
||||
- name: TRAINERS_NUM
|
||||
value: "20"
|
||||
- name: TRAINER_PACKAGE
|
||||
value: "/workspace"
|
||||
- name: NUM_PASSES
|
||||
value: "1"
|
||||
- name: NAMESPACE
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: "metadata.namespace"
|
||||
- name: POD_IP
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: "status.podIP"
|
||||
resources:
|
||||
requests:
|
||||
memory: 40Gi
|
||||
cpu: 2
|
||||
limits:
|
||||
memory: 40Gi
|
||||
cpu: 2
|
||||
restartPolicy: Never
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,41 @@
|
||||
if(NOT WITH_GPU)
|
||||
return()
|
||||
endif()
|
||||
|
||||
|
||||
set(CUPTI_ROOT "/usr" CACHE PATH "CUPTI ROOT")
|
||||
find_path(CUPTI_INCLUDE_DIR cupti.h
|
||||
PATHS ${CUPTI_ROOT} ${CUPTI_ROOT}/include
|
||||
$ENV{CUPTI_ROOT} $ENV{CUPTI_ROOT}/include
|
||||
${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/include
|
||||
NO_DEFAULT_PATH
|
||||
)
|
||||
|
||||
get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH)
|
||||
|
||||
set(TARGET_ARCH "x86_64")
|
||||
if(NOT ${CMAKE_SYSTEM_PROCESSOR})
|
||||
set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR})
|
||||
endif()
|
||||
|
||||
list(APPEND CUPTI_CHECK_LIBRARY_DIRS
|
||||
${CUPTI_ROOT}
|
||||
${CUPTI_ROOT}/lib64
|
||||
${CUPTI_ROOT}/lib
|
||||
${CUPTI_ROOT}/lib/${TARGET_ARCH}-linux-gnu
|
||||
$ENV{CUPTI_ROOT}
|
||||
$ENV{CUPTI_ROOT}/lib64
|
||||
$ENV{CUPTI_ROOT}/lib
|
||||
/usr/lib
|
||||
${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/lib64)
|
||||
find_library(CUPTI_LIBRARY NAMES libcupti.so libcupti.dylib # libcupti_static.a
|
||||
PATHS ${CUPTI_CHECK_LIBRARY_DIRS} ${CUPTI_INCLUDE_DIR} ${__libpath_hist}
|
||||
NO_DEFAULT_PATH
|
||||
DOC "Path to cuPTI library.")
|
||||
|
||||
get_filename_component(CUPTI_LIBRARY_PATH ${CUPTI_LIBRARY} DIRECTORY)
|
||||
if(CUPTI_INCLUDE_DIR AND CUPTI_LIBRARY)
|
||||
set(CUPTI_FOUND ON)
|
||||
else()
|
||||
set(CUPTI_FOUND OFF)
|
||||
endif()
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,72 @@
|
||||
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||
|
||||
licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#pragma once
|
||||
#include "paddle/fluid/platform/dynload/cupti.h"
|
||||
#include "paddle/fluid/platform/profiler.pb.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace platform {
|
||||
|
||||
///////////////////////
|
||||
// WARN: Under Development. Don't depend on it yet.
|
||||
//////////////////////
|
||||
|
||||
// DeviceTracer performs the following tasks:
|
||||
// 1. Register cuda callbacks for various events: kernel, memcpy, etc.
|
||||
// 2. Collect cuda statistics: start/end ts, memory, etc.
|
||||
// 3. Generate a protobuf for further analysis.
|
||||
class DeviceTracer {
|
||||
public:
|
||||
struct KernelRecord {
|
||||
uint64_t start_ns;
|
||||
uint64_t end_ns;
|
||||
uint32_t device_id;
|
||||
uint32_t stream_id;
|
||||
uint32_t correlation_id;
|
||||
};
|
||||
|
||||
virtual ~DeviceTracer() {}
|
||||
// Needs to be called once before use.
|
||||
virtual void Enable() = 0;
|
||||
// Needs to be called once after use.
|
||||
virtual void Disable() = 0;
|
||||
|
||||
// Add a pair to correlate internal cuda id with high level
|
||||
// annotation (string). So cuda statistics can be represented by
|
||||
// human-readable annotations.
|
||||
virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0;
|
||||
|
||||
// Add a cuda kernel stats. `correlation_id` will be mapped to annotation
|
||||
// added before for human readability.
|
||||
virtual void AddKernelRecords(uint64_t start, uint64_t end,
|
||||
uint32_t device_id, uint32_t stream_id,
|
||||
uint32_t correlation_id) = 0;
|
||||
|
||||
// Generate a proto after done (Disabled).
|
||||
virtual proto::Profile GenProfile() = 0;
|
||||
|
||||
virtual bool IsEnabled() = 0;
|
||||
};
|
||||
|
||||
// Get a DeviceTracer.
|
||||
DeviceTracer* GetDeviceTracer();
|
||||
|
||||
// Set a name for the cuda kernel operation being launched by the thread.
|
||||
void SetCurAnnotation(const char* anno);
|
||||
// Clear the name after the operation is done.
|
||||
void ClearCurAnnotation();
|
||||
|
||||
} // namespace platform
|
||||
} // namespace paddle
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue