update due to upstream's change

tonyyang-svail-patch-1
Yibing Liu 7 years ago
commit c0876cf686

@ -146,6 +146,7 @@ include(external/cares)
include(external/grpc) include(external/grpc)
include(cudnn) # set cudnn libraries, must before configure include(cudnn) # set cudnn libraries, must before configure
include(cupti)
include(configure) # add paddle env configuration include(configure) # add paddle env configuration
include(generic) # simplify cmake module include(generic) # simplify cmake module
include(package) # set paddle packages include(package) # set paddle packages

@ -1,18 +1,35 @@
#FROM python:2.7.14
FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04 FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
RUN apt-get update && apt-get install -y python
RUN pip install -U kubernetes opencv-python && apt-get update -y && apt-get install -y iputils-ping libgtk2.0-dev # you can get mirror list here:
# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF, # https://launchpad.net/ubuntu/+archivemirrors
# so we must build one with distribute support to install in this image. ARG UBUNTU_MIRROR
RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
RUN apt-get update && apt-get install -y python python-dev python-pip iputils-ping libgtk2.0-dev
RUN pip install -U kubernetes opencv-python
RUN pip install paddlepaddle RUN pip install paddlepaddle
# if network is slowly, you may need to add proxy here.
# ENV https_proxy=
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python' RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
RUN pip uninstall -y paddlepaddle RUN pip uninstall -y paddlepaddle
# unset proxy if it is setted.
# ENV https_proxy=""
# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
# so we must build one with distribute support to install in this image.
ADD *.whl /
RUN pip install /*.whl && rm -f /*.whl
ENV LD_LIBRARY_PATH=/usr/local/lib
# tf k8s
RUN pip install tensorflow==1.4.0
ADD tf_k8s /usr/bin
RUN chmod +x /usr/bin/tf_k8s
ADD vgg16_tf.py /workspace/
# below lines may change a lot for debugging # below lines may change a lot for debugging
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
ADD *.whl / RUN chmod +x /usr/bin/paddle_k8s
RUN pip install /*.whl && rm -f /*.whl && \
chmod +x /usr/bin/paddle_k8s
ENV LD_LIBRARY_PATH=/usr/local/lib
ADD vgg16_fluid.py vgg16_v2.py /workspace/ ADD vgg16_fluid.py vgg16_v2.py /workspace/

@ -0,0 +1,82 @@
#!/bin/bash
check_trainer_ret() {
ret=$1
stdbuf -oL echo "job returned $ret...setting pod return message..."
stdbuf -oL echo "==============================="
if [ $ret -eq 136 ] ; then
echo "Error Arithmetic Operation(Floating Point Exception)" > /dev/termination-log
elif [ $ret -eq 139 ] ; then
echo "Segmentation Fault" > /dev/termination-log
elif [ $ret -eq 1 ] ; then
echo "General Error" > /dev/termination-log
elif [ $ret -eq 134 ] ; then
echo "Program Abort" > /dev/termination-log
fi
stdbuf -oL echo "termination log wroted..."
exit $ret
}
g_pservers=""
g_trainers=""
wait_running_pods(){
pserver_label="tf-job-pserver=${JOB_NAME}"
trainer_label="tf-job-trainer=${JOB_NAME}"
stdbuf -oL python /root/k8s_tools.py wait_pods_running ${pserver_label} ${PSERVERS_NUM}
stdbuf -oL python /root/k8s_tools.py wait_pods_running ${trainer_label} ${TRAINERS_NUM}
g_pservers=$(python /root/k8s_tools.py fetch_endpoints ${pserver_label} ${PORT})
g_trainers=$(python /root/k8s_tools.py fetch_endpoints ${trainer_label} ${PORT})
}
start_tf_pserver(){
wait_running_pods
label="tf-job-pserver=${JOB_NAME}"
pserver_id=$(python /root/k8s_tools.py fetch_id ${label})
cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
--job_name=${TF_JOB_NAME} --task_index=${pserver_id}"
stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
}
start_tf_trainer(){
wait_running_pods
label="tf-job-trainer=${JOB_NAME}"
trainer_id=$(python /root/k8s_tools.py fetch_id ${label})
cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
--job_name=${TF_JOB_NAME} --task_index=${trainer_id} --batch_size=${BATCH_SIZE}"
stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
check_trainer_ret $?
}
start_tf(){
if [[ "${TF_JOB_NAME}" == "worker" ]]; then
start_tf_trainer
else
start_tf_pserver
fi
}
usage() {
echo "usage: tf_k8s [<args>]:"
echo " start_tf Start tensorflow jobs"
}
case "$1" in
start_tf)
start_tf
;;
--help)
usage
;;
*)
usage
;;
esac

@ -0,0 +1,56 @@
apiVersion: extensions/v1beta1
kind: ReplicaSet
metadata:
name: vgg16job-tf-pserver
spec:
replicas: 10
template:
metadata:
labels:
tf-job-pserver: vgg16job-tf
spec:
hostNetwork: true
imagePullSecrets:
- name: job-registry-secret
containers:
- name: pserver
image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
imagePullPolicy: Always
command: ["tf_k8s", "start_tf"]
ports:
- name: jobport-30236
containerPort: 30236
env:
- name: PORT
value: "32036"
- name: ENTRY
value: "python vgg16_tf.py"
- name: JOB_NAME
value: vgg16job-tf
- name: PSERVERS_NUM
value: "10"
- name: TF_JOB_NAME
value: "ps"
- name: TRAINERS_NUM
value: "20"
- name: BATCH_SIZE
value: "128"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: NUM_PASSES
value: "1"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: "status.podIP"
resources:
requests:
memory: 10Gi
cpu: 4
limits:
memory: 10Gi
cpu: 4

@ -0,0 +1,58 @@
apiVersion: batch/v1
kind: Job
metadata:
name: vgg16job-tf-trainer
spec:
parallelism: 20
completions: 20
template:
metadata:
labels:
tf-job-trainer: vgg16job-tf
spec:
imagePullSecrets:
- name: job-registry-secret
hostNetwork: true
containers:
- name: trainer
image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
imagePullPolicy: Always
command: ["tf_k8s", "start_tf"]
ports:
- name: jobport-30236
containerPort: 30236
env:
- name: PORT
value: "32036"
- name: JOB_NAME
value: vgg16job-tf
- name: TF_JOB_NAME
value: "worker"
- name: ENTRY
value: "python vgg16_tf.py"
- name: PSERVERS_NUM
value: "10"
- name: BATCH_SIZE
value: "128"
- name: TRAINERS_NUM
value: "20"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: NUM_PASSES
value: "1"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: "status.podIP"
resources:
requests:
memory: 40Gi
cpu: 2
limits:
memory: 40Gi
cpu: 2
restartPolicy: Never

@ -68,6 +68,21 @@ parser.add_argument(
type=str2bool, type=str2bool,
default=True, default=True,
help='Whether to run as local mode.') help='Whether to run as local mode.')
parser.add_argument(
"--ps_hosts",
type=str,
default="",
help="Comma-separated list of hostname:port pairs")
parser.add_argument(
"--trainer_hosts",
type=str,
default="",
help="Comma-separated list of hostname:port pairs")
# Flags for defining the tf.train.Server
parser.add_argument(
"--task_index", type=int, default=0, help="Index of task within the job")
args = parser.parse_args() args = parser.parse_args()
@ -180,8 +195,9 @@ def main():
iters += 1 iters += 1
num_samples += len(data) num_samples += len(data)
print( print(
"Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f" "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s"
% (pass_id, iters, loss, acc, time.time() - ts) % (pass_id, iters, loss, acc,
len(data) / (time.time() - ts))
) # The accuracy is the accumulation of batches, but not the current batch. ) # The accuracy is the accumulation of batches, but not the current batch.
pass_elapsed = time.time() - start_time pass_elapsed = time.time() - start_time
@ -209,27 +225,24 @@ def main():
batch_size=args.batch_size) batch_size=args.batch_size)
train_loop(exe, fluid.default_main_program()) train_loop(exe, fluid.default_main_program())
else: else:
pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # all pserver endpoints
eplist = []
for ip in pserver_ips.split(","):
eplist.append(':'.join([ip, "6174"]))
pserver_endpoints = ",".join(eplist)
print("pserver endpoints: ", pserver_endpoints)
trainers = int(os.getenv("TRAINERS")) # total trainer count trainers = int(os.getenv("TRAINERS")) # total trainer count
print("trainers total: ", trainers) print("trainers total: ", trainers)
current_endpoint = os.getenv(
"POD_IP") + ":6174" # current pserver endpoint
training_role = os.getenv( training_role = os.getenv(
"TRAINING_ROLE", "TRAINING_ROLE",
"TRAINER") # get the training role: trainer/pserver "TRAINER") # get the training role: trainer/pserver
t = fluid.DistributeTranspiler() t = fluid.DistributeTranspiler()
t.transpile( t.transpile(
optimize_ops, optimize_ops,
params_grads, params_grads,
pservers=pserver_endpoints, trainer_id=args.task_index,
pservers=args.ps_hosts,
trainers=trainers) trainers=trainers)
if training_role == "PSERVER": if training_role == "PSERVER":
current_endpoint = os.getenv("POD_IP") + ":" + os.getenv(
"PADDLE_INIT_PORT")
if not current_endpoint: if not current_endpoint:
print("need env SERVER_ENDPOINT") print("need env SERVER_ENDPOINT")
exit(1) exit(1)

File diff suppressed because it is too large Load Diff

@ -59,6 +59,7 @@ endif(NOT WITH_GOLANG)
if(NOT WITH_GPU) if(NOT WITH_GPU)
add_definitions(-DHPPL_STUB_FUNC) add_definitions(-DHPPL_STUB_FUNC)
add_definitions("-DCUPTI_LIB_PATH=\"\"")
list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu) list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
else() else()
@ -73,7 +74,14 @@ else()
if(NOT CUDNN_FOUND) if(NOT CUDNN_FOUND)
message(FATAL_ERROR "Paddle needs cudnn to compile") message(FATAL_ERROR "Paddle needs cudnn to compile")
endif() endif()
if(CUPTI_FOUND)
include_directories(${CUPTI_INCLUDE_DIR})
add_definitions(-DPADDLE_WITH_CUPTI)
add_definitions("-DCUPTI_LIB_PATH=\"${CUPTI_LIBRARY_PATH}\"")
else()
add_definitions("-DCUPTI_LIB_PATH=\"\"")
message(STATUS "Cannot find CUPTI, GPU Profiling is incorrect.")
endif()
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}") set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}")
# Include cuda and cudnn # Include cuda and cudnn

@ -155,7 +155,8 @@ endif()
include_directories(${CUDA_INCLUDE_DIRS}) include_directories(${CUDA_INCLUDE_DIRS})
list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY}) list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
if(NOT WITH_DSO) if(NOT WITH_DSO)
list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY}) # TODO(panyx0718): CUPTI only allows DSO?
list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUPTI_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
endif(NOT WITH_DSO) endif(NOT WITH_DSO)
# setting nvcc arch flags # setting nvcc arch flags

@ -0,0 +1,41 @@
if(NOT WITH_GPU)
return()
endif()
set(CUPTI_ROOT "/usr" CACHE PATH "CUPTI ROOT")
find_path(CUPTI_INCLUDE_DIR cupti.h
PATHS ${CUPTI_ROOT} ${CUPTI_ROOT}/include
$ENV{CUPTI_ROOT} $ENV{CUPTI_ROOT}/include
${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/include
NO_DEFAULT_PATH
)
get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH)
set(TARGET_ARCH "x86_64")
if(NOT ${CMAKE_SYSTEM_PROCESSOR})
set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR})
endif()
list(APPEND CUPTI_CHECK_LIBRARY_DIRS
${CUPTI_ROOT}
${CUPTI_ROOT}/lib64
${CUPTI_ROOT}/lib
${CUPTI_ROOT}/lib/${TARGET_ARCH}-linux-gnu
$ENV{CUPTI_ROOT}
$ENV{CUPTI_ROOT}/lib64
$ENV{CUPTI_ROOT}/lib
/usr/lib
${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/lib64)
find_library(CUPTI_LIBRARY NAMES libcupti.so libcupti.dylib # libcupti_static.a
PATHS ${CUPTI_CHECK_LIBRARY_DIRS} ${CUPTI_INCLUDE_DIR} ${__libpath_hist}
NO_DEFAULT_PATH
DOC "Path to cuPTI library.")
get_filename_component(CUPTI_LIBRARY_PATH ${CUPTI_LIBRARY} DIRECTORY)
if(CUPTI_INCLUDE_DIR AND CUPTI_LIBRARY)
set(CUPTI_FOUND ON)
else()
set(CUPTI_FOUND OFF)
endif()

@ -56,7 +56,7 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto) cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context) cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
shape_inference data_transform lod_tensor) shape_inference data_transform lod_tensor profiler)
cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry init) cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry init)
cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog) cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog)
@ -80,7 +80,7 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog) cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
framework_proto backward glog lod_rank_table profiler feed_fetch_method) framework_proto backward glog lod_rank_table feed_fetch_method)
cc_library(prune SRCS prune.cc DEPS framework_proto) cc_library(prune SRCS prune.cc DEPS framework_proto)
cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)

@ -167,4 +167,6 @@ message BlockDesc {
// Please refer to // Please refer to
// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md // https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md
// for more details. // for more details.
// TODO(panyx0718): A model can have multiple programs. Need a
// way to distinguish them. Maybe ID or name?
message ProgramDesc { repeated BlockDesc blocks = 1; } message ProgramDesc { repeated BlockDesc blocks = 1; }

@ -31,8 +31,14 @@ std::ostream &operator<<(std::ostream &os, const LoD &lod) {
os << "{"; os << "{";
for (auto &v : lod) { for (auto &v : lod) {
os << "{"; os << "{";
bool is_first = true;
for (auto &i : v) { for (auto &i : v) {
os << i << ","; if (is_first) {
os << i;
is_first = false;
} else {
os << ", " << i;
}
} }
os << "}"; os << "}";
} }

@ -125,6 +125,8 @@ class OpDesc {
BlockDesc *Block() { return this->block_; } BlockDesc *Block() { return this->block_; }
const BlockDesc &BlockRef() const { return *this->block_; }
void SetBlock(BlockDesc *block) { this->block_ = block; } void SetBlock(BlockDesc *block) { this->block_ = block; }
private: private:

@ -32,24 +32,12 @@ void ReadBinaryFile(const std::string& filename, std::string& contents) {
inputfs.close(); inputfs.close();
} }
bool IsParameter(const framework::VarDesc* var, bool IsPersistable(const framework::VarDesc* var) {
const framework::ProgramDesc& main_program) { if (var->Persistable() &&
if (var->Persistable()) { var->GetType() != framework::proto::VarType::FEED_MINIBATCH &&
// There are many unreachable variables in the program var->GetType() != framework::proto::VarType::FETCH_LIST) {
for (size_t i = 0; i < main_program.Size(); ++i) {
const framework::BlockDesc& block = main_program.Block(i);
for (auto* op : block.AllOps()) {
if (op->Type() == framework::kFeedOpType) {
continue;
}
for (auto input_argument_name : op->InputArgumentNames()) {
if (input_argument_name == var->Name()) {
return true; return true;
} }
}
}
}
}
return false; return false;
} }
@ -65,8 +53,8 @@ void LoadPersistables(framework::Executor& executor,
std::vector<std::string> paramlist; std::vector<std::string> paramlist;
for (auto* var : global_block.AllVars()) { for (auto* var : global_block.AllVars()) {
if (IsParameter(var, main_program)) { if (IsPersistable(var)) {
VLOG(3) << "parameter's name: " << var->Name(); VLOG(3) << "persistable variable's name: " << var->Name();
framework::VarDesc* new_var = load_block->Var(var->Name()); framework::VarDesc* new_var = load_block->Var(var->Name());
new_var->SetShape(var->GetShape()); new_var->SetShape(var->GetShape());
@ -101,7 +89,6 @@ void LoadPersistables(framework::Executor& executor,
executor.Run(*load_program, &scope, 0, true, true); executor.Run(*load_program, &scope, 0, true, true);
VLOG(3) << "Ran loading successfully";
delete load_program; delete load_program;
} }

@ -30,5 +30,5 @@ inference_test(label_semantic_roles)
inference_test(recognize_digits ARGS mlp conv) inference_test(recognize_digits ARGS mlp conv)
inference_test(recommender_system) inference_test(recommender_system)
#inference_test(rnn_encoder_decoder) #inference_test(rnn_encoder_decoder)
inference_test(understand_sentiment) inference_test(understand_sentiment ARGS conv)
inference_test(word2vec) inference_test(word2vec)

@ -32,16 +32,42 @@ TEST(inference, label_semantic_roles) {
paddle::framework::LoDTensor word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, paddle::framework::LoDTensor word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1,
ctx_p2, mark; ctx_p2, mark;
paddle::framework::LoD lod{{0, 4, 10}}; paddle::framework::LoD lod{{0, 4, 10}};
int64_t word_dict_len = 44068;
SetupLoDTensor(word, lod, static_cast<int64_t>(0), static_cast<int64_t>(1)); int64_t predicate_dict_len = 3162;
SetupLoDTensor( int64_t mark_dict_len = 2;
predicate, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
SetupLoDTensor(ctx_n2, lod, static_cast<int64_t>(0), static_cast<int64_t>(1)); SetupLoDTensor(word,
SetupLoDTensor(ctx_n1, lod, static_cast<int64_t>(0), static_cast<int64_t>(1)); lod,
SetupLoDTensor(ctx_0, lod, static_cast<int64_t>(0), static_cast<int64_t>(1)); static_cast<int64_t>(0),
SetupLoDTensor(ctx_p1, lod, static_cast<int64_t>(0), static_cast<int64_t>(1)); static_cast<int64_t>(word_dict_len - 1));
SetupLoDTensor(ctx_p2, lod, static_cast<int64_t>(0), static_cast<int64_t>(1)); SetupLoDTensor(predicate,
SetupLoDTensor(mark, lod, static_cast<int64_t>(0), static_cast<int64_t>(1)); lod,
static_cast<int64_t>(0),
static_cast<int64_t>(predicate_dict_len - 1));
SetupLoDTensor(ctx_n2,
lod,
static_cast<int64_t>(0),
static_cast<int64_t>(word_dict_len - 1));
SetupLoDTensor(ctx_n1,
lod,
static_cast<int64_t>(0),
static_cast<int64_t>(word_dict_len - 1));
SetupLoDTensor(ctx_0,
lod,
static_cast<int64_t>(0),
static_cast<int64_t>(word_dict_len - 1));
SetupLoDTensor(ctx_p1,
lod,
static_cast<int64_t>(0),
static_cast<int64_t>(word_dict_len - 1));
SetupLoDTensor(ctx_p2,
lod,
static_cast<int64_t>(0),
static_cast<int64_t>(word_dict_len - 1));
SetupLoDTensor(mark,
lod,
static_cast<int64_t>(0),
static_cast<int64_t>(mark_dict_len - 1));
std::vector<paddle::framework::LoDTensor*> cpu_feeds; std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&word); cpu_feeds.push_back(&word);

@ -31,7 +31,12 @@ TEST(inference, understand_sentiment) {
paddle::framework::LoDTensor words; paddle::framework::LoDTensor words;
paddle::framework::LoD lod{{0, 4, 10}}; paddle::framework::LoD lod{{0, 4, 10}};
SetupLoDTensor(words, lod, static_cast<int64_t>(0), static_cast<int64_t>(10)); int64_t word_dict_len = 5147;
SetupLoDTensor(words,
lod,
static_cast<int64_t>(0),
static_cast<int64_t>(word_dict_len - 1));
std::vector<paddle::framework::LoDTensor*> cpu_feeds; std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&words); cpu_feeds.push_back(&words);

@ -31,12 +31,12 @@ TEST(inference, word2vec) {
paddle::framework::LoDTensor first_word, second_word, third_word, fourth_word; paddle::framework::LoDTensor first_word, second_word, third_word, fourth_word;
paddle::framework::LoD lod{{0, 1}}; paddle::framework::LoD lod{{0, 1}};
int64_t dict_size = 2072; // Hard-coding the size of dictionary int64_t dict_size = 2073; // The size of dictionary
SetupLoDTensor(first_word, lod, static_cast<int64_t>(0), dict_size); SetupLoDTensor(first_word, lod, static_cast<int64_t>(0), dict_size - 1);
SetupLoDTensor(second_word, lod, static_cast<int64_t>(0), dict_size); SetupLoDTensor(second_word, lod, static_cast<int64_t>(0), dict_size - 1);
SetupLoDTensor(third_word, lod, static_cast<int64_t>(0), dict_size); SetupLoDTensor(third_word, lod, static_cast<int64_t>(0), dict_size - 1);
SetupLoDTensor(fourth_word, lod, static_cast<int64_t>(0), dict_size); SetupLoDTensor(fourth_word, lod, static_cast<int64_t>(0), dict_size - 1);
std::vector<paddle::framework::LoDTensor*> cpu_feeds; std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&first_word); cpu_feeds.push_back(&first_word);

@ -101,8 +101,8 @@ void TestInference(const std::string& dirname,
if (IsCombined) { if (IsCombined) {
// All parameters are saved in a single file. // All parameters are saved in a single file.
// Hard-coding the file names of program and parameters in unittest. // Hard-coding the file names of program and parameters in unittest.
// Users are free to specify different filename // The file names should be consistent with that used in Python API
// (provided: the filenames are changed in the python api as well: io.py) // `fluid.io.save_inference_model`.
std::string prog_filename = "__model_combined__"; std::string prog_filename = "__model_combined__";
std::string param_filename = "__params_combined__"; std::string param_filename = "__params_combined__";
inference_program = paddle::inference::Load(executor, inference_program = paddle::inference::Load(executor,

@ -11,6 +11,8 @@ function(op_library TARGET)
set(cc_srcs) set(cc_srcs)
set(cu_srcs) set(cu_srcs)
set(cu_cc_srcs) set(cu_cc_srcs)
set(cudnn_cu_cc_srcs)
set(CUDNN_FILE)
set(op_common_deps operator op_registry math_function) set(op_common_deps operator op_registry math_function)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
@ -30,10 +32,16 @@ function(op_library TARGET)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
list(APPEND cu_srcs ${TARGET}.cu) list(APPEND cu_srcs ${TARGET}.cu)
endif() endif()
string(REPLACE "_op" "_cudnn_op" CUDNN_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc)
list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc)
endif()
else() else()
foreach(src ${op_library_SRCS}) foreach(src ${op_library_SRCS})
if (${src} MATCHES ".*\\.cu$") if (${src} MATCHES ".*\\.cu$")
list(APPEND cu_srcs ${src}) list(APPEND cu_srcs ${src})
elseif(${src} MATCHES ".*_cudnn_op.cu.cc$")
list(APPEND cudnn_cu_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cu.cc$") elseif(${src} MATCHES ".*\\.cu.cc$")
list(APPEND cu_cc_srcs ${src}) list(APPEND cu_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cc$") elseif(${src} MATCHES ".*\\.cc$")
@ -54,7 +62,7 @@ function(op_library TARGET)
set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE) set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
endif() endif()
if (WITH_GPU) if (WITH_GPU)
nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS} nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
${op_common_deps}) ${op_common_deps})
else() else()
cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${op_library_DEPS} cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${op_library_DEPS}
@ -98,6 +106,12 @@ function(op_library TARGET)
set(pybind_flag 1) set(pybind_flag 1)
endif() endif()
# pybind USE_OP_DEVICE_KERNEL for CUDNN
list(LENGTH cudnn_cu_cc_srcs cudnn_cu_cc_srcs_len)
if (WITH_GPU AND ${cudnn_cu_cc_srcs_len} GREATER 0)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
endif()
# pybind USE_OP # pybind USE_OP
if (${pybind_flag} EQUAL 0) if (${pybind_flag} EQUAL 0)
file(APPEND ${pybind_file} "USE_OP(${TARGET});\n") file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
@ -152,43 +166,24 @@ op_library(lstm_op DEPS sequence2batch lstm_compute)
op_library(lstmp_op DEPS sequence2batch lstm_compute) op_library(lstmp_op DEPS sequence2batch lstm_compute)
op_library(gru_op DEPS sequence2batch gru_compute) op_library(gru_op DEPS sequence2batch gru_compute)
op_library(recurrent_op DEPS executor) op_library(recurrent_op DEPS executor)
op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale math_function) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
op_library(cos_sim_op DEPS cos_sim_functor) op_library(cos_sim_op DEPS cos_sim_functor)
op_library(parallel_do_op DEPS executor) op_library(parallel_do_op DEPS executor)
op_library(create_reader_op DEPS reader) op_library(create_reader_op DEPS reader)
# Regist multiple Kernel to pybind # Regist multiple Kernel to pybind
if (WITH_GPU) if (WITH_GPU)
op_library(conv_op DEPS vol2col depthwise_conv)
op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS
vol2col depthwise_conv)
op_library(edit_distance_op SRCS edit_distance_op.cc edit_distance_op.cu DEPS math_function)
op_library(pool_op SRCS pool_op.cc pool_op.cu.cc pool_cudnn_op.cu.cc DEPS pooling)
op_library(conv_transpose_op SRCS conv_transpose_op.cc conv_transpose_op.cu.cc
conv_transpose_cudnn_op.cu.cc DEPS vol2col)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(conv2d, CUDNN);\n")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(pool2d, CUDNN);\n")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(conv2d_transpose, CUDNN);\n")
else() else()
op_library(conv_op SRCS conv_op.cc DEPS vol2col) op_library(conv_op DEPS vol2col)
op_library(pool_op SRCS pool_op.cc DEPS pooling)
op_library(conv_transpose_op SRCS conv_transpose_op.cc DEPS vol2col)
endif() endif()
op_library(pool_op DEPS pooling)
op_library(conv_transpose_op DEPS vol2col)
cc_library(batch_size_like SRCS batch_size_like.cc DEPS op_registry) cc_library(batch_size_like SRCS batch_size_like.cc DEPS op_registry)
op_library(fill_constant_batch_size_like_op DEPS batch_size_like)
op_library(fill_constant_batch_size_like_op op_library(uniform_random_batch_size_like_op DEPS batch_size_like uniform_random_op)
SRCS fill_constant_batch_size_like_op.cc fill_constant_batch_size_like_op.cu.cc op_library(gaussian_random_batch_size_like_op DEPS batch_size_like gaussian_random_op)
DEPS batch_size_like)
op_library(uniform_random_batch_size_like_op
SRCS uniform_random_batch_size_like_op.cc
DEPS batch_size_like uniform_random_op)
op_library(gaussian_random_batch_size_like_op
SRCS gaussian_random_batch_size_like_op.cc
DEPS batch_size_like gaussian_random_op)
# FIXME(typhoonzero): save/load depends lodtensor serialization functions # FIXME(typhoonzero): save/load depends lodtensor serialization functions
op_library(save_op DEPS lod_tensor) op_library(save_op DEPS lod_tensor)

@ -94,6 +94,38 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
} }
} }
void ArgMaxMatch(const Tensor& dist, int* match_indices, T* match_dist,
T overlap_threshold) const {
constexpr T kEPS = static_cast<T>(1e-6);
int64_t row = dist.dims()[0];
int64_t col = dist.dims()[1];
auto* dist_data = dist.data<T>();
for (int64_t j = 0; j < col; ++j) {
if (match_indices[j] != -1) {
// the j-th column has been matched to one entity.
continue;
}
int max_row_idx = -1;
T max_dist = -1;
for (int i = 0; i < row; ++i) {
T dist = dist_data[i * col + j];
if (dist < kEPS) {
// distance is 0 between m-th row and j-th column
continue;
}
if (dist >= overlap_threshold && dist > max_dist) {
max_row_idx = i;
max_dist = dist;
}
}
if (max_row_idx != -1) {
PADDLE_ENFORCE_EQ(match_indices[j], -1);
match_indices[j] = max_row_idx;
match_dist[j] = max_dist;
}
}
}
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto* dist_mat = context.Input<LoDTensor>("DistMat"); auto* dist_mat = context.Input<LoDTensor>("DistMat");
auto* match_indices = context.Output<Tensor>("ColToRowMatchIndices"); auto* match_indices = context.Output<Tensor>("ColToRowMatchIndices");
@ -120,13 +152,21 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
int* indices = match_indices->data<int>(); int* indices = match_indices->data<int>();
T* dist = match_dist->data<T>(); T* dist = match_dist->data<T>();
auto type = context.Attr<std::string>("match_type");
auto threshold = context.Attr<float>("dist_threshold");
if (n == 1) { if (n == 1) {
BipartiteMatch(*dist_mat, indices, dist); BipartiteMatch(*dist_mat, indices, dist);
if (type == "per_prediction") {
ArgMaxMatch(*dist_mat, indices, dist, threshold);
}
} else { } else {
auto lod = dist_mat->lod().back(); auto lod = dist_mat->lod().back();
for (size_t i = 0; i < lod.size() - 1; ++i) { for (size_t i = 0; i < lod.size() - 1; ++i) {
Tensor one_ins = dist_mat->Slice(lod[i], lod[i + 1]); Tensor one_ins = dist_mat->Slice(lod[i], lod[i + 1]);
BipartiteMatch(one_ins, indices + i * col, dist + i * col); BipartiteMatch(one_ins, indices + i * col, dist + i * col);
if (type == "per_prediction") {
ArgMaxMatch(one_ins, indices + i * col, dist + i * col, threshold);
}
} }
} }
} }
@ -147,6 +187,19 @@ class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker {
"This tensor can contain LoD information to represent a batch of " "This tensor can contain LoD information to represent a batch of "
"inputs. One instance of this batch can contain different numbers of " "inputs. One instance of this batch can contain different numbers of "
"entities."); "entities.");
AddAttr<std::string>(
"match_type",
"(string, defalut: per_prediction) "
"The type of matching method, should be 'bipartite' or "
"'per_prediction', 'bipartite' by defalut.")
.SetDefault("bipartite")
.InEnum({"bipartite", "per_prediction"});
AddAttr<float>(
"dist_threshold",
"(float, defalut: 0.5) "
"If `match_type` is 'per_prediction', this threshold is to determine "
"the extra matching bboxes based on the maximum distance.")
.SetDefault(0.5);
AddOutput("ColToRowMatchIndices", AddOutput("ColToRowMatchIndices",
"(Tensor) A 2-D Tensor with shape [N, M] in int type. " "(Tensor) A 2-D Tensor with shape [N, M] in int type. "
"N is the batch size. If ColToRowMatchIndices[i][j] is -1, it " "N is the batch size. If ColToRowMatchIndices[i][j] is -1, it "
@ -168,10 +221,10 @@ distance matrix. For input 2D matrix, the bipartite matching algorithm can
find the matched column for each row, also can find the matched row for find the matched column for each row, also can find the matched row for
each column. And this operator only calculate matched indices from column each column. And this operator only calculate matched indices from column
to row. For each instance, the number of matched indices is the number of to row. For each instance, the number of matched indices is the number of
of columns of the input ditance matrix. of columns of the input distance matrix.
There are two outputs to save matched indices and distance. There are two outputs to save matched indices and distance.
A simple description, this algothrim matched the best (maximum distance) A simple description, this algorithm matched the best (maximum distance)
row entity to the column entity and the matched indices are not duplicated row entity to the column entity and the matched indices are not duplicated
in each row of ColToRowMatchIndices. If the column entity is not matched in each row of ColToRowMatchIndices. If the column entity is not matched
any row entity, set -1 in ColToRowMatchIndices. any row entity, set -1 in ColToRowMatchIndices.

@ -1,3 +1,5 @@
proto_library(profiler_proto SRCS profiler.proto)
if(WITH_GPU) if(WITH_GPU)
cc_library(enforce SRCS enforce.cc DEPS) cc_library(enforce SRCS enforce.cc DEPS)
else() else()
@ -37,7 +39,8 @@ nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context) nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
nv_test(nccl_test SRCS nccl_test.cu DEPS dynload_cuda gpu_info device_context) nv_test(nccl_test SRCS nccl_test.cu DEPS dynload_cuda gpu_info device_context)
cc_library(profiler SRCS profiler.cc DEPS device_context) cc_library(device_tracer SRCS device_tracer.cc DEPS profiler_proto ${GPU_CTX_DEPS})
cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
cc_test(profiler_test SRCS profiler_test.cc DEPS profiler) cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
nv_test(float16_gpu_test SRCS float16_test.cu) nv_test(float16_gpu_test SRCS float16_test.cu)

File diff suppressed because it is too large Load Diff

@ -0,0 +1,72 @@
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/platform/dynload/cupti.h"
#include "paddle/fluid/platform/profiler.pb.h"
namespace paddle {
namespace platform {
///////////////////////
// WARN: Under Development. Don't depend on it yet.
//////////////////////
// DeviceTracer performs the following tasks:
// 1. Register cuda callbacks for various events: kernel, memcpy, etc.
// 2. Collect cuda statistics: start/end ts, memory, etc.
// 3. Generate a protobuf for further analysis.
class DeviceTracer {
public:
struct KernelRecord {
uint64_t start_ns;
uint64_t end_ns;
uint32_t device_id;
uint32_t stream_id;
uint32_t correlation_id;
};
virtual ~DeviceTracer() {}
// Needs to be called once before use.
virtual void Enable() = 0;
// Needs to be called once after use.
virtual void Disable() = 0;
// Add a pair to correlate internal cuda id with high level
// annotation (string). So cuda statistics can be represented by
// human-readable annotations.
virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0;
// Add a cuda kernel stats. `correlation_id` will be mapped to annotation
// added before for human readability.
virtual void AddKernelRecords(uint64_t start, uint64_t end,
uint32_t device_id, uint32_t stream_id,
uint32_t correlation_id) = 0;
// Generate a proto after done (Disabled).
virtual proto::Profile GenProfile() = 0;
virtual bool IsEnabled() = 0;
};
// Get a DeviceTracer.
DeviceTracer* GetDeviceTracer();
// Set a name for the cuda kernel operation being launched by the thread.
void SetCurAnnotation(const char* anno);
// Clear the name after the operation is done.
void ClearCurAnnotation();
} // namespace platform
} // namespace paddle

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save