Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add-code-exp

local_add_cudnn_lstm
ZhenWang 7 years ago
commit 7b9d9d7089

@ -22,6 +22,27 @@ ENV HOME /root
# Add bash enhancements
COPY ./paddle/scripts/docker/root/ /root/
# Prepare packages for Python
RUN apt-get update && \
apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \
libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \
xz-utils tk-dev libffi-dev liblzma-dev
# Install Python3.6
RUN mkdir -p /root/python_build/ && wget -q https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz && \
tar -zxf sqlite-autoconf-3250300.tar.gz && cd sqlite-autoconf-3250300 && \
./configure -prefix=/usr/local && make -j8 && make install && cd ../ && rm sqlite-autoconf-3250300.tar.gz && \
wget -q https://www.python.org/ftp/python/3.6.0/Python-3.6.0.tgz && \
tar -xzf Python-3.6.0.tgz && cd Python-3.6.0 && \
CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
make -j8 > /dev/null && make altinstall > /dev/null
# Install Python3.7
RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \
tar -xzf Python-3.7.0.tgz && cd Python-3.7.0 && \
CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
make -j8 > /dev/null && make altinstall > /dev/null
RUN apt-get update && \
apt-get install -y --allow-downgrades patchelf \
python3 python3-dev python3-pip \
@ -74,6 +95,12 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
RUN pip3 install -U wheel && \
pip3 install -U docopt PyYAML sphinx==1.5.6 && \
pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \
pip3.6 install -U wheel && \
pip3.6 install -U docopt PyYAML sphinx==1.5.6 && \
pip3.6 install sphinx-rtd-theme==0.1.9 recommonmark && \
pip3.7 install -U wheel && \
pip3.7 install -U docopt PyYAML sphinx==1.5.6 && \
pip3.7 install sphinx-rtd-theme==0.1.9 recommonmark && \
easy_install -U pip && \
pip install -U pip setuptools wheel && \
pip install -U docopt PyYAML sphinx==1.5.6 && \
@ -82,22 +109,34 @@ RUN pip3 install -U wheel && \
RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip3 install opencv-python && \
pip3.6 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip3.6 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip3.6 install opencv-python && \
pip3.7 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip3.7 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip3.7 install opencv-python && \
pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip install opencv-python
#For docstring checker
RUN pip3 install pylint pytest astroid isort
RUN pip3.6 install pylint pytest astroid isort
RUN pip3.7 install pylint pytest astroid isort
RUN pip install pylint pytest astroid isort LinkChecker
COPY ./python/requirements.txt /root/
RUN pip3 install -r /root/requirements.txt
RUN pip3.6 install -r /root/requirements.txt
RUN pip3.7 install -r /root/requirements.txt
RUN pip install -r /root/requirements.txt
# To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
# the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
RUN apt-get install -y libssl-dev libffi-dev
RUN pip3 install certifi urllib3[secure]
RUN pip3.6 install certifi urllib3[secure]
RUN pip3.7 install certifi urllib3[secure]
RUN pip install certifi urllib3[secure]

@ -109,7 +109,8 @@ function(op_library TARGET)
# Define operators that don't need pybind here.
foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op")
"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
"fusion_transpose_flatten_concat_op")
if ("${TARGET}" STREQUAL "${manual_pybind_op}")
set(pybind_flag 1)
endif()

@ -116,8 +116,14 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
if (NOT WIN32)
cc_library(transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto device_context)
cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
shape_inference data_transform lod_tensor profiler transfer_scope_cache)
else()
cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
shape_inference data_transform lod_tensor profiler)
shape_inference data_transform lod_tensor)
endif(NOT WIN32)
cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)

@ -20,6 +20,7 @@ limitations under the License. */
#include "paddle/fluid/framework/ngraph_operator.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/transfer_scope_cache.h"
#include "paddle/fluid/operators/detail/macros.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h"
@ -391,8 +392,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
int64_t max_memory_size = GetEagerDeletionThreshold();
std::unique_ptr<GarbageCollector<Tensor>> gc;
// WhileOp would set keep_kids to false
// WhileGradOp would need the scopes created in WhileOp
// WhileOp would set keep_kids to true,
// because WhileGradOp needs the scopes created in WhileOp.
// Perhaps, we should not perform eager deletion in WhileOp
// The scopes and variables created by WhileOp would be deleted
// in WhileGradOp.

@ -83,6 +83,7 @@ void NaiveExecutor::Run() {
for (auto &op : ops_) {
VLOG(3) << std::this_thread::get_id() << " run " << op->Type()
<< " on scope " << scope_;
op->SetIsCalledByExecutor(false);
op->Run(*scope_, place_);
}
}

@ -22,6 +22,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/shape_inference.h"
#include "paddle/fluid/framework/transfer_scope_cache.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/platform/profiler.h"
@ -33,11 +34,6 @@ DEFINE_bool(check_nan_inf, false,
namespace paddle {
namespace framework {
// Combine two hash values to a single hash.
inline size_t CombineHash(size_t seed, size_t a) {
return (seed ^ a) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
}
std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN),
std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain),
@ -797,17 +793,6 @@ void OperatorWithKernel::TransferInplaceVarsBack(
Scope* OperatorWithKernel::TryTransferData(
const Scope& scope, const OpKernelType& expected_kernel_key,
std::vector<std::string>* transfered_inplace_vars) const {
// In the inference scenerio, the scopes will be reused across the batches, so
// the `new_scope` here will result in GPU memroy explosion over the running of
// operators.
// We use a thread_local cache to fix that issue, the key in the cache is the
// combination of the `scope` argument, from_kernel_type, target_kernel_type.
// Have a discussion with @Superjomn or the inference developers if some changes
// on this logic for this macro might not tested on the other scenerios.
#ifdef PADDLE_ON_INFERENCE
thread_local std::unordered_map<size_t, Scope*> infer_transfer_scope_cache;
#endif
Scope* new_scope = nullptr;
for (auto& var_name_item : Inputs()) {
for (auto& var_name : var_name_item.second) {
@ -838,23 +823,23 @@ Scope* OperatorWithKernel::TryTransferData(
VLOG(30) << "Transform Variable " << var_name << " from "
<< kernel_type_for_var << " to " << expected_kernel_key;
#ifdef PADDLE_ON_INFERENCE
size_t infer_cache_key =
CombineHash(OpKernelType::Hash()(kernel_type_for_var),
OpKernelType::Hash()(expected_kernel_key));
infer_cache_key =
CombineHash(infer_cache_key, std::hash<const Scope*>()(&scope));
auto it = infer_transfer_scope_cache.find(infer_cache_key);
if (it != infer_transfer_scope_cache.end()) {
new_scope = infer_transfer_scope_cache[infer_cache_key];
} else {
new_scope = &scope.NewScope();
infer_transfer_scope_cache[infer_cache_key] = new_scope;
// In the inference scenerio, the scopes will be reused across the
// batches, so the `new_scope` here will result in GPU memroy explosion
// over the running of operators.
// We use a thread_local cache to fix that issue, the key in the cache is
// the combination of the `scope` argument, from_kernel_type,
// target_kernel_type.
// Have a discussion with @Superjomn or the inference developers if some
// changes on this logic for this macro might not tested on the other
// scenerios.
// If this op is not called by an Executor or ParallelExecutor, it should
// called by a NaiveExecutor, the NaiveExecutor will cache the scopes and
// variables, that behavior a lot different.
if (!run_by_executor_) {
new_scope = TryCreateTransferScope(kernel_type_for_var,
expected_kernel_key, &scope);
}
#endif
if (new_scope == nullptr) {
if (!new_scope) {
new_scope = &scope.NewScope();
}

@ -127,6 +127,8 @@ class OperatorBase {
//! Get all outputs variable names
virtual std::vector<std::string> OutputVars(bool has_intermediate) const;
void SetIsCalledByExecutor(bool x) { run_by_executor_ = x; }
protected:
std::string type_;
// NOTE: in case of OpGrad, inputs_ contains:
@ -139,6 +141,8 @@ class OperatorBase {
// IG (Inputs Gradients)
VariableNameMap outputs_;
AttributeMap attrs_;
// Whether this operator executes in an Executor.
bool run_by_executor_{true};
private:
void GenerateTemporaryNames();

@ -0,0 +1,62 @@
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/transfer_scope_cache.h"
namespace paddle {
namespace framework {
// Holds all the transfer scope across the process.
std::unordered_map<size_t, Scope*>& global_transfer_data_cache() {
typedef std::unordered_map<size_t, Scope*> map_t;
thread_local std::unique_ptr<map_t> x(new map_t);
return *x;
}
// Holds all the transfer scope for this thread.
std::unordered_set<Scope*>& global_transfer_scope_cache() {
typedef std::unordered_set<Scope*> set_t;
thread_local std::unique_ptr<set_t> x(new set_t);
return *x;
}
// Try to create a transfer scope. If one cached scope has match the
// requirement, just return that one.
// Inputs:
// @type0: the source kernel type.
// @type1: the target kernel type.
// @scope: the execution scope of this op.
// Returns: A scope used to hold the transfer data across the different kernel
// type.
Scope* TryCreateTransferScope(OpKernelType type0, OpKernelType type1,
const Scope* scope) {
Scope* new_scope{nullptr};
size_t infer_cache_key =
CombineHash(OpKernelType::Hash()(type0), OpKernelType::Hash()(type1));
infer_cache_key =
CombineHash(infer_cache_key, std::hash<const Scope*>()(scope));
auto it = global_transfer_data_cache().find(infer_cache_key);
if (it != global_transfer_data_cache().end()) {
new_scope = global_transfer_data_cache()[infer_cache_key];
} else {
new_scope = &scope->NewScope();
global_transfer_data_cache()[infer_cache_key] = new_scope;
}
global_transfer_scope_cache().insert(new_scope);
return new_scope;
}
} // namespace framework
} // namespace paddle

@ -0,0 +1,41 @@
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <thread> // NOLINT
#include <unordered_map>
#include <unordered_set>
#include "paddle/fluid/framework/op_kernel_type.h"
#include "paddle/fluid/framework/scope.h"
namespace paddle {
namespace framework {
std::unordered_map<size_t, Scope*>& global_transfer_data_cache();
std::unordered_set<Scope*>& global_transfer_scope_cache();
// Combine two hash values to a single hash.
static size_t CombineHash(size_t seed, size_t a) {
return (seed ^ a) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
}
Scope* TryCreateTransferScope(OpKernelType type0, OpKernelType type1,
const Scope* scope);
void RemoveKidsFromTransferScopeCache(Scope* scope);
} // namespace framework
} // namespace paddle

@ -4,6 +4,7 @@ endif()
# analysis and tensorrt must be added before creating static library,
# otherwise, there would be undefined reference to them in static library.
add_subdirectory(analysis)
add_subdirectory(utils)
if (TENSORRT_FOUND)
add_subdirectory(tensorrt)
endif()

@ -30,7 +30,9 @@ cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager)
cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS scope lod_tensor enforce)
cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc)
cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config analysis_config paddle_pass_builder DEPS zero_copy_tensor)
cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS
lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config
analysis_config paddle_pass_builder zero_copy_tensor reset_tensor_array)
cc_test(test_paddle_inference_api
SRCS api_tester.cc

@ -46,6 +46,7 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
prog_file = other.prog_file;
param_file = other.param_file;
specify_input_name = other.specify_input_name;
cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_;
// fields from this.
enable_ir_optim = other.enable_ir_optim;
use_feed_fetch_ops = other.use_feed_fetch_ops;
@ -72,6 +73,7 @@ contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) {
prog_file = other.prog_file;
param_file = other.param_file;
specify_input_name = other.specify_input_name;
cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_;
// fields from this.
enable_ir_optim = other.enable_ir_optim;
use_feed_fetch_ops = other.use_feed_fetch_ops;

@ -31,11 +31,11 @@
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#endif
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/profiler.h"
DECLARE_bool(profile);
DECLARE_int32(paddle_num_threads);
namespace paddle {
@ -67,7 +67,7 @@ bool AnalysisPredictor::Init(
#endif
// no matter with or without MKLDNN
paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
if (!PrepareScope(parent_scope)) {
return false;
@ -160,6 +160,14 @@ bool AnalysisPredictor::PrepareExecutor() {
return true;
}
void AnalysisPredictor::SetMkldnnThreadID(int tid) {
#ifdef PADDLE_WITH_MKLDNN
platform::set_cur_thread_id(tid);
#else
LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN";
#endif
}
bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data,
int batch_size) {
@ -167,7 +175,6 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
inference::Timer timer;
timer.tic();
// set feed variable
std::vector<framework::LoDTensor> feeds;
framework::Scope *scope = sub_scope_ ? sub_scope_ : scope_.get();
if (!SetFeed(inputs, scope)) {
LOG(ERROR) << "fail to set feed";
@ -208,17 +215,29 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
framework::DDim ddim = framework::make_ddim(inputs[i].shape);
void *input_ptr;
if (inputs[i].dtype == PaddleDType::INT64) {
input_ptr = input.mutable_data<int64_t>(ddim, platform::CPUPlace());
input_ptr = input.mutable_data<int64_t>(ddim, place_);
} else if (inputs[i].dtype == PaddleDType::FLOAT32) {
input_ptr = input.mutable_data<float>(ddim, platform::CPUPlace());
input_ptr = input.mutable_data<float>(ddim, place_);
} else {
LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
return false;
}
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
inputs[i].data.length());
if (platform::is_cpu_place(place_)) {
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
inputs[i].data.length());
} else {
#ifdef PADDLE_WITH_CUDA
auto dst_gpu_place = boost::get<platform::CUDAPlace>(place_);
memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
platform::CPUPlace(), inputs[i].data.data(),
inputs[i].data.length(),
0); // stream 0 for sync copy
#else
PADDLE_THROW("Not compile with CUDA, should not reach here.");
#endif
}
// TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
framework::LoD lod;
for (auto &level : inputs[i].lod) {

@ -69,6 +69,8 @@ class AnalysisPredictor : public PaddlePredictor {
framework::Scope *scope() { return scope_.get(); }
framework::ProgramDesc &program() { return *inference_program_; }
void SetMkldnnThreadID(int tid);
protected:
bool PrepareProgram(const std::shared_ptr<framework::ProgramDesc> &program);
bool PrepareScope(const std::shared_ptr<framework::Scope> &parent_scope);

@ -24,11 +24,11 @@ limitations under the License. */
#include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_bool(profile, false, "Turn on profiler for fluid");
DECLARE_int32(paddle_num_threads);
namespace paddle {
namespace {
@ -76,7 +76,7 @@ bool NativePaddlePredictor::Init(
#endif
// no matter with or without MKLDNN
paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
if (config_.use_gpu) {
place_ = paddle::platform::CUDAPlace(config_.device);
@ -139,7 +139,6 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
Timer timer;
timer.tic();
// set feed variable
std::vector<framework::LoDTensor> feeds;
framework::Scope *scope = sub_scope_ != nullptr ? sub_scope_ : scope_.get();
if (!SetFeed(inputs, scope)) {
LOG(ERROR) << "fail to set feed";
@ -195,17 +194,30 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
framework::DDim ddim = framework::make_ddim(inputs[i].shape);
void *input_ptr;
if (inputs[i].dtype == PaddleDType::INT64) {
input_ptr = input.mutable_data<int64_t>(ddim, platform::CPUPlace());
input_ptr = input.mutable_data<int64_t>(ddim, place_);
} else if (inputs[i].dtype == PaddleDType::FLOAT32) {
input_ptr = input.mutable_data<float>(ddim, platform::CPUPlace());
input_ptr = input.mutable_data<float>(ddim, place_);
} else {
LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
return false;
}
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
inputs[i].data.length());
if (platform::is_cpu_place(place_)) {
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
inputs[i].data.length());
} else {
#ifdef PADDLE_WITH_CUDA
auto dst_gpu_place = boost::get<platform::CUDAPlace>(place_);
memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
platform::CPUPlace(), inputs[i].data.data(),
inputs[i].data.length(),
0); // stream 0 for sync copy
#else
PADDLE_THROW("Not compile with CUDA, should not reach here.");
#endif
}
// TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
framework::LoD lod;
for (auto &level : inputs[i].lod) {

@ -46,8 +46,6 @@ if(WITH_GPU)
endif()
endif(NOT WIN32)
endif()
include_directories("D:/Paddle/")
include_directories("${PADDLE_LIB}")
include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
include_directories("${PADDLE_LIB}/third_party/install/glog/include")

@ -51,9 +51,9 @@ struct AnalysisConfig : public NativeConfig {
int max_batch_size = 1);
bool use_tensorrt() const { return use_tensorrt_; }
void EnableMKLDNN();
// NOTE this is just for internal development, please not use it.
// NOT stable yet.
void EnableMKLDNN();
bool use_mkldnn() const { return use_mkldnn_; }
friend class ::paddle::AnalysisPredictor;

@ -186,6 +186,19 @@ struct NativeConfig : public PaddlePredictor::Config {
// Specify the variable's name of each input if input tensors don't follow the
// `feeds` and `fetches` of the phase `save_inference_model`.
bool specify_input_name{false};
// Set and get the number of cpu math library threads.
void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads) {
cpu_math_library_num_threads_ = cpu_math_library_num_threads;
}
int cpu_math_library_num_threads() const {
return cpu_math_library_num_threads_;
}
protected:
// number of cpu math library (such as MKL, OpenBlas) threads for each
// instance.
int cpu_math_library_num_threads_{1};
};
// A factory to help create different predictors.

@ -74,7 +74,7 @@ inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} ana
# ocr
set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
if (NOT EXISTS ${OCR_INSTALL_DIR})
inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
endif()
inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc)
@ -88,31 +88,31 @@ inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet
# anakin
if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
# anakin rnn1
set(ANAKIN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/anakin")
set(ANAKIN_RNN1_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/rnn1")
inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn.anakin2.model.bin")
inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn_data.txt")
cc_test(test_anakin_rnn1 SRCS anakin_rnn1_tester.cc
ARGS --model=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin
--datapath=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn_data.txt
DEPS inference_anakin_api_shared SERIAL)
# anakin mobilenet
if(WITH_GPU)
set(ANAKIN_MOBILENET_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/mobilenet")
inference_download(${ANAKIN_MOBILENET_INSTALL_DIR} ${INFERENCE_URL} "mobilenet_v2.anakin.bin")
cc_test(test_anakin_mobilenet SRCS anakin_mobilenet_tester.cc
ARGS --model=${ANAKIN_MOBILENET_INSTALL_DIR}/mobilenet_v2.anakin.bin
DEPS inference_anakin_api_shared dynload_cuda SERIAL)
endif()
# anakin rnn1
set(ANAKIN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/anakin")
set(ANAKIN_RNN1_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/rnn1")
inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn.anakin2.model.bin")
inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn_data.txt")
cc_test(test_anakin_rnn1 SRCS anakin_rnn1_tester.cc
ARGS --model=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin
--datapath=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn_data.txt
DEPS inference_anakin_api_shared SERIAL)
# anakin mobilenet
if(WITH_GPU)
set(ANAKIN_MOBILENET_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/mobilenet")
inference_download(${ANAKIN_MOBILENET_INSTALL_DIR} ${INFERENCE_URL} "mobilenet_v2.anakin.bin")
cc_test(test_anakin_mobilenet SRCS anakin_mobilenet_tester.cc
ARGS --model=${ANAKIN_MOBILENET_INSTALL_DIR}/mobilenet_v2.anakin.bin
DEPS inference_anakin_api_shared dynload_cuda SERIAL)
endif()
endif()
if(WITH_GPU AND TENSORRT_FOUND)
set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt")
if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR})
inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz")
endif()
inference_analysis_test(test_trt_models SRCS trt_models_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models SERIAL)
set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt")
if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR})
inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz")
endif()
inference_analysis_test(test_trt_models SRCS trt_models_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models SERIAL)
endif()

@ -27,6 +27,7 @@ void SetConfig(AnalysisConfig *cfg) {
cfg->device = 0;
cfg->enable_ir_optim = true;
cfg->specify_input_name = true;
cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
}
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {

@ -53,6 +53,8 @@ std::ostream &operator<<(std::ostream &os, const NativeConfig &config) {
os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n";
os << GenSpaces(num_spaces)
<< "specify_input_name: " << config.specify_input_name << "\n";
os << GenSpaces(num_spaces)
<< "cpu_num_threads: " << config.cpu_math_library_num_threads() << "\n";
num_spaces--;
os << GenSpaces(num_spaces) << "}\n";
return os;

@ -42,6 +42,7 @@ DEFINE_bool(use_analysis, true,
"Running the inference program in analysis mode.");
DECLARE_bool(profile);
DECLARE_int32(paddle_num_threads);
namespace paddle {
namespace inference {
@ -206,22 +207,23 @@ void TestMultiThreadPrediction(
int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat;
std::vector<std::thread> threads;
std::vector<std::unique_ptr<PaddlePredictor>> predictors;
predictors.emplace_back(CreateTestPredictor(config, use_analysis));
for (int tid = 1; tid < num_threads; ++tid) {
predictors.emplace_back(predictors.front()->Clone());
}
auto main_predictor = CreateTestPredictor(config, use_analysis);
size_t total_time{0};
for (int tid = 0; tid < num_threads; ++tid) {
threads.emplace_back([&, tid]() {
#ifdef PADDLE_WITH_MKLDNN
platform::set_cur_thread_id(static_cast<int>(tid) + 1);
#endif
// Each thread should have local inputs and outputs.
// The inputs of each thread are all the same.
std::vector<PaddleTensor> outputs_tid;
auto &predictor = predictors[tid];
// To ensure the thread binding correctly,
// please clone inside the threadpool.
auto predictor = main_predictor->Clone();
#ifdef PADDLE_WITH_MKLDNN
if (use_analysis) {
static_cast<AnalysisPredictor *>(predictor.get())
->SetMkldnnThreadID(static_cast<int>(tid) + 1);
}
#endif
// warmup run
LOG(INFO) << "Running thread " << tid << ", warm up run...";

@ -0,0 +1,2 @@
cc_library(benchmark SRCS benchmark.cc DEPS enforce)
cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark)

@ -0,0 +1,49 @@
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/utils/benchmark.h"
#include <sstream>
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace inference {
std::string Benchmark::SerializeToString() const {
std::stringstream ss;
ss << "-----------------------------------------------------\n";
ss << "name\t";
ss << "batch_size\t";
ss << "num_threads\t";
ss << "latency\t";
ss << "qps";
ss << '\n';
ss << name_ << "\t";
ss << batch_size_ << "\t";
ss << num_threads_ << "\t";
ss << latency_ << "\t";
ss << 1000 / latency_;
ss << '\n';
return ss.str();
}
void Benchmark::PersistToFile(const std::string &path) const {
std::ofstream file(path, std::ios::app);
PADDLE_ENFORCE(file.is_open(), "Can not open %s to add benchmark", path);
file << SerializeToString();
file.flush();
file.close();
}
} // namespace inference
} // namespace paddle

@ -0,0 +1,52 @@
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <fstream>
#include <iostream>
namespace paddle {
namespace inference {
/*
* Helper class to calculate the performance.
*/
struct Benchmark {
int batch_size() const { return batch_size_; }
void SetBatchSize(int x) { batch_size_ = x; }
int num_threads() const { return num_threads_; }
void SetNumThreads(int x) { num_threads_ = x; }
bool use_gpu() const { return use_gpu_; }
void SetUseGpu() { use_gpu_ = true; }
int latency() const { return latency_; }
void SetLatency(int x) { latency_ = x; }
const std::string& name() const { return name_; }
void SetName(const std::string& name) { name_ = name; }
std::string SerializeToString() const;
void PersistToFile(const std::string& path) const;
private:
bool use_gpu_{false};
int batch_size_{0};
int latency_;
int num_threads_{1};
std::string name_;
};
} // namespace inference
} // namespace paddle

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save