merge nn.py

revert-14324-fix_vlog
barrierye 6 years ago
commit 8c1e304307

1
.gitignore vendored

@ -28,3 +28,4 @@ third_party/
build_*
# clion workspace.
cmake-build-*
model_test

@ -69,6 +69,7 @@ option(WITH_ANAKIN "Compile with Anakin library" OFF)
option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE})
option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF)
option(WITH_INFERENCE "Compile fluid inference library" ON)
option(ON_INFER "Turn on inference optimization." OFF)
option(WITH_INFERENCE_API_TEST "Test fluid inference high-level api interface" OFF)
option(WITH_SYSTEM_BLAS "Use system blas library" OFF)
option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION})
@ -179,6 +180,7 @@ include(external/eigen) # download eigen3
include(external/pybind11) # download pybind11
include(external/cares)
include(external/cub)
include(external/xxhash) # download xxhash
if (NOT WIN32)
# there is no official support of snappystream, warpctc, nccl, cupti in windows
@ -301,3 +303,8 @@ if(WITH_DOC)
find_python_module(recommonmark REQUIRED)
add_subdirectory(doc)
endif()
if (ON_INFER)
message(WARNING "On inference mode, will take place some specific optimization.")
add_definitions(-DPADDLE_ON_INFERENCE)
endif()

@ -75,14 +75,14 @@ RUN pip3 install -U wheel && \
pip3 install -U docopt PyYAML sphinx==1.5.6 && \
pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \
easy_install -U pip && \
pip install -U wheel && \
pip install -U pip setuptools wheel && \
pip install -U docopt PyYAML sphinx==1.5.6 && \
pip install sphinx-rtd-theme==0.1.9 recommonmark
RUN pip3 install pre-commit 'ipython==5.3.0' && \
RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip3 install opencv-python && \
pip install pre-commit 'ipython==5.3.0' && \
pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip install opencv-python

@ -0,0 +1,46 @@
INCLUDE(ExternalProject)
set(XXHASH_SOURCE_DIR ${THIRD_PARTY_PATH}/xxhash)
set(XXHASH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/xxhash)
set(XXHASH_INCLUDE_DIR "${XXHASH_INSTALL_DIR}/include")
IF(WITH_STATIC_LIB)
SET(BUILD_CMD make lib)
ELSE()
SET(BUILD_CMD sed -i "s/-Wstrict-prototypes -Wundef/-Wstrict-prototypes -Wundef -fPIC/g" ${XXHASH_SOURCE_DIR}/src/extern_xxhash/Makefile && make lib)
ENDIF()
ExternalProject_Add(
extern_xxhash
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/Cyan4973/xxHash"
GIT_TAG "v0.6.5"
PREFIX ${XXHASH_SOURCE_DIR}
DOWNLOAD_NAME "xxhash"
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_IN_SOURCE 1
PATCH_COMMAND
BUILD_COMMAND ${BUILD_CMD}
INSTALL_COMMAND export PREFIX=${XXHASH_INSTALL_DIR}/ && make install
TEST_COMMAND ""
)
set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a")
INCLUDE_DIRECTORIES(${XXHASH_INCLUDE_DIR})
add_library(xxhash STATIC IMPORTED GLOBAL)
set_property(TARGET xxhash PROPERTY IMPORTED_LOCATION ${XXHASH_LIBRARIES})
include_directories(${XXHASH_INCLUDE_DIR})
add_dependencies(xxhash extern_xxhash)
LIST(APPEND external_project_dependencies xxhash)
IF(WITH_C_API)
INSTALL(DIRECTORY ${XXHASH_INCLUDE_DIR} DESTINATION third_party/xxhash)
IF(ANDROID)
INSTALL(FILES ${XXHASH_LIBRARIES} DESTINATION third_party/xxhash/lib/${ANDROID_ABI})
ELSE()
INSTALL(FILES ${XXHASH_LIBRARIES} DESTINATION third_party/xxhash/lib)
ENDIF()
ENDIF()

@ -14,6 +14,9 @@
# make package for paddle fluid shared and static library
function(copy TARGET)
if (NOT ON_INFER)
message(WARNING "Turn on the ON_INFER flag when building inference_lib only.")
endif()
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DSTS DEPS)
@ -31,7 +34,7 @@ function(copy TARGET)
foreach(index RANGE ${len})
list(GET copy_lib_SRCS ${index} src)
list(GET copy_lib_DSTS ${index} dst)
add_custom_command(TARGET ${TARGET} PRE_BUILD
add_custom_command(TARGET ${TARGET} PRE_BUILD
COMMAND mkdir -p "${dst}"
COMMAND cp -r "${src}" "${dst}"
COMMENT "copying ${src} -> ${dst}")
@ -67,6 +70,13 @@ copy(boost_lib
DEPS boost
)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/xxhash")
copy(xxhash_lib
SRCS ${XXHASH_INCLUDE_DIR} ${XXHASH_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib
DEPS xxhash
)
if(NOT PROTOBUF_FOUND)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/protobuf")
copy(protobuf_lib
@ -186,7 +196,7 @@ copy(cmake_cache
DSTS ${FLUID_INSTALL_DIR})
# This command generates a complete fluid library for both train and inference
add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep})
add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep})
# Following commands generate a inference-only fluid library
# third_party, version.txt and CMakeCache.txt are the same position with ${FLUID_INSTALL_DIR}

@ -86,7 +86,7 @@ paddle.fluid.layers.reduce_prod ArgSpec(args=['input', 'dim', 'keep_dim', 'name'
paddle.fluid.layers.sequence_first_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.sequence_last_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.sequence_slice ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.dropout ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name'], varargs=None, keywords=None, defaults=(False, None, None))
paddle.fluid.layers.dropout ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name', 'dropout_implementation'], varargs=None, keywords=None, defaults=(False, None, None, 'downgrade_in_infer'))
paddle.fluid.layers.split ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None))
paddle.fluid.layers.ctc_greedy_decoder ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.edit_distance ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None))
@ -107,7 +107,7 @@ paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label',
paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1))
paddle.fluid.layers.reshape ArgSpec(args=['x', 'shape', 'actual_shape', 'act', 'inplace', 'name'], varargs=None, keywords=None, defaults=(None, None, True, None))
paddle.fluid.layers.reshape ArgSpec(args=['x', 'shape', 'actual_shape', 'act', 'inplace', 'name'], varargs=None, keywords=None, defaults=(None, None, False, None))
paddle.fluid.layers.squeeze ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.unsqueeze ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.lod_reset ArgSpec(args=['x', 'y', 'target_lod'], varargs=None, keywords=None, defaults=(None, None))
@ -174,7 +174,9 @@ paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None
paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None))
paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None))
paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)

@ -252,9 +252,9 @@ std::vector<ir::Node *> SortOpsAndDelayOptimizeOp(const ir::Graph &graph) {
std::vector<ir::Node *> sorted_ret;
for (size_t i = 0; i < ret.size(); ++i) {
if (i < last_backward) {
if (boost::get<int>(ret[i]->Op()->GetAttr(
OpProtoAndCheckerMaker::OpRoleAttrName())) ==
static_cast<int>(OpRole::kOptimize)) {
if (static_cast<bool>(boost::get<int>(ret[i]->Op()->GetAttr(
OpProtoAndCheckerMaker::OpRoleAttrName())) &
static_cast<int>(OpRole::kOptimize))) {
optimize_ops.push_back(ret[i]);
} else {
sorted_ret.push_back(ret[i]);

@ -120,19 +120,25 @@ size_t GraphNum(const Graph &graph) {
std::deque<ir::Node *> q_nodes;
std::vector<std::unordered_set<ir::Node *>> graph_nodes;
std::unordered_set<ir::Node *> g_nodes;
// q_set used to record records in the queue.
std::unordered_set<ir::Node *> q_set;
size_t graph_count = 0;
auto traverse_nodes = [&visited_nodes,
&q_nodes](const std::vector<ir::Node *> &nodes) {
std::copy_if(
nodes.begin(), nodes.end(), std::back_inserter(q_nodes),
[&visited_nodes](Node *node) { return !visited_nodes.count(node); });
auto traverse_nodes = [&visited_nodes, &q_nodes,
&q_set](const std::vector<ir::Node *> &nodes) {
for (auto n : nodes) {
if (visited_nodes.count(n) == 0 && q_set.count(n) == 0) {
q_nodes.push_back(n);
q_set.insert(n);
}
}
};
while (visited_nodes.size() != nodes.size()) {
if (!q_nodes.empty()) {
auto cur_node = q_nodes.front();
q_nodes.pop_front();
q_set.erase(cur_node);
visited_nodes.insert(cur_node);
g_nodes.insert(cur_node);
traverse_nodes(cur_node->inputs);
@ -146,6 +152,7 @@ size_t GraphNum(const Graph &graph) {
for (auto &n : nodes) {
if (visited_nodes.count(n) == 0) {
q_nodes.push_back(n);
q_set.insert(n);
break;
}
}

@ -18,6 +18,82 @@ limitations under the License. */
namespace paddle {
namespace framework {
// NOTE The vector<LoDTensor> can't be replaced with the class LoDTensorArray
// directly, because there are many vector<LoDTensor> used accross the project,
// and some of them are treated as LoDTensorArray.
#if !defined(PADDLE_ON_INFERENCE)
using LoDTensorArray = std::vector<LoDTensor>;
}
#else // !PADDLE_ON_INFERENCE
#pragma message "LoDTensorArray is replaced with the inference one."
/*
* A LoDTensorArray which will not deallocate buffer when resized, fix the data
* diff in inference, and more performance friendly in the concurrency
* scenerios.
*/
class LoDTensorArray {
public:
LoDTensorArray() = default;
using iterator = std::vector<LoDTensor>::iterator;
using const_iterator = std::vector<LoDTensor>::const_iterator;
const_iterator begin() const { return array_.begin(); }
const_iterator end() const { return array_.begin() + size_; }
iterator begin() { return array_.begin(); }
iterator end() { return array_.begin() + size_; }
void push_back(const LoDTensor& x) {
if (size_ < array_.size()) {
array_[size_++] = x;
} else {
array_.push_back(x);
++size_;
}
}
void resize(size_t size) {
if (array_.size() < size) {
array_.resize(size);
}
size_ = size;
}
void emplace_back() { array_.emplace_back(); }
void emplace_back(LoDTensor&& x) { array_.emplace_back(std::move(x)); }
LoDTensor& back() { return array_.back(); }
size_t space() const { return array_.size(); }
void reserve(size_t size) {
// Naive warning to tell user this array might be to large. The memory and
// buffer used by this TensorArray will not be deleted during the training
// and inference phase, so attention not to make it expand too long.
if (size > 800UL) {
LOG(WARNING) << "TensorArray has more than 800 items";
}
array_.reserve(size);
}
bool empty() const { return size_ == 0UL; }
void clear() { size_ = 0UL; }
LoDTensor& operator[](size_t id) { return array_[id]; }
const LoDTensor& operator[](size_t id) const { return array_[id]; }
LoDTensor& at(size_t id) { return array_.at(id); }
const LoDTensor& at(size_t id) const { return array_.at(id); }
size_t size() const { return size_; }
private:
size_t size_{0};
std::vector<LoDTensor> array_;
};
#endif // !PADDLE_ON_INFERENCE
} // namespace framework
} // namespace paddle

@ -542,6 +542,33 @@ class CPUVector : public std::vector<T, std::allocator<T>> {
this->reserve(this->size() + size_t(end - begin));
this->insert(this->end(), begin, end);
}
const T *CUDAData(platform::Place place) const {
PADDLE_THROW(
"Vector::CUDAData() method is not supported in CPU-only version");
}
T *CUDAMutableData(platform::Place place) {
PADDLE_THROW(
"Vector::CUDAMutableData() method is not supported in CPU-only "
"version");
}
const T *Data(platform::Place place) const {
PADDLE_ENFORCE(
platform::is_cpu_place(place),
"Vector::Data() method is not supported when not in CPUPlace");
return this->data();
}
T *MutableData(platform::Place place) {
PADDLE_ENFORCE(
platform::is_cpu_place(place),
"Vector::MutableData() method is not supported when not in CPUPlace");
return this->data();
}
const void *Handle() const { return static_cast<const void *>(this); }
};
template <typename T>

@ -146,22 +146,5 @@ void NaiveExecutor::CleanFeedFetchOps() {
ops_.swap(ops);
}
void NaiveExecutor::EnableMKLDNN(const ProgramDesc &program) {
#ifdef PADDLE_WITH_MKLDNN
VLOG(3) << "use_mkldnn=True";
for (size_t block_id = 0; block_id < program.Size(); ++block_id) {
auto *block = const_cast<ProgramDesc &>(program).MutableBlock(block_id);
for (auto *op : block->AllOps()) {
if (op->HasAttr("use_mkldnn")) {
op->SetAttr("use_mkldnn", true);
}
}
}
#else
LOG(WARNING)
<< "'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option";
#endif
}
} // namespace framework
} // namespace paddle

@ -48,8 +48,6 @@ class NaiveExecutor {
void CleanFeedFetchOps();
void EnableMKLDNN(const ProgramDesc& program);
protected:
void CreateVariables(const ProgramDesc& desc, Scope* scope, int block_id);

@ -71,6 +71,8 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
static_cast<int>(OpRole::kLoss) | static_cast<int>(OpRole::kForward),
static_cast<int>(OpRole::kLoss) |
static_cast<int>(OpRole::kBackward),
static_cast<int>(OpRole::kOptimize) |
static_cast<int>(OpRole::kLRSched),
static_cast<int>(OpRole::kNotSpecified)})
.SetDefault(static_cast<int>(OpRole::kNotSpecified));
AddAttr<std::vector<std::string>>(OpRoleVarAttrName(),

@ -20,17 +20,20 @@ limitations under the License. */
namespace paddle {
namespace framework {
//////////////////////////
// Don't add more roles to make this too complicated!
//////////////////////////
enum class OpRole {
kForward = 0x0000,
kBackward = 0x0001,
kOptimize = 0x0002,
// RPC role is for send/recv releated op
kRPC = 0x0003,
kRPC = 0x0004,
// Dist role is for split_byref/split_selected_rows/concat
// used for distributed training.
kDist = 0x0004,
kDist = 0x0008,
// Tag all learning rate scheduler operators.
kLRSched = 0x0005,
kLRSched = 0x0016,
kLoss = 0x0100,
// The default value of op's role. This should be only used for unittests and

@ -78,6 +78,8 @@ class Scope {
/// Drop all kids scopes belonged to this scope.
void DropKids();
std::list<Scope*>& kids() const { return kids_; }
/// Find if a scope exists in the kid scopes
bool HasKid(const Scope* scope) const;

@ -25,7 +25,6 @@ DEFINE_int32(dist_threadpool_size, 0,
namespace paddle {
namespace framework {
std::unique_ptr<ThreadPool> ThreadPool::threadpool_(nullptr);
std::once_flag ThreadPool::init_flag_;
@ -47,8 +46,7 @@ void ThreadPool::Init() {
}
}
ThreadPool::ThreadPool(int num_threads)
: total_threads_(num_threads), idle_threads_(num_threads), running_(true) {
ThreadPool::ThreadPool(int num_threads) : running_(true) {
threads_.resize(num_threads);
for (auto& thread : threads_) {
// TODO(Yancey1989): binding the thread on the specify CPU number
@ -59,6 +57,7 @@ ThreadPool::ThreadPool(int num_threads)
ThreadPool::~ThreadPool() {
{
// notify all threads to stop running
std::lock_guard<std::mutex> l(mutex_);
running_ = false;
scheduled_.notify_all();
}
@ -69,36 +68,24 @@ ThreadPool::~ThreadPool() {
}
}
void ThreadPool::Wait() {
std::unique_lock<std::mutex> lock(mutex_);
completed_.wait(lock, [=] { return Done() == true; });
}
void ThreadPool::TaskLoop() {
while (running_) {
while (true) {
std::unique_lock<std::mutex> lock(mutex_);
scheduled_.wait(lock, [=] { return !tasks_.empty() || !running_; });
if (!running_) {
break;
scheduled_.wait(
lock, [this] { return !this->tasks_.empty() || !this->running_; });
if (!running_ || tasks_.empty()) {
return;
}
// pop a task from the task queue
auto task = std::move(tasks_.front());
tasks_.pop();
--idle_threads_;
lock.unlock();
// run the task
task();
{
std::unique_lock<std::mutex> lock(mutex_);
++idle_threads_;
if (Done()) {
completed_.notify_all();
}
}
}
}

@ -57,15 +57,6 @@ class ThreadPool {
~ThreadPool();
// Returns the number of threads created by the constructor.
size_t Threads() const { return total_threads_; }
// Returns the number of currently idle threads.
size_t IdleThreads() {
std::unique_lock<std::mutex> lock(mutex_);
return idle_threads_;
}
// Run pushes a function to the task queue and returns a std::future
// object. To wait for the completion of the task, call
// std::future::wait().
@ -94,25 +85,13 @@ class ThreadPool {
});
std::future<std::unique_ptr<platform::EnforceNotMet>> f = task.get_future();
tasks_.push(std::move(task));
lock.unlock();
scheduled_.notify_one();
return f;
}
// Wait until all the tasks are completed.
void Wait();
private:
DISABLE_COPY_AND_ASSIGN(ThreadPool);
// If the task queue is empty and avaialbe is equal to the number of
// threads, means that all tasks are completed. Note: this function
// is not thread-safe. Returns true if all tasks are completed.
// Note: don't delete the data member total_threads_ and use
// threads_.size() instead; because you'd need to lock the mutex
// before accessing threads_.
bool Done() { return tasks_.empty() && idle_threads_ == total_threads_; }
// The constructor starts threads to run TaskLoop, which retrieves
// and runs tasks from the queue.
void TaskLoop();
@ -125,14 +104,11 @@ class ThreadPool {
static std::once_flag init_flag_;
std::vector<std::unique_ptr<std::thread>> threads_;
const size_t total_threads_;
size_t idle_threads_;
std::queue<Task> tasks_;
std::mutex mutex_;
bool running_;
std::condition_variable scheduled_;
std::condition_variable completed_;
};
class ThreadPoolIO : ThreadPool {

@ -19,10 +19,11 @@ limitations under the License. */
namespace framework = paddle::framework;
void do_sum(framework::ThreadPool* pool, std::atomic<int>* sum, int cnt) {
std::vector<std::future<void>> fs;
void do_sum(std::vector<std::future<void>>* fs, std::mutex* mu,
std::atomic<int>* sum, int cnt) {
for (int i = 0; i < cnt; ++i) {
fs.push_back(framework::Async([sum]() { sum->fetch_add(1); }));
std::lock_guard<std::mutex> l(*mu);
fs->push_back(framework::Async([sum]() { sum->fetch_add(1); }));
}
}
@ -40,18 +41,21 @@ TEST(ThreadPool, ConcurrentInit) {
}
TEST(ThreadPool, ConcurrentRun) {
framework::ThreadPool* pool = framework::ThreadPool::GetInstance();
std::atomic<int> sum(0);
std::vector<std::thread> threads;
std::vector<std::future<void>> fs;
std::mutex fs_mu;
int n = 50;
// sum = (n * (n + 1)) / 2
for (int i = 1; i <= n; ++i) {
std::thread t(do_sum, pool, &sum, i);
std::thread t(do_sum, &fs, &fs_mu, &sum, i);
threads.push_back(std::move(t));
}
for (auto& t : threads) {
t.join();
}
pool->Wait();
for (auto& t : fs) {
t.wait();
}
EXPECT_EQ(sum, ((n + 1) * n) / 2);
}

@ -30,7 +30,7 @@ if (WITH_GPU AND TENSORRT_FOUND)
endif()
# Create static library
cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor)
cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array)
if(NOT APPLE)
# TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
@ -40,7 +40,7 @@ endif()
# Create shared library
cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
DEPS ${fluid_modules} paddle_fluid_api)
DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array)
set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
if(NOT APPLE)

@ -107,6 +107,9 @@ void Analyzer::Run(Argument* argument) {
passes.push_back("mkldnn_placement_pass");
}
#endif
// infer_clean_graph_pass should be the first default pass
// after mkldnn_placement_pass.
passes.push_back("infer_clean_graph_pass");
for (auto& pass : ir_passes_) {
if (!disabled_ir_passes_.count(pass)) {
passes.push_back(pass);

@ -67,7 +67,6 @@ class Analyzer : public OrderedRegistry<PassManager> {
// larger fusion.
const std::vector<std::string> all_ir_passes_{{
// Manual update the passes here.
"infer_clean_graph_pass", //
"attention_lstm_fuse_pass", //
"seqconv_eltadd_relu_fuse_pass", //
"embedding_fc_lstm_fuse_pass", //

@ -18,7 +18,8 @@ if(APPLE)
endif(APPLE)
set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor ${GLOB_PASS_LIB})
set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor ${GLOB_PASS_LIB}
)
if(WITH_GPU AND TENSORRT_FOUND)
set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine analysis_predictor)
@ -31,10 +32,17 @@ function(inference_api_test TARGET_NAME)
set(multiValueArgs ARGS)
cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
cc_test(${TARGET_NAME}
SRCS ${inference_test_SRC}
DEPS "${inference_deps}"
ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
if (WITH_GPU)
cc_test(${TARGET_NAME}
SRCS ${inference_test_SRC}
DEPS "${inference_deps}"
ARGS --dirname=${PYTHON_TESTS_DIR}/book/ --fraction_of_gpu_memory_to_use=0.15)
else()
cc_test(${TARGET_NAME}
SRCS ${inference_test_SRC}
DEPS "${inference_deps}"
ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
endif()
if(inference_test_ARGS)
set_tests_properties(${TARGET_NAME}
PROPERTIES DEPENDS "${inference_test_ARGS}")
@ -42,7 +50,8 @@ function(inference_api_test TARGET_NAME)
endif(WITH_TESTING)
endfunction(inference_api_test)
cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope)
cc_library(reset_tensor_array SRCS details/reset_tensor_array.cc DEPS lod_tensor scope)
cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS reset_tensor_array lod_tensor scope)
cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor)
cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS paddle_inference_api)
cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc DEPS paddle_inference_api)

@ -82,6 +82,7 @@ bool AnalysisPredictor::Init(
// Get the feed_target_names and fetch_target_names
PrepareFeedFetch();
return true;
}
@ -109,6 +110,10 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
return false;
}
VLOG(3) << "predict cost: " << timer.toc() << "ms";
// Fix TensorArray reuse not cleaned bug.
tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
tensor_array_batch_cleaner_.ResetTensorArray();
return true;
}
@ -322,6 +327,9 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
bool AnalysisPredictor::ZeroCopyRun() {
executor_->Run();
// Fix TensorArray reuse not cleaned bug.
tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
tensor_array_batch_cleaner_.ResetTensorArray();
return true;
}

@ -18,6 +18,7 @@
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/string/printf.h"
@ -88,6 +89,7 @@ class AnalysisPredictor : public PaddlePredictor {
// Memory buffer for feed inputs. The temporary LoDTensor will cause serious
// concurrency problems, so cache them.
std::vector<framework::LoDTensor> feed_tensors_;
details::TensorArrayBatchCleaner tensor_array_batch_cleaner_;
};
} // namespace paddle

@ -22,6 +22,7 @@ limitations under the License. */
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/profiler.h"
@ -157,6 +158,10 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
return false;
}
VLOG(3) << "predict cost: " << timer.toc() << "ms";
// Fix TensorArray reuse not cleaned bug.
tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
tensor_array_batch_cleaner_.ResetTensorArray();
return true;
}

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save