Merge pull request #15 from PaddlePaddle/develop

merge to local
revert-16807-engine2-interface
lujun 6 years ago committed by GitHub
commit 8ba177593b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -24,7 +24,7 @@ set(BOOST_PROJECT "extern_boost")
# So we use 1.41.0 here.
set(BOOST_VER "1.41.0")
set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
set(BOOST_URL "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")

@ -44,7 +44,7 @@ ExternalProject_Add(
# 3. keep only zlib, cares, protobuf, boringssl under "third_party",
# checkout and clean other dirs under third_party
# 4. remove .git, and package the directory.
URL "http://paddlepaddledeps.cdn.bcebos.com/grpc-v1.10.x.tar.gz"
URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.10.x.tar.gz"
URL_MD5 "1f268a2aff6759839dccd256adcc91cf"
PREFIX ${GRPC_SOURCES_DIR}
UPDATE_COMMAND ""

@ -34,7 +34,7 @@ SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
SET(TIME_VERSION "2019.0.1.20181227")
IF(WIN32)
SET(MKLML_VER "mklml_win_${TIME_VERSION}" CACHE STRING "" FORCE)
SET(MKLML_URL "https://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
SET(MKLML_URL "https://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
SET(MKLML_LIB ${MKLML_LIB_DIR}/mklml.lib)
SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.lib)
SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/mklml.dll)
@ -43,7 +43,7 @@ ELSE()
#TODO(intel-huying):
# Now enable Erf function in mklml library temporarily, it will be updated as offical version later.
SET(MKLML_VER "Glibc225_vsErf_mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE)
SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
SET(MKLML_URL "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so)
SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so)
SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/libmklml_intel.so)

@ -10,6 +10,9 @@ paddle.fluid.default_startup_program (ArgSpec(args=[], varargs=None, keywords=No
paddle.fluid.default_main_program (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '5430f54ab4895f9f47db6bebbaf71659'))
paddle.fluid.program_guard (ArgSpec(args=['main_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b54f403e57825a1592aece03afe3afb6'))
paddle.fluid.name_scope (ArgSpec(args=['prefix'], varargs=None, keywords=None, defaults=(None,)), ('document', '0ef753f5cec69fef9ae6ad8b867b33a2'))
paddle.fluid.cuda_places (ArgSpec(args=['device_ids'], varargs=None, keywords=None, defaults=(None,)), ('document', '7d9a51fc9cf3c5245b5227080a8064c3'))
paddle.fluid.cpu_places (ArgSpec(args=['device_count'], varargs=None, keywords=None, defaults=(None,)), ('document', '4c0cd83f0b401fc2ff84c70974e5d210'))
paddle.fluid.cuda_pinned_places (ArgSpec(args=['device_count'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd0c3ebd813c39958c92b78e3eef7e912'))
paddle.fluid.Executor.__init__ (ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.Executor.close (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'f5369953dd0c443961cf79f7a00e1a03'))
paddle.fluid.Executor.run (ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)), ('document', 'f482e93b38b4018796969a2e1dde479d'))
@ -44,7 +47,7 @@ paddle.fluid.AsyncExecutor.run (ArgSpec(args=['self', 'program', 'data_feed', 'f
paddle.fluid.AsyncExecutor.save_model (ArgSpec(args=['self', 'save_path'], varargs=None, keywords=None, defaults=None), ('document', 'c8ac0dfcb3b187aba25d03af7fea56b2'))
paddle.fluid.AsyncExecutor.stop (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '5f23d043607bb5d55e466ec3f578e093'))
paddle.fluid.CompiledProgram.__init__ (ArgSpec(args=['self', 'program_or_graph'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.CompiledProgram.with_data_parallel (ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'e1af7fd53cf868554f312779fc803864'))
paddle.fluid.CompiledProgram.with_data_parallel (ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from', 'places'], varargs=None, keywords=None, defaults=(None, None, None, None, None)), ('document', 'a8c7793803cf976680d9478e378fa356'))
paddle.fluid.CompiledProgram.with_inference_optimize (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=None), ('document', '9e5b009d850191a010e859189c127fd8'))
paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None
paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None
@ -58,6 +61,12 @@ paddle.fluid.io.load_params (ArgSpec(args=['executor', 'dirname', 'main_program'
paddle.fluid.io.load_persistables (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '28df5bfe26ca7a077f91156abb0fe6d2'))
paddle.fluid.io.save_inference_model (ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True)), ('document', '70f4f53f13572436ac72d1c8b5efeb9d'))
paddle.fluid.io.load_inference_model (ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename', 'pserver_endpoints'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '7a5255386075dac3c75b7058254fcdcb'))
paddle.fluid.io.PyReader.__init__ (ArgSpec(args=['self', 'feed_list', 'capacity', 'use_double_buffer', 'iterable'], varargs=None, keywords=None, defaults=(True, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.io.PyReader.decorate_batch_generator (ArgSpec(args=['self', 'reader', 'places'], varargs=None, keywords=None, defaults=(None,)), ('document', 'a3fefec8bacd6ce83f49906a9d05e779'))
paddle.fluid.io.PyReader.decorate_sample_generator (ArgSpec(args=['self', 'sample_generator', 'batch_size', 'drop_last', 'places'], varargs=None, keywords=None, defaults=(True, None)), ('document', '7abd9cf7d695bab5bb6cf7ded5903cb2'))
paddle.fluid.io.PyReader.decorate_sample_list_generator (ArgSpec(args=['self', 'reader', 'places'], varargs=None, keywords=None, defaults=(None,)), ('document', 'faef298f73e91aedcfaf5d184f3109b7'))
paddle.fluid.io.PyReader.reset (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'ff1cc1e2beb8824d453656c72c28ddfb'))
paddle.fluid.io.PyReader.start (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'b7ea0a548991924e4cfe61a577b8e56d'))
paddle.fluid.initializer.ConstantInitializer.__init__ (ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.initializer.UniformInitializer.__init__ (ArgSpec(args=['self', 'low', 'high', 'seed'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.initializer.NormalInitializer.__init__ (ArgSpec(args=['self', 'loc', 'scale', 'seed'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
@ -222,6 +231,7 @@ paddle.fluid.layers.teacher_student_sigmoid_loss (ArgSpec(args=['input', 'label'
paddle.fluid.layers.huber_loss (ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None), ('document', '431a4301c35032166ec029f7432c80a7'))
paddle.fluid.layers.tree_conv (ArgSpec(args=['nodes_vector', 'edge_set', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None)), ('document', '34ea12ac9f10a65dccbc50100d12e607'))
paddle.fluid.layers.npair_loss (ArgSpec(args=['anchor', 'positive', 'labels', 'l2_reg'], varargs=None, keywords=None, defaults=(0.002,)), ('document', '46994d10276dd4cb803b4062b5d14329'))
paddle.fluid.layers.fsp_matrix (ArgSpec(args=['x', 'y'], varargs=None, keywords=None, defaults=None), ('document', 'b76ccca3735bea4a58a0dbf0d77c5393'))
paddle.fluid.layers.data (ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)), ('document', '33bbd42027d872b3818b3d64ec52e139'))
paddle.fluid.layers.open_files (ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)), ('document', 'b1ae2e1cc0750e58726374061ea90ecc'))
paddle.fluid.layers.read_file (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', 'b0a1c2fc51c27a106da28f3308c41f5e'))
@ -229,7 +239,7 @@ paddle.fluid.layers.shuffle (ArgSpec(args=['reader', 'buffer_size'], varargs=Non
paddle.fluid.layers.batch (ArgSpec(args=['reader', 'batch_size'], varargs=None, keywords=None, defaults=None), ('document', 'f563d376d35e1a4c4db100fd11b381a0'))
paddle.fluid.layers.double_buffer (ArgSpec(args=['reader', 'place', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '07e5b796674796eb1ef3fee9c10d24e3'))
paddle.fluid.layers.random_data_generator (ArgSpec(args=['low', 'high', 'shapes', 'lod_levels', 'for_parallel'], varargs=None, keywords=None, defaults=(True,)), ('document', '9b7f0f86ec24bbc97643cadcb6499cff'))
paddle.fluid.layers.py_reader (ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '13dabc57863f62ab3141586784ee356b'))
paddle.fluid.layers.py_reader (ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '4357643685cfd65454ba5a15f0151709'))
paddle.fluid.layers.create_py_reader_by_data (ArgSpec(args=['capacity', 'feed_list', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, True)), ('document', '350f74d93fab9adb2ac4950f1c26416b'))
paddle.fluid.layers.Preprocessor.__init__ (ArgSpec(args=['self', 'reader', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.layers.Preprocessor.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
@ -510,6 +520,7 @@ paddle.fluid.unique_name.guard (ArgSpec(args=['new_generator'], varargs=None, ke
paddle.fluid.recordio_writer.convert_reader_to_recordio_file (ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)), ('document', '65c7523e86f0c50bb729b01667f36310'))
paddle.fluid.recordio_writer.convert_reader_to_recordio_files (ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)), ('document', 'bc643f0f5f1b9db57ff0d8a57d379bd7'))
paddle.fluid.Scope Scope() -> paddle.fluid.core._Scope
paddle.reader.cache (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', '1676886070eb607cb608f7ba47be0d3c'))
paddle.reader.map_readers (ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None), ('document', '77cbadb09df588e21e5cc0819b69c87d'))
paddle.reader.buffered (ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None), ('document', '0d6186f109feceb99f60ec50a0a624cb'))
paddle.reader.compose (ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None), ('document', '884291104e1c3f37f33aae44b7deeb0d'))

@ -27,7 +27,7 @@ if [ -d "$TENSORRT_INCLUDE_DIR" -a -d "$TENSORRT_LIB_DIR" ]; then
fi
PREFIX=inference-vis-demos%2F
URL_ROOT=http://paddlemodels.cdn.bcebos.com/${PREFIX}
URL_ROOT=http://paddlemodels.bj.bcebos.com/${PREFIX}
# download vis_demo data
function download() {

@ -115,14 +115,14 @@ inference_analysis_test(test_analyzer_transformer SRCS analyzer_transformer_test
# ocr
set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
if (NOT EXISTS ${OCR_INSTALL_DIR})
inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
endif()
inference_analysis_api_test_with_refer_result(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc SERIAL)
# mobilenet with transpose op
set(MOBILENET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet")
if (NOT EXISTS ${MOBILENET_INSTALL_DIR})
inference_download_and_uncompress(${MOBILENET_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Fmobilenet.tar.gz")
inference_download_and_uncompress(${MOBILENET_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos%2Fmobilenet.tar.gz")
endif()
inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc SERIAL)

@ -1,5 +1,5 @@
include(ExternalProject)
set(INFERENCE_URL "http://paddle-inference-dist.cdn.bcebos.com" CACHE STRING "inference download url")
set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com" CACHE STRING "inference download url")
set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
"A path setting inference demo download directories.")

@ -61,4 +61,6 @@ nv_test(allocation_and_eigen_test SRCS allocation_and_eigen_test.cu DEPS allocat
cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator best_fit_allocator locked_allocator cpu_allocator)
cc_test(allocator_facade_test SRCS allocator_facade_test.cc DEPS allocator_facade)
cc_test(allocator_facade_abs_flags_test SRCS allocator_facade_abs_flags_test.cc DEPS allocator_facade)
cc_test(allocator_facade_frac_flags_test SRCS allocator_facade_frac_flags_test.cc DEPS allocator_facade)

@ -0,0 +1,100 @@
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#ifdef PADDLE_WITH_CUDA
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
DECLARE_uint64(initial_gpu_memory_in_mb);
DECLARE_uint64(reallocate_gpu_memory_in_mb);
DECLARE_int64(gpu_allocator_retry_time);
#endif
namespace paddle {
namespace memory {
namespace allocation {
//! Run allocate test cases for different places
void AllocateTestCases() {
auto &instance = AllocatorFacade::Instance();
platform::Place place;
size_t size = 1024;
{
place = platform::CPUPlace();
size = 1024;
auto cpu_allocation = instance.Alloc(place, size);
ASSERT_NE(cpu_allocation, nullptr);
ASSERT_NE(cpu_allocation->ptr(), nullptr);
ASSERT_EQ(cpu_allocation->place(), place);
ASSERT_EQ(cpu_allocation->size(), size);
}
#ifdef PADDLE_WITH_CUDA
{
place = platform::CUDAPlace(0);
size = 1024;
auto gpu_allocation = instance.Alloc(place, size);
ASSERT_NE(gpu_allocation, nullptr);
ASSERT_NE(gpu_allocation->ptr(), nullptr);
ASSERT_EQ(gpu_allocation->place(), place);
ASSERT_GE(gpu_allocation->size(), size);
}
{
// Allocate 2GB gpu memory
place = platform::CUDAPlace(0);
size = 2 * static_cast<size_t>(1 << 30);
auto gpu_allocation = instance.Alloc(place, size);
ASSERT_NE(gpu_allocation, nullptr);
ASSERT_NE(gpu_allocation->ptr(), nullptr);
ASSERT_EQ(gpu_allocation->place(), place);
ASSERT_GE(gpu_allocation->size(), size);
}
{
place = platform::CUDAPinnedPlace();
size = (1 << 20);
auto cuda_pinned_allocation =
instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20);
ASSERT_NE(cuda_pinned_allocation, nullptr);
ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr);
ASSERT_EQ(cuda_pinned_allocation->place(), place);
ASSERT_GE(cuda_pinned_allocation->size(), size);
}
#endif
}
TEST(Allocator, SpecifyGpuMemory) {
#ifdef PADDLE_WITH_CUDA
// Set to 0.0 to test FLAGS_initial_gpu_memory_in_mb and
// FLAGS_reallocate_gpu_memory_in_mb
FLAGS_fraction_of_gpu_memory_to_use = 0.0;
// 512 MB
FLAGS_initial_gpu_memory_in_mb = 512;
// 4 MB
FLAGS_reallocate_gpu_memory_in_mb = 4;
FLAGS_gpu_allocator_retry_time = 500;
FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
#endif
AllocateTestCases();
}
} // namespace allocation
} // namespace memory
} // namespace paddle

@ -19,6 +19,8 @@
#ifdef PADDLE_WITH_CUDA
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
DECLARE_uint64(initial_gpu_memory_in_mb);
DECLARE_uint64(reallocate_gpu_memory_in_mb);
DECLARE_int64(gpu_allocator_retry_time);
#endif
@ -26,13 +28,8 @@ namespace paddle {
namespace memory {
namespace allocation {
TEST(allocator, allocator) {
#ifdef PADDLE_WITH_CUDA
FLAGS_fraction_of_gpu_memory_to_use = 0.01;
FLAGS_gpu_allocator_retry_time = 500;
FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
#endif
//! Run allocate test cases for different places
void AllocateTestCases() {
auto &instance = AllocatorFacade::Instance();
platform::Place place;
size_t size = 1024;
@ -82,6 +79,16 @@ TEST(allocator, allocator) {
#endif
}
TEST(Allocator, Allocator) {
#ifdef PADDLE_WITH_CUDA
FLAGS_fraction_of_gpu_memory_to_use = 0.01;
FLAGS_gpu_allocator_retry_time = 500;
FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
#endif
AllocateTestCases();
}
} // namespace allocation
} // namespace memory
} // namespace paddle

@ -37,6 +37,8 @@ DEFINE_bool(init_allocated_mem, false,
"that initializing the allocated memory with a small value "
"during unit testing.");
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_uint64(initial_gpu_memory_in_mb);
DECLARE_uint64(reallocate_gpu_memory_in_mb);
DECLARE_bool(benchmark);
namespace paddle {
@ -153,12 +155,18 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
platform::GpuMinChunkSize(),
platform::GpuMaxChunkSize());
VLOG(10) << "\n\nNOTE: each GPU device use "
<< FLAGS_fraction_of_gpu_memory_to_use * 100
<< "% of GPU memory.\n"
<< "You can set GFlags environment variable '"
<< "FLAGS_fraction_of_gpu_memory_to_use"
<< "' to change the fraction of GPU usage.\n\n";
VLOG(10) << "\n\nNOTE:\n"
<< "You can set GFlags environment variable "
<< "'FLAGS_fraction_of_gpu_memory_to_use' "
<< "or 'FLAGS_initial_gpu_memory_in_mb' "
<< "or 'FLAGS_reallocate_gpu_memory_in_mb' "
<< "to change the memory size for GPU usage.\n"
<< "Current 'FLAGS_fraction_of_gpu_memory_to_use' value is "
<< FLAGS_fraction_of_gpu_memory_to_use
<< ". Current 'FLAGS_initial_gpu_memory_in_mb' value is "
<< FLAGS_initial_gpu_memory_in_mb
<< ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is "
<< FLAGS_reallocate_gpu_memory_in_mb << "\n\n";
}
});

@ -9,3 +9,5 @@ endif(${WITH_GPU})
cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)
cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS memory_block system_allocator glog)
cc_test(buddy_allocator_test SRCS buddy_allocator_test.cc DEPS buddy_allocator)

@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/memory/detail/buddy_allocator.h"
#include <algorithm>
#include <utility>
#include "glog/logging.h"
DEFINE_bool(free_idle_memory, false,
@ -36,9 +40,10 @@ BuddyAllocator::~BuddyAllocator() {
"have actually been freed";
while (!pool_.empty()) {
auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
VLOG(10) << "Free from block (" << block << ", " << max_chunk_size_ << ")";
VLOG(10) << "Free from block (" << block << ", " << block->size(cache_)
<< ")";
system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
system_allocator_->Free(block, block->size(cache_), block->index(cache_));
cache_.invalidate(block);
pool_.erase(pool_.begin());
}
@ -71,7 +76,7 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
// refill the pool if failure
if (it == pool_.end()) {
it = RefillPool();
it = RefillPool(size);
// if still failure, fail fatally
if (it == pool_.end()) {
return nullptr;
@ -184,19 +189,28 @@ void* BuddyAllocator::SystemAlloc(size_t size) {
return static_cast<MemoryBlock*>(p)->data();
}
BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
size_t request_bytes) {
size_t allocate_bytes = max_chunk_size_;
size_t index = 0;
#ifdef PADDLE_WITH_CUDA
if (system_allocator_->UseGpu()) {
if ((total_used_ + total_free_) == 0) {
// Compute the maximum allocation size for the first allocation.
max_chunk_size_ = platform::GpuMaxChunkSize();
// Compute the allocation size for gpu for the first allocation.
allocate_bytes = std::max(platform::GpuInitAllocSize(), request_bytes);
} else {
// Reallocation size
if (realloc_size_ == 0) {
realloc_size_ = platform::GpuReallocSize();
}
allocate_bytes = std::max(realloc_size_, request_bytes);
}
}
#endif
// Allocate a new maximum sized block
size_t index = 0;
void* p = system_allocator_->Alloc(&index, max_chunk_size_);
// Allocate a new block
void* p = system_allocator_->Alloc(&index, allocate_bytes);
if (p == nullptr) return pool_.end();
@ -204,7 +218,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
<< " from system allocator";
static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index,
max_chunk_size_, nullptr, nullptr);
allocate_bytes, nullptr, nullptr);
// gpu fallback allocation
if (system_allocator_->UseGpu() &&
@ -212,10 +226,10 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
fallback_alloc_count_++;
}
total_free_ += max_chunk_size_;
total_free_ += allocate_bytes;
// dump the block into pool
return pool_.insert(IndexSizeAddress(index, max_chunk_size_, p)).first;
return pool_.insert(IndexSizeAddress(index, allocate_bytes, p)).first;
}
BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) {
@ -286,12 +300,12 @@ void BuddyAllocator::CleanIdleFallBackAlloc() {
VLOG(10) << "Return block " << block << " to fallback allocator.";
system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
system_allocator_->Free(block, block->size(cache_), block->index(cache_));
cache_.invalidate(block);
pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
total_free_ -= max_chunk_size_;
total_free_ -= block->size(cache_);
fallback_alloc_count_--;
// If no fall allocation exists, return directly
@ -322,12 +336,12 @@ void BuddyAllocator::CleanIdleNormalAlloc() {
VLOG(10) << "Return block " << block << " to base allocator.";
system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
system_allocator_->Free(block, block->size(cache_), block->index(cache_));
cache_.invalidate(block);
pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
total_free_ -= max_chunk_size_;
total_free_ -= block->size(cache_);
if (!shall_free_alloc()) return;
}

@ -60,7 +60,7 @@ class BuddyAllocator {
void* SystemAlloc(size_t size);
/*! \brief If existing chunks are not suitable, refill pool */
PoolSet::iterator RefillPool();
PoolSet::iterator RefillPool(size_t request_bytes);
/**
* \brief Find the suitable chunk from existing pool and split
@ -89,6 +89,8 @@ class BuddyAllocator {
size_t min_chunk_size_; // the minimum size of each chunk
size_t max_chunk_size_; // the maximum size of each chunk
size_t realloc_size_ = 0; // the size of re-allocated chunk
private:
/**
* \brief A list of free allocation

@ -0,0 +1,133 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/memory/detail/buddy_allocator.h"
#include <memory>
#include "gflags/gflags.h"
#include "gtest/gtest.h"
#include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/platform/gpu_info.h"
#ifdef PADDLE_WITH_CUDA
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_uint64(initial_gpu_memory_in_mb);
DECLARE_uint64(reallocate_gpu_memory_in_mb);
#endif
namespace paddle {
namespace memory {
namespace detail {
constexpr static int test_gpu_id = 0;
void TestBuddyAllocator(BuddyAllocator* allocator, size_t size_bytes) {
bool freed = false;
size_t used_bytes = allocator->Used();
if (size_bytes > 0) {
void* p = allocator->Alloc(size_bytes);
EXPECT_NE(p, nullptr);
#ifdef PADDLE_WITH_CUDA
if (size_bytes < platform::GpuMaxChunkSize()) {
#else
if (size_bytes < platform::CpuMaxChunkSize()) {
#endif
// Not allocate from SystemAllocator
EXPECT_GE(allocator->Used(), used_bytes + size_bytes);
} else {
// Allocate from SystemAllocator doesn't count in Used()
EXPECT_EQ(allocator->Used(), used_bytes);
}
int* intp = static_cast<int*>(p);
std::shared_ptr<int> ptr(intp, [&](void* p) {
allocator->Free(intp);
freed = true;
});
} else {
freed = true;
}
EXPECT_EQ(used_bytes, allocator->Used());
EXPECT_TRUE(freed);
}
#ifdef PADDLE_WITH_CUDA
TEST(BuddyAllocator, GpuFraction) {
FLAGS_fraction_of_gpu_memory_to_use = 0.01;
BuddyAllocator buddy_allocator(
std::unique_ptr<SystemAllocator>(new GPUAllocator(test_gpu_id)),
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
TestBuddyAllocator(&buddy_allocator, 10);
TestBuddyAllocator(&buddy_allocator, 10 << 10);
TestBuddyAllocator(&buddy_allocator, 10 << 20);
TestBuddyAllocator(&buddy_allocator, 2 * static_cast<size_t>(1 << 30));
}
TEST(BuddyAllocator, InitRealloc) {
FLAGS_initial_gpu_memory_in_mb = 100;
FLAGS_reallocate_gpu_memory_in_mb = 50;
EXPECT_EQ(platform::GpuMaxChunkSize(), static_cast<size_t>(100 << 20));
BuddyAllocator buddy_allocator(
std::unique_ptr<SystemAllocator>(new GPUAllocator(test_gpu_id)),
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
// Less then initial size and reallocate size
TestBuddyAllocator(&buddy_allocator, 10 << 20);
// Between initial size and reallocate size and not exceed pool
TestBuddyAllocator(&buddy_allocator, 80 << 20);
// Less then reallocate size and exceed pool
TestBuddyAllocator(&buddy_allocator, 40 << 20);
// Greater then reallocate size and exceed pool
TestBuddyAllocator(&buddy_allocator, 80 << 20);
// Greater then initial size and reallocate size
TestBuddyAllocator(&buddy_allocator, 2 * static_cast<size_t>(1 << 30));
}
TEST(BuddyAllocator, ReallocSizeGreaterThanInit) {
FLAGS_initial_gpu_memory_in_mb = 5;
FLAGS_reallocate_gpu_memory_in_mb = 10;
EXPECT_EQ(platform::GpuMaxChunkSize(), static_cast<size_t>(10 << 20));
BuddyAllocator buddy_allocator(
std::unique_ptr<SystemAllocator>(new GPUAllocator(test_gpu_id)),
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
// Less then initial size and reallocate size
TestBuddyAllocator(&buddy_allocator, 1 << 20);
// Between initial size and reallocate size and not exceed pool
TestBuddyAllocator(&buddy_allocator, 3 << 20);
// Less then initial size and exceed pool
TestBuddyAllocator(&buddy_allocator, 3 << 20);
// Less then reallocate size and not exceed pool (now pool is 15 MB, used 7
// MB)
TestBuddyAllocator(&buddy_allocator, 7 << 20);
// Less then reallocate size and exceed pool
TestBuddyAllocator(&buddy_allocator, 8 << 20);
// Greater then initial size and reallocate size
TestBuddyAllocator(&buddy_allocator, 2 * static_cast<size_t>(1 << 30));
}
#endif
} // namespace detail
} // namespace memory
} // namespace paddle

@ -32,6 +32,9 @@ limitations under the License. */
DECLARE_bool(use_pinned_memory);
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_uint64(initial_gpu_memory_in_mb);
DECLARE_uint64(reallocate_gpu_memory_in_mb);
namespace paddle {
namespace memory {
namespace detail {
@ -119,11 +122,18 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
gpu_alloc_size_ += size;
return p;
} else {
LOG(WARNING)
<< "Cannot malloc " << size / 1024.0 / 1024.0
<< " MB GPU memory. Please shrink FLAGS_fraction_of_gpu_memory_to_use "
"environment variable to a lower value. Current value is "
<< FLAGS_fraction_of_gpu_memory_to_use;
LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0
<< " MB GPU memory. Please shrink "
"FLAGS_fraction_of_gpu_memory_to_use or "
"FLAGS_initial_gpu_memory_in_mb or "
"FLAGS_reallocate_gpu_memory_in_mb"
"environment variable to a lower value. "
<< "Current FLAGS_fraction_of_gpu_memory_to_use value is "
<< FLAGS_fraction_of_gpu_memory_to_use
<< ". Current FLAGS_initial_gpu_memory_in_mb value is "
<< FLAGS_initial_gpu_memory_in_mb
<< ". Current FLAGS_reallocate_gpu_memory_in_mb value is "
<< FLAGS_reallocate_gpu_memory_in_mb;
return nullptr;
}
}

@ -0,0 +1,128 @@
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fsp_op.h"
namespace paddle {
namespace operators {
class FSPOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of FSPOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) of FSPOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of FSPOp should not be null.");
auto x_dims = ctx->GetInputDim("X");
auto y_dims = ctx->GetInputDim("Y");
PADDLE_ENFORCE(
x_dims.size() == 4,
"The Input(X) must have shape [batch_size, channel, height, width].");
PADDLE_ENFORCE(
y_dims.size() == 4,
"The Input(Y) must have shape [batch_size, channel, height, width].");
PADDLE_ENFORCE(
(x_dims[2] == y_dims[2]) && (x_dims[3] == y_dims[3]),
"The Input(X) and Input(Y) should have the same height and width.");
ctx->SetOutputDim("Out", {x_dims[0], x_dims[1], y_dims[1]});
ctx->ShareLoD("X", "Out");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
framework::LibraryType library_{framework::LibraryType::kPlain};
framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
ctx.device_context(), layout_, library_);
}
};
class FSPOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"(Tensor) The input of FSP op with shape [batch_size, x_channel, "
"height, width]");
AddInput("Y",
"(Tensor) The input of FSP op with shape"
"[batch_size, y_channel, height, width]."
"The y_channel can be different with the x_channel of Input(X)"
" while the other dimensions must be the same with Input(X)'s.");
AddOutput(
"Out",
"(Tensor) The output of FSP op with shape "
"[batch_size, x_channel, y_channel]. The x_channel is the channel "
"of Input(X) and the y_channel is the channel of Input(Y).");
AddComment(R"DOC(
This op is used to calculate the flow of solution procedure (FSP) matrix of two feature maps.
Given feature map x with shape [x_channel, h, w] and feature map y with shape
[y_channel, h, w], we can get the fsp matrix of x and y in two steps:
step 1: reshape x into matrix with shape [x_channel, h * w] and reshape and
transpose y into matrix with shape [h * w, y_channel]
step 2: multiply x and y to get fsp matrix with shape [x_channel, y_channel]
The output is a batch of fsp matrices.
)DOC");
}
};
class FSPOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) should not be null");
auto x_dims = ctx->GetInputDim("X");
auto y_dims = ctx->GetInputDim("Y");
auto x_grad_name = framework::GradVarName("X");
auto y_grad_name = framework::GradVarName("Y");
if (ctx->HasOutput(x_grad_name)) {
ctx->SetOutputDim(x_grad_name, x_dims);
}
if (ctx->HasOutput(y_grad_name)) {
ctx->SetOutputDim(y_grad_name, y_dims);
}
}
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->type(),
ctx.device_context());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(fsp, ops::FSPOp, ops::FSPOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(fsp_grad, ops::FSPOpGrad);
REGISTER_OP_CPU_KERNEL(
fsp, ops::FSPOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::FSPOpKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
fsp_grad, ops::FSPGradOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::FSPGradOpKernel<paddle::platform::CPUDeviceContext, double>);

@ -0,0 +1,24 @@
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/fsp_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(fsp, ops::FSPOpKernel<plat::CUDADeviceContext, float>,
ops::FSPOpKernel<plat::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(fsp_grad,
ops::FSPGradOpKernel<plat::CUDADeviceContext, float>,
ops::FSPGradOpKernel<plat::CUDADeviceContext, double>);

@ -0,0 +1,136 @@
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename DeviceContext, typename T>
class FSPOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* x = context.Input<Tensor>("X");
auto* y = context.Input<Tensor>("Y");
auto* output = context.Output<Tensor>("Out");
output->mutable_data<T>(context.GetPlace());
auto x_dims = x->dims();
auto y_dims = y->dims();
auto batch_size = x_dims[0];
auto x_channel = x_dims[1];
auto y_channel = y_dims[1];
auto height = x_dims[2];
auto width = x_dims[3];
auto blas = math::GetBlas<DeviceContext, T>(context);
math::MatDescriptor x_mat_desc;
x_mat_desc.height_ = x_channel;
x_mat_desc.width_ = height * width;
x_mat_desc.batch_size_ = batch_size;
x_mat_desc.stride_ = x_channel * height * width;
math::MatDescriptor y_mat_desc;
y_mat_desc.height_ = height * width;
y_mat_desc.width_ = y_channel;
y_mat_desc.batch_size_ = batch_size;
y_mat_desc.stride_ = y_channel * height * width;
y_mat_desc.trans_ = true;
blas.MatMul(*x, x_mat_desc, *y, y_mat_desc,
static_cast<T>(1.0 / (height * width)), output,
static_cast<T>(0.0));
}
};
template <typename DeviceContext, typename T>
class FSPGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
auto* d_y = context.Output<Tensor>(framework::GradVarName("Y"));
if (d_x == nullptr && d_y == nullptr) {
return;
}
auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
auto d_out_dims = d_out->dims();
auto batch_size = d_out_dims[0];
auto x_channel = d_out_dims[1];
auto y_channel = d_out_dims[2];
int64_t h = 0;
int64_t w = 0;
auto blas = math::GetBlas<DeviceContext, T>(context);
math::SetConstant<DeviceContext, T> set_zero;
if (d_x != nullptr) {
d_x->mutable_data<T>(context.GetPlace());
set_zero(context.template device_context<DeviceContext>(), d_x,
static_cast<T>(0));
auto* y = context.Input<Tensor>("Y");
auto y_dims = y->dims();
h = y_dims[2];
w = y_dims[3];
math::MatDescriptor d_out_mat_desc;
d_out_mat_desc.height_ = x_channel;
d_out_mat_desc.width_ = y_channel;
d_out_mat_desc.batch_size_ = batch_size;
d_out_mat_desc.stride_ = x_channel * y_channel;
math::MatDescriptor y_mat_desc;
y_mat_desc.height_ = y_channel;
y_mat_desc.width_ = h * w;
y_mat_desc.batch_size_ = batch_size;
y_mat_desc.stride_ = y_channel * h * w;
blas.MatMul(*d_out, d_out_mat_desc, *y, y_mat_desc,
static_cast<T>(1.0 / (h * w)), d_x, static_cast<T>(0.0));
}
if (d_y != nullptr) {
d_y->mutable_data<T>(context.GetPlace());
set_zero(context.template device_context<DeviceContext>(), d_y,
static_cast<T>(0));
auto* x = context.Input<Tensor>("X");
auto x_dims = x->dims();
h = x_dims[2];
w = x_dims[3];
math::MatDescriptor d_out_mat_desc;
d_out_mat_desc.height_ = y_channel;
d_out_mat_desc.width_ = x_channel;
d_out_mat_desc.batch_size_ = batch_size;
d_out_mat_desc.stride_ = x_channel * y_channel;
d_out_mat_desc.trans_ = true;
math::MatDescriptor x_mat_desc;
x_mat_desc.height_ = x_channel;
x_mat_desc.width_ = h * w;
x_mat_desc.batch_size_ = batch_size;
x_mat_desc.stride_ = x_channel * h * w;
blas.MatMul(*d_out, d_out_mat_desc, *x, x_mat_desc,
static_cast<T>(1.0 / (h * w)), d_y, static_cast<T>(0.0));
}
}
};
} // namespace operators
} // namespace paddle

@ -17,7 +17,9 @@ function(reader_library TARGET_NAME)
PARENT_SCOPE)
endfunction()
cc_library(py_reader SRCS py_reader.cc DEPS reader)
cc_library(buffered_reader SRCS buffered_reader.cc DEPS reader simple_threadpool)
reader_library(open_files_op SRCS open_files_op.cc DEPS buffered_reader)
reader_library(create_random_data_generator_op SRCS create_random_data_generator_op.cc)
reader_library(create_shuffle_reader_op SRCS create_shuffle_reader_op.cc)
@ -26,7 +28,7 @@ reader_library(create_recordio_file_reader_op SRCS create_recordio_file_reader_o
reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc DEPS buffered_reader)
reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc)
reader_library(create_custom_reader_op SRCS create_custom_reader_op.cc)
reader_library(create_py_reader_op SRCS create_py_reader_op.cc)
reader_library(create_py_reader_op SRCS create_py_reader_op.cc DEPS py_reader)
if (NOT WIN32 AND NOT ON_INFER)
cc_library(ctr_reader SRCS ctr_reader.cc DEPS gzstream reader zlib)
@ -38,7 +40,7 @@ cc_test(reader_blocking_queue_test SRCS reader_blocking_queue_test.cc)
# Export local libraries to parent
# set(READER_LIBRARY ${LOCAL_READER_LIBS} PARENT_SCOPE)
op_library(read_op)
op_library(read_op DEPS py_reader buffered_reader)
foreach(src ${LOCAL_READER_LIBS})
set(OP_LIBRARY ${src} ${OP_LIBRARY} CACHE INTERNAL "op libs")

@ -16,6 +16,7 @@
#include <condition_variable> // NOLINT
#include <deque>
#include <utility>
#include "paddle/fluid/platform/enforce.h"
@ -34,7 +35,7 @@ class BlockingQueue {
explicit BlockingQueue(size_t capacity, bool speed_test_mode = false)
: capacity_(capacity), speed_test_mode_(speed_test_mode), closed_(false) {
PADDLE_ENFORCE_GT(
capacity_, 0,
capacity_, static_cast<size_t>(0),
"The capacity of a reader::BlockingQueue must be greater than 0.");
}

@ -30,8 +30,10 @@ BufferedReader::~BufferedReader() {
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(place_)) {
platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
PADDLE_ENFORCE(cudaStreamDestroy(stream));
for (auto &event : events) PADDLE_ENFORCE(cudaEventDestroy(event));
PADDLE_ENFORCE(cudaStreamDestroy(stream_));
for (auto &event : events_) {
PADDLE_ENFORCE(cudaEventDestroy(event));
}
}
#endif
}
@ -46,15 +48,15 @@ BufferedReader::BufferedReader(
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(place_)) {
platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
compute_stream =
compute_stream_ =
((platform::CUDADeviceContext *)(platform::DeviceContextPool::Instance()
.Get(place_)))
->stream();
events.resize(buffer_size);
for (auto &event : events) {
events_.resize(buffer_size);
for (auto &event : events_) {
PADDLE_ENFORCE(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
}
PADDLE_ENFORCE(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
PADDLE_ENFORCE(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking));
}
#endif
cpu_buffer_.resize(buffer_size);
@ -73,7 +75,7 @@ void BufferedReader::ReadAsync(size_t i) {
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(place_)) {
platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
PADDLE_ENFORCE(cudaEventRecord(events[i], compute_stream));
PADDLE_ENFORCE(cudaEventRecord(events_[i], compute_stream_));
}
#endif
position_.emplace(thread_pool_.enqueue([this, i]() -> size_t {
@ -91,7 +93,7 @@ void BufferedReader::ReadAsync(size_t i) {
// commands from different streams cannot run concurrently.
if (platform::is_gpu_place(place_)) {
platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
PADDLE_ENFORCE(cudaStreamWaitEvent(stream, events[i], 0));
PADDLE_ENFORCE(cudaStreamWaitEvent(stream_, events_[i], 0));
TensorVec &gpu = gpu_buffer_[i];
gpu.resize(cpu.size());
platform::RecordEvent record_event("BufferedReader:MemoryCopy");
@ -106,12 +108,14 @@ void BufferedReader::ReadAsync(size_t i) {
if (platform::is_cuda_pinned_place(cpu_place)) {
memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
boost::get<platform::CUDAPinnedPlace>(cpu_place),
cpu_ptr, size, stream);
cpu_ptr, size, stream_);
} else if ((platform::is_gpu_place(cpu_place))) {
memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
boost::get<platform::CUDAPlace>(cpu_place), cpu_ptr,
size, stream);
size, stream_);
} else {
// if cpu place is not pinned, async copy is slower than sync copy,
// so we use sync copy instead.
// TODO(zcd): The default stream should not be used here.
memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
boost::get<platform::CPUPlace>(cpu_place), cpu_ptr, size,
@ -119,7 +123,7 @@ void BufferedReader::ReadAsync(size_t i) {
}
gpu[i].set_lod(cpu[i].lod());
}
PADDLE_ENFORCE(cudaStreamSynchronize(stream));
PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
}
#endif
return i;

@ -15,6 +15,7 @@
#pragma once
#include <list>
#include <memory>
#include <queue>
#include <vector>
#include "ThreadPool.h"
@ -63,9 +64,9 @@ class BufferedReader : public framework::DecoratedReader {
std::vector<TensorVec> gpu_buffer_;
size_t prev_pos_{-1UL};
#ifdef PADDLE_WITH_CUDA
cudaStream_t stream;
cudaStream_t compute_stream;
std::vector<cudaEvent_t> events;
cudaStream_t stream_;
cudaStream_t compute_stream_;
std::vector<cudaEvent_t> events_;
#endif
};

@ -12,37 +12,13 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
#include "paddle/fluid/operators/reader/py_reader.h"
#include "paddle/fluid/operators/reader/reader_op_registry.h"
namespace paddle {
namespace operators {
namespace reader {
class PyReader : public framework::FileReader {
public:
explicit PyReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue)
: framework::FileReader() {
PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null");
queue_ = queue;
}
void ReadNext(std::vector<framework::LoDTensor>* out) override {
bool success;
*out = queue_->Pop(&success);
if (!success) out->clear();
}
~PyReader() { queue_->Close(); }
void Shutdown() override { queue_->Close(); }
void Start() override { queue_->ReOpen(); }
private:
std::shared_ptr<LoDTensorBlockingQueue> queue_;
};
class CreatePyReaderOp : public framework::OperatorBase {
public:
using framework::OperatorBase::OperatorBase;

@ -0,0 +1,42 @@
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reader/py_reader.h"
#include <memory>
namespace paddle {
namespace operators {
namespace reader {
PyReader::PyReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue)
: framework::FileReader() {
PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null");
queue_ = queue;
}
void PyReader::ReadNext(std::vector<framework::LoDTensor>* out) {
bool success;
*out = queue_->Pop(&success);
if (!success) out->clear();
}
PyReader::~PyReader() { queue_->Close(); }
void PyReader::Shutdown() { queue_->Close(); }
void PyReader::Start() { queue_->ReOpen(); }
} // namespace reader
} // namespace operators
} // namespace paddle

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save