From dee6d1606f35a6cda8b788bb163763ca8a1d3257 Mon Sep 17 00:00:00 2001 From: Haihao Shen Date: Fri, 31 Aug 2018 15:35:36 +0800 Subject: [PATCH 01/85] Enable conv and batch norm by default --- python/paddle/fluid/transpiler/inference_transpiler.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py index f79fcb24bb..02fefe32df 100644 --- a/python/paddle/fluid/transpiler/inference_transpiler.py +++ b/python/paddle/fluid/transpiler/inference_transpiler.py @@ -60,11 +60,10 @@ class InferenceTranspiler(object): if not isinstance(scope, core.Scope): raise TypeError("scope should be as Scope type or None") use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False)) + self._fuse_batch_norm(program, place, scope) if use_mkldnn: self._fuse_relu_mkldnn(program) self._fuse_conv_bias_mkldnn(program) - else: - self._fuse_batch_norm(program, place, scope) def _fuse_relu_mkldnn(self, program): ''' From 5ec2fb0c93d0e799ea1fc215be0072488399c31e Mon Sep 17 00:00:00 2001 From: nhzlx Date: Fri, 31 Aug 2018 11:32:35 +0000 Subject: [PATCH 02/85] add flexibledfs for find path between two nodes --- .../inference/analysis/data_flow_graph.cc | 37 ++++++++++ .../inference/analysis/data_flow_graph.h | 3 + .../analysis/data_flow_graph_tester.cc | 71 +++++++++++++++++++ 3 files changed, 111 insertions(+) diff --git a/paddle/fluid/inference/analysis/data_flow_graph.cc b/paddle/fluid/inference/analysis/data_flow_graph.cc index 100a7504b8..e4f4bbf43c 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph.cc @@ -480,6 +480,8 @@ void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) { for (auto *out : op_nodes[i]->outlinks) { if (follow_up_input_names.count(out->name())) { filtered_subgraph_outlinks.push_back(out); + } else { + out->SetDeleted(); } } PADDLE_ENFORCE_GE(filtered_subgraph_outlinks.size(), 1UL); @@ -487,6 +489,41 @@ void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) { } } +void FlexibleDFS(const std::vector &source, bool reverse, + const std::function &enter, + const std::function &leave) { + typedef struct { + const Node *node; + bool leave; + } FNode; + std::vector stack; + for (auto &node : source) { + stack.push_back(FNode{node, false}); + } + std::unordered_set visited; + while (!stack.empty()) { + auto fnode = stack.back(); + stack.pop_back(); + + if (fnode.leave) { + if (leave && !leave(fnode.node)) return; + } + if (visited.count(fnode.node)) continue; + visited.insert(fnode.node); + + if (enter && !enter(fnode.node)) return; + + if (leave) stack.push_back(FNode{fnode.node, true}); + const std::vector iter_nodes = + reverse == true ? fnode.node->inlinks : fnode.node->outlinks; + for (const Node *node : iter_nodes) { + if (!visited.count(node)) { + stack.push_back(FNode{node, false}); + } + } + } +} + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h index 437e097acd..4fefc175f3 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph.h +++ b/paddle/fluid/inference/analysis/data_flow_graph.h @@ -204,6 +204,9 @@ std::pair, std::vector> ExtractInputAndOutputOfSubGraph(std::vector &graph); // NOLINT void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph); +void FlexibleDFS(const std::vector &source, bool reverse, + const std::function &enter, + const std::function &leave); } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc index 1682011c3d..040ca19514 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc @@ -160,6 +160,77 @@ TEST(DataFlowGraph, Build_IR_Graph) { ASSERT_EQ(graph.nodes.size(), ir_graph.Nodes().size()); } +// FlexibleDFS +/* + * Graph topology + * inputs: 0 + * 0 -> 1 + * 1 -> 2 + * 1 -> 3 + * 3 -> 4 + * 4 -> 5 + * 5 -> 2 + */ +TEST(DataFlowGraph, flexibledfs) { + DataFlowGraph graph; + + for (int i = 0; i < 6; i++) { + auto* node = graph.nodes.Create(Node::Type::kValue); + node->SetName("node-" + std::to_string(i)); + } + + auto add_link = [&](int i, int j) { + Node* source = graph.nodes.GetMutable(i); + Node* target = graph.nodes.GetMutable(j); + target->inlinks.push_back(source); + source->outlinks.push_back(target); + }; + + add_link(0, 1); + add_link(1, 2); + add_link(1, 3); + add_link(3, 4); + add_link(4, 5); + add_link(5, 2); + graph.Build(); + + std::vector order; + FlexibleDFS(graph.inputs(), false, nullptr, [&order](const Node* n) { + order.push_back(n); + return true; + }); + + ASSERT_EQ(order.size(), 6UL); + + order.clear(); + // reverse dfs + FlexibleDFS(graph.outputs(), true, nullptr, [&order](const Node* n) { + order.push_back(n); + return true; + }); + + ASSERT_EQ(order.size(), 6UL); + + // If we delete + Node* last_node = graph.nodes.GetMutable(2); + Node* direct_node = graph.nodes.GetMutable(1); + std::vector source_nodes; + for (Node* node : last_node->inlinks) { + if (node != direct_node) source_nodes.push_back(node); + } + + bool has_cycle = false; + FlexibleDFS(source_nodes, true, nullptr, + [&has_cycle, direct_node](const Node* n) { + if (n == direct_node) { + has_cycle = true; + return false; + } + return true; + }); + ASSERT_TRUE(has_cycle); +} + } // namespace analysis } // namespace inference } // namespace paddle From 82a1b35b9b64d96e2715e58cd4779a1324f1e139 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Tue, 4 Sep 2018 11:25:43 +0800 Subject: [PATCH 03/85] Revert "Revert "Add CudnnHolder and use it in Conv and ConvTranspose op"" This reverts commit 151e169eb75a8ee96e0c1e50605fa811cb65acf4. --- paddle/fluid/framework/rw_lock.h | 71 ++++++++++++++++++ paddle/fluid/operators/conv_cudnn_op.cu.cc | 57 +++++++------- .../operators/conv_transpose_cudnn_op.cu.cc | 59 +++++++-------- paddle/fluid/platform/device_context.cc | 74 ++++++++++++++++--- paddle/fluid/platform/device_context.h | 8 +- 5 files changed, 196 insertions(+), 73 deletions(-) diff --git a/paddle/fluid/framework/rw_lock.h b/paddle/fluid/framework/rw_lock.h index a068d3543d..da163835e8 100644 --- a/paddle/fluid/framework/rw_lock.h +++ b/paddle/fluid/framework/rw_lock.h @@ -56,5 +56,76 @@ struct RWLock { }; #endif +class RWLockGuard { + public: + enum Status { kUnLock, kWRLock, kRDLock }; + + RWLockGuard(RWLock* rw_lock, Status init_status) + : lock_(rw_lock), status_(Status::kUnLock) { + switch (init_status) { + case Status::kRDLock: { + RDLock(); + break; + } + case Status::kWRLock: { + WRLock(); + break; + } + case Status::kUnLock: { + break; + } + } + } + + void WRLock() { + switch (status_) { + case Status::kUnLock: { + lock_->WRLock(); + status_ = Status::kWRLock; + break; + } + case Status::kWRLock: { + break; + } + case Status::kRDLock: { + PADDLE_THROW( + "Please unlock read lock first before invoking write lock."); + break; + } + } + } + + void RDLock() { + switch (status_) { + case Status::kUnLock: { + lock_->RDLock(); + status_ = Status::kRDLock; + break; + } + case Status::kRDLock: { + break; + } + case Status::kWRLock: { + PADDLE_THROW( + "Please unlock write lock first before invoking read lock."); + break; + } + } + } + + void UnLock() { + if (status_ != Status::kUnLock) { + lock_->UNLock(); + status_ = Status::kUnLock; + } + } + + ~RWLockGuard() { UnLock(); } + + private: + RWLock* lock_; + Status status_; +}; + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc index 22cbf680c0..4a7a6bcf71 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc @@ -118,7 +118,6 @@ class CUDNNConvOpKernel : public framework::OpKernel { output_channels / groups * output_height * output_width * output_depth; int group_offset_filter = filter->numel() / groups; // ------------------- cudnn conv workspace --------------------- - void* cudnn_workspace = nullptr; size_t workspace_size_in_bytes; // final workspace to allocate. size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES; if (user_workspace_size > 0) { @@ -159,20 +158,18 @@ class CUDNNConvOpKernel : public framework::OpKernel { PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit, "workspace_size to be allocated exceeds the limit"); - // Allocate on GPU memory - platform::CUDAPlace gpu = boost::get(ctx.GetPlace()); - cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); // ------------------- cudnn conv forward --------------------- ScalingParamType alpha = 1.0f, beta = 0.0f; for (int i = 0; i < groups; i++) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( - handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, - cudnn_filter_desc, filter_data + i * group_offset_filter, - cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes, - &beta, cudnn_output_desc, output_data + i * group_offset_out)); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( + handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, + cudnn_filter_desc, filter_data + i * group_offset_filter, + cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes, + &beta, cudnn_output_desc, output_data + i * group_offset_out)); + }; + dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes); } - // Release the cudnn workspace - paddle::memory::Free(gpu, cudnn_workspace); } }; @@ -314,11 +311,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { cudnn_filter_desc, filter_algo, &tmp_size)); workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size); } - // ------------------- cudnn conv workspace --------------------- - // Already on GPU - void* cudnn_workspace = nullptr; - platform::CUDAPlace gpu = boost::get(ctx.GetPlace()); - cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); + // ------------------- cudnn conv backward data --------------------- ScalingParamType alpha = 1.0f, beta = 0.0f; if (input_grad) { @@ -326,12 +319,15 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { // Because beta is zero, it is unnecessary to reset input_grad. for (int i = 0; i < groups; i++) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( - handle, &alpha, cudnn_filter_desc, - filter_data + i * group_offset_filter, cudnn_output_grad_desc, - output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo, - cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, - input_grad_data + i * group_offset_in)); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( + handle, &alpha, cudnn_filter_desc, + filter_data + i * group_offset_filter, cudnn_output_grad_desc, + output_grad_data + i * group_offset_out, cudnn_conv_desc, + data_algo, cudnn_workspace, workspace_size_in_bytes, &beta, + cudnn_input_desc, input_grad_data + i * group_offset_in)); + }; + dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes); } } // ------------------- cudnn conv backward filter --------------------- @@ -339,16 +335,17 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { T* filter_grad_data = filter_grad->mutable_data(ctx.GetPlace()); // Because beta is zero, it is unnecessary to reset filter_grad. for (int i = 0; i < groups; i++) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( - handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, - cudnn_output_grad_desc, output_grad_data + i * group_offset_out, - cudnn_conv_desc, filter_algo, cudnn_workspace, - workspace_size_in_bytes, &beta, cudnn_filter_desc, - filter_grad_data + i * group_offset_filter)); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( + handle, &alpha, cudnn_input_desc, + input_data + i * group_offset_in, cudnn_output_grad_desc, + output_grad_data + i * group_offset_out, cudnn_conv_desc, + filter_algo, cudnn_workspace, workspace_size_in_bytes, &beta, + cudnn_filter_desc, filter_grad_data + i * group_offset_filter)); + }; + dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes); } } - // Release the cudnn workspace - paddle::memory::Free(gpu, cudnn_workspace); } }; diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc index 82fff68e75..73831611d0 100644 --- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc @@ -76,7 +76,6 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { conv_desc.descriptor(paddings, strides, dilations); // ------------------- cudnn conv workspace --------------------- - void* cudnn_workspace = nullptr; size_t workspace_size_in_bytes; // final workspace to allocate. size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes; if (user_workspace_size > 0) { @@ -100,25 +99,21 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc, cudnn_output_desc, algo, &workspace_size_in_bytes)); - // Allocate on GPU memory - platform::CUDAPlace gpu = boost::get(ctx.GetPlace()); - cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); - // ------------------- cudnn conv transpose forward --------------------- int input_offset = input->numel() / input->dims()[0] / groups; int output_offset = output->numel() / output->dims()[0] / groups; int filter_offset = filter->numel() / groups; T alpha = 1.0f, beta = 0.0f; for (int g = 0; g < groups; g++) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( - handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g, - cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc, - algo, cudnn_workspace, workspace_size_in_bytes, &beta, - cudnn_output_desc, output_data + output_offset * g)); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( + handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g, + cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc, + algo, cudnn_workspace, workspace_size_in_bytes, &beta, + cudnn_output_desc, output_data + output_offset * g)); + }; + dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes); } - - // Release the cudnn workspace - paddle::memory::Free(gpu, cudnn_workspace); } }; @@ -206,11 +201,6 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { std::max(workspace_size_in_bytes, bwd_filter_ws_size); } - // ------------------- cudnn conv workspace --------------------- - // Already on GPU - void* cudnn_workspace = nullptr; - platform::CUDAPlace gpu = boost::get(ctx.GetPlace()); - cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); // ------------------- cudnn conv backward data --------------------- // FIXME(typhoonzero): template type T may not be the same as cudnn call. int input_offset = input->numel() / input->dims()[0] / groups; @@ -222,12 +212,15 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); // Because beta is zero, it is unnecessary to reset input_grad. for (int g = 0; g < groups; g++) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( - handle, &alpha, cudnn_output_desc, - output_grad_data + output_grad_offset * g, cudnn_filter_desc, - filter_data + filter_offset * g, cudnn_conv_desc, data_algo, - cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, - input_grad_data + input_offset * g)); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( + handle, &alpha, cudnn_output_desc, + output_grad_data + output_grad_offset * g, cudnn_filter_desc, + filter_data + filter_offset * g, cudnn_conv_desc, data_algo, + cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, + input_grad_data + input_offset * g)); + }; + dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes); } } @@ -237,17 +230,17 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { // Because beta is zero, it is unnecessary to reset filter_grad. // Gradient with respect to the filter for (int g = 0; g < groups; g++) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( - handle, &alpha, cudnn_output_desc, - output_grad_data + output_grad_offset * g, cudnn_input_desc, - input_data + input_offset * g, cudnn_conv_desc, filter_algo, - cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_filter_desc, - filter_grad_data + filter_offset * g)); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( + handle, &alpha, cudnn_output_desc, + output_grad_data + output_grad_offset * g, cudnn_input_desc, + input_data + input_offset * g, cudnn_conv_desc, filter_algo, + cudnn_workspace, workspace_size_in_bytes, &beta, + cudnn_filter_desc, filter_grad_data + filter_offset * g)); + }; + dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes); } } - - // Release the cudnn workspace - paddle::memory::Free(gpu, cudnn_workspace); } }; diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 2cc26da013..3ec20ad7e5 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -16,6 +16,9 @@ limitations under the License. */ #include #include "paddle/fluid/memory/memory.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/framework/rw_lock.h" +#endif namespace paddle { namespace platform { @@ -142,7 +145,59 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { mutable unsigned int* semaphore_; }; -CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) { +class CudnnHolder { + public: + CudnnHolder(const cudaStream_t* stream, const CUDAPlace& place) + : workspace_(nullptr), workspace_len_(0), stream_(stream), place_(place) { + PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_)); + PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, *stream_)); + } + + cudnnHandle_t cudnn_handle() const { return cudnn_handle_; } + + void RunFunc(const std::function& cudnn_func, + size_t required_workspace_len) { + std::lock_guard lock(mtx_); + if (required_workspace_len > workspace_len_) { + ReallocateWorkspace(required_workspace_len); + } + cudnn_func(workspace_); + } + + ~CudnnHolder() { + PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_)); + if (workspace_ != nullptr) { + paddle::memory::Free(place_, workspace_); + } + } + + private: + void ReallocateWorkspace(size_t required_workspace_len) { + if (required_workspace_len <= workspace_len_) { + return; + } + void* new_workspace = paddle::memory::Alloc(place_, required_workspace_len); + if (workspace_ != nullptr) { + // Maybe someone is using the current workspace + PADDLE_ENFORCE(cudaStreamSynchronize(*stream_)); + paddle::memory::Free(place_, workspace_); + } + workspace_ = new_workspace; + workspace_len_ = required_workspace_len; + } + + cudnnHandle_t cudnn_handle_; + void* workspace_; + size_t workspace_len_; + + const cudaStream_t* stream_; // not owned; + const CUDAPlace place_; + + std::mutex mtx_; +}; + +CUDADeviceContext::CUDADeviceContext(CUDAPlace place) + : place_(place), cudnn_holder_(nullptr) { SetDeviceId(place_.device); compute_capability = GetCUDAComputeCapability(place_.device); multi_process = GetCUDAMultiProcessors(place_.device); @@ -154,10 +209,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) { PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_)); PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_)); if (dynload::HasCUDNN()) { - PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_)); - PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream_)); - } else { - cudnn_handle_ = nullptr; + cudnn_holder_.reset(new CudnnHolder(&stream_, place)); } } @@ -165,9 +217,6 @@ CUDADeviceContext::~CUDADeviceContext() { SetDeviceId(place_.device); Wait(); PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_)); - if (cudnn_handle_ != nullptr) { - PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_)); - } eigen_stream_.reset(); eigen_device_.reset(); PADDLE_ENFORCE(cudaStreamDestroy(stream_)); @@ -196,7 +245,14 @@ cublasHandle_t CUDADeviceContext::cublas_handle() const { return cublas_handle_; } -cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; } +cudnnHandle_t CUDADeviceContext::cudnn_handle() const { + return cudnn_holder_->cudnn_handle(); +} + +void CUDADeviceContext::RunCudnnFuncWithWorkspace( + const std::function& cudnn_func, size_t workspace_len) const { + cudnn_holder_->RunFunc(cudnn_func, workspace_len); +} cudaStream_t CUDADeviceContext::stream() const { return stream_; } diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index b97dad20db..3ed49fc423 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -69,6 +69,7 @@ struct DefaultDeviceContextType { #ifdef PADDLE_WITH_CUDA class EigenCudaStreamDevice; +class CudnnHolder; class CUDADeviceContext : public DeviceContext { public: @@ -96,6 +97,11 @@ class CUDADeviceContext : public DeviceContext { /*! \brief Return cudnn handle in the device context. */ cudnnHandle_t cudnn_handle() const; + /*! \brief Run a cudnn function with the workspace provided by + * CUDADeviceContext */ + void RunCudnnFuncWithWorkspace(const std::function& cudnn_func, + size_t workspace_len) const; + /*! \brief Return cuda stream in the device context. */ cudaStream_t stream() const; @@ -111,8 +117,8 @@ class CUDADeviceContext : public DeviceContext { std::unique_ptr eigen_device_; std::unique_ptr eigen_stream_; + std::unique_ptr cudnn_holder_; cudaStream_t stream_; - cudnnHandle_t cudnn_handle_; cublasHandle_t cublas_handle_; int compute_capability; From 7b577b92e04ff3ac62eefe2837f90eb4d266b413 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Tue, 4 Sep 2018 11:27:24 +0800 Subject: [PATCH 04/85] fix a memory bug in CudnnHolder --- paddle/fluid/platform/device_context.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 3ec20ad7e5..c6f1d1f3d5 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -176,13 +176,12 @@ class CudnnHolder { if (required_workspace_len <= workspace_len_) { return; } - void* new_workspace = paddle::memory::Alloc(place_, required_workspace_len); if (workspace_ != nullptr) { // Maybe someone is using the current workspace PADDLE_ENFORCE(cudaStreamSynchronize(*stream_)); paddle::memory::Free(place_, workspace_); } - workspace_ = new_workspace; + workspace_ = paddle::memory::Alloc(place_, required_workspace_len); workspace_len_ = required_workspace_len; } From 6edfae4234ebe28d3c14954b0117536ced65758f Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Fri, 7 Sep 2018 14:34:21 +0800 Subject: [PATCH 05/85] reset received vars on pserver --- .../distributed/request_handler_impl.cc | 13 ------ .../distributed/request_handler_impl.h | 5 --- .../fluid/operators/distributed/rpc_server.cc | 8 ++++ .../fluid/operators/distributed/rpc_server.h | 6 ++- paddle/fluid/operators/listen_and_serv_op.cc | 42 ++++++++++++++++--- paddle/fluid/operators/listen_and_serv_op.h | 10 ++++- 6 files changed, 58 insertions(+), 26 deletions(-) diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc index 31159a0259..849e412504 100644 --- a/paddle/fluid/operators/distributed/request_handler_impl.cc +++ b/paddle/fluid/operators/distributed/request_handler_impl.cc @@ -67,24 +67,11 @@ bool RequestSendHandler::Handle(const std::string& varname, LOG(FATAL) << "sync: Can not find server side var: " << varname; return false; } - - if (invar->IsType()) { - std::unique_lock lock(mutex_sparse_vars_); - sparse_vars_.push_back(invar); - } } } return true; } -void RequestSendHandler::ResetSparseVarRecorder() { - std::unique_lock lock(mutex_sparse_vars_); - for (auto* var : sparse_vars_) { - var->GetMutable()->mutable_rows()->clear(); - } - sparse_vars_.clear(); -} - bool RequestGetHandler::Handle(const std::string& varname, framework::Scope* scope, framework::Variable* invar, diff --git a/paddle/fluid/operators/distributed/request_handler_impl.h b/paddle/fluid/operators/distributed/request_handler_impl.h index 87185500f2..8be5b21bb8 100644 --- a/paddle/fluid/operators/distributed/request_handler_impl.h +++ b/paddle/fluid/operators/distributed/request_handler_impl.h @@ -41,11 +41,6 @@ class RequestSendHandler final : public RequestHandler { bool Handle(const std::string& varname, framework::Scope* scope, framework::Variable* var, framework::Variable** outvar, const std::string& out_var_name = "") override; - void ResetSparseVarRecorder(); - - private: - std::mutex mutex_sparse_vars_; - std::vector sparse_vars_; }; class RequestGetHandler final : public RequestHandler { diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc index 406e7294c1..084480ae48 100644 --- a/paddle/fluid/operators/distributed/rpc_server.cc +++ b/paddle/fluid/operators/distributed/rpc_server.cc @@ -101,6 +101,8 @@ void RPCServer::Complete() { { std::unique_lock lock(mutex_); client_num_--; + need_reset_all_vars_ = true; + VLOG(4) << "decrease client_num to: " << client_num_; if (cur_cond_.load() == rpc_cond_map_[kRequestGet]) { barrier_counter_[kRequestGet]--; @@ -109,6 +111,11 @@ void RPCServer::Complete() { barrier_cond_.notify_all(); } +bool RPCServer::NeedResetAllVars() { + std::unique_lock lock(mutex_); + return need_reset_all_vars_; +} + int RPCServer::GetClientNum() { std::unique_lock lock(mutex_); return client_num_; @@ -120,6 +127,7 @@ void RPCServer::ResetBarrierCounter() { for (auto& t : barrier_counter_) { t.second = 0; } + need_reset_all_vars_ = false; } void RPCServer::RegisterRPC(const std::string& rpc_name, diff --git a/paddle/fluid/operators/distributed/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h index d813ba03e2..d88e8c640f 100644 --- a/paddle/fluid/operators/distributed/rpc_server.h +++ b/paddle/fluid/operators/distributed/rpc_server.h @@ -49,7 +49,8 @@ class RPCServer { bind_address_(address), exit_flag_(false), selected_port_(0), - client_num_(client_num) {} + client_num_(client_num), + need_reset_all_vars_(false) {} virtual ~RPCServer() {} virtual void StartServer() = 0; @@ -86,6 +87,8 @@ class RPCServer { void ResetBarrierCounter(); RPCServerProfiler& Profiler() { return profiler_; } + bool NeedResetAllVars(); + protected: virtual void ShutDownImpl() = 0; @@ -104,6 +107,7 @@ class RPCServer { std::atomic exit_flag_; int selected_port_; int client_num_; + bool need_reset_all_vars_; std::unordered_map rpc_call_map_; std::unordered_map rpc_thread_num_; diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index 4cc2159d9f..1933e6a5d0 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -22,6 +22,7 @@ limitations under the License. */ #include "gflags/gflags.h" #include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/operators/listen_and_serv_op.h" @@ -101,9 +102,10 @@ static int64_t GetTimestamp() { void ListenAndServOp::RunSyncLoop( framework::Executor *executor, framework::ProgramDesc *program, - framework::Scope *recv_scope, + framework::Scope *recv_scope, platform::DeviceContext *dev_ctx, const std::vector &prefetch_block_id_list, - const int checkpoint_point_block_id) const { + const int checkpoint_point_block_id, + const std::vector &recv_varnames) const { VLOG(2) << "RunSyncLoop"; size_t num_blocks = program->Size(); auto optimize_blocks = @@ -166,8 +168,8 @@ void ListenAndServOp::RunSyncLoop( VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)"; // reset received sparse vars to avoid reuse it in the next mini-batch - dynamic_cast(request_send_handler_.get()) - ->ResetSparseVarRecorder(); + ResetReceivedVars(recv_varnames, recv_scope, dev_ctx, + !rpc_service_->NeedResetAllVars()); rpc_service_->SetCond(distributed::kRequestGet); rpc_service_->WaitBarrier(distributed::kRequestGet); @@ -175,6 +177,33 @@ void ListenAndServOp::RunSyncLoop( } // while(true) } +void ListenAndServOp::ResetReceivedVars( + const std::vector &recv_varnames, framework::Scope *recv_scope, + platform::DeviceContext *dev_ctx, bool only_sparse_vars) const { + for (auto &varname : recv_varnames) { + auto var = recv_scope->FindVar(varname); + if (var == nullptr) { + VLOG(2) << "can not find var " << varname << " in received scope"; + continue; + } + if (var->IsType()) { + var->GetMutable()->mutable_rows()->clear(); + } + if (!only_sparse_vars) { + if (var->IsType()) { + math::set_constant(*dev_ctx, var->GetMutable(), + static_cast(0)); + } else if (var->IsType()) { + math::set_constant(*dev_ctx, var->GetMutable(), + static_cast(0)); + } else { + PADDLE_THROW( + "received var should be in [SelectedRows, LoDTensor, Tensor]"); + } + } + } +} + void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, framework::ProgramDesc *program, framework::Scope *recv_scope) const { @@ -258,6 +287,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, bool sync_mode = Attr("sync_mode"); auto fan_in = Attr("Fanin"); + auto inputs = Inputs("X"); PADDLE_ENFORCE(!rpc_service_); std::string endpoint = Attr("endpoint"); @@ -351,8 +381,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, // Write to a file of server selected port for python use. SavePort(); if (sync_mode) { - RunSyncLoop(&executor, program, &recv_scope, prefetch_block_id_list, - checkpoint_block_id); + RunSyncLoop(&executor, program, &recv_scope, &dev_ctx, + prefetch_block_id_list, checkpoint_block_id, inputs); } else { RunAsyncLoop(&executor, program, &recv_scope); } diff --git a/paddle/fluid/operators/listen_and_serv_op.h b/paddle/fluid/operators/listen_and_serv_op.h index 978969cc51..f84baa36eb 100644 --- a/paddle/fluid/operators/listen_and_serv_op.h +++ b/paddle/fluid/operators/listen_and_serv_op.h @@ -26,6 +26,7 @@ limitations under the License. */ #include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/operators/distributed/request_handler.h" #include "paddle/fluid/operators/distributed/rpc_server.h" +#include "paddle/fluid/platform/device_context.h" namespace paddle { namespace operators { @@ -48,8 +49,10 @@ class ListenAndServOp : public framework::OperatorBase { void RunSyncLoop(framework::Executor* executor, framework::ProgramDesc* program, framework::Scope* recv_scope, + platform::DeviceContext* dev_ctx, const std::vector& prefetch_block_id_list, - const int checkpoint_point_block_id) const; + const int checkpoint_point_block_id, + const std::vector& recv_varnames) const; void RunAsyncLoop(framework::Executor* executor, framework::ProgramDesc* program, @@ -64,6 +67,11 @@ class ListenAndServOp : public framework::OperatorBase { void RunImpl(const framework::Scope& scope, const platform::Place& dev_place) const override; + void ResetReceivedVars(const std::vector& recv_varnames, + framework::Scope* recv_scope, + platform::DeviceContext* dev_ctx, + bool only_sparse_vars = true) const; + protected: mutable std::shared_ptr rpc_service_; mutable std::shared_ptr request_send_handler_; From 580f55fa0f73c7d418e92672253708c648599710 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Mon, 10 Sep 2018 10:38:07 +0800 Subject: [PATCH 06/85] update by comment --- paddle/fluid/operators/listen_and_serv_op.cc | 9 +++++---- paddle/fluid/operators/listen_and_serv_op.h | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index 1933e6a5d0..abbb3d06d1 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -167,9 +167,8 @@ void ListenAndServOp::RunSyncLoop( recv_scope); VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)"; - // reset received sparse vars to avoid reuse it in the next mini-batch ResetReceivedVars(recv_varnames, recv_scope, dev_ctx, - !rpc_service_->NeedResetAllVars()); + rpc_service_->NeedResetAllVars()); rpc_service_->SetCond(distributed::kRequestGet); rpc_service_->WaitBarrier(distributed::kRequestGet); @@ -179,7 +178,7 @@ void ListenAndServOp::RunSyncLoop( void ListenAndServOp::ResetReceivedVars( const std::vector &recv_varnames, framework::Scope *recv_scope, - platform::DeviceContext *dev_ctx, bool only_sparse_vars) const { + platform::DeviceContext *dev_ctx, bool reset_all) const { for (auto &varname : recv_varnames) { auto var = recv_scope->FindVar(varname); if (var == nullptr) { @@ -187,9 +186,11 @@ void ListenAndServOp::ResetReceivedVars( continue; } if (var->IsType()) { + VLOG(3) << "reset sparse var: " << varname; var->GetMutable()->mutable_rows()->clear(); } - if (!only_sparse_vars) { + if (UNLIKELY(reset_all)) { + VLOG(3) << "reset dense var: " << varname; if (var->IsType()) { math::set_constant(*dev_ctx, var->GetMutable(), static_cast(0)); diff --git a/paddle/fluid/operators/listen_and_serv_op.h b/paddle/fluid/operators/listen_and_serv_op.h index f84baa36eb..5102c963b9 100644 --- a/paddle/fluid/operators/listen_and_serv_op.h +++ b/paddle/fluid/operators/listen_and_serv_op.h @@ -70,7 +70,7 @@ class ListenAndServOp : public framework::OperatorBase { void ResetReceivedVars(const std::vector& recv_varnames, framework::Scope* recv_scope, platform::DeviceContext* dev_ctx, - bool only_sparse_vars = true) const; + bool reset_all = false) const; protected: mutable std::shared_ptr rpc_service_; From 926e1077ca07b86c42b87a418efcf07fb820e3af Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Fri, 24 Aug 2018 18:57:36 +0800 Subject: [PATCH 07/85] version --- paddle/fluid/framework/CMakeLists.txt | 4 +++- paddle/fluid/framework/framework.proto | 13 ++++++++++- paddle/fluid/framework/program_desc.cc | 8 +++++++ paddle/fluid/framework/program_desc.h | 2 ++ paddle/fluid/framework/version.cc | 28 ++++++++++++++++++++++++ paddle/fluid/framework/version.h | 30 ++++++++++++++++++++++++++ paddle/fluid/inference/io.cc | 3 +++ paddle/fluid/pybind/protobuf.cc | 5 ++++- paddle/fluid/pybind/pybind.cc | 4 ++++ python/paddle/fluid/framework.py | 3 +++ python/paddle/fluid/io.py | 5 +++++ 11 files changed, 102 insertions(+), 3 deletions(-) create mode 100644 paddle/fluid/framework/version.cc create mode 100644 paddle/fluid/framework/version.h diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index cc7938b2ac..8af9a8f68a 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -116,7 +116,9 @@ cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope gl endif(NOT WIN32) cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context) -cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog) + +cc_library(version SRCS version.cc) +cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version) cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto index c658843581..8517d01cfe 100644 --- a/paddle/fluid/framework/framework.proto +++ b/paddle/fluid/framework/framework.proto @@ -16,6 +16,13 @@ syntax = "proto2"; option optimize_for = LITE_RUNTIME; package paddle.framework.proto; +// Any incompatible changes to ProgramDesc and its dependencies should +// raise the version defined version.h. +// +// Serailization and Deserialization codes should be modified in a way +// that supports old versions following the version and compatibility policy. +message Version { optional int64 version = 1 [ default = -1 ]; } + enum AttrType { INT = 0; FLOAT = 1; @@ -180,4 +187,8 @@ message BlockDesc { // for more details. // TODO(panyx0718): A model can have multiple programs. Need a // way to distinguish them. Maybe ID or name? -message ProgramDesc { repeated BlockDesc blocks = 1; } +message ProgramDesc { + repeated BlockDesc blocks = 1; + + optional Version version = 2; +} diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc index a63944eaee..f2e0b79c4b 100644 --- a/paddle/fluid/framework/program_desc.cc +++ b/paddle/fluid/framework/program_desc.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/version.h" namespace paddle { namespace framework { @@ -31,6 +32,10 @@ void ProgramDesc::Flush() { for (auto &block : blocks_) { block->Flush(); } + // If not loaded, use current code version. + if (desc_.version().version() < 0) { + desc_.mutable_version()->set_version(kCurProgramVersion); + } } proto::ProgramDesc *ProgramDesc::Proto() { @@ -38,7 +43,10 @@ proto::ProgramDesc *ProgramDesc::Proto() { return &desc_; } +int ProgramDesc::Version() const { return desc_.version().version(); } + ProgramDesc::ProgramDesc() { + desc_.mutable_version()->set_version(kCurProgramVersion); auto *block = desc_.mutable_blocks()->Add(); block->set_idx(kRootBlockIndex); block->set_parent_idx(kNoneBlockIndex); diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h index a0e81cade1..9cf3714b6a 100644 --- a/paddle/fluid/framework/program_desc.h +++ b/paddle/fluid/framework/program_desc.h @@ -57,6 +57,8 @@ class ProgramDesc { proto::ProgramDesc *Proto(); + int Version() const; + // The output variable of feed_op is referenced as feed_target. // This function is used to collect the output variable's name of all // feed_ops. diff --git a/paddle/fluid/framework/version.cc b/paddle/fluid/framework/version.cc new file mode 100644 index 0000000000..b0d5c26a31 --- /dev/null +++ b/paddle/fluid/framework/version.cc @@ -0,0 +1,28 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/version.h" +#include + +namespace paddle { +namespace framework { +bool IsProgramVersionSupported(int version) { + static int num_supported = + sizeof(kSupportedProgramVersion) / sizeof(kSupportedProgramVersion[0]); + return std::find(kSupportedProgramVersion, + kSupportedProgramVersion + num_supported, + version) != kSupportedProgramVersion + num_supported; +} +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/version.h b/paddle/fluid/framework/version.h new file mode 100644 index 0000000000..2960ac9782 --- /dev/null +++ b/paddle/fluid/framework/version.h @@ -0,0 +1,30 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +namespace paddle { +namespace framework { + +// The program version the current codes generate. +constexpr int kCurProgramVersion = 0; + +// The program version that was generated by previous or current codes +// and supported by current codes. +constexpr int kSupportedProgramVersion[] = {0}; + +bool IsProgramVersionSupported(int version); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc index cef7b2a7e3..fa59cca383 100644 --- a/paddle/fluid/inference/io.cc +++ b/paddle/fluid/inference/io.cc @@ -20,6 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/version.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/pybind/pybind.h" @@ -124,6 +125,7 @@ std::unique_ptr Load(framework::Executor* executor, std::unique_ptr main_program( new framework::ProgramDesc(program_desc_str)); + PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version())); LoadPersistables(executor, scope, *main_program, dirname, ""); return main_program; @@ -138,6 +140,7 @@ std::unique_ptr Load( std::unique_ptr main_program( new framework::ProgramDesc(program_desc_str)); + PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version())); LoadPersistables(executor, scope, *main_program, "", param_filename); return main_program; diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index f21f8d23f9..67501186d1 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -137,7 +137,10 @@ void BindProgramDesc(pybind11::module *m) { PADDLE_ENFORCE(desc->ParseFromString(data), "Fail to parse ProgramDesc from string. This could " "be a bug of Paddle."); - }); + }) + .def("_version", [](pd::ProgramDesc &self) -> int64_t { + return self.Proto()->version().version(); + }); } void BindBlockDesc(pybind11::module *m) { diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 5b20b87174..191241de7d 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -517,6 +517,10 @@ All parameter, weight, gradient are variables in Paddle. m.def("init_glog", framework::InitGLOG); m.def("init_devices", [](bool init_p2p) { framework::InitDevices(init_p2p); }); + m.def("_supported_version", []() { + std::vector supported_versions; + return supported_versions; + }); m.def("is_compiled_with_cuda", IsCompiledWithCUDA); m.def("is_compiled_with_dist", IsCompiledWithDIST); diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index b0e0d27ff7..8892606486 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1564,6 +1564,9 @@ class Program(object): """ return self.desc + def _version(self): + return self.desc._version() + def clone(self, for_test=False): """ Create a new, duplicated program. diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 5c4ec99c53..f72ca0a8d5 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -750,6 +750,11 @@ def load_inference_model(dirname, program_desc_str = f.read() program = Program.parse_from_string(program_desc_str) + # TODO(panyx0718): Link to our version and compatibility guide. + if program._version() != 0: + raise ValueError("Unsupported program version: %d\n" % + program._version()) + # Binary data also need versioning. load_persistables(executor, dirname, program, params_filename) if pserver_endpoints: From 56a977d4363a1af3a0944dfdafcef01636a642ee Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Mon, 27 Aug 2018 18:11:06 +0800 Subject: [PATCH 08/85] add test --- paddle/fluid/framework/CMakeLists.txt | 2 ++ paddle/fluid/framework/version_test.cc | 26 ++++++++++++++++++++++++++ paddle/fluid/pybind/pybind.cc | 3 +++ python/paddle/fluid/io.py | 2 +- 4 files changed, 32 insertions(+), 1 deletion(-) create mode 100644 paddle/fluid/framework/version_test.cc diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 8af9a8f68a..1c9130305c 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -118,6 +118,8 @@ endif(NOT WIN32) cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context) cc_library(version SRCS version.cc) +cc_test(version_test SRCS version_test.cc DEPS version) + cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version) cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) diff --git a/paddle/fluid/framework/version_test.cc b/paddle/fluid/framework/version_test.cc new file mode 100644 index 0000000000..cc57f713d8 --- /dev/null +++ b/paddle/fluid/framework/version_test.cc @@ -0,0 +1,26 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/version.h" +#include "gtest/gtest.h" + +namespace paddle { +namespace framework { +TEST(Variable, GetMutable) { + EXPECT_TRUE(IsProgramVersionSupported(0)); + EXPECT_FALSE(IsProgramVersionSupported(1)); + EXPECT_FALSE(IsProgramVersionSupported(-1)); +} +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 191241de7d..6d85d01477 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -33,6 +33,7 @@ limitations under the License. */ #include "paddle/fluid/framework/prune.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/version.h" #include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" #include "paddle/fluid/platform/enforce.h" @@ -534,6 +535,8 @@ All parameter, weight, gradient are variables in Paddle. m.def("set_feed_variable", framework::SetFeedVariable); m.def("get_fetch_variable", framework::GetFetchVariable); + m.def("_is_program_version_supported", IsProgramVersionSupported); + BindProgramDesc(&m); BindBlockDesc(&m); BindVarDsec(&m); diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index f72ca0a8d5..3e02e14167 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -751,7 +751,7 @@ def load_inference_model(dirname, program = Program.parse_from_string(program_desc_str) # TODO(panyx0718): Link to our version and compatibility guide. - if program._version() != 0: + if not core._is_program_version_supported(program._version()): raise ValueError("Unsupported program version: %d\n" % program._version()) # Binary data also need versioning. From c69cf6dde879318ea10a7ddc0dd3dabd4d9be358 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Mon, 27 Aug 2018 20:23:28 +0800 Subject: [PATCH 09/85] fix --- paddle/fluid/framework/framework.proto | 2 +- paddle/fluid/inference/io.cc | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto index 8517d01cfe..460401df54 100644 --- a/paddle/fluid/framework/framework.proto +++ b/paddle/fluid/framework/framework.proto @@ -21,7 +21,7 @@ package paddle.framework.proto; // // Serailization and Deserialization codes should be modified in a way // that supports old versions following the version and compatibility policy. -message Version { optional int64 version = 1 [ default = -1 ]; } +message Version { optional int64 version = 1 [ default = 0 ]; } enum AttrType { INT = 0; diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc index fa59cca383..1d20643ce0 100644 --- a/paddle/fluid/inference/io.cc +++ b/paddle/fluid/inference/io.cc @@ -125,7 +125,8 @@ std::unique_ptr Load(framework::Executor* executor, std::unique_ptr main_program( new framework::ProgramDesc(program_desc_str)); - PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version())); + PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()), + "model version %d is not supported.", main_program->Version()); LoadPersistables(executor, scope, *main_program, dirname, ""); return main_program; @@ -140,7 +141,8 @@ std::unique_ptr Load( std::unique_ptr main_program( new framework::ProgramDesc(program_desc_str)); - PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version())); + PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()), + "model version %d is not supported.", main_program->Version()); LoadPersistables(executor, scope, *main_program, "", param_filename); return main_program; From 4313d870a2a4e99c3a039949224fff41750b1e52 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Mon, 27 Aug 2018 20:56:38 +0800 Subject: [PATCH 10/85] refine --- paddle/fluid/framework/CMakeLists.txt | 4 ++-- paddle/fluid/framework/lod_tensor.cc | 7 +++++-- paddle/fluid/framework/version.cc | 10 +++++++++- paddle/fluid/framework/version.h | 15 ++++++++++++--- paddle/fluid/framework/version_test.cc | 6 +++++- paddle/fluid/inference/io.cc | 6 ++++-- 6 files changed, 37 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 1c9130305c..d998109df2 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -56,9 +56,9 @@ else() cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor) endif() if (NOT WIN32) -cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio) +cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version) else() -cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto) +cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version) endif (NOT WIN32) cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory) diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc index adeb26e4e7..1e7da9a69c 100644 --- a/paddle/fluid/framework/lod_tensor.cc +++ b/paddle/fluid/framework/lod_tensor.cc @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/framework/version.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memory.h" @@ -251,8 +252,8 @@ void AppendLoD(LoD *lod, const LoD &lod_length) { void SerializeToStream(std::ostream &os, const LoDTensor &tensor, const platform::DeviceContext &dev_ctx) { { // the 1st field, uint32_t version for LoDTensor - constexpr uint32_t version = 0; - os.write(reinterpret_cast(&version), sizeof(version)); + os.write(reinterpret_cast(&kCurTensorVersion), + sizeof(kCurTensorVersion)); } { // the 2st field, LoD information @@ -281,6 +282,8 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor, // the 1st field, unit32_t version for LoDTensor uint32_t version; is.read(reinterpret_cast(&version), sizeof(version)); + PADDLE_ENFORCE(framework::IsTensorVersionSupported(version), + "tensor version %u is not supported.", version); PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported"); } { diff --git a/paddle/fluid/framework/version.cc b/paddle/fluid/framework/version.cc index b0d5c26a31..3d559e26e0 100644 --- a/paddle/fluid/framework/version.cc +++ b/paddle/fluid/framework/version.cc @@ -17,12 +17,20 @@ limitations under the License. */ namespace paddle { namespace framework { -bool IsProgramVersionSupported(int version) { +bool IsProgramVersionSupported(int64_t version) { static int num_supported = sizeof(kSupportedProgramVersion) / sizeof(kSupportedProgramVersion[0]); return std::find(kSupportedProgramVersion, kSupportedProgramVersion + num_supported, version) != kSupportedProgramVersion + num_supported; } + +bool IsTensorVersionSupported(uint32_t version) { + static int num_supported = + sizeof(kSupportedTensorVersion) / sizeof(kSupportedTensorVersion[0]); + return std::find(kSupportedTensorVersion, + kSupportedTensorVersion + num_supported, + version) != kSupportedTensorVersion + num_supported; +} } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/version.h b/paddle/fluid/framework/version.h index 2960ac9782..bf07fc288d 100644 --- a/paddle/fluid/framework/version.h +++ b/paddle/fluid/framework/version.h @@ -12,19 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include + #pragma once namespace paddle { namespace framework { // The program version the current codes generate. -constexpr int kCurProgramVersion = 0; +constexpr int64_t kCurProgramVersion = 0; // The program version that was generated by previous or current codes // and supported by current codes. -constexpr int kSupportedProgramVersion[] = {0}; +constexpr int64_t kSupportedProgramVersion[] = {0}; + +// Due to historical reasons, tensor version use uint32_t. +constexpr uint32_t kCurTensorVersion = 0; + +constexpr uint32_t kSupportedTensorVersion[] = {0}; + +bool IsProgramVersionSupported(int64_t version); -bool IsProgramVersionSupported(int version); +bool IsTensorVersionSupported(uint32_t version); } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/version_test.cc b/paddle/fluid/framework/version_test.cc index cc57f713d8..e8c5f25600 100644 --- a/paddle/fluid/framework/version_test.cc +++ b/paddle/fluid/framework/version_test.cc @@ -17,10 +17,14 @@ namespace paddle { namespace framework { -TEST(Variable, GetMutable) { +TEST(Version, Basic) { EXPECT_TRUE(IsProgramVersionSupported(0)); EXPECT_FALSE(IsProgramVersionSupported(1)); EXPECT_FALSE(IsProgramVersionSupported(-1)); + + EXPECT_TRUE(IsTensorVersionSupported(0)); + EXPECT_FALSE(IsTensorVersionSupported(1)); + EXPECT_FALSE(IsTensorVersionSupported(-1)); } } // namespace framework } // namespace paddle diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc index 1d20643ce0..e246a06fd0 100644 --- a/paddle/fluid/inference/io.cc +++ b/paddle/fluid/inference/io.cc @@ -126,7 +126,8 @@ std::unique_ptr Load(framework::Executor* executor, std::unique_ptr main_program( new framework::ProgramDesc(program_desc_str)); PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()), - "model version %d is not supported.", main_program->Version()); + "model version %ld is not supported.", + main_program->Version()); LoadPersistables(executor, scope, *main_program, dirname, ""); return main_program; @@ -142,7 +143,8 @@ std::unique_ptr Load( std::unique_ptr main_program( new framework::ProgramDesc(program_desc_str)); PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()), - "model version %d is not supported.", main_program->Version()); + "model version %ld is not supported.", + main_program->Version()); LoadPersistables(executor, scope, *main_program, "", param_filename); return main_program; From ff47eaf45f5d5cc9715aa455467f1af0edfd8872 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Tue, 28 Aug 2018 09:34:02 +0800 Subject: [PATCH 11/85] clean --- paddle/fluid/framework/program_desc.cc | 4 ---- 1 file changed, 4 deletions(-) diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc index f2e0b79c4b..5b9073373e 100644 --- a/paddle/fluid/framework/program_desc.cc +++ b/paddle/fluid/framework/program_desc.cc @@ -32,10 +32,6 @@ void ProgramDesc::Flush() { for (auto &block : blocks_) { block->Flush(); } - // If not loaded, use current code version. - if (desc_.version().version() < 0) { - desc_.mutable_version()->set_version(kCurProgramVersion); - } } proto::ProgramDesc *ProgramDesc::Proto() { From 9b7c3f9615a3a35d4bde29e4f7154b45e5de7786 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Tue, 28 Aug 2018 09:42:34 +0800 Subject: [PATCH 12/85] refine --- paddle/fluid/framework/version.h | 8 ++++++++ python/paddle/fluid/io.py | 1 - 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/version.h b/paddle/fluid/framework/version.h index bf07fc288d..3a1a492701 100644 --- a/paddle/fluid/framework/version.h +++ b/paddle/fluid/framework/version.h @@ -19,6 +19,11 @@ limitations under the License. */ namespace paddle { namespace framework { +// Note: +// Program and Tensor that pass the IsXXXVersionSupported should +// be supported by the current codes. Otherwise, it's a compatibility +// bug. + // The program version the current codes generate. constexpr int64_t kCurProgramVersion = 0; @@ -27,8 +32,11 @@ constexpr int64_t kCurProgramVersion = 0; constexpr int64_t kSupportedProgramVersion[] = {0}; // Due to historical reasons, tensor version use uint32_t. +// The tensor version the current codes generate. constexpr uint32_t kCurTensorVersion = 0; +// The tensor version that was generated by previous or current codes +// and supported by current codes. constexpr uint32_t kSupportedTensorVersion[] = {0}; bool IsProgramVersionSupported(int64_t version); diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 3e02e14167..656fafa0cb 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -750,7 +750,6 @@ def load_inference_model(dirname, program_desc_str = f.read() program = Program.parse_from_string(program_desc_str) - # TODO(panyx0718): Link to our version and compatibility guide. if not core._is_program_version_supported(program._version()): raise ValueError("Unsupported program version: %d\n" % program._version()) From 0904f07d4655b82543aba0baca7aecf81a6ff98b Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Tue, 28 Aug 2018 11:15:31 +0800 Subject: [PATCH 13/85] polish --- paddle/fluid/pybind/pybind.cc | 4 ---- 1 file changed, 4 deletions(-) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 6d85d01477..20fc08e21d 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -518,10 +518,6 @@ All parameter, weight, gradient are variables in Paddle. m.def("init_glog", framework::InitGLOG); m.def("init_devices", [](bool init_p2p) { framework::InitDevices(init_p2p); }); - m.def("_supported_version", []() { - std::vector supported_versions; - return supported_versions; - }); m.def("is_compiled_with_cuda", IsCompiledWithCUDA); m.def("is_compiled_with_dist", IsCompiledWithDIST); From e762d85de41ebc8d60a31b79c6a21c23a5afa0d5 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Mon, 10 Sep 2018 11:36:42 +0800 Subject: [PATCH 14/85] clean --- paddle/fluid/framework/program_desc.cc | 2 +- paddle/fluid/framework/program_desc.h | 2 +- paddle/fluid/framework/version.cc | 2 +- paddle/fluid/framework/version.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc index 5b9073373e..589905828f 100644 --- a/paddle/fluid/framework/program_desc.cc +++ b/paddle/fluid/framework/program_desc.cc @@ -39,7 +39,7 @@ proto::ProgramDesc *ProgramDesc::Proto() { return &desc_; } -int ProgramDesc::Version() const { return desc_.version().version(); } +int64_t ProgramDesc::Version() const { return desc_.version().version(); } ProgramDesc::ProgramDesc() { desc_.mutable_version()->set_version(kCurProgramVersion); diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h index 9cf3714b6a..2ec0e9d7a0 100644 --- a/paddle/fluid/framework/program_desc.h +++ b/paddle/fluid/framework/program_desc.h @@ -57,7 +57,7 @@ class ProgramDesc { proto::ProgramDesc *Proto(); - int Version() const; + int64_t Version() const; // The output variable of feed_op is referenced as feed_target. // This function is used to collect the output variable's name of all diff --git a/paddle/fluid/framework/version.cc b/paddle/fluid/framework/version.cc index 3d559e26e0..81c0392bf3 100644 --- a/paddle/fluid/framework/version.cc +++ b/paddle/fluid/framework/version.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/paddle/fluid/framework/version.h b/paddle/fluid/framework/version.h index 3a1a492701..9945bc58c6 100644 --- a/paddle/fluid/framework/version.h +++ b/paddle/fluid/framework/version.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. From 681514e15ffbba78def454402f24d5a56f66546c Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 10 Sep 2018 12:20:08 +0800 Subject: [PATCH 15/85] Make all scope pointer to shared --- .../fast_threaded_ssa_graph_executor.cc | 3 +- .../fast_threaded_ssa_graph_executor.h | 11 ++++--- .../framework/details/fetch_op_handle.cc | 2 +- .../fluid/framework/details/fetch_op_handle.h | 4 +-- .../scope_buffered_ssa_graph_executor.cc | 3 +- .../scope_buffered_ssa_graph_executor.h | 5 +-- .../details/threaded_ssa_graph_executor.cc | 3 +- .../details/threaded_ssa_graph_executor.h | 11 ++++--- paddle/fluid/framework/parallel_executor.cc | 31 ++++++++++++------- paddle/fluid/framework/parallel_executor.h | 21 +++++++------ paddle/fluid/framework/scope.cc | 11 ++++--- paddle/fluid/framework/scope.h | 2 +- 12 files changed, 63 insertions(+), 44 deletions(-) diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc index 7606f2bc06..a9b89614ae 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -22,7 +22,8 @@ namespace framework { namespace details { FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor( - const ExecutionStrategy &strategy, const std::vector &local_scopes, + const ExecutionStrategy &strategy, + const std::vector> &local_scopes, const std::vector &places, std::unique_ptr &&graph) : strategy_(strategy), diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h index dad3a231cb..fb615d70b7 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h @@ -29,16 +29,17 @@ namespace details { class OpHandleBase; class FastThreadedSSAGraphExecutor : public SSAGraphExecutor { public: - FastThreadedSSAGraphExecutor(const ExecutionStrategy &strategy, - const std::vector &local_scopes, - const std::vector &places, - std::unique_ptr &&graph); + FastThreadedSSAGraphExecutor( + const ExecutionStrategy &strategy, + const std::vector> &local_scopes, + const std::vector &places, + std::unique_ptr &&graph); FeedFetchList Run(const std::vector &fetch_tensors) override; const ir::Graph &Graph() const override; private: ExecutionStrategy strategy_; - std::vector local_scopes_; + std::vector> local_scopes_; std::vector places_; std::unique_ptr graph_; diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc index fe18b2060c..2f4aefd39d 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -22,7 +22,7 @@ namespace framework { namespace details { FetchOpHandle::FetchOpHandle(ir::Node *node, FeedFetchList *data, size_t offset, - std::vector *local_scopes) + std::vector> *local_scopes) : OpHandleBase(node), data_(data), offset_(offset), diff --git a/paddle/fluid/framework/details/fetch_op_handle.h b/paddle/fluid/framework/details/fetch_op_handle.h index 6ce42f92d7..a207e36b8a 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.h +++ b/paddle/fluid/framework/details/fetch_op_handle.h @@ -29,7 +29,7 @@ namespace details { struct FetchOpHandle : public OpHandleBase { public: FetchOpHandle(ir::Node *node, FeedFetchList *data, size_t offset, - std::vector *local_scopes); + std::vector> *local_scopes); ~FetchOpHandle(); @@ -47,7 +47,7 @@ struct FetchOpHandle : public OpHandleBase { private: FeedFetchList *data_; size_t offset_; - std::vector *local_scopes_; + std::vector> *local_scopes_; std::vector tensors_; }; diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index 5bd974d6b7..bf5671c679 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -23,7 +23,8 @@ namespace paddle { namespace framework { namespace details { ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor( - ExecutionStrategy strategy, std::vector local_scopes, + ExecutionStrategy strategy, + std::vector> local_scopes, std::vector var_infos, std::vector places, std::unique_ptr &&underlying_executor) : strategy_(std::move(strategy)), diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h index 5e87e0bf50..ec31755af5 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h @@ -37,7 +37,8 @@ struct VariableInfo { class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { public: ScopeBufferedSSAGraphExecutor( - ExecutionStrategy strategy, std::vector local_scopes, + ExecutionStrategy strategy, + std::vector> local_scopes, std::vector var_infos, std::vector places, std::unique_ptr&& underlying_executor); @@ -52,7 +53,7 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { ExecutionStrategy strategy_; std::unique_ptr underlying_executor_; - std::vector local_scopes_; + std::vector> local_scopes_; std::vector var_infos_; std::vector places_; }; diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index c9e331ef35..cc6f444363 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -21,7 +21,8 @@ namespace paddle { namespace framework { namespace details { ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor( - const ExecutionStrategy &strategy, const std::vector &local_scopes, + const ExecutionStrategy &strategy, + const std::vector> &local_scopes, const std::vector &places, std::unique_ptr &&graph) : graph_(std::move(graph)), diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index 9135c1f5d4..2a74af6c3d 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -38,10 +38,11 @@ namespace details { class ThreadedSSAGraphExecutor : public SSAGraphExecutor { public: - ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy, - const std::vector &local_scopes, - const std::vector &places, - std::unique_ptr &&graph); + ThreadedSSAGraphExecutor( + const ExecutionStrategy &strategy, + const std::vector> &local_scopes, + const std::vector &places, + std::unique_ptr &&graph); const ir::Graph &Graph() const override { return *graph_; } // Run a SSAGraph by a thread pool @@ -57,7 +58,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { private: std::unique_ptr graph_; std::unique_ptr<::ThreadPool> pool_; - std::vector local_scopes_; + std::vector> local_scopes_; std::vector places_; platform::DeviceContextPool fetch_ctxs_; ExceptionHolder exception_holder_; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 81cb24bdda..93c74deb3e 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -39,7 +39,8 @@ std::unique_ptr ApplyParallelExecutorPass( const ProgramDesc &main_program, const std::vector &places, const std::string &loss_var_name, const std::unordered_set ¶m_names, - const std::vector &local_scopes, const bool use_cuda, + const std::vector> &local_scopes, + const bool use_cuda, #ifdef PADDLE_WITH_CUDA const BuildStrategy &strategy, platform::NCCLContextMap *nccl_ctxs) { #else @@ -66,8 +67,8 @@ std::unique_ptr ApplyParallelExecutorPass( &loss_var_name); multi_devices_pass->SetNotOwned>( "params", ¶m_names); - multi_devices_pass->SetNotOwned>("local_scopes", - &local_scopes); + multi_devices_pass->SetNotOwned>>( + "local_scopes", &local_scopes); multi_devices_pass->SetNotOwned("strategy", &strategy); #ifdef PADDLE_WITH_CUDA @@ -100,8 +101,8 @@ class ParallelExecutorPrivate { : places_(places) {} std::vector places_; - std::vector local_scopes_; - Scope *global_scope_; + std::vector> local_scopes_; + std::shared_ptr global_scope_; std::unique_ptr executor_; #ifdef PADDLE_WITH_CUDA @@ -112,7 +113,7 @@ class ParallelExecutorPrivate { bool use_all_reduce_; }; -std::vector &ParallelExecutor::GetLocalScopes() { +std::vector> &ParallelExecutor::GetLocalScopes() { return member_->local_scopes_; } @@ -121,7 +122,8 @@ ParallelExecutor::ParallelExecutor( const std::unordered_set ¶ms, const std::unordered_set &bcast_vars, const ProgramDesc &main_program, const std::string &loss_var_name, - Scope *scope, const std::vector &local_scopes, + const std::shared_ptr &scope, + const std::vector> &local_scopes, const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy, size_t num_trainers, size_t trainer_id) : member_(new ParallelExecutorPrivate(places)) { @@ -142,13 +144,13 @@ ParallelExecutor::ParallelExecutor( member_->own_local_scope_ = true; member_->local_scopes_.emplace_back(member_->global_scope_); for (size_t i = 1; i < member_->places_.size(); ++i) { - member_->local_scopes_.emplace_back(&scope->NewScope()); + member_->local_scopes_.emplace_back(scope->NewSharedScope()); } } else { member_->own_local_scope_ = false; PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size()); for (size_t i = 0; i < member_->places_.size(); ++i) { - member_->local_scopes_.emplace_back(&local_scopes[i]->NewScope()); + member_->local_scopes_.emplace_back(local_scopes[i]->NewSharedScope()); } } @@ -321,7 +323,7 @@ void ParallelExecutor::FeedTensorsIntoLocalScopes( for (size_t i = 0; i < tensors.size(); ++i) { auto &map = tensors[i]; - auto *scope = member_->local_scopes_[i]; + auto &scope = member_->local_scopes_[i]; for (auto &pair : map) { auto *trg = scope->Var(pair.first)->GetMutable(); trg->ShareDataWith(pair.second); @@ -351,8 +353,15 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes( ParallelExecutor::~ParallelExecutor() { if (member_->own_local_scope_) { + std::vector local_scopes_ptrs; + local_scopes_ptrs.reserve(member_->local_scopes_.size()); for (size_t i = 1; i < member_->local_scopes_.size(); ++i) { - member_->global_scope_->DeleteScope(member_->local_scopes_[i]); + local_scopes_ptrs.emplace_back(member_->local_scopes_[i].get()); + member_->local_scopes_[i].reset(); + } + + for (size_t i = 0; i != local_scopes_ptrs.size(); ++i) { + member_->global_scope_->DeleteScope(local_scopes_ptrs[i]); } } } diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 5fb748fa20..ce1076e44b 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -39,19 +39,20 @@ class ParallelExecutor { DISABLE_COPY_AND_ASSIGN(ParallelExecutor); public: - explicit ParallelExecutor(const std::vector &places, - const std::unordered_set ¶ms, - const std::unordered_set &bcast_vars, - const ProgramDesc &main_program, - const std::string &loss_var_name, Scope *scope, - const std::vector &local_scopes, - const ExecutionStrategy &exec_strategy, - const BuildStrategy &build_strategy, - size_t num_trainers = 1, size_t trainer_id = 0); + explicit ParallelExecutor( + const std::vector &places, + const std::unordered_set ¶ms, + const std::unordered_set &bcast_vars, + const ProgramDesc &main_program, const std::string &loss_var_name, + const std::shared_ptr &scope, + const std::vector> &local_scopes, + const ExecutionStrategy &exec_strategy, + const BuildStrategy &build_strategy, size_t num_trainers = 1, + size_t trainer_id = 0); ~ParallelExecutor(); - std::vector &GetLocalScopes(); + std::vector> &GetLocalScopes(); /** * Feed tensors to local scopes. The size of tensors should be equal to the diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 50f374e370..fa6bf4429d 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -38,8 +38,8 @@ Scope::~Scope() { DropKids(); } Scope& Scope::NewScope() const { std::unique_lock lock(mutex_); - kids_.push_back(new Scope(this)); - return *kids_.back(); + kids_.push_back(std::shared_ptr(new Scope(this))); + return kids_.back().get(); } Variable* Scope::Var(const std::string& name) { @@ -68,7 +68,6 @@ const Scope* Scope::FindScope(const Variable* var) const { void Scope::DropKids() { std::unique_lock lock(mutex_); - for (Scope* s : kids_) delete s; kids_.clear(); } @@ -84,8 +83,12 @@ std::vector Scope::LocalVarNames() const { void Scope::DeleteScope(Scope* scope) const { std::unique_lock lock(mutex_); - auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); + auto it = std::find_if(this->kids_.begin(), this->kids_.end(), + [&scope](const std::shared_ptr& kid) { + return kid.get() == scope; + }); PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope); + it->reset(); this->kids_.erase(it); // When making memory benchmark on Fluid, we have to delete scope sync. if (FLAGS_benchmark || FLAGS_eager_delete_scope) { diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index e246241c0a..0ba5d34798 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -105,7 +105,7 @@ class Scope { Variable* FindVarLocally(const std::string& name) const; // Scope in `kids_` are owned by this class. - mutable std::list kids_; + mutable std::list> kids_; Scope const* parent_{nullptr}; DISABLE_COPY_AND_ASSIGN(Scope); From dc863aac7edeccbe8362d625b2c1e6eeca885000 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 10 Sep 2018 14:29:19 +0800 Subject: [PATCH 16/85] Add kids exists detection in Scope --- .../fast_threaded_ssa_graph_executor.cc | 3 +- .../fast_threaded_ssa_graph_executor.h | 11 +++--- .../framework/details/fetch_op_handle.cc | 2 +- .../fluid/framework/details/fetch_op_handle.h | 4 +-- .../scope_buffered_ssa_graph_executor.cc | 3 +- .../scope_buffered_ssa_graph_executor.h | 5 ++- .../details/threaded_ssa_graph_executor.cc | 3 +- .../details/threaded_ssa_graph_executor.h | 11 +++--- paddle/fluid/framework/parallel_executor.cc | 34 ++++++++----------- paddle/fluid/framework/parallel_executor.h | 21 ++++++------ paddle/fluid/framework/scope.cc | 17 ++++++---- paddle/fluid/framework/scope.h | 5 ++- .../test_image_classification_resnet.py | 5 +-- .../test_image_classification_vgg.py | 5 +-- .../test_recognize_digits_conv.py | 5 +-- .../test_recognize_digits_mlp.py | 5 +-- 16 files changed, 60 insertions(+), 79 deletions(-) diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc index a9b89614ae..7606f2bc06 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -22,8 +22,7 @@ namespace framework { namespace details { FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor( - const ExecutionStrategy &strategy, - const std::vector> &local_scopes, + const ExecutionStrategy &strategy, const std::vector &local_scopes, const std::vector &places, std::unique_ptr &&graph) : strategy_(strategy), diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h index fb615d70b7..dad3a231cb 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h @@ -29,17 +29,16 @@ namespace details { class OpHandleBase; class FastThreadedSSAGraphExecutor : public SSAGraphExecutor { public: - FastThreadedSSAGraphExecutor( - const ExecutionStrategy &strategy, - const std::vector> &local_scopes, - const std::vector &places, - std::unique_ptr &&graph); + FastThreadedSSAGraphExecutor(const ExecutionStrategy &strategy, + const std::vector &local_scopes, + const std::vector &places, + std::unique_ptr &&graph); FeedFetchList Run(const std::vector &fetch_tensors) override; const ir::Graph &Graph() const override; private: ExecutionStrategy strategy_; - std::vector> local_scopes_; + std::vector local_scopes_; std::vector places_; std::unique_ptr graph_; diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc index 2f4aefd39d..fe18b2060c 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -22,7 +22,7 @@ namespace framework { namespace details { FetchOpHandle::FetchOpHandle(ir::Node *node, FeedFetchList *data, size_t offset, - std::vector> *local_scopes) + std::vector *local_scopes) : OpHandleBase(node), data_(data), offset_(offset), diff --git a/paddle/fluid/framework/details/fetch_op_handle.h b/paddle/fluid/framework/details/fetch_op_handle.h index a207e36b8a..6ce42f92d7 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.h +++ b/paddle/fluid/framework/details/fetch_op_handle.h @@ -29,7 +29,7 @@ namespace details { struct FetchOpHandle : public OpHandleBase { public: FetchOpHandle(ir::Node *node, FeedFetchList *data, size_t offset, - std::vector> *local_scopes); + std::vector *local_scopes); ~FetchOpHandle(); @@ -47,7 +47,7 @@ struct FetchOpHandle : public OpHandleBase { private: FeedFetchList *data_; size_t offset_; - std::vector> *local_scopes_; + std::vector *local_scopes_; std::vector tensors_; }; diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index bf5671c679..5bd974d6b7 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -23,8 +23,7 @@ namespace paddle { namespace framework { namespace details { ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor( - ExecutionStrategy strategy, - std::vector> local_scopes, + ExecutionStrategy strategy, std::vector local_scopes, std::vector var_infos, std::vector places, std::unique_ptr &&underlying_executor) : strategy_(std::move(strategy)), diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h index ec31755af5..5e87e0bf50 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h @@ -37,8 +37,7 @@ struct VariableInfo { class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { public: ScopeBufferedSSAGraphExecutor( - ExecutionStrategy strategy, - std::vector> local_scopes, + ExecutionStrategy strategy, std::vector local_scopes, std::vector var_infos, std::vector places, std::unique_ptr&& underlying_executor); @@ -53,7 +52,7 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { ExecutionStrategy strategy_; std::unique_ptr underlying_executor_; - std::vector> local_scopes_; + std::vector local_scopes_; std::vector var_infos_; std::vector places_; }; diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index cc6f444363..c9e331ef35 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -21,8 +21,7 @@ namespace paddle { namespace framework { namespace details { ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor( - const ExecutionStrategy &strategy, - const std::vector> &local_scopes, + const ExecutionStrategy &strategy, const std::vector &local_scopes, const std::vector &places, std::unique_ptr &&graph) : graph_(std::move(graph)), diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index 2a74af6c3d..9135c1f5d4 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -38,11 +38,10 @@ namespace details { class ThreadedSSAGraphExecutor : public SSAGraphExecutor { public: - ThreadedSSAGraphExecutor( - const ExecutionStrategy &strategy, - const std::vector> &local_scopes, - const std::vector &places, - std::unique_ptr &&graph); + ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy, + const std::vector &local_scopes, + const std::vector &places, + std::unique_ptr &&graph); const ir::Graph &Graph() const override { return *graph_; } // Run a SSAGraph by a thread pool @@ -58,7 +57,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { private: std::unique_ptr graph_; std::unique_ptr<::ThreadPool> pool_; - std::vector> local_scopes_; + std::vector local_scopes_; std::vector places_; platform::DeviceContextPool fetch_ctxs_; ExceptionHolder exception_holder_; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 93c74deb3e..5b8c75a93d 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -39,8 +39,7 @@ std::unique_ptr ApplyParallelExecutorPass( const ProgramDesc &main_program, const std::vector &places, const std::string &loss_var_name, const std::unordered_set ¶m_names, - const std::vector> &local_scopes, - const bool use_cuda, + const std::vector &local_scopes, const bool use_cuda, #ifdef PADDLE_WITH_CUDA const BuildStrategy &strategy, platform::NCCLContextMap *nccl_ctxs) { #else @@ -67,8 +66,8 @@ std::unique_ptr ApplyParallelExecutorPass( &loss_var_name); multi_devices_pass->SetNotOwned>( "params", ¶m_names); - multi_devices_pass->SetNotOwned>>( - "local_scopes", &local_scopes); + multi_devices_pass->SetNotOwned>("local_scopes", + &local_scopes); multi_devices_pass->SetNotOwned("strategy", &strategy); #ifdef PADDLE_WITH_CUDA @@ -101,8 +100,8 @@ class ParallelExecutorPrivate { : places_(places) {} std::vector places_; - std::vector> local_scopes_; - std::shared_ptr global_scope_; + std::vector local_scopes_; + Scope *global_scope_; std::unique_ptr executor_; #ifdef PADDLE_WITH_CUDA @@ -113,7 +112,7 @@ class ParallelExecutorPrivate { bool use_all_reduce_; }; -std::vector> &ParallelExecutor::GetLocalScopes() { +std::vector &ParallelExecutor::GetLocalScopes() { return member_->local_scopes_; } @@ -122,8 +121,7 @@ ParallelExecutor::ParallelExecutor( const std::unordered_set ¶ms, const std::unordered_set &bcast_vars, const ProgramDesc &main_program, const std::string &loss_var_name, - const std::shared_ptr &scope, - const std::vector> &local_scopes, + Scope *scope, const std::vector &local_scopes, const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy, size_t num_trainers, size_t trainer_id) : member_(new ParallelExecutorPrivate(places)) { @@ -144,13 +142,13 @@ ParallelExecutor::ParallelExecutor( member_->own_local_scope_ = true; member_->local_scopes_.emplace_back(member_->global_scope_); for (size_t i = 1; i < member_->places_.size(); ++i) { - member_->local_scopes_.emplace_back(scope->NewSharedScope()); + member_->local_scopes_.emplace_back(&scope->NewScope()); } } else { member_->own_local_scope_ = false; PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size()); for (size_t i = 0; i < member_->places_.size(); ++i) { - member_->local_scopes_.emplace_back(local_scopes[i]->NewSharedScope()); + member_->local_scopes_.emplace_back(&local_scopes[i]->NewScope()); } } @@ -323,7 +321,7 @@ void ParallelExecutor::FeedTensorsIntoLocalScopes( for (size_t i = 0; i < tensors.size(); ++i) { auto &map = tensors[i]; - auto &scope = member_->local_scopes_[i]; + auto *scope = member_->local_scopes_[i]; for (auto &pair : map) { auto *trg = scope->Var(pair.first)->GetMutable(); trg->ShareDataWith(pair.second); @@ -353,15 +351,11 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes( ParallelExecutor::~ParallelExecutor() { if (member_->own_local_scope_) { - std::vector local_scopes_ptrs; - local_scopes_ptrs.reserve(member_->local_scopes_.size()); for (size_t i = 1; i < member_->local_scopes_.size(); ++i) { - local_scopes_ptrs.emplace_back(member_->local_scopes_[i].get()); - member_->local_scopes_[i].reset(); - } - - for (size_t i = 0; i != local_scopes_ptrs.size(); ++i) { - member_->global_scope_->DeleteScope(local_scopes_ptrs[i]); + Scope *local_scope = member_->local_scopes_[i]; + if (member_->global_scope_->HasKid(local_scope)) { + member_->global_scope_->DeleteScope(local_scope); + } } } } diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index ce1076e44b..5fb748fa20 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -39,20 +39,19 @@ class ParallelExecutor { DISABLE_COPY_AND_ASSIGN(ParallelExecutor); public: - explicit ParallelExecutor( - const std::vector &places, - const std::unordered_set ¶ms, - const std::unordered_set &bcast_vars, - const ProgramDesc &main_program, const std::string &loss_var_name, - const std::shared_ptr &scope, - const std::vector> &local_scopes, - const ExecutionStrategy &exec_strategy, - const BuildStrategy &build_strategy, size_t num_trainers = 1, - size_t trainer_id = 0); + explicit ParallelExecutor(const std::vector &places, + const std::unordered_set ¶ms, + const std::unordered_set &bcast_vars, + const ProgramDesc &main_program, + const std::string &loss_var_name, Scope *scope, + const std::vector &local_scopes, + const ExecutionStrategy &exec_strategy, + const BuildStrategy &build_strategy, + size_t num_trainers = 1, size_t trainer_id = 0); ~ParallelExecutor(); - std::vector> &GetLocalScopes(); + std::vector &GetLocalScopes(); /** * Feed tensors to local scopes. The size of tensors should be equal to the diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index fa6bf4429d..2be655b89a 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -38,8 +38,8 @@ Scope::~Scope() { DropKids(); } Scope& Scope::NewScope() const { std::unique_lock lock(mutex_); - kids_.push_back(std::shared_ptr(new Scope(this))); - return kids_.back().get(); + kids_.push_back(new Scope(this)); + return *kids_.back(); } Variable* Scope::Var(const std::string& name) { @@ -68,9 +68,16 @@ const Scope* Scope::FindScope(const Variable* var) const { void Scope::DropKids() { std::unique_lock lock(mutex_); + for (Scope* s : kids_) delete s; kids_.clear(); } +bool Scope::HasKid(const Scope* scope) const { + std::unique_lock lock(mutex_); + auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); + return it != this->kids_.end(); +} + std::vector Scope::LocalVarNames() const { std::unique_lock lock(mutex_); std::vector known_vars; @@ -83,12 +90,8 @@ std::vector Scope::LocalVarNames() const { void Scope::DeleteScope(Scope* scope) const { std::unique_lock lock(mutex_); - auto it = std::find_if(this->kids_.begin(), this->kids_.end(), - [&scope](const std::shared_ptr& kid) { - return kid.get() == scope; - }); + auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope); - it->reset(); this->kids_.erase(it); // When making memory benchmark on Fluid, we have to delete scope sync. if (FLAGS_benchmark || FLAGS_eager_delete_scope) { diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index 0ba5d34798..b6165a595d 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -71,6 +71,9 @@ class Scope { /// Drop all kids scopes belonged to this scope. void DropKids(); + /// Find if a scope exists in the kid scopes + bool HasKid(const Scope* scope) const; + // enumerate all the variables current contains. std::vector LocalVarNames() const; @@ -105,7 +108,7 @@ class Scope { Variable* FindVarLocally(const std::string& name) const; // Scope in `kids_` are owned by this class. - mutable std::list> kids_; + mutable std::list kids_; Scope const* parent_{nullptr}; DISABLE_COPY_AND_ASSIGN(Scope); diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py index e5ae95e2d9..de276755bb 100644 --- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py +++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py @@ -178,7 +178,4 @@ if __name__ == '__main__': for parallel in (False, True): if use_cuda and not core.is_compiled_with_cuda(): continue - # TODO(minqiyang): remove this line after fixing the deletion - # order problem of Scope in ParallelExecutor in manylinux - if six.PY2: - main(use_cuda=use_cuda, parallel=parallel) + main(use_cuda=use_cuda, parallel=parallel) diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py index ff91be72c9..dd547f3448 100644 --- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py +++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py @@ -152,7 +152,4 @@ if __name__ == '__main__': for parallel in (False, True): if use_cuda and not core.is_compiled_with_cuda(): continue - # TODO(minqiyang): remove this line after fixing the deletion - # order problem of Scope in ParallelExecutor in manylinux - if six.PY2: - main(use_cuda=use_cuda, parallel=parallel) + main(use_cuda=use_cuda, parallel=parallel) diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py index fa72c939e5..973308498b 100644 --- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py +++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py @@ -155,7 +155,4 @@ if __name__ == '__main__': for parallel in (False, True): if use_cuda and not core.is_compiled_with_cuda(): continue - # TODO(minqiyang): remove this line after fixing the deletion - # order problem of Scope in ParallelExecutor in manylinux - if six.PY2: - main(use_cuda=use_cuda, parallel=parallel) + main(use_cuda=use_cuda, parallel=parallel) diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py index 440d2a3083..cb4aeb430e 100644 --- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py +++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py @@ -137,7 +137,4 @@ if __name__ == '__main__': for parallel in (False, True): if use_cuda and not core.is_compiled_with_cuda(): continue - # TODO(minqiyang): remove this line after fixing the deletion - # order problem of Scope in ParallelExecutor in manylinux - if six.PY2: - main(use_cuda=use_cuda, parallel=parallel) + main(use_cuda=use_cuda, parallel=parallel) From 32b94a7d13233aba6f077dac43071e54f43fd489 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Mon, 10 Sep 2018 15:09:47 +0800 Subject: [PATCH 17/85] cache var types --- paddle/fluid/operators/listen_and_serv_op.cc | 56 +++++++++++++++----- paddle/fluid/operators/listen_and_serv_op.h | 11 ++-- 2 files changed, 50 insertions(+), 17 deletions(-) diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index abbb3d06d1..966d78b841 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -104,8 +104,7 @@ void ListenAndServOp::RunSyncLoop( framework::Executor *executor, framework::ProgramDesc *program, framework::Scope *recv_scope, platform::DeviceContext *dev_ctx, const std::vector &prefetch_block_id_list, - const int checkpoint_point_block_id, - const std::vector &recv_varnames) const { + const int checkpoint_point_block_id) const { VLOG(2) << "RunSyncLoop"; size_t num_blocks = program->Size(); auto optimize_blocks = @@ -130,6 +129,7 @@ void ListenAndServOp::RunSyncLoop( rpc_service_->SetCond(distributed::kRequestGet); rpc_service_->WaitBarrier(distributed::kRequestGet); rpc_service_->ResetBarrierCounter(); + while (true) { rpc_service_->Profiler().OneStep(); // Get from multiple trainers, we don't care about the order in which @@ -167,8 +167,7 @@ void ListenAndServOp::RunSyncLoop( recv_scope); VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)"; - ResetReceivedVars(recv_varnames, recv_scope, dev_ctx, - rpc_service_->NeedResetAllVars()); + ResetReceivedVars(recv_scope, dev_ctx, rpc_service_->NeedResetAllVars()); rpc_service_->SetCond(distributed::kRequestGet); rpc_service_->WaitBarrier(distributed::kRequestGet); @@ -176,10 +175,10 @@ void ListenAndServOp::RunSyncLoop( } // while(true) } -void ListenAndServOp::ResetReceivedVars( - const std::vector &recv_varnames, framework::Scope *recv_scope, - platform::DeviceContext *dev_ctx, bool reset_all) const { - for (auto &varname : recv_varnames) { +void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope, + platform::DeviceContext *dev_ctx, + bool reset_all) const { + for (auto &varname : sparse_vars_) { auto var = recv_scope->FindVar(varname); if (var == nullptr) { VLOG(2) << "can not find var " << varname << " in received scope"; @@ -188,9 +187,17 @@ void ListenAndServOp::ResetReceivedVars( if (var->IsType()) { VLOG(3) << "reset sparse var: " << varname; var->GetMutable()->mutable_rows()->clear(); + } else { + PADDLE_THROW("The type of sparse var should be SelectedRows"); } - if (UNLIKELY(reset_all)) { - VLOG(3) << "reset dense var: " << varname; + } + if (UNLIKELY(reset_all)) { + for (auto &varname : dense_vars_) { + auto var = recv_scope->FindVar(varname); + if (var == nullptr) { + VLOG(2) << "can not find var " << varname << " in received scope"; + continue; + } if (var->IsType()) { math::set_constant(*dev_ctx, var->GetMutable(), static_cast(0)); @@ -198,8 +205,7 @@ void ListenAndServOp::ResetReceivedVars( math::set_constant(*dev_ctx, var->GetMutable(), static_cast(0)); } else { - PADDLE_THROW( - "received var should be in [SelectedRows, LoDTensor, Tensor]"); + PADDLE_THROW("The type of dense var should be in [LoDTensor, Tensor]"); } } } @@ -278,6 +284,25 @@ static void FillRequestCtx( h->SetCheckpointNotifyPreparedCtx(checkpoint_ctx); } +void ListenAndServOp::CacheVarsType(const std::vector &varnames, + const framework::Scope &scope) const { + for (const auto &varname : varnames) { + auto var = scope.FindVar(varname); + PADDLE_ENFORCE(var != nullptr, + "Received var should be initialized in the received scope."); + if (var->IsType()) { + sparse_vars_.push_back(varname); + } else if (var->IsType() || + var->IsType()) { + dense_vars_.push_back(varname); + } else { + PADDLE_THROW( + "The type of received var should be in [SelectedRows, LoDTensor, " + "Tensor]."); + } + } +} + void ListenAndServOp::RunImpl(const framework::Scope &scope, const platform::Place &dev_place) const { // Mark this as PS that it should decide profiling by listening from trainer. @@ -379,11 +404,16 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, signal(SIGINT, SignalHandler::StopAndExit); signal(SIGTERM, SignalHandler::StopAndExit); + // Cache the type of the received vars as `sparse_vars_` and `dense_vars_` + // so that we can reset them at the end of each iteration. + // NOTE: only used in sync update + CacheVarsType(inputs, recv_scope); + // Write to a file of server selected port for python use. SavePort(); if (sync_mode) { RunSyncLoop(&executor, program, &recv_scope, &dev_ctx, - prefetch_block_id_list, checkpoint_block_id, inputs); + prefetch_block_id_list, checkpoint_block_id); } else { RunAsyncLoop(&executor, program, &recv_scope); } diff --git a/paddle/fluid/operators/listen_and_serv_op.h b/paddle/fluid/operators/listen_and_serv_op.h index 5102c963b9..5f889793ab 100644 --- a/paddle/fluid/operators/listen_and_serv_op.h +++ b/paddle/fluid/operators/listen_and_serv_op.h @@ -51,8 +51,7 @@ class ListenAndServOp : public framework::OperatorBase { framework::Scope* recv_scope, platform::DeviceContext* dev_ctx, const std::vector& prefetch_block_id_list, - const int checkpoint_point_block_id, - const std::vector& recv_varnames) const; + const int checkpoint_point_block_id) const; void RunAsyncLoop(framework::Executor* executor, framework::ProgramDesc* program, @@ -67,11 +66,13 @@ class ListenAndServOp : public framework::OperatorBase { void RunImpl(const framework::Scope& scope, const platform::Place& dev_place) const override; - void ResetReceivedVars(const std::vector& recv_varnames, - framework::Scope* recv_scope, + void ResetReceivedVars(framework::Scope* recv_scope, platform::DeviceContext* dev_ctx, bool reset_all = false) const; + void CacheVarsType(const std::vector& varnames, + const framework::Scope& scope) const; + protected: mutable std::shared_ptr rpc_service_; mutable std::shared_ptr request_send_handler_; @@ -82,6 +83,8 @@ class ListenAndServOp : public framework::OperatorBase { request_checkpoint_handler_; mutable std::shared_ptr server_thread_; + mutable std::vector sparse_vars_; + mutable std::vector dense_vars_; }; class SignalHandler { From 2fd1bf2ea6b6e0d27fb49461dd2b35c8e2a2b13b Mon Sep 17 00:00:00 2001 From: Yan Chunwei Date: Mon, 10 Sep 2018 17:31:06 +0800 Subject: [PATCH 18/85] fea/add color log (#13305) --- paddle/fluid/framework/ir/CMakeLists.txt | 2 +- .../framework/ir/graph_pattern_detector.cc | 7 +- .../fluid/inference/analysis/CMakeLists.txt | 2 +- .../inference/analysis/ir_pass_manager.cc | 6 +- .../fluid/inference/analysis/pass_manager.cc | 8 ++- paddle/fluid/string/CMakeLists.txt | 2 + paddle/fluid/string/pretty_log.cc | 22 ++++++ paddle/fluid/string/pretty_log.h | 70 +++++++++++++++++++ 8 files changed, 112 insertions(+), 7 deletions(-) create mode 100644 paddle/fluid/string/pretty_log.cc create mode 100644 paddle/fluid/string/pretty_log.h diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 78387c4073..ce3ebed00b 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -19,7 +19,7 @@ function(pass_library TARGET DEST) endfunction() cc_library(node SRCS node.cc DEPS proto_desc) -cc_library(graph SRCS graph.cc DEPS node) +cc_library(graph SRCS graph.cc DEPS node pretty_log) cc_library(graph_helper SRCS graph_helper.cc DEPS graph) cc_library(pass SRCS pass.cc DEPS graph node graph_helper) cc_library(graph_traits SRCS graph_traits.cc DEPS graph) diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index fc7feca567..5825a129b7 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -21,12 +21,17 @@ #include "paddle/fluid/framework/ir/graph_traits.h" #include "paddle/fluid/framework/ir/graph_viz_pass.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/string/pretty_log.h" #include "paddle/fluid/string/printf.h" namespace paddle { namespace framework { namespace ir { +using string::PrettyLogEndl; +using string::PrettyLog; +using string::Style; + size_t PDPattern::id_ = 0UL; PDNode* PDPattern::NewNode(const std::string& name) { @@ -83,7 +88,7 @@ void GraphPatternDetector::operator()(Graph* graph, ValidateByNodeRole(&subgraphs); if (subgraphs.empty()) return; - LOG(INFO) << "detect " << subgraphs.size() << " subgraph matches the pattern"; + PrettyLogEndl(Style::detail(), "--- detect %d subgraphs", subgraphs.size()); int id = 0; for (auto& g : subgraphs) { VLOG(3) << "optimizing #" << id++ << " subgraph"; diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index 11a7509feb..fecce9c225 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -1,6 +1,6 @@ cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass) set(analysis_deps - framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor) + framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor pretty_log) cc_library(analysis SRCS pass_manager.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc analyzer.cc diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index ea0f2241d7..30c1e8e93d 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -17,10 +17,14 @@ #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/string/pretty_log.h" namespace paddle { namespace inference { namespace analysis { +using string::PrettyLogEndl; +using string::PrettyLog; +using string::Style; IRPassManager::IRPassManager(const ProgramDesc &program, framework::Scope *scope) @@ -34,7 +38,7 @@ void IRPassManager::Apply(const std::vector &passes) { // Apply all the passes std::string pre_pass; for (const std::string &pass_name : passes) { - LOG(WARNING) << "Running IR pass [" << pass_name << "]"; + PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass_name); auto pass = framework::ir::PassRegistry::Instance().Get(pass_name); if (pass_name == "graph_viz_pass") { std::string dot_file_path = diff --git a/paddle/fluid/inference/analysis/pass_manager.cc b/paddle/fluid/inference/analysis/pass_manager.cc index 759b2b96a1..a6ac0ee49f 100644 --- a/paddle/fluid/inference/analysis/pass_manager.cc +++ b/paddle/fluid/inference/analysis/pass_manager.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/inference/analysis/pass_manager.h" #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" +#include "paddle/fluid/string/pretty_log.h" namespace paddle { namespace inference { @@ -22,7 +23,7 @@ namespace analysis { bool PassManager::Initialize(Argument* argument) { argument_ = argument; for (auto& pass : data_) { - LOG(WARNING) << "Initializing pass [" << pass->repr() << "]"; + VLOG(3) << "Initializing pass [" << pass->repr() << "]"; if (!pass->Initialize(argument)) { LOG(ERROR) << "Failed to initialize pass [" << pass->repr() << "]"; return false; @@ -33,9 +34,10 @@ bool PassManager::Initialize(Argument* argument) { void DfgPassManager::RunAll() { PADDLE_ENFORCE(argument_); - LOG(INFO) << "Total " << data_.size() << " Analysys passes"; + VLOG(3) << "Total " << data_.size() << " Analysys passes"; for (auto& pass : data_) { - LOG(WARNING) << "Running Analysis pass [" << pass->repr() << "]"; + string::PrettyLogEndl(string::Style::H1(), "* Running Analysis pass [%s]", + pass->repr()); pass->Run(argument_->main_dfg.get()); } } diff --git a/paddle/fluid/string/CMakeLists.txt b/paddle/fluid/string/CMakeLists.txt index 1fe7f42ca1..719411bf66 100644 --- a/paddle/fluid/string/CMakeLists.txt +++ b/paddle/fluid/string/CMakeLists.txt @@ -1,4 +1,6 @@ cc_library(stringpiece SRCS piece.cc) +cc_library(pretty_log SRCS pretty_log.cc) +cc_test(test_pretty_log SRCS pretty_log.cc) cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags) cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags) cc_test(to_string_test SRCS to_string_test.cc) diff --git a/paddle/fluid/string/pretty_log.cc b/paddle/fluid/string/pretty_log.cc new file mode 100644 index 0000000000..4534fdc58b --- /dev/null +++ b/paddle/fluid/string/pretty_log.cc @@ -0,0 +1,22 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/string/pretty_log.h" +#include + +DEFINE_bool(color, true, "Whether to turn on pretty log"); + +namespace paddle { +namespace string {} // namespace string +} // namespace paddle diff --git a/paddle/fluid/string/pretty_log.h b/paddle/fluid/string/pretty_log.h new file mode 100644 index 0000000000..a3b4e38f45 --- /dev/null +++ b/paddle/fluid/string/pretty_log.h @@ -0,0 +1,70 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include +#include +#include +#include "paddle/fluid/string/printf.h" + +DECLARE_bool(color); + +namespace paddle { + +namespace string { + +inline std::string black() { return FLAGS_color ? "\e[30m" : ""; } +inline std::string red() { return FLAGS_color ? "\e[31m" : ""; } +inline std::string b_red() { return FLAGS_color ? "\e[41m" : ""; } +inline std::string green() { return FLAGS_color ? "\e[32m" : ""; } +inline std::string yellow() { return FLAGS_color ? "\e[33m" : ""; } +inline std::string blue() { return FLAGS_color ? "\e[34m" : ""; } +inline std::string purple() { return FLAGS_color ? "\e[35m" : ""; } +inline std::string cyan() { return FLAGS_color ? "\e[36m" : ""; } +inline std::string light_gray() { return FLAGS_color ? "\e[37m" : ""; } +inline std::string white() { return FLAGS_color ? "\e[37m" : ""; } +inline std::string light_red() { return FLAGS_color ? "\e[91m" : ""; } +inline std::string dim() { return FLAGS_color ? "\e[2m" : ""; } +inline std::string bold() { return FLAGS_color ? "\e[1m" : ""; } +inline std::string underline() { return FLAGS_color ? "\e[4m" : ""; } +inline std::string blink() { return FLAGS_color ? "\e[5m" : ""; } +inline std::string reset() { return FLAGS_color ? "\e[0m" : ""; } + +using TextBlock = std::pair; + +struct Style { + static std::string info() { return black(); } + static std::string warn() { return b_red(); } + static std::string suc() { return green(); } + static std::string H1() { return bold() + purple(); } + static std::string H2() { return green(); } + static std::string H3() { return green(); } + static std::string detail() { return light_gray(); } +}; + +template +static void PrettyLogEndl(const std::string& style, const char* fmt, + const Args&... args) { + std::cerr << style << Sprintf(fmt, args...) << reset() << std::endl; +} +template +static void PrettyLog(const std::string& style, const char* fmt, + const Args&... args) { + std::cerr << style << Sprintf(fmt, args...) << reset(); +} + +} // namespace string +} // namespace paddle From b720b3a58f7e5150c2d1c070a80132b349a612cb Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Mon, 10 Sep 2018 17:52:03 +0800 Subject: [PATCH 19/85] fix fluid benchmark script --- benchmark/fluid/fluid_benchmark.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py index 11bd75e1d0..25622ee06c 100644 --- a/benchmark/fluid/fluid_benchmark.py +++ b/benchmark/fluid/fluid_benchmark.py @@ -91,7 +91,8 @@ def dist_transpile(trainer_id, args, train_prog, startup_prog): program=train_prog, pservers=pserver_endpoints, trainers=trainers, - sync_mode=not args.async_mode) + sync_mode=not args.async_mode, + startup_program=startup_prog) if training_role == "PSERVER": pserver_program = t.get_pserver_program(current_endpoint) pserver_startup_program = t.get_startup_program( From 83af1b3b3e7933c95398365aaec15f1bff0cc7f4 Mon Sep 17 00:00:00 2001 From: luotao1 Date: Mon, 10 Sep 2018 18:52:15 +0800 Subject: [PATCH 20/85] move analyzer_rnn1_test out of analyzer_test --- .../fluid/inference/analysis/CMakeLists.txt | 8 +- .../analysis/analyzer_rnn1_tester.cc | 306 ++++++++++++++++++ .../inference/analysis/analyzer_tester.cc | 282 +--------------- 3 files changed, 314 insertions(+), 282 deletions(-) create mode 100644 paddle/fluid/inference/analysis/analyzer_rnn1_tester.cc diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index 11a7509feb..699e16ad97 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -35,11 +35,15 @@ function (inference_analysis_test TARGET) cc_test(${TARGET} SRCS "${analysis_test_SRCS}" DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS} - ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt} ${analysis_test_ARGS}) + ARGS ${mem_opt} ${analysis_test_ARGS}) set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec) endif(WITH_TESTING) endfunction(inference_analysis_test) +inference_analysis_test(test_analyzer SRCS analyzer_tester.cc + EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor + ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model) + function (inference_download_and_uncompress install_dir url gz_filename) message(STATUS "Download inference test stuff ${gz_filename} from ${url}") execute_process(COMMAND bash -c "mkdir -p ${install_dir}") @@ -56,7 +60,7 @@ if (NOT EXISTS ${RNN1_INSTALL_DIR} AND WITH_TESTING) inference_download_and_uncompress(${RNN1_INSTALL_DIR} ${RNN1_DATA_URL} "rnn1%2Fdata.txt.tar.gz") endif() -inference_analysis_test(test_analyzer SRCS analyzer_tester.cc +inference_analysis_test(test_analyzer_rnn1 SRCS analyzer_rnn1_tester.cc EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor ARGS --infer_model=${RNN1_INSTALL_DIR}/model --infer_data=${RNN1_INSTALL_DIR}/data.txt) diff --git a/paddle/fluid/inference/analysis/analyzer_rnn1_tester.cc b/paddle/fluid/inference/analysis/analyzer_rnn1_tester.cc new file mode 100644 index 0000000000..b8ac468b4e --- /dev/null +++ b/paddle/fluid/inference/analysis/analyzer_rnn1_tester.cc @@ -0,0 +1,306 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/analyzer.h" + +#include +#include +#include // NOLINT +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/inference/analysis/ut_helper.h" +#include "paddle/fluid/inference/api/analysis_predictor.h" +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/api/paddle_inference_pass.h" + +DEFINE_string(infer_model, "", "model path"); +DEFINE_string(infer_data, "", "data path"); +DEFINE_int32(batch_size, 10, "batch size."); +DEFINE_int32(repeat, 1, "Running the inference program repeat times."); +DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads."); + +namespace paddle { +namespace inference { + +using namespace framework; // NOLINT + +struct DataRecord { + std::vector>> link_step_data_all; + std::vector> week_data_all, minute_data_all; + std::vector lod1, lod2, lod3; + std::vector> rnn_link_data, rnn_week_datas, + rnn_minute_datas; + size_t batch_iter{0}; + size_t batch_size{1}; + DataRecord() = default; + explicit DataRecord(const std::string &path, int batch_size = 1) + : batch_size(batch_size) { + Load(path); + } + DataRecord NextBatch() { + DataRecord data; + size_t batch_end = batch_iter + batch_size; + // NOTE skip the final batch, if no enough data is provided. + if (batch_end <= link_step_data_all.size()) { + data.link_step_data_all.assign(link_step_data_all.begin() + batch_iter, + link_step_data_all.begin() + batch_end); + data.week_data_all.assign(week_data_all.begin() + batch_iter, + week_data_all.begin() + batch_end); + data.minute_data_all.assign(minute_data_all.begin() + batch_iter, + minute_data_all.begin() + batch_end); + // Prepare LoDs + data.lod1.push_back(0); + data.lod2.push_back(0); + data.lod3.push_back(0); + CHECK(!data.link_step_data_all.empty()) << "empty"; + CHECK(!data.week_data_all.empty()); + CHECK(!data.minute_data_all.empty()); + CHECK_EQ(data.link_step_data_all.size(), data.week_data_all.size()); + CHECK_EQ(data.minute_data_all.size(), data.link_step_data_all.size()); + for (size_t j = 0; j < data.link_step_data_all.size(); j++) { + for (const auto &d : data.link_step_data_all[j]) { + data.rnn_link_data.push_back(d); + } + data.rnn_week_datas.push_back(data.week_data_all[j]); + data.rnn_minute_datas.push_back(data.minute_data_all[j]); + // calculate lod + data.lod1.push_back(data.lod1.back() + + data.link_step_data_all[j].size()); + data.lod3.push_back(data.lod3.back() + 1); + for (size_t i = 1; i < data.link_step_data_all[j].size() + 1; i++) { + data.lod2.push_back(data.lod2.back() + + data.link_step_data_all[j].size()); + } + } + } + batch_iter += batch_size; + return data; + } + void Load(const std::string &path) { + std::ifstream file(path); + std::string line; + int num_lines = 0; + while (std::getline(file, line)) { + num_lines++; + std::vector data; + split(line, ':', &data); + std::vector> link_step_data; + std::vector link_datas; + split(data[0], '|', &link_datas); + for (auto &step_data : link_datas) { + std::vector tmp; + split_to_float(step_data, ',', &tmp); + link_step_data.push_back(tmp); + } + // load week data + std::vector week_data; + split_to_float(data[2], ',', &week_data); + // load minute data + std::vector minute_data; + split_to_float(data[1], ',', &minute_data); + link_step_data_all.push_back(std::move(link_step_data)); + week_data_all.push_back(std::move(week_data)); + minute_data_all.push_back(std::move(minute_data)); + } + } +}; +void PrepareInputs(std::vector *input_slots, DataRecord *data, + int batch_size) { + PaddleTensor lod_attention_tensor, init_zero_tensor, lod_tensor_tensor, + week_tensor, minute_tensor; + lod_attention_tensor.name = "data_lod_attention"; + init_zero_tensor.name = "cell_init"; + lod_tensor_tensor.name = "data"; + week_tensor.name = "week"; + minute_tensor.name = "minute"; + auto one_batch = data->NextBatch(); + std::vector rnn_link_data_shape( + {static_cast(one_batch.rnn_link_data.size()), + static_cast(one_batch.rnn_link_data.front().size())}); + lod_attention_tensor.shape.assign({1, 2}); + lod_attention_tensor.lod.assign({one_batch.lod1, one_batch.lod2}); + init_zero_tensor.shape.assign({batch_size, 15}); + init_zero_tensor.lod.assign({one_batch.lod3}); + lod_tensor_tensor.shape = rnn_link_data_shape; + lod_tensor_tensor.lod.assign({one_batch.lod1}); + // clang-format off + week_tensor.shape.assign( + {static_cast(one_batch.rnn_week_datas.size()), + static_cast(one_batch.rnn_week_datas.front().size())}); + week_tensor.lod.assign({one_batch.lod3}); + minute_tensor.shape.assign( + {static_cast(one_batch.rnn_minute_datas.size()), + static_cast(one_batch.rnn_minute_datas.front().size())}); + minute_tensor.lod.assign({one_batch.lod3}); + // clang-format on + // assign data + TensorAssignData(&lod_attention_tensor, + std::vector>({{0, 0}})); + std::vector tmp_zeros(batch_size * 15, 0.); + TensorAssignData(&init_zero_tensor, {tmp_zeros}); + TensorAssignData(&lod_tensor_tensor, one_batch.rnn_link_data); + TensorAssignData(&week_tensor, one_batch.rnn_week_datas); + TensorAssignData(&minute_tensor, one_batch.rnn_minute_datas); + // Set inputs. + auto init_zero_tensor1 = init_zero_tensor; + init_zero_tensor1.name = "hidden_init"; + input_slots->assign({week_tensor, init_zero_tensor, minute_tensor, + init_zero_tensor1, lod_attention_tensor, + lod_tensor_tensor}); + for (auto &tensor : *input_slots) { + tensor.dtype = PaddleDType::FLOAT32; + } +} + +void CompareResult(const std::vector &outputs, + const std::vector &base_outputs) { + PADDLE_ENFORCE_GT(outputs.size(), 0); + PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size()); + for (size_t i = 0; i < outputs.size(); i++) { + auto &out = outputs[i]; + auto &base_out = base_outputs[i]; + size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, + [](int a, int b) { return a * b; }); + size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(), + 1, [](int a, int b) { return a * b; }); + PADDLE_ENFORCE_EQ(size, size1); + PADDLE_ENFORCE_GT(size, 0); + float *data = static_cast(out.data.data()); + float *base_data = static_cast(base_out.data.data()); + for (size_t i = 0; i < size; i++) { + EXPECT_NEAR(data[i], base_data[i], 1e-3); + } + } +} +// Test with a really complicate model. +void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) { + AnalysisConfig config; + config.prog_file = FLAGS_infer_model + "/__model__"; + config.param_file = FLAGS_infer_model + "/param"; + config.use_gpu = false; + config.device = 0; + config.specify_input_name = true; + config.enable_ir_optim = activate_ir; + PADDLE_ENFORCE(config.ir_mode == + AnalysisConfig::IrPassMode::kExclude); // default + config.ir_passes.clear(); // Do not exclude any pass. + + int batch_size = FLAGS_batch_size; + int num_times = FLAGS_repeat; + + auto base_predictor = + CreatePaddlePredictor(config); + auto predictor = + CreatePaddlePredictor( + config); + std::vector input_slots; + DataRecord data(FLAGS_infer_data, batch_size); + // Prepare inputs. + PrepareInputs(&input_slots, &data, batch_size); + std::vector outputs, base_outputs; + + base_predictor->Run(input_slots, &base_outputs); + + if (num_threads == 1) { + // Prepare inputs. + Timer timer; + timer.tic(); + for (int i = 0; i < num_times; i++) { + predictor->Run(input_slots, &outputs); + } + PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times); + CompareResult(outputs, base_outputs); + } else { + std::vector threads; + std::vector> predictors; + // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled + // because AttentionLSTM's hard code nodeid will be damanged. + for (int tid = 0; tid < num_threads; ++tid) { + predictors.emplace_back( + CreatePaddlePredictor( + config)); + } + for (int tid = 0; tid < num_threads; ++tid) { + threads.emplace_back([&, tid]() { + // Each thread should have local input_slots and outputs. + std::vector input_slots; + DataRecord data(FLAGS_infer_data, batch_size); + PrepareInputs(&input_slots, &data, batch_size); + std::vector outputs; + Timer timer; + timer.tic(); + for (int i = 0; i < num_times; i++) { + predictors[tid]->Run(input_slots, &outputs); + } + PrintTime(batch_size, num_times, num_threads, tid, + timer.toc() / num_times); + CompareResult(outputs, base_outputs); + }); + } + for (int i = 0; i < num_threads; ++i) { + threads[i].join(); + } + } + + if (use_analysis && activate_ir) { + AnalysisPredictor *analysis_predictor = + dynamic_cast(predictor.get()); + auto &fuse_statis = analysis_predictor->analysis_argument() + .Get>( + framework::ir::kFuseStatisAttr); + for (auto &item : fuse_statis) { + LOG(INFO) << "fused " << item.first << " " << item.second; + } + + int num_ops = 0; + for (auto &node : + analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) { + if (node->IsFunction()) { + ++num_ops; + } + } + LOG(INFO) << "has num ops: " << num_ops; + + ASSERT_TRUE(fuse_statis.count("fc_fuse")); + EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); + EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM + EXPECT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1); + EXPECT_EQ(num_ops, + 13); // After graph optimization, only 13 operators exists. + } +} + +// Inference with analysis and IR, easy for profiling independently. +TEST(Analyzer, rnn1) { TestRNN1Prediction(true, true, FLAGS_num_threads); } + +// Other unit-tests of RNN1, test different options of use_analysis, +// activate_ir and multi-threads. +TEST(Analyzer, RNN_tests) { + int num_threads[2] = {1, 4}; + for (auto i : num_threads) { + // Directly infer with the original model. + TestRNN1Prediction(false, false, i); + // Inference with the original model with the analysis turned on, the + // analysis + // module will transform the program to a data flow graph. + TestRNN1Prediction(true, false, i); + // Inference with analysis and IR. The IR module will fuse some large + // kernels. + TestRNN1Prediction(true, true, i); + } +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index cc4b390495..3b5be7f3ee 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -16,21 +16,9 @@ #include #include -#include // NOLINT -#include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/inference/analysis/ut_helper.h" -#include "paddle/fluid/inference/api/analysis_predictor.h" -#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h" -#include "paddle/fluid/inference/utils/singleton.h" - -DEFINE_string(infer_model, "", "model path"); -DEFINE_string(infer_data, "", "data path"); -DEFINE_int32(batch_size, 10, "batch size."); -DEFINE_int32(repeat, 1, "Running the inference program repeat times."); -DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads."); namespace paddle { namespace inference { @@ -91,274 +79,8 @@ void TestWord2vecPrediction(const std::string &model_path) { } } -namespace { - -struct DataRecord { - std::vector>> link_step_data_all; - std::vector> week_data_all, minute_data_all; - std::vector lod1, lod2, lod3; - std::vector> rnn_link_data, rnn_week_datas, - rnn_minute_datas; - size_t batch_iter{0}; - size_t batch_size{1}; - DataRecord() = default; - explicit DataRecord(const std::string &path, int batch_size = 1) - : batch_size(batch_size) { - Load(path); - } - DataRecord NextBatch() { - DataRecord data; - size_t batch_end = batch_iter + batch_size; - // NOTE skip the final batch, if no enough data is provided. - if (batch_end <= link_step_data_all.size()) { - data.link_step_data_all.assign(link_step_data_all.begin() + batch_iter, - link_step_data_all.begin() + batch_end); - data.week_data_all.assign(week_data_all.begin() + batch_iter, - week_data_all.begin() + batch_end); - data.minute_data_all.assign(minute_data_all.begin() + batch_iter, - minute_data_all.begin() + batch_end); - // Prepare LoDs - data.lod1.push_back(0); - data.lod2.push_back(0); - data.lod3.push_back(0); - CHECK(!data.link_step_data_all.empty()) << "empty"; - CHECK(!data.week_data_all.empty()); - CHECK(!data.minute_data_all.empty()); - CHECK_EQ(data.link_step_data_all.size(), data.week_data_all.size()); - CHECK_EQ(data.minute_data_all.size(), data.link_step_data_all.size()); - for (size_t j = 0; j < data.link_step_data_all.size(); j++) { - for (const auto &d : data.link_step_data_all[j]) { - data.rnn_link_data.push_back(d); - } - data.rnn_week_datas.push_back(data.week_data_all[j]); - data.rnn_minute_datas.push_back(data.minute_data_all[j]); - // calculate lod - data.lod1.push_back(data.lod1.back() + - data.link_step_data_all[j].size()); - data.lod3.push_back(data.lod3.back() + 1); - for (size_t i = 1; i < data.link_step_data_all[j].size() + 1; i++) { - data.lod2.push_back(data.lod2.back() + - data.link_step_data_all[j].size()); - } - } - } - batch_iter += batch_size; - return data; - } - void Load(const std::string &path) { - std::ifstream file(path); - std::string line; - int num_lines = 0; - while (std::getline(file, line)) { - num_lines++; - std::vector data; - split(line, ':', &data); - std::vector> link_step_data; - std::vector link_datas; - split(data[0], '|', &link_datas); - for (auto &step_data : link_datas) { - std::vector tmp; - split_to_float(step_data, ',', &tmp); - link_step_data.push_back(tmp); - } - // load week data - std::vector week_data; - split_to_float(data[2], ',', &week_data); - // load minute data - std::vector minute_data; - split_to_float(data[1], ',', &minute_data); - link_step_data_all.push_back(std::move(link_step_data)); - week_data_all.push_back(std::move(week_data)); - minute_data_all.push_back(std::move(minute_data)); - } - } -}; -void PrepareInputs(std::vector *input_slots, DataRecord *data, - int batch_size) { - PaddleTensor lod_attention_tensor, init_zero_tensor, lod_tensor_tensor, - week_tensor, minute_tensor; - lod_attention_tensor.name = "data_lod_attention"; - init_zero_tensor.name = "cell_init"; - lod_tensor_tensor.name = "data"; - week_tensor.name = "week"; - minute_tensor.name = "minute"; - auto one_batch = data->NextBatch(); - std::vector rnn_link_data_shape( - {static_cast(one_batch.rnn_link_data.size()), - static_cast(one_batch.rnn_link_data.front().size())}); - lod_attention_tensor.shape.assign({1, 2}); - lod_attention_tensor.lod.assign({one_batch.lod1, one_batch.lod2}); - init_zero_tensor.shape.assign({batch_size, 15}); - init_zero_tensor.lod.assign({one_batch.lod3}); - lod_tensor_tensor.shape = rnn_link_data_shape; - lod_tensor_tensor.lod.assign({one_batch.lod1}); - // clang-format off - week_tensor.shape.assign( - {static_cast(one_batch.rnn_week_datas.size()), - static_cast(one_batch.rnn_week_datas.front().size())}); - week_tensor.lod.assign({one_batch.lod3}); - minute_tensor.shape.assign( - {static_cast(one_batch.rnn_minute_datas.size()), - static_cast(one_batch.rnn_minute_datas.front().size())}); - minute_tensor.lod.assign({one_batch.lod3}); - // clang-format on - // assign data - TensorAssignData(&lod_attention_tensor, - std::vector>({{0, 0}})); - std::vector tmp_zeros(batch_size * 15, 0.); - TensorAssignData(&init_zero_tensor, {tmp_zeros}); - TensorAssignData(&lod_tensor_tensor, one_batch.rnn_link_data); - TensorAssignData(&week_tensor, one_batch.rnn_week_datas); - TensorAssignData(&minute_tensor, one_batch.rnn_minute_datas); - // Set inputs. - auto init_zero_tensor1 = init_zero_tensor; - init_zero_tensor1.name = "hidden_init"; - input_slots->assign({week_tensor, init_zero_tensor, minute_tensor, - init_zero_tensor1, lod_attention_tensor, - lod_tensor_tensor}); - for (auto &tensor : *input_slots) { - tensor.dtype = PaddleDType::FLOAT32; - } -} - -} // namespace - -void CompareResult(const std::vector &outputs, - const std::vector &base_outputs) { - PADDLE_ENFORCE_GT(outputs.size(), 0); - PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size()); - for (size_t i = 0; i < outputs.size(); i++) { - auto &out = outputs[i]; - auto &base_out = base_outputs[i]; - size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, - [](int a, int b) { return a * b; }); - size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(), - 1, [](int a, int b) { return a * b; }); - PADDLE_ENFORCE_EQ(size, size1); - PADDLE_ENFORCE_GT(size, 0); - float *data = static_cast(out.data.data()); - float *base_data = static_cast(base_out.data.data()); - for (size_t i = 0; i < size; i++) { - EXPECT_NEAR(data[i], base_data[i], 1e-3); - } - } -} -// Test with a really complicate model. -void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) { - AnalysisConfig config; - config.prog_file = FLAGS_infer_model + "/__model__"; - config.param_file = FLAGS_infer_model + "/param"; - config.use_gpu = false; - config.device = 0; - config.specify_input_name = true; - config.enable_ir_optim = activate_ir; - PADDLE_ENFORCE(config.ir_mode == - AnalysisConfig::IrPassMode::kExclude); // default - config.ir_passes.clear(); // Do not exclude any pass. - - int batch_size = FLAGS_batch_size; - int num_times = FLAGS_repeat; - - auto base_predictor = - CreatePaddlePredictor(config); - auto predictor = - CreatePaddlePredictor( - config); - std::vector input_slots; - DataRecord data(FLAGS_infer_data, batch_size); - // Prepare inputs. - PrepareInputs(&input_slots, &data, batch_size); - std::vector outputs, base_outputs; - - base_predictor->Run(input_slots, &base_outputs); - - if (num_threads == 1) { - // Prepare inputs. - Timer timer; - timer.tic(); - for (int i = 0; i < num_times; i++) { - predictor->Run(input_slots, &outputs); - } - PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times); - CompareResult(outputs, base_outputs); - } else { - std::vector threads; - std::vector> predictors; - // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled - // because AttentionLSTM's hard code nodeid will be damanged. - for (int tid = 0; tid < num_threads; ++tid) { - predictors.emplace_back( - CreatePaddlePredictor( - config)); - } - for (int tid = 0; tid < num_threads; ++tid) { - threads.emplace_back([&, tid]() { - // Each thread should have local input_slots and outputs. - std::vector input_slots; - DataRecord data(FLAGS_infer_data, batch_size); - PrepareInputs(&input_slots, &data, batch_size); - std::vector outputs; - Timer timer; - timer.tic(); - for (int i = 0; i < num_times; i++) { - predictors[tid]->Run(input_slots, &outputs); - } - PrintTime(batch_size, num_times, num_threads, tid, - timer.toc() / num_times); - CompareResult(outputs, base_outputs); - }); - } - for (int i = 0; i < num_threads; ++i) { - threads[i].join(); - } - } - - if (use_analysis && activate_ir) { - AnalysisPredictor *analysis_predictor = - dynamic_cast(predictor.get()); - auto &fuse_statis = analysis_predictor->analysis_argument() - .Get>( - framework::ir::kFuseStatisAttr); - for (auto &item : fuse_statis) { - LOG(INFO) << "fused " << item.first << " " << item.second; - } - - int num_ops = 0; - for (auto &node : - analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) { - if (node->IsFunction()) { - ++num_ops; - } - } - LOG(INFO) << "has num ops: " << num_ops; - - ASSERT_TRUE(fuse_statis.count("fc_fuse")); - EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); - EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM - EXPECT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1); - EXPECT_EQ(num_ops, - 13); // After graph optimization, only 13 operators exists. - } -} - -// Inference with analysis and IR, easy for profiling independently. -TEST(Analyzer, rnn1) { TestRNN1Prediction(true, true, FLAGS_num_threads); } - -// Other unit-tests of RNN1, test different options of use_analysis, -// activate_ir and multi-threads. -TEST(Analyzer, RNN_tests) { - int num_threads[2] = {1, 4}; - for (auto i : num_threads) { - // Directly infer with the original model. - TestRNN1Prediction(false, false, i); - // Inference with the original model with the analysis turned on, the - // analysis - // module will transform the program to a data flow graph. - TestRNN1Prediction(true, false, i); - // Inference with analysis and IR. The IR module will fuse some large - // kernels. - TestRNN1Prediction(true, true, i); - } +TEST(Analyzer, word2vec_without_analysis) { + TestWord2vecPrediction(FLAGS_inference_model_dir); } } // namespace analysis From d0fbe780403b42490e90ed38a86c504bbf8f80c5 Mon Sep 17 00:00:00 2001 From: luotao1 Date: Mon, 10 Sep 2018 19:45:05 +0800 Subject: [PATCH 21/85] move analyzer_xxx_tester to inference/tests/api --- paddle/fluid/inference/CMakeLists.txt | 3 +- .../fluid/inference/analysis/CMakeLists.txt | 64 ------------------- .../fluid/inference/tests/api/CMakeLists.txt | 60 +++++++++++++++++ .../api}/analyzer_lac_tester.cc | 0 .../api}/analyzer_ner_tester.cc | 0 .../api}/analyzer_rnn1_tester.cc | 0 .../analyzer_text_classification_tester.cc | 0 7 files changed, 62 insertions(+), 65 deletions(-) create mode 100644 paddle/fluid/inference/tests/api/CMakeLists.txt rename paddle/fluid/inference/{analysis => tests/api}/analyzer_lac_tester.cc (100%) rename paddle/fluid/inference/{analysis => tests/api}/analyzer_ner_tester.cc (100%) rename paddle/fluid/inference/{analysis => tests/api}/analyzer_rnn1_tester.cc (100%) rename paddle/fluid/inference/{analysis => tests/api}/analyzer_text_classification_tester.cc (100%) diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 2006e3b24f..efb91bcf75 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -55,6 +55,7 @@ if(NOT APPLE) endif() if(WITH_TESTING) - # both tests/book and analysis depends the models that generated by python/paddle/fluid/tests/book + # tests/book depends the models that generated by python/paddle/fluid/tests/book add_subdirectory(tests/book) + add_subdirectory(tests/api) endif() diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index 699e16ad97..a36f85bd70 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -44,27 +44,6 @@ inference_analysis_test(test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model) -function (inference_download_and_uncompress install_dir url gz_filename) - message(STATUS "Download inference test stuff ${gz_filename} from ${url}") - execute_process(COMMAND bash -c "mkdir -p ${install_dir}") - execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}") - execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${gz_filename}") - message(STATUS "finish downloading ${gz_filename}") -endfunction(inference_download_and_uncompress) - -set(RNN1_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/rnn1%2Fmodel.tar.gz") -set(RNN1_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/rnn1%2Fdata.txt.tar.gz") -set(RNN1_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/rnn1" CACHE PATH "RNN1 model and data root." FORCE) -if (NOT EXISTS ${RNN1_INSTALL_DIR} AND WITH_TESTING) - inference_download_and_uncompress(${RNN1_INSTALL_DIR} ${RNN1_MODEL_URL} "rnn1%2Fmodel.tar.gz") - inference_download_and_uncompress(${RNN1_INSTALL_DIR} ${RNN1_DATA_URL} "rnn1%2Fdata.txt.tar.gz") -endif() - -inference_analysis_test(test_analyzer_rnn1 SRCS analyzer_rnn1_tester.cc - EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor - ARGS --infer_model=${RNN1_INSTALL_DIR}/model - --infer_data=${RNN1_INSTALL_DIR}/data.txt) - inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc) inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc) inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc) @@ -75,46 +54,3 @@ inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_ inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc) inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc) inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc) - -set(CHINESE_NER_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner_model.tar.gz") -set(CHINESE_NER_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner-data.txt.tar.gz") -set(CHINESE_NER_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/chinese_ner" CACHE PATH "Chinese ner model and data root." FORCE) -if (NOT EXISTS ${CHINESE_NER_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE) - inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_MODEL_URL} "chinese_ner_model.tar.gz") - inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_DATA_URL} "chinese_ner-data.txt.tar.gz") -endif() - -inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc - EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor - ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model - --infer_data=${CHINESE_NER_INSTALL_DIR}/data.txt) - -set(LAC_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/lac_model.tar.gz") -set(LAC_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/lac_data.txt.tar.gz") -set(LAC_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/lac" CACHE PATH "LAC model and data root." FORCE) -if (NOT EXISTS ${LAC_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE) - inference_download_and_uncompress(${LAC_INSTALL_DIR} ${LAC_MODEL_URL} "lac_model.tar.gz") - inference_download_and_uncompress(${LAC_INSTALL_DIR} ${LAC_DATA_URL} "lac_data.txt.tar.gz") -endif() - -inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc - EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor - ARGS --infer_model=${LAC_INSTALL_DIR}/model - --infer_data=${LAC_INSTALL_DIR}/data.txt) - - -set(TEXT_CLASSIFICATION_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/text-classification-Senta.tar.gz") -set(TEXT_CLASSIFICATION_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/text_classification_data.txt.tar.gz") -set(TEXT_CLASSIFICATION_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/text_classification" CACHE PATH "Text Classification model and data root." FORCE) - -if (NOT EXISTS ${TEXT_CLASSIFICATION_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE) - inference_download_and_uncompress(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_MODEL_URL} "text-classification-Senta.tar.gz") - inference_download_and_uncompress(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_DATA_URL} "text_classification_data.txt.tar.gz") -endif() - -inference_analysis_test(test_text_classification SRCS analyzer_text_classification_tester.cc - EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor - ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta - --infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt - --topn=1 # Just run top 1 batch. - ) diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt new file mode 100644 index 0000000000..caf23eef40 --- /dev/null +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -0,0 +1,60 @@ +function (inference_download_and_uncompress install_dir url gz_filename) + message(STATUS "Download inference test stuff ${gz_filename} from ${url}") + execute_process(COMMAND bash -c "mkdir -p ${install_dir}") + execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}") + execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${gz_filename}") + message(STATUS "finish downloading ${gz_filename}") +endfunction(inference_download_and_uncompress) + +function(download_model_and_data install_dir model_url model_gz_filename data_url data_gz_filename) + if (NOT EXISTS ${install_dir} AND WITH_INFERENCE) + inference_download_and_uncompress(${install_dir} ${model_url} ${model_gz_filename}) + inference_download_and_uncompress(${install_dir} ${data_url} ${data_gz_filename}) + endif() +endfunction() + +# RNN1 +set(RNN1_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/rnn1%2Fmodel.tar.gz") +set(RNN1_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/rnn1%2Fdata.txt.tar.gz") +set(RNN1_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/rnn1") +download_model_and_data(${RNN1_INSTALL_DIR} ${RNN1_MODEL_URL} "rnn1%2Fmodel.tar.gz" + ${RNN1_DATA_URL} "rnn1%2Fdata.txt.tar.gz") +inference_analysis_test(test_analyzer_rnn1 SRCS analyzer_rnn1_tester.cc + EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor + ARGS --infer_model=${RNN1_INSTALL_DIR}/model + --infer_data=${RNN1_INSTALL_DIR}/data.txt) + +# chinese_ner +set(CHINESE_NER_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner_model.tar.gz") +set(CHINESE_NER_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner-data.txt.tar.gz") +set(CHINESE_NER_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/chinese_ner") +download_model_and_data(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_MODEL_URL} "chinese_ner_model.tar.gz" + ${CHINESE_NER_DATA_URL} "chinese_ner-data.txt.tar.gz") +inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc + EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor + ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model + --infer_data=${CHINESE_NER_INSTALL_DIR}/data.txt) + +# lac +set(LAC_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/lac_model.tar.gz") +set(LAC_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/lac_data.txt.tar.gz") +set(LAC_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/lac") +download_model_and_data(${LAC_INSTALL_DIR} ${LAC_MODEL_URL} "lac_model.tar.gz" + ${LAC_DATA_URL} "lac_data.txt.tar.gz") +inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc + EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor + ARGS --infer_model=${LAC_INSTALL_DIR}/model + --infer_data=${LAC_INSTALL_DIR}/data.txt) + +# text_classification +set(TEXT_CLASSIFICATION_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/text-classification-Senta.tar.gz") +set(TEXT_CLASSIFICATION_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/text_classification_data.txt.tar.gz") +set(TEXT_CLASSIFICATION_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/text_classification") +download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_MODEL_URL} "text_classification-Senta.tar.gz" + ${TEXT_CLASSIFICATION_DATA_URL} "text_classification_data.txt.tar.gz") +inference_analysis_test(test_text_classification SRCS analyzer_text_classification_tester.cc + EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor + ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta + --infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt + --topn=1 # Just run top 1 batch. + ) diff --git a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc similarity index 100% rename from paddle/fluid/inference/analysis/analyzer_lac_tester.cc rename to paddle/fluid/inference/tests/api/analyzer_lac_tester.cc diff --git a/paddle/fluid/inference/analysis/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc similarity index 100% rename from paddle/fluid/inference/analysis/analyzer_ner_tester.cc rename to paddle/fluid/inference/tests/api/analyzer_ner_tester.cc diff --git a/paddle/fluid/inference/analysis/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc similarity index 100% rename from paddle/fluid/inference/analysis/analyzer_rnn1_tester.cc rename to paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc diff --git a/paddle/fluid/inference/analysis/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc similarity index 100% rename from paddle/fluid/inference/analysis/analyzer_text_classification_tester.cc rename to paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc From 81c21705b493bd59259f491e3818af8ba09033ab Mon Sep 17 00:00:00 2001 From: luotao1 Date: Mon, 10 Sep 2018 20:24:04 +0800 Subject: [PATCH 22/85] simplify inference/tests/api/CMakeLists.txt --- .../fluid/inference/tests/api/CMakeLists.txt | 27 +++++++++---------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index caf23eef40..d44a2cfa7f 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -1,15 +1,16 @@ -function (inference_download_and_uncompress install_dir url gz_filename) - message(STATUS "Download inference test stuff ${gz_filename} from ${url}") +function (inference_download_and_uncompress install_dir url) + get_filename_component(filename ${url} NAME) + message(STATUS "Download inference test stuff ${filename} from ${url}") execute_process(COMMAND bash -c "mkdir -p ${install_dir}") execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}") - execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${gz_filename}") - message(STATUS "finish downloading ${gz_filename}") + execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}") + message(STATUS "finish downloading ${filename}") endfunction(inference_download_and_uncompress) -function(download_model_and_data install_dir model_url model_gz_filename data_url data_gz_filename) +function(download_model_and_data install_dir model_url data_url) if (NOT EXISTS ${install_dir} AND WITH_INFERENCE) - inference_download_and_uncompress(${install_dir} ${model_url} ${model_gz_filename}) - inference_download_and_uncompress(${install_dir} ${data_url} ${data_gz_filename}) + inference_download_and_uncompress(${install_dir} ${model_url}) + inference_download_and_uncompress(${install_dir} ${data_url}) endif() endfunction() @@ -17,8 +18,7 @@ endfunction() set(RNN1_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/rnn1%2Fmodel.tar.gz") set(RNN1_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/rnn1%2Fdata.txt.tar.gz") set(RNN1_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/rnn1") -download_model_and_data(${RNN1_INSTALL_DIR} ${RNN1_MODEL_URL} "rnn1%2Fmodel.tar.gz" - ${RNN1_DATA_URL} "rnn1%2Fdata.txt.tar.gz") +download_model_and_data(${RNN1_INSTALL_DIR} ${RNN1_MODEL_URL} ${RNN1_DATA_URL}) inference_analysis_test(test_analyzer_rnn1 SRCS analyzer_rnn1_tester.cc EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor ARGS --infer_model=${RNN1_INSTALL_DIR}/model @@ -28,8 +28,7 @@ inference_analysis_test(test_analyzer_rnn1 SRCS analyzer_rnn1_tester.cc set(CHINESE_NER_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner_model.tar.gz") set(CHINESE_NER_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner-data.txt.tar.gz") set(CHINESE_NER_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/chinese_ner") -download_model_and_data(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_MODEL_URL} "chinese_ner_model.tar.gz" - ${CHINESE_NER_DATA_URL} "chinese_ner-data.txt.tar.gz") +download_model_and_data(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_MODEL_URL} ${CHINESE_NER_DATA_URL}) inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model @@ -39,8 +38,7 @@ inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc set(LAC_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/lac_model.tar.gz") set(LAC_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/lac_data.txt.tar.gz") set(LAC_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/lac") -download_model_and_data(${LAC_INSTALL_DIR} ${LAC_MODEL_URL} "lac_model.tar.gz" - ${LAC_DATA_URL} "lac_data.txt.tar.gz") +download_model_and_data(${LAC_INSTALL_DIR} ${LAC_MODEL_URL} ${LAC_DATA_URL}) inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor ARGS --infer_model=${LAC_INSTALL_DIR}/model @@ -50,8 +48,7 @@ inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc set(TEXT_CLASSIFICATION_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/text-classification-Senta.tar.gz") set(TEXT_CLASSIFICATION_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/text_classification_data.txt.tar.gz") set(TEXT_CLASSIFICATION_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/text_classification") -download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_MODEL_URL} "text_classification-Senta.tar.gz" - ${TEXT_CLASSIFICATION_DATA_URL} "text_classification_data.txt.tar.gz") +download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_MODEL_URL} ${TEXT_CLASSIFICATION_DATA_URL}) inference_analysis_test(test_text_classification SRCS analyzer_text_classification_tester.cc EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta From 1658958fe697f1b7a2c558e8bda06285826b058a Mon Sep 17 00:00:00 2001 From: Krzysztof Binias Date: Mon, 10 Sep 2018 14:57:10 +0200 Subject: [PATCH 23/85] Reusing converted weights --- paddle/fluid/operators/conv_mkldnn_op.cc | 9 ++++++--- paddle/fluid/operators/conv_op.cc | 1 + paddle/fluid/platform/mkldnn_helper.h | 6 +++--- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc index c5cbadc892..1ccf2494f2 100644 --- a/paddle/fluid/operators/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/conv_mkldnn_op.cc @@ -130,12 +130,13 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler { std::shared_ptr AcquireWeightsMemoryFromPrimitive( const std::shared_ptr user_weights_memory_p, - std::vector& pipeline) { // NOLINT + const std::vector& pipeline, + bool is_test = false) { // NOLINT auto user_weights_pd = user_weights_memory_p->get_primitive_desc(); auto weights_pd = conv_pd_->weights_primitive_desc(); return this->AcquireMemory(weights_pd, user_weights_pd, user_weights_memory_p, "@weights_mem_p", - pipeline); + pipeline, is_test); } std::shared_ptr AcquireBiasMemoryFromPrimitive( @@ -266,6 +267,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace."); + const bool is_test = ctx.Attr("is_test"); + auto& dev_ctx = ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); @@ -371,7 +374,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto src_memory_p = handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline); auto weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive( - user_weights_memory_p, pipeline); + user_weights_memory_p, pipeline, is_test); auto dst_memory_p = handler.AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 61ca80877a..6070173ee2 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -109,6 +109,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( } void Conv2DOpMaker::Make() { + AddAttr("is_test", "").SetDefault(false); AddInput( "Input", "(Tensor) The input tensor of convolution operator. " diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index f6e9a52b27..c64e5dafda 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -191,8 +191,8 @@ class MKLDNNHandler { mkldnn::memory::primitive_desc& mpd, // NOLINT mkldnn::memory::primitive_desc& user_mpd, // NOLINT const std::shared_ptr user_memory_p, - const std::string& suffix, - std::vector& pipeline) { // NOLINT + const std::string& suffix, const std::vector& pipeline, + bool is_test = false) { // NOLINT // create reorder primitive if the input format is not the preferred one auto local_key = key_ + suffix; auto key_reorder_p = key_ + suffix + "reorder_p"; @@ -213,7 +213,7 @@ class MKLDNNHandler { pipeline.push_back(*reorder_p); } dev_ctx_.SetBlob(local_key, target_memory_p); - } else { + } else if (!is_test) { // Make reorder if needed auto reorder_p = std::static_pointer_cast( dev_ctx_.GetBlob(key_reorder_p)); From 9664c53c7cae450dc70459008f1509bffb2d0518 Mon Sep 17 00:00:00 2001 From: luotao1 Date: Mon, 10 Sep 2018 22:50:24 +0800 Subject: [PATCH 24/85] fix cmake error to pass the ci --- paddle/fluid/inference/analysis/CMakeLists.txt | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index a36f85bd70..29e3d7dacd 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -35,15 +35,12 @@ function (inference_analysis_test TARGET) cc_test(${TARGET} SRCS "${analysis_test_SRCS}" DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS} - ARGS ${mem_opt} ${analysis_test_ARGS}) + ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt} ${analysis_test_ARGS}) set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec) endif(WITH_TESTING) endfunction(inference_analysis_test) -inference_analysis_test(test_analyzer SRCS analyzer_tester.cc - EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor - ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model) - +inference_analysis_test(test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS paddle_inference_api) inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc) inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc) inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc) From c4d6364060f87d094524115d358c31c23008e2e0 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Tue, 11 Sep 2018 13:02:44 +0800 Subject: [PATCH 25/85] update release doc --- doc/fluid/dev/releasing_process_cn.md | 44 ++++++++--------- doc/fluid/dev/releasing_process_en.md | 68 ++++++++++++++++++--------- 2 files changed, 66 insertions(+), 46 deletions(-) diff --git a/doc/fluid/dev/releasing_process_cn.md b/doc/fluid/dev/releasing_process_cn.md index 4c6728fba7..b3ce2b1b02 100644 --- a/doc/fluid/dev/releasing_process_cn.md +++ b/doc/fluid/dev/releasing_process_cn.md @@ -1,24 +1,23 @@ # PaddlePaddle发行规范 -PaddlePaddle使用git-flow branching model做分支管理,使用[Semantic Versioning](http://semver.org/)标准表示PaddlePaddle版本号。 +PaddlePaddle使用Trunk Based Development,使用[Semantic Versioning](http://semver.org/)标准表示PaddlePaddle版本号。 PaddlePaddle每次发新的版本,遵循以下流程: 1. 从`develop`分支派生出新的分支,分支名为`release/版本号`。例如,`release/0.10.0` -1. 将新分支的版本打上tag,tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`,第二个为`0.10.0rc2`,依次类推。 -1. 对这个版本的提交,做如下几个操作: - * 使用Regression Test List作为检查列表,测试本次release的正确性。 - * 如果失败,记录下所有失败的例子,在这个`release/版本号`分支中,修复所有bug后,Patch号加一,到第二步 - * 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True`。 - * 将这个版本的python wheel包发布到pypi。 - * 更新Docker镜像(参考后面的操作细节)。 -1. 第三步完成后,将`release/版本号`分支合入master分支,将master分支的合入commit打上tag,tag为`版本号`。同时再将`master`分支合入`develop`分支。 -1. 协同完成Release Note的书写。 +2. 将新分支的版本打上tag,tag为`版本号rc.Patch号`。第一个tag为`0.10.0-rc0`。 +3. 新分支一般不接受新的feature和优化。QA在release分支上进行测试。研发基于最新的develop开发。 +4. QA和研发发现的bug,在develop上修复验证后,cherry-pick到release分支。直到release分支相对稳定。 +5. 如果有需要,在release分支最新代码上打上新的tag,比如`0.10.0-rc1`,让更多的用户加入测试。重复3-4步。 +6. release分支稳定后,打上正式的release tag,比如`0.10.0`。 +7. 将这个版本的python wheel包发布到pypi。 +8. 更新Docker镜像(参考后面的操作细节)。 需要注意的是: -* `release/版本号`分支一旦建立,一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭,方便测试人员测试PaddlePaddle的行为。 -* 在`release/版本号`分支存在的时候,如果有bugfix的行为,需要将bugfix的分支同时merge到`master`, `develop`和`release/版本号`这三个分支。 +* bug修复需要先在develop上进行,然后进入release分支。而不是直接在release分支上开发。 + +* release分支原则上只接受修复类的修改,不接受新feature。 ## 发布wheel包到pypi @@ -61,24 +60,21 @@ docker push [镜像]:[version] ## PaddlePaddle 分支规范 -PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范,并适应github的特性做了一些区别。 +PaddlePaddle开发过程使用[Trunk Based Development](https://trunkbaseddevelopment.com/) 开发规范。 -* PaddlePaddle的主版本库遵循[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范。其中: - * `master`分支为稳定(stable branch)版本分支。每一个`master`分支的版本都是经过单元测试和回归测试的版本。 - * `develop`分支为开发(develop branch)版本分支。每一个`develop`分支的版本都经过单元测试,但并没有经过回归测试。 - * `release/版本号`分支为每一次Release时建立的临时分支。在这个阶段的代码正在经历回归测试。 +* `develop`分支为开发(develop branch)版本分支。每一个`develop`分支的版本都经过单元测试。并且会经过模型回归测试。 +* `release/版本号`分支为每一次Release时建立的临时分支。release分支主要用于测试,bug修复和最终发版。 +* `master`分支因为历史原因,已经废弃。 -* 其他用户的fork版本库并不需要严格遵守[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范,但所有fork的版本库的所有分支都相当于特性分支。 +* 其他开发者fork的feature branch。 * 建议,开发者fork的版本库使用`develop`分支同步主版本库的`develop`分支 - * 建议,开发者fork的版本库中,再基于`develop`版本fork出自己的功能分支。 - * 当功能分支开发完毕后,向PaddlePaddle的主版本库提交`Pull Reuqest`,进而进行代码评审。 - * 在评审过程中,开发者修改自己的代码,可以继续在自己的功能分支提交代码。 - -* BugFix分支也是在开发者自己的fork版本库维护,与功能分支不同的是,BugFix分支需要分别给主版本库的`master`、`develop`与可能有的`release/版本号`分支,同时提起`Pull Request`。 + * 建议,开发者fork的版本库中,再基于`develop`版本fork出自己的feature branch。 + * 当feature branch开发完毕后,向PaddlePaddle的主版本库提交`Pull Reuqest`,进而进行代码评审。 + * 在评审过程中,开发者修改自己的代码,可以继续在自己的feature branch提交代码。 ## PaddlePaddle回归测试列表 -本列表说明PaddlePaddle发版之前需要测试的功能点。 +TODO ### PaddlePaddle Book中所有章节 diff --git a/doc/fluid/dev/releasing_process_en.md b/doc/fluid/dev/releasing_process_en.md index 2c1c30c1ed..e3ca3acd22 100644 --- a/doc/fluid/dev/releasing_process_en.md +++ b/doc/fluid/dev/releasing_process_en.md @@ -17,13 +17,23 @@ Each time we release a new PaddlePaddle version, we should follow the below step * Update the Docker images (see below instructions for detail). 1. After above step, merge `release/[version]` branch to master and push a tag on the master commit, then merge `master` to `develop`. -1. Update the Release Note. +1. Update the Release Note. -***NOTE:*** +1. Create a new release branch from `develop`,named `release/[version]`. E.g.,`release/0.10.0` +2. Create a new tag for the release branch, tag format: `version-rc.Patch`. The first tag is `0.10.0-rc0`。 +3. New release branch normally doesn't accept new features or optimizations. QA will test on the release branch. Developer should develop based on `develop` branch. +4. If QA or Developer find bugs. They should first fix and verity on `develop` branch. Then cherry-pick to the release branch. Wait until the release branch is stable. +5. If necessary, create a new tag on the relese branch, e.g. `0.10.0-rc1`. Involve more users to try it and repeat step 3-4. +6. After release branch is stable,Create the official release tag,such as `0.10.0`. +7. Release the python wheel package to pypi. +8. Update the docker image (More details below). + +NOTE: + +* bug fix should happen on `develop` branch, then cherry-pick to relese branch. Avoid developing directly on release branch. + +* release normally only accept bug fixes. Don't add new features. -* Do ***NOT*** merge commits from develop branch to release branches to keep the release branch contain - features only for current release, so that we can test on that version. -* If we want to fix bugs on release branches, we must merge the fix to master, develop and release branch. ## Publish Wheel Packages to pypi @@ -95,28 +105,42 @@ Tags that need to be updated are: You can then checkout the latest pushed tags at https://hub.docker.com/r/paddlepaddle/paddle/tags/. +## PaddlePaddle 分支规范 + +PaddlePaddle开发过程使用[Trunk Based Development](https://trunkbaseddevelopment.com/) 开发规范。 + +* `develop`分支为开发(develop branch)版本分支。每一个`develop`分支的版本都经过单元测试。并且会经过模型回归测试。 +* `release/版本号`分支为每一次Release时建立的临时分支。release分支主要用于测试,bug修复和最终发版。 +* `master`分支因为历史原因,已经废弃。 + +* 其他开发者fork的feature branch。 + * 建议,开发者fork的版本库使用`develop`分支同步主版本库的`develop`分支 + * 建议,开发者fork的版本库中,再基于`develop`版本fork出自己的feature branch。 + * 当feature branch开发完毕后,向PaddlePaddle的主版本库提交`Pull Reuqest`,进而进行代码评审。 + * 在评审过程中,开发者修改自己的代码,可以继续在自己的feature branch提交代码。 + +## PaddlePaddle回归测试列表 + +TODO + ## Branching Model -We use [git-flow](http://nvie.com/posts/a-successful-git-branching-model/) as our branching model, -with some modifications: - -* `master` branch is the stable branch. Each version on the master branch is tested and guaranteed. -* `develop` branch is for development. Each commit on develop branch has passed CI unit test, but no - regression tests are run. -* `release/[version]` branch is used to publish each release. Latest release version branches have - bugfix only for that version, but no feature updates. -* Developer forks are not required to follow - [git-flow](http://nvie.com/posts/a-successful-git-branching-model/) - branching model, all forks is like a feature branch. - * Advise: developer fork's develop branch is used to sync up with main repo's develop branch. - * Advise: developer use it's fork's develop branch to for new branch to start developing. - * Use that branch on developer's fork to create pull requests and start reviews. - * developer can push new commits to that branch when the pull request is open. -* Bug fixes are also started from developers forked repo. And, bug fixes branch can merge to - `master`, `develop` and `releases`. +PaddlePaddle uses [Trunk Based Development](https://trunkbaseddevelopment.com/) as our branching model. + +* `develop` branch is used for development. Each comment to `develop` branc goes through unit tests and model regression tests. +* `release/[version]` branch is used for each release. Release branch is used for tests, bug fix and evetual release. +* `master` branch as been deprecated for historical reasons + +* Developer's feature branch。 + * Developer's feature branch should sync with upstream `develop` branch. + * Developer's feature branch should be forked from upstream `develop` branch. + * After feature branch is ready, create a `Pull Request` against the Paddle repo and go through code review. + * In the review process, develop modify codes and push to their own feature branch. ## PaddlePaddle Regression Test List +TODO + ### All Chapters of PaddlePaddle Book We need to guarantee that all the chapters of PaddlePaddle Book can run correctly. Including From 5b12eb9294c4e0a109d4cd6224eff4c18948466f Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Tue, 11 Sep 2018 13:13:55 +0800 Subject: [PATCH 26/85] clean --- doc/fluid/dev/releasing_process_cn.md | 6 ++--- doc/fluid/dev/releasing_process_en.md | 35 +-------------------------- 2 files changed, 4 insertions(+), 37 deletions(-) diff --git a/doc/fluid/dev/releasing_process_cn.md b/doc/fluid/dev/releasing_process_cn.md index b3ce2b1b02..b1b6595eff 100644 --- a/doc/fluid/dev/releasing_process_cn.md +++ b/doc/fluid/dev/releasing_process_cn.md @@ -5,7 +5,7 @@ PaddlePaddle使用Trunk Based Development,使用[Semantic Versioning](http://s PaddlePaddle每次发新的版本,遵循以下流程: 1. 从`develop`分支派生出新的分支,分支名为`release/版本号`。例如,`release/0.10.0` -2. 将新分支的版本打上tag,tag为`版本号rc.Patch号`。第一个tag为`0.10.0-rc0`。 +2. 将新分支的版本打上tag,tag为`版本号rc-Patch号`。例如,第一个tag为`0.10.0-rc0`。 3. 新分支一般不接受新的feature和优化。QA在release分支上进行测试。研发基于最新的develop开发。 4. QA和研发发现的bug,在develop上修复验证后,cherry-pick到release分支。直到release分支相对稳定。 5. 如果有需要,在release分支最新代码上打上新的tag,比如`0.10.0-rc1`,让更多的用户加入测试。重复3-4步。 @@ -67,8 +67,8 @@ PaddlePaddle开发过程使用[Trunk Based Development](https://trunkbaseddevelo * `master`分支因为历史原因,已经废弃。 * 其他开发者fork的feature branch。 - * 建议,开发者fork的版本库使用`develop`分支同步主版本库的`develop`分支 - * 建议,开发者fork的版本库中,再基于`develop`版本fork出自己的feature branch。 + * 建议,开发者的feature branch需要同步主版本库的`develop`分支。 + * 建议,开发者的feature branch需要基于朱版本库中的`develop`分支。 * 当feature branch开发完毕后,向PaddlePaddle的主版本库提交`Pull Reuqest`,进而进行代码评审。 * 在评审过程中,开发者修改自己的代码,可以继续在自己的feature branch提交代码。 diff --git a/doc/fluid/dev/releasing_process_en.md b/doc/fluid/dev/releasing_process_en.md index e3ca3acd22..a4ea4f6fb0 100644 --- a/doc/fluid/dev/releasing_process_en.md +++ b/doc/fluid/dev/releasing_process_en.md @@ -4,23 +4,8 @@ PaddlePaddle manages its branches using "git-flow branching model", and [Semanti Each time we release a new PaddlePaddle version, we should follow the below steps: -1. Fork a new branch from `develop` named `release/[version]`, e.g. `release/0.10.0`. -1. Push a new tag on the release branch, the tag name should be like `[version]rc.patch`. The - first tag should be `0.10.0rc1`, and the second should be `0.10.0.rc2` and so on. -1. After that, we should do: - * Run all regression test on the Regression Test List (see PaddlePaddle TeamCity CI), to confirm - that this release has no major bugs. - * If regression test fails, we must fix those bugs and create a new `release/[version]` - branch from previous release branch. - * Modify `python/setup.py.in`, change the version number and change `ISTAGED` to `True`. - * Publish PaddlePaddle release wheel packages to pypi (see below instructions for detail). - * Update the Docker images (see below instructions for detail). -1. After above step, merge `release/[version]` branch to master and push a tag on the master commit, - then merge `master` to `develop`. -1. Update the Release Note. - 1. Create a new release branch from `develop`,named `release/[version]`. E.g.,`release/0.10.0` -2. Create a new tag for the release branch, tag format: `version-rc.Patch`. The first tag is `0.10.0-rc0`。 +2. Create a new tag for the release branch, tag format: `version-rc.Patch`. E.g. the first tag is `0.10.0-rc0`。 3. New release branch normally doesn't accept new features or optimizations. QA will test on the release branch. Developer should develop based on `develop` branch. 4. If QA or Developer find bugs. They should first fix and verity on `develop` branch. Then cherry-pick to the release branch. Wait until the release branch is stable. 5. If necessary, create a new tag on the relese branch, e.g. `0.10.0-rc1`. Involve more users to try it and repeat step 3-4. @@ -105,24 +90,6 @@ Tags that need to be updated are: You can then checkout the latest pushed tags at https://hub.docker.com/r/paddlepaddle/paddle/tags/. -## PaddlePaddle 分支规范 - -PaddlePaddle开发过程使用[Trunk Based Development](https://trunkbaseddevelopment.com/) 开发规范。 - -* `develop`分支为开发(develop branch)版本分支。每一个`develop`分支的版本都经过单元测试。并且会经过模型回归测试。 -* `release/版本号`分支为每一次Release时建立的临时分支。release分支主要用于测试,bug修复和最终发版。 -* `master`分支因为历史原因,已经废弃。 - -* 其他开发者fork的feature branch。 - * 建议,开发者fork的版本库使用`develop`分支同步主版本库的`develop`分支 - * 建议,开发者fork的版本库中,再基于`develop`版本fork出自己的feature branch。 - * 当feature branch开发完毕后,向PaddlePaddle的主版本库提交`Pull Reuqest`,进而进行代码评审。 - * 在评审过程中,开发者修改自己的代码,可以继续在自己的feature branch提交代码。 - -## PaddlePaddle回归测试列表 - -TODO - ## Branching Model PaddlePaddle uses [Trunk Based Development](https://trunkbaseddevelopment.com/) as our branching model. From 52122704d351c1339f8728d5dfc91a82f4b2e60d Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Tue, 11 Sep 2018 13:19:09 +0800 Subject: [PATCH 27/85] clean --- doc/fluid/dev/releasing_process_cn.md | 4 ++-- doc/fluid/dev/releasing_process_en.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/fluid/dev/releasing_process_cn.md b/doc/fluid/dev/releasing_process_cn.md index b1b6595eff..acea9a2b5d 100644 --- a/doc/fluid/dev/releasing_process_cn.md +++ b/doc/fluid/dev/releasing_process_cn.md @@ -7,7 +7,7 @@ PaddlePaddle每次发新的版本,遵循以下流程: 1. 从`develop`分支派生出新的分支,分支名为`release/版本号`。例如,`release/0.10.0` 2. 将新分支的版本打上tag,tag为`版本号rc-Patch号`。例如,第一个tag为`0.10.0-rc0`。 3. 新分支一般不接受新的feature和优化。QA在release分支上进行测试。研发基于最新的develop开发。 -4. QA和研发发现的bug,在develop上修复验证后,cherry-pick到release分支。直到release分支相对稳定。 +4. QA和研发发现的bug,在develop上修复验证后,cherry-pick修复到release分支。直到release分支相对稳定。 5. 如果有需要,在release分支最新代码上打上新的tag,比如`0.10.0-rc1`,让更多的用户加入测试。重复3-4步。 6. release分支稳定后,打上正式的release tag,比如`0.10.0`。 7. 将这个版本的python wheel包发布到pypi。 @@ -68,7 +68,7 @@ PaddlePaddle开发过程使用[Trunk Based Development](https://trunkbaseddevelo * 其他开发者fork的feature branch。 * 建议,开发者的feature branch需要同步主版本库的`develop`分支。 - * 建议,开发者的feature branch需要基于朱版本库中的`develop`分支。 + * 建议,开发者的feature branch需要基于主版本库中的`develop`分支。 * 当feature branch开发完毕后,向PaddlePaddle的主版本库提交`Pull Reuqest`,进而进行代码评审。 * 在评审过程中,开发者修改自己的代码,可以继续在自己的feature branch提交代码。 diff --git a/doc/fluid/dev/releasing_process_en.md b/doc/fluid/dev/releasing_process_en.md index a4ea4f6fb0..b810dc941d 100644 --- a/doc/fluid/dev/releasing_process_en.md +++ b/doc/fluid/dev/releasing_process_en.md @@ -7,7 +7,7 @@ Each time we release a new PaddlePaddle version, we should follow the below step 1. Create a new release branch from `develop`,named `release/[version]`. E.g.,`release/0.10.0` 2. Create a new tag for the release branch, tag format: `version-rc.Patch`. E.g. the first tag is `0.10.0-rc0`。 3. New release branch normally doesn't accept new features or optimizations. QA will test on the release branch. Developer should develop based on `develop` branch. -4. If QA or Developer find bugs. They should first fix and verity on `develop` branch. Then cherry-pick to the release branch. Wait until the release branch is stable. +4. If QA or Developer find bugs. They should first fix and verify on `develop` branch. Then cherry-pick the fix to the release branch. Wait until the release branch is stable. 5. If necessary, create a new tag on the relese branch, e.g. `0.10.0-rc1`. Involve more users to try it and repeat step 3-4. 6. After release branch is stable,Create the official release tag,such as `0.10.0`. 7. Release the python wheel package to pypi. From 5fd5bf9c9644b93ded4737edea84f4ea754b60d4 Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Tue, 11 Sep 2018 14:18:50 +0800 Subject: [PATCH 28/85] sync resnet model --- benchmark/fluid/models/resnet.py | 225 +++++++++++++++---------------- 1 file changed, 108 insertions(+), 117 deletions(-) diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py index ae1baa48e1..d71b855612 100644 --- a/benchmark/fluid/models/resnet.py +++ b/benchmark/fluid/models/resnet.py @@ -20,6 +20,7 @@ import functools import numpy as np import time import os +import math import cProfile, pstats, StringIO @@ -27,128 +28,120 @@ import paddle import paddle.fluid as fluid import paddle.fluid.core as core import paddle.fluid.profiler as profiler -# from recordio_converter import imagenet_train, imagenet_test from imagenet_reader import train, val +train_parameters = { + "input_size": [3, 224, 224], + "input_mean": [0.485, 0.456, 0.406], + "input_std": [0.229, 0.224, 0.225], + "learning_strategy": { + "name": "piecewise_decay", + "batch_size": 256, + "epochs": [30, 60, 90], + "steps": [0.1, 0.01, 0.001, 0.0001] + } +} + + +class ResNet(): + def __init__(self, layers=50, is_train=True): + self.params = train_parameters + self.layers = layers + self.is_train = is_train + + def net(self, input, class_dim=1000): + layers = self.layers + supported_layers = [50, 101, 152] + assert layers in supported_layers, \ + "supported layers are {} but input layer is {}".format(supported_layers, layers) + + if layers == 50: + depth = [3, 4, 6, 3] + elif layers == 101: + depth = [3, 4, 23, 3] + elif layers == 152: + depth = [3, 8, 36, 3] + num_filters = [64, 128, 256, 512] + + conv = self.conv_bn_layer( + input=input, num_filters=64, filter_size=7, stride=2, act='relu') + conv = fluid.layers.pool2d( + input=conv, + pool_size=3, + pool_stride=2, + pool_padding=1, + pool_type='max') + + for block in range(len(depth)): + for i in range(depth[block]): + conv = self.bottleneck_block( + input=conv, + num_filters=num_filters[block], + stride=2 if i == 0 and block != 0 else 1) + + pool = fluid.layers.pool2d( + input=conv, pool_size=7, pool_type='avg', global_pooling=True) + stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0) + out = fluid.layers.fc(input=pool, + size=class_dim, + act='softmax', + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.Uniform(-stdv, + stdv))) + return out + + def conv_bn_layer(self, + input, + num_filters, + filter_size, + stride=1, + groups=1, + act=None): + conv = fluid.layers.conv2d( + input=input, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + groups=groups, + act=None, + bias_attr=False) + return fluid.layers.batch_norm( + input=conv, act=act, is_test=not self.is_train) + + def shortcut(self, input, ch_out, stride): + ch_in = input.shape[1] + if ch_in != ch_out or stride != 1: + return self.conv_bn_layer(input, ch_out, 1, stride) + else: + return input -def conv_bn_layer(input, - ch_out, - filter_size, - stride, - padding, - act='relu', - is_train=True): - conv1 = fluid.layers.conv2d( - input=input, - filter_size=filter_size, - num_filters=ch_out, - stride=stride, - padding=padding, - act=None, - bias_attr=False) - return fluid.layers.batch_norm(input=conv1, act=act, is_test=not is_train) - - -def shortcut(input, ch_out, stride, is_train=True): - ch_in = input.shape[1] # if args.data_format == 'NCHW' else input.shape[-1] - if ch_in != ch_out: - return conv_bn_layer( - input, ch_out, 1, stride, 0, None, is_train=is_train) - else: - return input - - -def basicblock(input, ch_out, stride, is_train=True): - short = shortcut(input, ch_out, stride, is_train=is_train) - conv1 = conv_bn_layer(input, ch_out, 3, stride, 1, is_train=is_train) - conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None, is_train=is_train) - return fluid.layers.elementwise_add(x=short, y=conv2, act='relu') - - -def bottleneck(input, ch_out, stride, is_train=True): - short = shortcut(input, ch_out * 4, stride, is_train=is_train) - conv1 = conv_bn_layer(input, ch_out, 1, stride, 0, is_train=is_train) - conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, is_train=is_train) - conv3 = conv_bn_layer( - conv2, ch_out * 4, 1, 1, 0, act=None, is_train=is_train) - return fluid.layers.elementwise_add(x=short, y=conv3, act='relu') - - -def layer_warp(block_func, input, ch_out, count, stride): - res_out = block_func(input, ch_out, stride) - for i in range(1, count): - res_out = block_func(res_out, ch_out, 1) - return res_out - + def bottleneck_block(self, input, num_filters, stride): + conv0 = self.conv_bn_layer( + input=input, num_filters=num_filters, filter_size=1, act='relu') + conv1 = self.conv_bn_layer( + input=conv0, + num_filters=num_filters, + filter_size=3, + stride=stride, + act='relu') + conv2 = self.conv_bn_layer( + input=conv1, num_filters=num_filters * 4, filter_size=1, act=None) -def resnet_imagenet(input, - class_dim, - depth=50, - data_format='NCHW', - is_train=True): + short = self.shortcut(input, num_filters * 4, stride) - cfg = { - 18: ([2, 2, 2, 1], basicblock), - 34: ([3, 4, 6, 3], basicblock), - 50: ([3, 4, 6, 3], bottleneck), - 101: ([3, 4, 23, 3], bottleneck), - 152: ([3, 8, 36, 3], bottleneck) - } - stages, block_func = cfg[depth] - conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3) - pool1 = fluid.layers.pool2d( - input=conv1, pool_type='avg', pool_size=3, pool_stride=2) - res1 = layer_warp(block_func, pool1, 64, stages[0], 1) - res2 = layer_warp(block_func, res1, 128, stages[1], 2) - res3 = layer_warp(block_func, res2, 256, stages[2], 2) - res4 = layer_warp(block_func, res3, 512, stages[3], 2) - pool2 = fluid.layers.pool2d( - input=res4, - pool_size=7, - pool_type='avg', - pool_stride=1, - global_pooling=True) - out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax') - return out - - -def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'): - assert (depth - 2) % 6 == 0 - - n = (depth - 2) // 6 - - conv1 = conv_bn_layer( - input=input, ch_out=16, filter_size=3, stride=1, padding=1) - res1 = layer_warp(basicblock, conv1, 16, n, 1) - res2 = layer_warp(basicblock, res1, 32, n, 2) - res3 = layer_warp(basicblock, res2, 64, n, 2) - pool = fluid.layers.pool2d( - input=res3, pool_size=8, pool_type='avg', pool_stride=1) - out = fluid.layers.fc(input=pool, size=class_dim, act='softmax') - return out + return fluid.layers.elementwise_add(x=short, y=conv2, act='relu') def _model_reader_dshape_classdim(args, is_train): - model = resnet_cifar10 + model = None reader = None - if args.data_set == "cifar10": - class_dim = 10 - if args.data_format == 'NCHW': - dshape = [3, 32, 32] - else: - dshape = [32, 32, 3] - model = resnet_cifar10 - if is_train: - reader = paddle.dataset.cifar.train10() - else: - reader = paddle.dataset.cifar.test10() - elif args.data_set == "flowers": + if args.data_set == "flowers": class_dim = 102 if args.data_format == 'NCHW': dshape = [3, 224, 224] else: dshape = [224, 224, 3] - model = resnet_imagenet if is_train: reader = paddle.dataset.flowers.train() else: @@ -159,7 +152,6 @@ def _model_reader_dshape_classdim(args, is_train): dshape = [3, 224, 224] else: dshape = [224, 224, 3] - model = resnet_imagenet if not args.data_path: raise Exception( "Must specify --data_path when training with imagenet") @@ -173,12 +165,11 @@ def _model_reader_dshape_classdim(args, is_train): reader = train(xmap=False) else: reader = val(xmap=False) - return model, reader, dshape, class_dim + return reader, dshape, class_dim def get_model(args, is_train, main_prog, startup_prog): - model, reader, dshape, class_dim = _model_reader_dshape_classdim(args, - is_train) + reader, dshape, class_dim = _model_reader_dshape_classdim(args, is_train) pyreader = None trainer_count = int(os.getenv("PADDLE_TRAINERS")) @@ -198,7 +189,8 @@ def get_model(args, is_train, main_prog, startup_prog): label = fluid.layers.data( name='label', shape=[1], dtype='int64') - predict = model(input, class_dim, is_train=is_train) + model = ResNet(is_train=is_train) + predict = model.net(input, class_dim=class_dim) cost = fluid.layers.cross_entropy(input=predict, label=label) avg_cost = fluid.layers.mean(x=cost) @@ -216,15 +208,14 @@ def get_model(args, is_train, main_prog, startup_prog): total_images = 1281167 / trainer_count step = int(total_images / args.batch_size + 1) - epochs = [30, 60, 80, 90] + epochs = [30, 60, 90] bd = [step * e for e in epochs] base_lr = args.learning_rate lr = [] lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] optimizer = fluid.optimizer.Momentum( - learning_rate=base_lr, - #learning_rate=fluid.layers.piecewise_decay( - # boundaries=bd, values=lr), + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=lr), momentum=0.9, regularization=fluid.regularizer.L2Decay(1e-4)) optimizer.minimize(avg_cost) From faf8ad2436522576cd1fdf0b783291e519308859 Mon Sep 17 00:00:00 2001 From: Bai Yifan Date: Tue, 11 Sep 2018 15:33:36 +0800 Subject: [PATCH 29/85] Add ignore_index in cross_entropy op (#13217) * add ignore index * update api.spec * enhance softmax_with_cross_entropy --- paddle/fluid/API.spec | 4 +-- paddle/fluid/operators/cross_entropy_op.cc | 5 +++ paddle/fluid/operators/cross_entropy_op.h | 26 +++++++++----- paddle/fluid/operators/math/cross_entropy.cc | 9 +++-- paddle/fluid/operators/math/cross_entropy.cu | 15 +++++--- paddle/fluid/operators/math/cross_entropy.h | 3 +- .../softmax_with_cross_entropy_op.cc | 6 ++++ .../softmax_with_cross_entropy_op.cu | 11 +++--- .../operators/softmax_with_cross_entropy_op.h | 3 +- python/paddle/fluid/layers/nn.py | 22 +++++++++--- .../tests/unittests/test_cross_entropy_op.py | 29 +++++++++++++++ .../fluid/tests/unittests/test_layers.py | 9 +++++ .../test_softmax_with_cross_entropy_op.py | 35 +++++++++++++++++++ 13 files changed, 148 insertions(+), 29 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index ae5f30e431..842fde1ec5 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -100,7 +100,7 @@ paddle.fluid.layers.gru_unit ArgSpec(args=['input', 'hidden', 'size', 'param_att paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.crf_decoding ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label'], varargs=None, keywords=None, defaults=(False,)) +paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100)) paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None)) @@ -142,7 +142,7 @@ paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 's paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)) -paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label'], varargs=None, keywords=None, defaults=(False,)) +paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100)) paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1)) diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc index 578ab63bc3..66f19fe7ec 100644 --- a/paddle/fluid/operators/cross_entropy_op.cc +++ b/paddle/fluid/operators/cross_entropy_op.cc @@ -138,6 +138,11 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker { "(bool, default false), a flag indicating whether to " "interpretate the given labels as soft labels.") .SetDefault(false); + AddAttr("ignore_index", + "(int, default -100), Specifies a target value that is" + "ignored and does not contribute to the input gradient." + "Only valid if soft_label is set to False") + .SetDefault(-100); AddComment(R"DOC( CrossEntropy Operator. diff --git a/paddle/fluid/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h index 36b58d8014..03974a7fc5 100644 --- a/paddle/fluid/operators/cross_entropy_op.h +++ b/paddle/fluid/operators/cross_entropy_op.h @@ -40,7 +40,7 @@ class CrossEntropyOpKernel : public framework::OpKernel { math::CrossEntropyFunctor()( ctx.template device_context(), &y_2d, &x_2d, &labels_2d, - ctx.Attr("soft_label")); + ctx.Attr("soft_label"), ctx.Attr("ignore_index")); } }; @@ -74,16 +74,22 @@ class XeGradFunctor { const T* dy, // NOLINT const T* x, // NOLINT const int64_t* label, // NOLINT - size_t num_classes) - : dx_(dx), dy_(dy), x_(x), label_(label), num_classes_(num_classes) {} + size_t num_classes, size_t ignore_index) + : dx_(dx), + dy_(dy), + x_(x), + label_(label), + num_classes_(num_classes), + ignore_index_(ignore_index) {} HOSTDEVICE void operator()(size_t sample_id) { auto x_is_true_offset = sample_id * num_classes_ + label_[sample_id]; for (size_t x_offset = sample_id * num_classes_; x_offset < (sample_id + 1) * num_classes_; ++x_offset) { - dx_[x_offset] = x_offset != x_is_true_offset - ? static_cast(0) - : -dy_[sample_id] / x_[x_offset]; + dx_[x_offset] = + (x_offset != x_is_true_offset || label_[sample_id] == ignore_index_) + ? static_cast(0) + : -dy_[sample_id] / x_[x_offset]; } } @@ -93,6 +99,7 @@ class XeGradFunctor { const T* x_; const int64_t* label_; size_t num_classes_; + size_t ignore_index_; }; template @@ -109,6 +116,7 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel { // unnecessary to convert tensors to 2-D views. int rank = x->dims().size(); int64_t class_num = x->dims()[rank - 1]; + int64_t ignore_index = ctx.Attr("ignore_index"); if (ctx.Attr("soft_label")) { XeSoftlabelGradFunctor functor(dx_data, dy->data(), x->data(), label->data(), @@ -118,9 +126,9 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel { static_cast(dx->numel())); for_range(functor); } else { - XeGradFunctor functor(dx_data, dy->data(), x->data(), - label->data(), - static_cast(class_num)); + XeGradFunctor functor( + dx_data, dy->data(), x->data(), label->data(), + static_cast(class_num), static_cast(ignore_index)); platform::ForRange for_range( ctx.template device_context(), static_cast(dy->numel())); diff --git a/paddle/fluid/operators/math/cross_entropy.cc b/paddle/fluid/operators/math/cross_entropy.cc index caff35e03a..18bf1a66f6 100644 --- a/paddle/fluid/operators/math/cross_entropy.cc +++ b/paddle/fluid/operators/math/cross_entropy.cc @@ -28,7 +28,8 @@ class CrossEntropyFunctor { public: void operator()(const platform::CPUDeviceContext& ctx, framework::Tensor* out, const framework::Tensor* prob, - const framework::Tensor* labels, const bool softLabel) { + const framework::Tensor* labels, const bool softLabel, + const int ignore_index) { const int batch_size = prob->dims()[0]; if (softLabel) { auto in = EigenMatrix::From(*prob); @@ -49,8 +50,12 @@ class CrossEntropyFunctor { int lbl = label_data[i]; PADDLE_ENFORCE_GE(lbl, 0); PADDLE_ENFORCE_LT(lbl, class_num); + PADDLE_ENFORCE((lbl >= 0 && lbl < class_num) || lbl == ignore_index); int index = i * class_num + lbl; - loss_data[i] = -math::TolerableValue()(std::log(prob_data[index])); + loss_data[i] = + lbl == ignore_index + ? 0 + : -math::TolerableValue()(std::log(prob_data[index])); } } } diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu index 0de58d5fdd..c92341ea55 100644 --- a/paddle/fluid/operators/math/cross_entropy.cu +++ b/paddle/fluid/operators/math/cross_entropy.cu @@ -23,11 +23,14 @@ namespace math { namespace { template __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label, - const int N, const int D) { + const int N, const int D, + const int ignore_index) { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) { - PADDLE_ASSERT(label[i] >= 0 && label[i] < D); - Y[i] = -math::TolerableValue()(log(X[i * D + label[i]])); + PADDLE_ASSERT(label[i] >= 0 && label[i] < D || label[i] == ignore_index); + Y[i] = ignore_index == label[i] + ? 0 + : -math::TolerableValue()(log(X[i * D + label[i]])); } } @@ -57,7 +60,8 @@ class CrossEntropyFunctor { public: void operator()(const platform::CUDADeviceContext& ctx, framework::Tensor* out, const framework::Tensor* prob, - const framework::Tensor* labels, bool softLabel) { + const framework::Tensor* labels, bool softLabel, + const int ignore_index) { const T* prob_data = prob->data(); T* loss_data = out->mutable_data(ctx.GetPlace()); @@ -77,7 +81,8 @@ class CrossEntropyFunctor { int block = 512; int grid = (batch_size + block - 1) / block; CrossEntropyKernel<<>>( - loss_data, prob_data, label_data, batch_size, class_num); + loss_data, prob_data, label_data, batch_size, class_num, + ignore_index); } } }; diff --git a/paddle/fluid/operators/math/cross_entropy.h b/paddle/fluid/operators/math/cross_entropy.h index adc5b3fe47..e8aeb5d057 100644 --- a/paddle/fluid/operators/math/cross_entropy.h +++ b/paddle/fluid/operators/math/cross_entropy.h @@ -38,7 +38,8 @@ class CrossEntropyFunctor { public: void operator()(const DeviceContext& context, framework::Tensor* out, const framework::Tensor* prob, - const framework::Tensor* labels, const bool softLabel); + const framework::Tensor* labels, const bool softLabel, + const int ignore_index); }; } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc index 53cb716a97..1a9324ec86 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc @@ -44,6 +44,12 @@ class SoftmaxWithCrossEntropyOpMaker "(bool, default: false), A flag to indicate whether to interpretate " "the given labels as soft labels.") .SetDefault(false); + AddAttr( + "ignore_index", + "(int, default -100), Specifies a target value that is ignored and" + "does not contribute to the input gradient. Only valid if soft_label" + "is set to False") + .SetDefault(-100); AddComment(R"DOC( Softmax With Cross Entropy Operator. diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu index a559b01ed3..148faec4af 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu @@ -26,7 +26,8 @@ using Tensor = framework::Tensor; namespace { template __global__ void CrossEntropyGrad(T* logit_grad, const int64_t* labels, - const int batch_size, const int class_num) { + const int batch_size, const int class_num, + const int ignore_index) { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < batch_size; i += blockDim.x * gridDim.x) { int idx = i * class_num + labels[i]; @@ -260,6 +261,7 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel { auto* loss_data = loss->mutable_data(context.GetPlace()); auto soft_label = context.Attr("soft_label"); + auto ignore_index = context.Attr("ignore_index"); if (soft_label) { int batch_size = logits->dims()[0]; int feature_size = logits->dims()[1]; @@ -272,7 +274,8 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel { math::SoftmaxCUDNNFunctor()(context.cuda_device_context(), logits, softmax); math::CrossEntropyFunctor()( - context.cuda_device_context(), loss, softmax, labels, false); + context.cuda_device_context(), loss, softmax, labels, false, + ignore_index); } } }; @@ -295,7 +298,7 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { const int class_num = logit_grad->dims()[1]; int block = 512; auto stream = context.cuda_device_context().stream(); - + auto ignore_index = context.Attr("ignore_index"); if (context.Attr("soft_label")) { int grid = (batch_size * class_num + block - 1) / block; const T* label_data = labels->data(); @@ -305,7 +308,7 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { int grid = (batch_size + block - 1) / block; const int64_t* label_data = labels->data(); CrossEntropyGrad<<>>( - logit_grad_data, label_data, batch_size, class_num); + logit_grad_data, label_data, batch_size, class_num, ignore_index); int num = batch_size * class_num; grid = (num + block - 1) / block; Scale<<>>(logit_grad_data, loss_grad_data, num, diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.h b/paddle/fluid/operators/softmax_with_cross_entropy_op.h index dd6f6aca5a..e9aba3b37b 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h @@ -45,7 +45,8 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel { math::SoftmaxFunctor()(dev_ctx, logits, softmax); math::CrossEntropyFunctor()( - dev_ctx, loss, softmax, labels, context.Attr("soft_label")); + dev_ctx, loss, softmax, labels, context.Attr("soft_label"), + context.Attr("ignore_index")); } }; diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 8408e6d2a1..3ae0fac4be 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -968,7 +968,7 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None): return out -def cross_entropy(input, label, soft_label=False): +def cross_entropy(input, label, soft_label=False, ignore_index=-100): """ **Cross Entropy Layer** @@ -1012,7 +1012,10 @@ def cross_entropy(input, label, soft_label=False): tensor with shape [N x D]. soft_label (bool): a flag indicating whether to interpretate the given labels as soft - labels, default `False`. + labels. Default: `False`. + ignore_index (int): Specifies a target value that is ignored and does + not contribute to the input gradient. Only valid + if soft_label is set to False. Default: -100 Returns: A 2-D tensor with shape [N x 1], the cross entropy loss. @@ -1037,7 +1040,8 @@ def cross_entropy(input, label, soft_label=False): inputs={'X': [input], 'Label': [label]}, outputs={'Y': [out]}, - attrs={"soft_label": soft_label}) + attrs={"soft_label": soft_label, + "ignore_index": ignore_index}) return out @@ -4242,7 +4246,10 @@ def multiplex(inputs, index): return out -def softmax_with_cross_entropy(logits, label, soft_label=False): +def softmax_with_cross_entropy(logits, + label, + soft_label=False, + ignore_index=-100): """ **Softmax With Cross Entropy Operator.** @@ -4284,6 +4291,10 @@ def softmax_with_cross_entropy(logits, label, soft_label=False): soft_label is set to true, Label is a Tensor with soft_label (bool): A flag to indicate whether to interpretate the given labels as soft labels. By default, `soft_label` is set to False. + ignore_index (int): Specifies a target value that is ignored and does + not contribute to the input gradient. Only valid + if soft_label is set to False. Default: -100 + Returns: Variable: The cross entropy loss is a 2-D tensor with shape [N x 1]. @@ -4305,7 +4316,8 @@ def softmax_with_cross_entropy(logits, label, soft_label=False): 'Label': label}, outputs={'Softmax': softmax, 'Loss': loss}, - attrs={'soft_label': soft_label}) + attrs={'soft_label': soft_label, + 'ignore_index': ignore_index}) return loss diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py index fa367f95fc..f22badbea0 100644 --- a/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py +++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py @@ -209,5 +209,34 @@ class TestCrossEntropyOp6(OpTest): ["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001) +class TestCrossEntropyOp7(OpTest): + """Test cross-entropy with ignore index. + """ + + def setUp(self): + self.op_type = "cross_entropy" + batch_size = 30 + class_num = 10 + ignore_index = 3 + + X = randomize_probability(batch_size, class_num, dtype='float64') + + label = np.random.randint(0, class_num, (batch_size, 1), dtype="int64") + cross_entropy = np.asmatrix( + [[-np.log(X[i][label[i][0]])] + if label[i][0] != ignore_index else [0] + for i in range(X.shape[0])], + dtype="float64") + self.inputs = {"X": X, "Label": label} + self.outputs = {"Y": cross_entropy} + self.attrs = {"soft_label": False, "ignore_index": ignore_index} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(["X"], "Y", numeric_grad_delta=0.001) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index bc4d364c74..b04346b052 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -556,6 +556,15 @@ class TestBook(unittest.TestCase): out = layers.sequence_enumerate(input=x, win_size=2, pad_value=0) print(str(program)) + def test_cross_entropy(self): + program = Program() + with program_guard(program): + x = layers.data(name="x", shape=[30, 10], dtype="float32") + label = layers.data(name="label", shape=[30, 1], dtype="int32") + mode = 'channel' + out = layers.cross_entropy(x, label, False, 4) + self.assertIsNotNone(out) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py index b7e5ff6d52..a18941dd31 100644 --- a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py +++ b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py @@ -88,5 +88,40 @@ class TestSoftmaxWithCrossEntropyOp2(OpTest): self.check_grad(["Logits"], "Loss") +class TestSoftmaxWithCrossEntropyOp3(OpTest): + """ + Test softmax with cross entropy operator with ignore_index. + """ + + def setUp(self): + self.op_type = "softmax_with_cross_entropy" + batch_size = 41 + class_num = 37 + + logits = np.random.uniform(0.1, 1.0, + [batch_size, class_num]).astype("float64") + softmax = np.apply_along_axis(stable_softmax, 1, logits) + labels = np.random.randint(0, class_num, [batch_size, 1], dtype="int64") + ignore_index = 7 + cross_entropy = np.asmatrix( + [[-np.log(softmax[i][labels[i][0]])] + if labels[i] != ignore_index else [0] + for i in range(softmax.shape[0])], + dtype="float64") + + self.inputs = {"Logits": logits, "Label": labels} + self.outputs = { + "Softmax": softmax.astype("float64"), + "Loss": cross_entropy.astype("float64") + } + self.attrs = {"ignore_index": ignore_index} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(["Logits"], "Loss") + + if __name__ == "__main__": unittest.main() From 1ce9e9dc3072f50e7e827fe6b63d59e3eb883196 Mon Sep 17 00:00:00 2001 From: Krzysztof Binias Date: Mon, 10 Sep 2018 15:29:51 +0200 Subject: [PATCH 30/85] Renaming decision variable --- paddle/fluid/operators/conv_mkldnn_op.cc | 4 ++-- paddle/fluid/platform/mkldnn_helper.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc index 1ccf2494f2..244a578db8 100644 --- a/paddle/fluid/operators/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/conv_mkldnn_op.cc @@ -131,12 +131,12 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler { std::shared_ptr AcquireWeightsMemoryFromPrimitive( const std::shared_ptr user_weights_memory_p, const std::vector& pipeline, - bool is_test = false) { // NOLINT + bool is_persistent = false) { // NOLINT auto user_weights_pd = user_weights_memory_p->get_primitive_desc(); auto weights_pd = conv_pd_->weights_primitive_desc(); return this->AcquireMemory(weights_pd, user_weights_pd, user_weights_memory_p, "@weights_mem_p", - pipeline, is_test); + pipeline, is_persistent); } std::shared_ptr AcquireBiasMemoryFromPrimitive( diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index c64e5dafda..cf08202ccb 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -192,7 +192,7 @@ class MKLDNNHandler { mkldnn::memory::primitive_desc& user_mpd, // NOLINT const std::shared_ptr user_memory_p, const std::string& suffix, const std::vector& pipeline, - bool is_test = false) { // NOLINT + bool is_persistent = false) { // NOLINT // create reorder primitive if the input format is not the preferred one auto local_key = key_ + suffix; auto key_reorder_p = key_ + suffix + "reorder_p"; @@ -213,7 +213,7 @@ class MKLDNNHandler { pipeline.push_back(*reorder_p); } dev_ctx_.SetBlob(local_key, target_memory_p); - } else if (!is_test) { + } else if (!is_persistent) { // Make reorder if needed auto reorder_p = std::static_pointer_cast( dev_ctx_.GetBlob(key_reorder_p)); From e0436ad8bbaed57b9c2c60f100d1e1f86fe42e07 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 11 Sep 2018 16:07:07 +0800 Subject: [PATCH 31/85] refine fusion lstm infershape --- paddle/fluid/framework/operator.cc | 277 ++++++++----------- paddle/fluid/framework/shape_runtime_infer.h | 86 ++++++ paddle/fluid/operators/fusion_lstm_op.cc | 81 ++++-- 3 files changed, 260 insertions(+), 184 deletions(-) create mode 100644 paddle/fluid/framework/shape_runtime_infer.h diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index d58d6e4f3e..36025db7ba 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/shape_inference.h" +#include "paddle/fluid/framework/shape_runtime_infer.h" #include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/platform/profiler.h" @@ -458,187 +459,147 @@ bool OpSupportGPU(const std::string& op_type) { return false; } -class RuntimeInferShapeContext : public InferShapeContext { - public: - RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope) - : op_(op), scope_(scope) {} - - bool HasInput(const std::string& name) const override { - if (!op_.HasInputs(name)) { - return false; - } - auto& ins = Inputs(name); - size_t length = ins.size(); - if (length == 0) { - return false; - } - PADDLE_ENFORCE_EQ(length, 1UL, - "Input %s should not have more than one inputs", name); - auto ipt = ins[0]; - auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt); - return var != nullptr; +bool RuntimeInferShapeContext::HasInput(const std::string& name) const { + if (!op_.HasInputs(name)) { + return false; } - - bool HasOutput(const std::string& name) const override { - if (!op_.HasOutputs(name)) { - return false; - } - auto& outs = Outputs(name); - size_t length = outs.size(); - if (length == 0) { - return false; - } - PADDLE_ENFORCE_EQ(length, 1UL, - "Output %s should not have more than one inputs", name); - auto ipt = outs[0]; - auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt); - return var != nullptr; + auto& ins = Inputs(name); + size_t length = ins.size(); + if (length == 0) { + return false; } + PADDLE_ENFORCE_EQ(length, 1UL, + "Input %s should not have more than one inputs", name); + auto ipt = ins[0]; + auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt); + return var != nullptr; +} - bool HasInputs(const std::string& name) const override { - if (!op_.HasInputs(name)) { - return false; - } - auto inputs = op_.Inputs(name); - if (inputs.empty()) { - return false; - } - for (auto& input : inputs) { - if (scope_.FindVar(input) == nullptr) { - return false; - } - } - return true; +bool RuntimeInferShapeContext::HasOutput(const std::string& name) const { + if (!op_.HasOutputs(name)) { + return false; } + auto& outs = Outputs(name); + size_t length = outs.size(); + if (length == 0) { + return false; + } + PADDLE_ENFORCE_EQ(length, 1UL, + "Output %s should not have more than one inputs", name); + auto ipt = outs[0]; + auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt); + return var != nullptr; +} - bool HasOutputs(const std::string& name) const override { - if (!op_.HasOutputs(name)) { - return false; - } - auto outputs = op_.Outputs(name); - if (outputs.empty()) { +bool RuntimeInferShapeContext::HasInputs(const std::string& name) const { + if (!op_.HasInputs(name)) { + return false; + } + auto inputs = op_.Inputs(name); + if (inputs.empty()) { + return false; + } + for (auto& input : inputs) { + if (scope_.FindVar(input) == nullptr) { return false; } - for (auto& output : outputs) { - if (scope_.FindVar(output) == nullptr) { - return false; - } - } - return true; } + return true; +} - AttrReader Attrs() const override { return AttrReader(op_.Attrs()); } - - const std::vector& Inputs( - const std::string& name) const override { - return op_.Inputs(name); +bool RuntimeInferShapeContext::HasOutputs(const std::string& name) const { + if (!op_.HasOutputs(name)) { + return false; } - - const std::vector& Outputs( - const std::string& name) const override { - return op_.Outputs(name); + auto outputs = op_.Outputs(name); + if (outputs.empty()) { + return false; } + for (auto& output : outputs) { + if (scope_.FindVar(output) == nullptr) { + return false; + } + } + return true; +} - void ShareLoD(const std::string& in, const std::string& out, size_t i = 0, - size_t j = 0) const override { - PADDLE_ENFORCE_LT(i, Inputs(in).size()); - PADDLE_ENFORCE_LT(j, Outputs(out).size()); - Variable* in_var = scope_.FindVar(Inputs(in)[i]); - Variable* out_var = scope_.FindVar(Outputs(out)[j]); - if (!in_var->IsType()) return; - PADDLE_ENFORCE(out_var->IsType(), - "The %d-th output of Output(%s) must be LoDTensor.", j, out); - auto in_tensor = in_var->Get(); - auto* out_tensor = out_var->GetMutable(); - out_tensor->set_lod(in_tensor.lod()); +void RuntimeInferShapeContext::ShareLoD(const std::string& in, + const std::string& out, size_t i, + size_t j) const { + PADDLE_ENFORCE_LT(i, Inputs(in).size()); + PADDLE_ENFORCE_LT(j, Outputs(out).size()); + Variable* in_var = scope_.FindVar(Inputs(in)[i]); + Variable* out_var = scope_.FindVar(Outputs(out)[j]); + if (!in_var->IsType()) return; + PADDLE_ENFORCE(out_var->IsType(), + "The %d-th output of Output(%s) must be LoDTensor.", j, out); + auto in_tensor = in_var->Get(); + auto* out_tensor = out_var->GetMutable(); + out_tensor->set_lod(in_tensor.lod()); // TODO(dzhwinter) : reuse ShareLoD in most operators. // Need to call ShareLayout explicitly in sequence related ops. // Shall we have a better method to shared info between in/out Tensor? #ifdef PADDLE_WITH_MKLDNN - // Fix me: ugly workaround below - // Correct solution: - // set_layout() should NOT be called here (i.e. ShareLoD). Instead, - // layout of output tensor should be set "manually" in Compute() - // of each OPKernel. The reason layout should NOT be shared between - // input and output "automatically" (now by InferShape()->ShareLoD()) - // is that layout transform may occur after InferShape(). - // Workaround: - // Skip set_layout() when input layout is kMKLDNN - // This is to avoid kMKLDNN is populated wrongly into a non-MKLDNN - // OPKernel. In all MKLDNN OPkernel, set_layout(kMKLDNN) should be called - // in Compute() - if (in_tensor.layout() != DataLayout::kMKLDNN) + // Fix me: ugly workaround below + // Correct solution: + // set_layout() should NOT be called here (i.e. ShareLoD). Instead, + // layout of output tensor should be set "manually" in Compute() + // of each OPKernel. The reason layout should NOT be shared between + // input and output "automatically" (now by InferShape()->ShareLoD()) + // is that layout transform may occur after InferShape(). + // Workaround: + // Skip set_layout() when input layout is kMKLDNN + // This is to avoid kMKLDNN is populated wrongly into a non-MKLDNN + // OPKernel. In all MKLDNN OPkernel, set_layout(kMKLDNN) should be called + // in Compute() + if (in_tensor.layout() != DataLayout::kMKLDNN) #endif - out_tensor->set_layout(in_tensor.layout()); - } - - void ShareLayout(const std::string& in, const std::string& out, size_t i = 0, - size_t j = 0) const { - PADDLE_ENFORCE_LT(i, Inputs(in).size()); - PADDLE_ENFORCE_LT(j, Outputs(out).size()); - Variable* in_var = scope_.FindVar(Inputs(in)[i]); - Variable* out_var = scope_.FindVar(Outputs(out)[j]); - if (!in_var->IsType()) return; - PADDLE_ENFORCE(out_var->IsType(), - "The %d-th output of Output(%s) must be LoDTensor.", j, out); - auto in_tensor = in_var->Get(); - auto* out_tensor = out_var->GetMutable(); out_tensor->set_layout(in_tensor.layout()); - } - - bool IsRuntime() const override { return true; } - - protected: - DDim GetDim(const std::string& name) const override { - Variable* var = scope_.FindVar(name); - PADDLE_ENFORCE_NOT_NULL(var); - if (var->IsType()) { - return var->Get().dims(); - } else if (var->IsType()) { - return var->Get().GetCompleteDims(); - } else { - PADDLE_THROW( - "Only LoDTensor/SelectedRows support 'GetDim', but Variable %s's " - "type_id is %s.", - name, var->Type().name()); - } - } - - std::vector GetRepeatedDims(const std::string& name) const override { - PADDLE_THROW("Only compile time support this method"); - } - - void SetDim(const std::string& name, const DDim& dim) override { - Variable* var = scope_.FindVar(name); - if (var->IsType()) { - var->GetMutable()->Resize(dim); - } else if (var->IsType()) { - var->GetMutable()->set_height(dim[0]); - } else { - PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.", - name, var->Type().name()); - } - } - - void SetRepeatedDims(const std::string& name, - const std::vector& dims) override { - PADDLE_THROW("Only compile time support this method"); - } +} - proto::VarType::Type GetVarType(const std::string& name) const override { - auto* var = scope_.FindVar(name); - return ToVarType(var->Type()); +void RuntimeInferShapeContext::ShareLayout(const std::string& in, + const std::string& out, size_t i, + size_t j) const { + PADDLE_ENFORCE_LT(i, Inputs(in).size()); + PADDLE_ENFORCE_LT(j, Outputs(out).size()); + Variable* in_var = scope_.FindVar(Inputs(in)[i]); + Variable* out_var = scope_.FindVar(Outputs(out)[j]); + if (!in_var->IsType()) return; + PADDLE_ENFORCE(out_var->IsType(), + "The %d-th output of Output(%s) must be LoDTensor.", j, out); + auto in_tensor = in_var->Get(); + auto* out_tensor = out_var->GetMutable(); + out_tensor->set_layout(in_tensor.layout()); +} + +DDim RuntimeInferShapeContext::GetDim(const std::string& name) const { + Variable* var = scope_.FindVar(name); + PADDLE_ENFORCE_NOT_NULL(var); + if (var->IsType()) { + return var->Get().dims(); + } else if (var->IsType()) { + return var->Get().GetCompleteDims(); + } else { + PADDLE_THROW( + "Only LoDTensor/SelectedRows support 'GetDim', but Variable %s's " + "type_id is %s.", + name, var->Type().name()); } +} - InferShapeVarPtr GetVarPtr(const std::string& name) override { - return scope_.FindVar(name); +void RuntimeInferShapeContext::SetDim(const std::string& name, + const DDim& dim) { + Variable* var = scope_.FindVar(name); + if (var->IsType()) { + var->GetMutable()->Resize(dim); + } else if (var->IsType()) { + var->GetMutable()->set_height(dim[0]); + } else { + PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.", name, + var->Type().name()); } - - private: - const OperatorBase& op_; - const Scope& scope_; -}; +} static void CheckTensorNANOrInf(const std::string& name, const framework::Tensor& tensor) { diff --git a/paddle/fluid/framework/shape_runtime_infer.h b/paddle/fluid/framework/shape_runtime_infer.h new file mode 100644 index 0000000000..04d4e33f7a --- /dev/null +++ b/paddle/fluid/framework/shape_runtime_infer.h @@ -0,0 +1,86 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/shape_inference.h" +#include "paddle/fluid/framework/var_type.h" + +namespace paddle { +namespace framework { + +class RuntimeInferShapeContext : public InferShapeContext { + public: + RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope) + : op_(op), scope_(scope) {} + + bool HasInput(const std::string& name) const override; + bool HasOutput(const std::string& name) const override; + bool HasInputs(const std::string& name) const override; + bool HasOutputs(const std::string& name) const override; + + const OperatorBase& OpBase() const { return op_; } + + const Scope& InferScope() const { return scope_; } + AttrReader Attrs() const override { return AttrReader(op_.Attrs()); } + + const std::vector& Inputs( + const std::string& name) const override { + return op_.Inputs(name); + } + + const std::vector& Outputs( + const std::string& name) const override { + return op_.Outputs(name); + } + + void ShareLoD(const std::string& in, const std::string& out, size_t i = 0, + size_t j = 0) const override; + + void ShareLayout(const std::string& in, const std::string& out, size_t i = 0, + size_t j = 0) const; + + bool IsRuntime() const override { return true; } + + protected: + DDim GetDim(const std::string& name) const override; + void SetDim(const std::string& name, const DDim& dim) override; + + std::vector GetRepeatedDims(const std::string& name) const override { + PADDLE_THROW("Only compile time support this method"); + } + void SetRepeatedDims(const std::string& name, + const std::vector& dims) override { + PADDLE_THROW("Only compile time support this method"); + } + + proto::VarType::Type GetVarType(const std::string& name) const override { + auto* var = scope_.FindVar(name); + return ToVarType(var->Type()); + } + + InferShapeVarPtr GetVarPtr(const std::string& name) override { + return scope_.FindVar(name); + } + + private: + const OperatorBase& op_; + const Scope& scope_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/operators/fusion_lstm_op.cc b/paddle/fluid/operators/fusion_lstm_op.cc index ef23ab3f98..ae9d5d78ae 100644 --- a/paddle/fluid/operators/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fusion_lstm_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/fusion_lstm_op.h" #include +#include "paddle/fluid/framework/shape_runtime_infer.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/cpu_vec.h" #include "paddle/fluid/operators/math/fc_compute.h" @@ -24,26 +25,54 @@ namespace paddle { namespace operators { void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of LSTM should not be null."); - PADDLE_ENFORCE(ctx->HasInput("WeightX"), - "Input(WeightX) of LSTM should not be null."); - PADDLE_ENFORCE(ctx->HasInput("WeightH"), - "Input(WeightH) of LSTM should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Bias"), - "Input(Bias) of LSTM should not be null."); - - PADDLE_ENFORCE(ctx->HasOutput("XX"), - "Output(XX) of LSTM should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Hidden"), - "Output(Hidden) of LSTM should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Cell"), - "Output(Cell) of LSTM should not be null."); + auto* runtime_ctx = dynamic_cast(ctx); + if (runtime_ctx == nullptr) { + LOG(FATAL) << "Should have runtime infer context"; + } + const auto& ins = runtime_ctx->OpBase().Inputs(); + const auto& outs = runtime_ctx->OpBase().Outputs(); + const auto& scope = runtime_ctx->InferScope(); + const auto ins_end = ins.end(); + const auto outs_end = outs.end(); + auto fair_input = [&](const std::string& name) -> bool { + auto it = ins.find(name); + if (it == ins_end) { + return false; + } + const auto& in = it->second; + if (in.size() != 1 || in[0] == framework::kEmptyVarName) { + return false; + } + return scope.FindVar(in[0]) != nullptr; + }; + auto fair_output = [&](const std::string& name) -> bool { + auto it = outs.find(name); + if (it == outs_end) { + return false; + } + const auto& out = it->second; + if (out.size() != 1 || out[0] == framework::kEmptyVarName) { + return false; + } + return scope.FindVar(out[0]) != nullptr; + }; + + PADDLE_ENFORCE(fair_input("X"), "Assert only one Input(X) of LSTM."); + PADDLE_ENFORCE(fair_input("WeightX"), + "Assert only one Input(WeightX) of LSTM."); + PADDLE_ENFORCE(fair_input("WeightH"), + "Assert only one Input(WeightH) of LSTM."); + PADDLE_ENFORCE(fair_input("Bias"), "Assert only one Input(Bias) of LSTM."); + PADDLE_ENFORCE(fair_output("XX"), "Assert only one Output(XX) of LSTM."); + PADDLE_ENFORCE(fair_output("Hidden"), + "Assert only one Output(Hidden) of LSTM."); + PADDLE_ENFORCE(fair_output("Cell"), "Assert only one Output(Cell) of LSTM."); auto x_dims = ctx->GetInputDim("X"); PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2."); - if (ctx->HasInput("H0")) { - PADDLE_ENFORCE(ctx->HasInput("C0"), + if (fair_input("H0")) { + PADDLE_ENFORCE(fair_input("C0"), "Input(Cell) and Input(Hidden) of LSTM should not " "be null at the same time."); auto h_dims = ctx->GetInputDim("H0"); @@ -95,16 +124,16 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { xx_width = wx_dims[1]; } else { xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1]; - PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"), - "Output(BatchedInput) of LSTM should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"), - "Output(BatchedHidden) of LSTM should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("BatchedCell"), - "Output(BatchedCell) of LSTM should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"), - "Output(ReorderedH0) of LSTM should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("ReorderedC0"), - "Output(ReorderedC0) of LSTM should not be null."); + PADDLE_ENFORCE(fair_output("BatchedInput"), + "Assert only one Output(BatchedInput) of LSTM."); + PADDLE_ENFORCE(fair_output("BatchedHidden"), + "Assert only one Output(BatchedHidden) of LSTM."); + PADDLE_ENFORCE(fair_output("BatchedCell"), + "Assert only one Output(BatchedCell) of LSTM."); + PADDLE_ENFORCE(fair_output("ReorderedH0"), + "Assert only one Output(ReorderedH0) of LSTM"); + PADDLE_ENFORCE(fair_output("ReorderedC0"), + "Assert only one Output(ReorderedC0) of LSTM."); ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]}); ctx->SetOutputDim("BatchedHidden", out_dims); ctx->SetOutputDim("BatchedCell", out_dims); From b681537e1a873a08e1b2f5a4bb78772ee0353279 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 11 Sep 2018 16:24:28 +0800 Subject: [PATCH 32/85] Add multiprocess reader (#13311) * add multiprocess_reader * add multiprocess_reader to reader decorator * support piped multi process reader * revert v2 decorator * add comment to multiprocess_reader * optimize code * use ujson to speed up json serialize/deserialize * add assert to multiprocess_reader * update comment of multiprocess_reader * optimize ujson import, handle error case * optimize import ujson * remove ujson from requirements.txt * add import sys to decorator.py --- python/paddle/reader/decorator.py | 99 +++++++++++++++++++- python/paddle/reader/tests/decorator_test.py | 29 ++++++ 2 files changed, 127 insertions(+), 1 deletion(-) diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py index 6d7ac876fd..5b9459b670 100644 --- a/python/paddle/reader/decorator.py +++ b/python/paddle/reader/decorator.py @@ -14,11 +14,14 @@ __all__ = [ 'map_readers', 'buffered', 'compose', 'chain', 'shuffle', - 'ComposeNotAligned', 'firstn', 'xmap_readers', 'PipeReader' + 'ComposeNotAligned', 'firstn', 'xmap_readers', 'PipeReader', + 'multiprocess_reader' ] from threading import Thread import subprocess +import multiprocessing +import sys from six.moves.queue import Queue from six.moves import zip_longest @@ -332,6 +335,100 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False): return xreader +def multiprocess_reader(readers, use_pipe=True, queue_size=1000): + """ + multiprocess_reader use python multi process to read data from readers + and then use multiprocess.Queue or multiprocess.Pipe to merge all + data. The process number is equal to the number of input readers, each + process call one reader. + + Multiprocess.Queue require the rw access right to /dev/shm, some + platform does not support. + + you need to create multiple readers first, these readers should be independent + to each other so that each process can work independently. + + An example: + + .. code-block:: python + + reader0 = reader(["file01", "file02"]) + reader1 = reader(["file11", "file12"]) + reader1 = reader(["file21", "file22"]) + reader = multiprocess_reader([reader0, reader1, reader2], + queue_size=100, use_pipe=False) + """ + + try: + import ujson as json + except Exception as e: + sys.stderr.write("import ujson error: " + str(e) + " use json\n") + import json + + assert type(readers) is list and len(readers) > 0 + + def _read_into_queue(reader, queue): + for sample in reader(): + if sample is None: + raise ValueError("sample has None") + queue.put(sample) + queue.put(None) + + def queue_reader(): + queue = multiprocessing.Queue(queue_size) + for reader in readers: + p = multiprocessing.Process( + target=_read_into_queue, args=(reader, queue)) + p.start() + + reader_num = len(readers) + finish_num = 0 + while finish_num < reader_num: + sample = queue.get() + if sample is None: + finish_num += 1 + else: + yield sample + + def _read_into_pipe(reader, conn): + for sample in reader(): + if sample is None: + raise ValueError("sample has None!") + conn.send(json.dumps(sample)) + conn.send(json.dumps(None)) + conn.close() + + def pipe_reader(): + conns = [] + for reader in readers: + parent_conn, child_conn = multiprocessing.Pipe() + conns.append(parent_conn) + p = multiprocessing.Process( + target=_read_into_pipe, args=(reader, child_conn)) + p.start() + + reader_num = len(readers) + finish_num = 0 + conn_to_remove = [] + while finish_num < reader_num: + for conn in conn_to_remove: + conns.remove(conn) + conn_to_remove = [] + for conn in conns: + sample = json.loads(conn.recv()) + if sample is None: + finish_num += 1 + conn.close() + conn_to_remove.append(conn) + else: + yield sample + + if use_pipe: + return pipe_reader + else: + return queue_reader + + def _buf2lines(buf, line_break="\n"): # FIXME: line_break should be automatically configured. lines = buf.split(line_break) diff --git a/python/paddle/reader/tests/decorator_test.py b/python/paddle/reader/tests/decorator_test.py index 537df489b9..c324092f88 100644 --- a/python/paddle/reader/tests/decorator_test.py +++ b/python/paddle/reader/tests/decorator_test.py @@ -14,6 +14,7 @@ import time import unittest +import functools import paddle.reader @@ -174,5 +175,33 @@ class TestPipeReader(unittest.TestCase): temp.close() +class TestMultiProcessReader(unittest.TestCase): + def setup(self): + self.samples = [] + for i in range(1000): + self.samples.append([[i], [i + 1, i + 2], i + 3]) + + def reader(index): + for i in range(len(self.samples)): + if i % 3 == index: + yield self.samples[i] + + self.reader0 = functools.partial(reader, 0) + self.reader1 = functools.partial(reader, 1) + self.reader2 = functools.partial(reader, 2) + + def reader_test(self, use_pipe): + self.setup() + results = [] + for data in paddle.reader.multiprocess_reader( + [self.reader0, self.reader1, self.reader2], 100, use_pipe)(): + results.append(data) + self.assertEqual(sorted(self.samples), sorted(results)) + + def test_multi_process_reader(self): + self.reader_test(use_pipe=False) + self.reader_test(use_pipe=True) + + if __name__ == '__main__': unittest.main() From 03ff4f689213a6dc2c469dfd0c2cffe16e6b418d Mon Sep 17 00:00:00 2001 From: nhzlx Date: Tue, 11 Sep 2018 08:27:24 +0000 Subject: [PATCH 33/85] fix subgraph bug! --- .../inference/analysis/data_flow_graph.cc | 39 +--- .../inference/analysis/data_flow_graph.h | 3 - .../analysis/data_flow_graph_to_fluid_pass.cc | 25 ++- .../inference/analysis/subgraph_splitter.cc | 186 +++++++++++++++++- .../analysis/subgraph_splitter_tester.cc | 2 +- paddle/fluid/operators/tensorrt_engine_op.h | 20 +- 6 files changed, 215 insertions(+), 60 deletions(-) diff --git a/paddle/fluid/inference/analysis/data_flow_graph.cc b/paddle/fluid/inference/analysis/data_flow_graph.cc index e4f4bbf43c..8c7d58678f 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph.cc @@ -440,6 +440,7 @@ ExtractInputAndOutputOfSubGraph(std::vector &graph) { // NOLINT } return false; }; + for (auto &node : graph) { for (auto *in : node->inlinks) { // The Value that is written by nodes inside a sub-graph shouldn't be the @@ -459,6 +460,7 @@ ExtractInputAndOutputOfSubGraph(std::vector &graph) { // NOLINT std::vector(outputs.begin(), outputs.end())); } +// Filter the Intermediate results of the subgraph node. void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) { std::vector op_nodes; for (auto &node : GraphTraits(*graph).nodes_in_TS()) { @@ -484,46 +486,11 @@ void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) { out->SetDeleted(); } } - PADDLE_ENFORCE_GE(filtered_subgraph_outlinks.size(), 1UL); + // The filtered_subgraph_outlinks may be empty. op_nodes[i]->outlinks = filtered_subgraph_outlinks; } } -void FlexibleDFS(const std::vector &source, bool reverse, - const std::function &enter, - const std::function &leave) { - typedef struct { - const Node *node; - bool leave; - } FNode; - std::vector stack; - for (auto &node : source) { - stack.push_back(FNode{node, false}); - } - std::unordered_set visited; - while (!stack.empty()) { - auto fnode = stack.back(); - stack.pop_back(); - - if (fnode.leave) { - if (leave && !leave(fnode.node)) return; - } - if (visited.count(fnode.node)) continue; - visited.insert(fnode.node); - - if (enter && !enter(fnode.node)) return; - - if (leave) stack.push_back(FNode{fnode.node, true}); - const std::vector iter_nodes = - reverse == true ? fnode.node->inlinks : fnode.node->outlinks; - for (const Node *node : iter_nodes) { - if (!visited.count(node)) { - stack.push_back(FNode{node, false}); - } - } - } -} - } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h index 4fefc175f3..437e097acd 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph.h +++ b/paddle/fluid/inference/analysis/data_flow_graph.h @@ -204,9 +204,6 @@ std::pair, std::vector> ExtractInputAndOutputOfSubGraph(std::vector &graph); // NOLINT void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph); -void FlexibleDFS(const std::vector &source, bool reverse, - const std::function &enter, - const std::function &leave); } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc index 80c85555e7..47e9752ff2 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc @@ -106,20 +106,23 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, // collect inputs std::unordered_set input_names; + std::unordered_set input_names_with_id; for (auto *x : func->inlinks) { input_names.insert(x->name()); + input_names_with_id.insert(x->name() + std::to_string(x->id())); } desc.SetInput( "Xs", std::vector(input_names.begin(), input_names.end())); std::unordered_set output_names; + std::unordered_set output_names_with_id; for (auto *x : func->outlinks) { output_names.insert(x->name()); + output_names_with_id.insert(x->name() + std::to_string(x->id())); } - std::vector output_temp(output_names.begin(), - output_names.end()); - desc.SetOutput("Ys", output_temp); + desc.SetOutput( + "Ys", std::vector(output_names.begin(), output_names.end())); desc.SetType("tensorrt_engine"); std::unordered_map output_name_map; @@ -153,11 +156,12 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, std::vector replaced_names; for (int k = 0; k < in_var->arguments_size(); k++) { std::string arg_value = in_var->arguments(k); - if (input_names.count(arg_value)) { + std::string arg_value_with_id = + arg_value + std::to_string(var2id[arg_value]); + if (input_names_with_id.count(arg_value_with_id)) { replaced_names.push_back(arg_value); } else { - replaced_names.push_back(arg_value + - std::to_string(var2id[arg_value])); + replaced_names.push_back(arg_value_with_id); } } in_var->clear_arguments(); @@ -176,11 +180,12 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, std::vector replaced_names; for (int k = 0; k < out_var->arguments_size(); k++) { std::string arg_value = out_var->arguments(k); - if (output_names.count(arg_value)) { - output_name_map[arg_value] = - arg_value + std::to_string(var2id[arg_value]); + std::string arg_value_with_id = + arg_value + std::to_string(var2id[arg_value]); + if (output_names_with_id.count(arg_value_with_id)) { + output_name_map[arg_value] = arg_value_with_id; } - replaced_names.push_back(arg_value + std::to_string(var2id[arg_value])); + replaced_names.push_back(arg_value_with_id); } out_var->clear_arguments(); for (size_t k = 0; k < replaced_names.size(); k++) { diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/subgraph_splitter.cc index 670a8de667..857375fc21 100644 --- a/paddle/fluid/inference/analysis/subgraph_splitter.cc +++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc @@ -74,13 +74,126 @@ void UnionFindCombine(const node_map_t &node_map, size_t a, size_t b) { node_map.at(b)->attr(kUnionFindParent).Int32() = a_ancestor; } +// This is a simple representation of a graph. +// The BriefNode hold the pointer of the Node. +// This is to avoid changing the original graph +// in the process of trt graph analysis. +struct BriefNode { + explicit BriefNode(Node *n) { node = n; } + Node *node; + std::vector inlinks; + std::vector outlinks; +}; + +void UnionContractedNodes(const std::unordered_map &node_map, + int src_id, int dst_id) { + // merge the two adjacent nodes into one node. + BriefNode *src_node = node_map.at(src_id); + BriefNode *dst_node = node_map.at(dst_id); + + std::unordered_set inputs(src_node->inlinks.begin(), + src_node->inlinks.end()); + std::unordered_set outputs; + + for (auto *n : src_node->outlinks) { + if (n != dst_node) outputs.insert(n); + } + + // Add the inlinks and outlinks of dst node to src node. + std::vector dst_in_nodes = dst_node->inlinks; + for (BriefNode *node : dst_in_nodes) { + if (node != src_node) { + inputs.insert(node); + } + } + + std::vector dst_out_nodes = dst_node->outlinks; + for (BriefNode *node : dst_out_nodes) { + outputs.insert(node); + } + + // update the dst and src node's inlinks and outlinks. + src_node->inlinks = + std::move(std::vector(inputs.begin(), inputs.end())); + src_node->outlinks = + std::move(std::vector(outputs.begin(), outputs.end())); + dst_node->inlinks.clear(); + dst_node->outlinks.clear(); + + auto inlink_or_outlink_cleaner = [&](std::vector &nodes) { + for (auto *&n : nodes) { + if (n == src_node || n == dst_node) { + n = src_node; + } + } + }; + // Change all the dst inputs and outputs corresponding inlink and + // outlink to the src node. + for (auto *node : src_node->inlinks) { + inlink_or_outlink_cleaner(node->outlinks); + } + + for (auto *node : src_node->outlinks) { + inlink_or_outlink_cleaner(node->inlinks); + } +} + +// FlexibleDfS +// If reverse is true, do reverse dfs. +// If enter func is not nullptr, calls enter(node) before visiting any children +// of node. +// If leave func not nullptr, calls leave(node) after visiting all parents of +// node. +void FlexibleDFS(const std::vector &source, bool reverse, + const std::function &enter, + const std::function &leave) { + typedef struct { + const BriefNode *node; + bool leave; + } FNode; + + std::vector stack; + for (auto &node : source) { + stack.push_back(FNode{node, false}); + } + std::unordered_set visited; + while (!stack.empty()) { + auto fnode = stack.back(); + stack.pop_back(); + + if (fnode.leave) { + if (leave && !leave(fnode.node)) return; + } + if (visited.count(fnode.node)) continue; + visited.insert(fnode.node); + + if (enter && !enter(fnode.node)) return; + + if (leave) stack.push_back(FNode{fnode.node, true}); + const std::vector iter_nodes = + reverse == true ? fnode.node->inlinks : fnode.node->outlinks; + for (const BriefNode *node : iter_nodes) { + if (!visited.count(node)) { + stack.push_back(FNode{node, false}); + } + } + } +} + std::vector> SubGraphSplitter::ExtractSubGraphs() { + // Run the Extract algorithm to find all subgraphs. std::vector marked_nodes; + // We use brief_node_map to represent the original graph in order to avoid + // changing the original graph. + std::unordered_map brief_node_map; + for (auto &node : GraphTraits(*graph_).nodes_in_TS()) { + brief_node_map[node.id()] = new BriefNode(&node); if (node.attr(kMarkerAttrName).Bool()) { marked_nodes.push_back(&node); } } + // extract sub-graphs in the marked node set, use Union Find algorithm. node_map_t node_map; // id to ptr for (auto *n : marked_nodes) { @@ -88,11 +201,73 @@ std::vector> SubGraphSplitter::ExtractSubGraphs() { n->attr(kUnionFindParent).Int32() = n->id(); node_map[n->id()] = n; } - std::unordered_set visited; - for (auto *n : marked_nodes) { - for (auto *out : n->outlinks) { - if (node_map.count(out->id())) { - UnionFindCombine(node_map, n->id(), out->id()); + + // create breif node map + for (auto &itr : brief_node_map) { + for (Node *node : itr.second->node->inlinks) { + itr.second->inlinks.push_back(brief_node_map[node->id()]); + } + + for (Node *node : itr.second->node->outlinks) { + itr.second->outlinks.push_back(brief_node_map[node->id()]); + } + } + + for (auto &itr : brief_node_map) { + BriefNode *brief_node = itr.second; + + if (!brief_node->node->attr(kMarkerAttrName).Bool()) { + VLOG(4) << brief_node->node->id() << " node not a trt candicate."; + continue; + } + + // Our algorithm must guarantee that: + // 1. The graph is always directed acyclic graph(DAG). + // 2. If there is a path in the subgraph from X to Y (X and Y are both + // nodes + // in the subgraph), then all paths from X to Y are in the subgraph. + // + // In order to achieve the above guarantee. + // For adjacent nodes src -> dst. + // 1. Get all dst input nodes except src. + // 2. Reverse DFS from those input nodes + // 3. If there is a path from input nodes to src, + // then the src and dst nodes can not be fused into one node, + // otherwise it can be done. + + while (true) { + std::unordered_set contract_nodes; + for (auto *out : brief_node->outlinks) { + // must be an trt candidate + if (!out->node->attr(kMarkerAttrName).Bool()) continue; + // get all dst input nodes except src. + std::vector source_nodes; + for (auto *n : out->inlinks) { + if (n != brief_node) { + source_nodes.push_back(n); + } + } + + // Reverse DFS from the source_nodes. + bool have_excess_path = false; + FlexibleDFS(source_nodes, true, nullptr, + [&have_excess_path, brief_node](const BriefNode *n) { + if (n == brief_node) { + have_excess_path = true; + return false; + } + return true; + }); + if (have_excess_path) continue; + contract_nodes.insert(out); + } + if (contract_nodes.empty()) break; + + for (auto dst_node : contract_nodes) { + UnionFindCombine(node_map, brief_node->node->id(), + dst_node->node->id()); + UnionContractedNodes(brief_node_map, brief_node->node->id(), + dst_node->node->id()); } } } @@ -128,6 +303,7 @@ void SubGraphFuse::ReplaceNodesWithSubGraphs() { auto io = ExtractInputAndOutputOfSubGraph(subgraph); block_node->inlinks = std::move(io.first); block_node->outlinks = std::move(io.second); + for (auto *node : subgraph) { // TODO(Superjomn) need a unified mechanism to treat deleted node in each // pass. diff --git a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc index 39cc433b40..531a170512 100644 --- a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc +++ b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc @@ -82,7 +82,7 @@ TEST(SubGraphSplitter, Fuse) { // At least one nodes should be deleted. ASSERT_EQ(dfg.nodes.size(), count0 + 1); // added a new FunctionBlock - ASSERT_EQ(6, count1); + ASSERT_EQ(11, count1); } } // namespace analysis diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h index bc556ab364..395d8bcc07 100644 --- a/paddle/fluid/operators/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt_engine_op.h @@ -160,11 +160,21 @@ class TensorRTEngineKernel : public framework::OpKernel { fluid_t->mutable_data(platform::CUDAPlace( boost::get(context.GetPlace()).device)), size * sizeof(float)); - //} else { - // engine->GetOutputInGPU( - // y, fluid_t->mutable_data(platform::CUDAPlace()), - // size * sizeof(float)); - //} + + // TODO(zhaolong) : delete it sometimes + /* THIS CODE JUST FOR TEST + std::cout << output_maps[output_index] << std::endl; + platform::CPUPlace cpu_place; + framework::LoDTensor temp_tensor; + temp_tensor.Resize(framework::make_ddim(ddim)); + auto* temp_data = temp_tensor.mutable_data(cpu_place); + + TensorCopySync(*fluid_t, cpu_place ,&temp_tensor); + for(int i = 0; i < size; i++) { + std::cout << temp_data[i] << " " ; + } + std::cout << std::endl; + */ output_index += 1; } From df161e08f0974b5fc77a62714c94bcdb8f04c412 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Tue, 11 Sep 2018 08:36:29 +0000 Subject: [PATCH 34/85] delete unuse ut --- .../analysis/data_flow_graph_tester.cc | 71 ------------------- .../inference/analysis/subgraph_splitter.cc | 2 +- 2 files changed, 1 insertion(+), 72 deletions(-) diff --git a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc index 040ca19514..1682011c3d 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc @@ -160,77 +160,6 @@ TEST(DataFlowGraph, Build_IR_Graph) { ASSERT_EQ(graph.nodes.size(), ir_graph.Nodes().size()); } -// FlexibleDFS -/* - * Graph topology - * inputs: 0 - * 0 -> 1 - * 1 -> 2 - * 1 -> 3 - * 3 -> 4 - * 4 -> 5 - * 5 -> 2 - */ -TEST(DataFlowGraph, flexibledfs) { - DataFlowGraph graph; - - for (int i = 0; i < 6; i++) { - auto* node = graph.nodes.Create(Node::Type::kValue); - node->SetName("node-" + std::to_string(i)); - } - - auto add_link = [&](int i, int j) { - Node* source = graph.nodes.GetMutable(i); - Node* target = graph.nodes.GetMutable(j); - target->inlinks.push_back(source); - source->outlinks.push_back(target); - }; - - add_link(0, 1); - add_link(1, 2); - add_link(1, 3); - add_link(3, 4); - add_link(4, 5); - add_link(5, 2); - graph.Build(); - - std::vector order; - FlexibleDFS(graph.inputs(), false, nullptr, [&order](const Node* n) { - order.push_back(n); - return true; - }); - - ASSERT_EQ(order.size(), 6UL); - - order.clear(); - // reverse dfs - FlexibleDFS(graph.outputs(), true, nullptr, [&order](const Node* n) { - order.push_back(n); - return true; - }); - - ASSERT_EQ(order.size(), 6UL); - - // If we delete - Node* last_node = graph.nodes.GetMutable(2); - Node* direct_node = graph.nodes.GetMutable(1); - std::vector source_nodes; - for (Node* node : last_node->inlinks) { - if (node != direct_node) source_nodes.push_back(node); - } - - bool has_cycle = false; - FlexibleDFS(source_nodes, true, nullptr, - [&has_cycle, direct_node](const Node* n) { - if (n == direct_node) { - has_cycle = true; - return false; - } - return true; - }); - ASSERT_TRUE(has_cycle); -} - } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/subgraph_splitter.cc index 857375fc21..773fceeeb2 100644 --- a/paddle/fluid/inference/analysis/subgraph_splitter.cc +++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc @@ -138,7 +138,7 @@ void UnionContractedNodes(const std::unordered_map &node_map, } } -// FlexibleDfS +// FlexibleDFS // If reverse is true, do reverse dfs. // If enter func is not nullptr, calls enter(node) before visiting any children // of node. From 5d34ef61cbeacb7089f6e28de685c79db324f207 Mon Sep 17 00:00:00 2001 From: Michal Gallus Date: Tue, 4 Sep 2018 12:02:57 +0200 Subject: [PATCH 35/85] Fuse MKLDNN's Conv + ReLU --- paddle/fluid/operators/conv_mkldnn_op.cc | 46 +++++++++++++++---- paddle/fluid/operators/conv_op.cc | 2 + .../fluid/transpiler/inference_transpiler.py | 39 ++++++++++++++-- 3 files changed, 75 insertions(+), 12 deletions(-) diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc index c5cbadc892..53e705c8ca 100644 --- a/paddle/fluid/operators/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/conv_mkldnn_op.cc @@ -296,6 +296,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { std::vector strides = ctx.Attr>("strides"); std::vector paddings = ctx.Attr>("paddings"); std::vector dilations = ctx.Attr>("dilations"); + bool fuse_relu = ctx.Attr("fuse_relu"); int groups = ctx.Attr("groups"); // TODO(pzelazko-intel) add support for group convolution and dilation @@ -348,11 +349,12 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { bias_tz = paddle::framework::vectorize2int(bias->dims()); auto bias_md = platform::MKLDNNMemDesc( bias_tz, platform::MKLDNNGetDataType(), memory::format::x); - conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, - strides, paddings, mkldnn_engine); + conv_pd = + ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, strides, + paddings, mkldnn_engine, fuse_relu); } else { conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, - paddings, mkldnn_engine); + paddings, mkldnn_engine, fuse_relu); } // Save conv_pd/src_memory/weights_memory for backward pass dev_ctx.SetBlob(key_conv_pd, conv_pd); @@ -402,11 +404,26 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { } private: + mkldnn::primitive_attr AddRelu() const { + // Fusion with ReLU layer is executed through the PostOps feature. Create a + // PostOps object and configure it to execute an eltwise relu operation. + mkldnn::primitive_attr conv_attr; + constexpr float scale = 1.0f; + constexpr float negative_slope = 0.0f; + constexpr float placeholder = 0.0f; + mkldnn::post_ops post_operations; + post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu, + negative_slope, placeholder); + conv_attr.set_post_ops(post_operations); + return conv_attr; + } + std::unique_ptr ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights, const memory::desc& dst, const std::vector& strides, const std::vector& paddings, - const mkldnn::engine& engine) const { + const mkldnn::engine& engine, + const bool fuse_relu) const { memory::dims stride_dims = {strides[0], strides[1]}; memory::dims padding_dims = {paddings[0], paddings[1]}; @@ -415,8 +432,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { dst, stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero); - auto p_conv_pd = - new mkldnn::convolution_forward::primitive_desc(conv_desc, engine); + mkldnn::primitive_attr conv_attr; + if (fuse_relu) { + conv_attr = AddRelu(); + } + + auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc( + conv_desc, conv_attr, engine); return std::unique_ptr( p_conv_pd); @@ -427,7 +449,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { const memory::desc& bias, const memory::desc& dst, const std::vector& strides, const std::vector& paddings, - const mkldnn::engine& engine) const { + const mkldnn::engine& engine, + const bool fuse_relu) const { memory::dims stride_dims = {strides[0], strides[1]}; memory::dims padding_dims = {paddings[0], paddings[1]}; @@ -436,8 +459,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { bias, dst, stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero); - auto p_conv_pd = - new mkldnn::convolution_forward::primitive_desc(conv_desc, engine); + mkldnn::primitive_attr conv_attr; + if (fuse_relu) { + conv_attr = AddRelu(); + } + + auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc( + conv_desc, conv_attr, engine); return std::unique_ptr( p_conv_pd); diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 61ca80877a..3332e64301 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -161,6 +161,8 @@ void Conv2DOpMaker::Make() { AddAttr("use_mkldnn", "(bool, default false) Only used in mkldnn kernel") .SetDefault(false); + AddAttr("fuse_relu", "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); AddAttr( "data_format", "(string, default NCHW) Only used in " diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py index 02fefe32df..adad2428f7 100644 --- a/python/paddle/fluid/transpiler/inference_transpiler.py +++ b/python/paddle/fluid/transpiler/inference_transpiler.py @@ -60,12 +60,46 @@ class InferenceTranspiler(object): if not isinstance(scope, core.Scope): raise TypeError("scope should be as Scope type or None") use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False)) + self._fuse_batch_norm(program, place, scope) if use_mkldnn: - self._fuse_relu_mkldnn(program) self._fuse_conv_bias_mkldnn(program) + self._fuse_conv_relu_mkldnn(program) + self._fuse_bn_relu_mkldnn(program) + + def _fuse_conv_relu_mkldnn(self, program): + ''' + Transpile the program by fused relu activation for MKLDNN program. + Relu activation following convolution OP can be fused by adding + 'fuse_relu' attribute to convolution OP. + The result of fuse is: + - before: + - conv->relu->any_other_op + - after: + - conv->any_other_op + :param program: program to transpile + :type program: Program + ''' + self.block = program.block(0) + + i = 0 + while i < len(self.block.ops): + current_op = self.block.ops[i] + if current_op.type in ['conv2d']: + next_op = self.block.ops[i + 1] + if next_op.type == 'relu': + # modify conv OP to include relu + current_op.set_attr("fuse_relu", True) + # remove conv OP + self.block._remove_op(i + 1) + i = i + 1 + + # TODO(luotao): use clone() method to flush the program.desc in force, + # since some large program.desc will not be flushed immediately. + # And a better solution will be considered later. + program = program.clone() - def _fuse_relu_mkldnn(self, program): + def _fuse_bn_relu_mkldnn(self, program): ''' Transpile the program by fused relu activation for MKLDNN program. @@ -159,7 +193,6 @@ class InferenceTranspiler(object): self._fuse_conv_bias(i, current_op, next_op) self.block._remove_op(i + 1) # Remove old conv self.block._remove_op(i + 1) # Remove elementwise_add - i = i + 1 i = i + 1 self._remove_unused_var() From accdecc6814b8070d3e3bdbec77b162d954f21d6 Mon Sep 17 00:00:00 2001 From: Krzysztof Binias Date: Tue, 11 Sep 2018 11:16:05 +0200 Subject: [PATCH 36/85] Correcting Lint errors --- paddle/fluid/operators/conv_mkldnn_op.cc | 4 ++-- paddle/fluid/platform/mkldnn_helper.h | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc index 244a578db8..fa9ee637c5 100644 --- a/paddle/fluid/operators/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/conv_mkldnn_op.cc @@ -130,8 +130,8 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler { std::shared_ptr AcquireWeightsMemoryFromPrimitive( const std::shared_ptr user_weights_memory_p, - const std::vector& pipeline, - bool is_persistent = false) { // NOLINT + std::vector& pipeline, // NOLINT + bool is_persistent = false) { auto user_weights_pd = user_weights_memory_p->get_primitive_desc(); auto weights_pd = conv_pd_->weights_primitive_desc(); return this->AcquireMemory(weights_pd, user_weights_pd, diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index cf08202ccb..c0a2543ba5 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -191,8 +191,9 @@ class MKLDNNHandler { mkldnn::memory::primitive_desc& mpd, // NOLINT mkldnn::memory::primitive_desc& user_mpd, // NOLINT const std::shared_ptr user_memory_p, - const std::string& suffix, const std::vector& pipeline, - bool is_persistent = false) { // NOLINT + const std::string& suffix, + std::vector& pipeline, // NOLINT + bool is_persistent = false) { // create reorder primitive if the input format is not the preferred one auto local_key = key_ + suffix; auto key_reorder_p = key_ + suffix + "reorder_p"; From a5556d44175931682bb049451639948c0da7ed6e Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 11 Sep 2018 17:49:54 +0800 Subject: [PATCH 37/85] refine attentionlstm infershape --- paddle/fluid/operators/attention_lstm_op.cc | 88 ++++++++++++++------- 1 file changed, 60 insertions(+), 28 deletions(-) diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc index 39b0c85699..ac4ddb5502 100644 --- a/paddle/fluid/operators/attention_lstm_op.cc +++ b/paddle/fluid/operators/attention_lstm_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/attention_lstm_op.h" #include +#include "paddle/fluid/framework/shape_runtime_infer.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/cpu_vec.h" #include "paddle/fluid/operators/math/fc_compute.h" @@ -23,29 +24,60 @@ namespace paddle { namespace operators { void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { - PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of AttentionLSTM should not be null."); - PADDLE_ENFORCE(ctx->HasInput("C0"), - "Input(C0) of AttentionLSTM should not be null."); - PADDLE_ENFORCE(ctx->HasInput("LSTMWeight"), - "Input(LSTMWeight) of AttentionLSTM should not be null."); - PADDLE_ENFORCE(ctx->HasInput("LSTMBias"), - "Input(LSTMBias) of AttentionLSTM should not be null."); - PADDLE_ENFORCE(ctx->HasInput("AttentionWeight"), - "Input(AttentionWeight) of AttentionLSTM should not be null."); - - PADDLE_ENFORCE(ctx->HasOutput("Hidden"), - "Output(Hidden) of AttentionLSTM should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Cell"), - "Output(Cell) of AttentionLSTM should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("AttentionedX"), - "Output(AttentionedX) of AttentionLSTM should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("AttentionFCOut"), - "Output(AttentionFCOut) of AttentionLSTM should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("LSTMX"), - "Output(LSTMX) of AttentionLSTM should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("LSTMOUT"), - "Output(LSTMOUT) of AttentionLSTM should not be null."); + auto* runtime_ctx = dynamic_cast(ctx); + if (runtime_ctx == nullptr) { + LOG(FATAL) << "Should have runtime infer context"; + } + const auto& ins = runtime_ctx->OpBase().Inputs(); + const auto& outs = runtime_ctx->OpBase().Outputs(); + const auto& scope = runtime_ctx->InferScope(); + const auto ins_end = ins.end(); + const auto outs_end = outs.end(); + auto fair_input = [&](const std::string& name) -> bool { + auto it = ins.find(name); + if (it == ins_end) { + return false; + } + const auto& in = it->second; + if (in.size() != 1 || in[0] == framework::kEmptyVarName) { + return false; + } + return scope.FindVar(in[0]) != nullptr; + }; + auto fair_output = [&](const std::string& name) -> bool { + auto it = outs.find(name); + if (it == outs_end) { + return false; + } + const auto& out = it->second; + if (out.size() != 1 || out[0] == framework::kEmptyVarName) { + return false; + } + return scope.FindVar(out[0]) != nullptr; + }; + + PADDLE_ENFORCE(fair_input("X"), "Assert only one Input(X) of AttentionLSTM."); + PADDLE_ENFORCE(fair_input("C0"), + "Assert only one Input(C0) of AttentionLSTM."); + PADDLE_ENFORCE(fair_input("LSTMWeight"), + "Assert only one Input(LSTMWeight) of AttentionLSTM."); + PADDLE_ENFORCE(fair_input("LSTMBias"), + "Assert only one Input(LSTMBias) of AttentionLSTM."); + PADDLE_ENFORCE(fair_input("AttentionWeight"), + "Assert only one Input(AttentionWeight) of AttentionLSTM."); + + PADDLE_ENFORCE(fair_output("Hidden"), + "Assert only one Output(Hidden) of AttentionLSTM."); + PADDLE_ENFORCE(fair_output("Cell"), + "Assert only one Output(Cell) of AttentionLSTM."); + PADDLE_ENFORCE(fair_output("AttentionedX"), + "Assert only one Output(AttentionedX) of AttentionLSTM."); + PADDLE_ENFORCE(fair_output("AttentionFCOut"), + "Assert only one Output(AttentionFCOut) of AttentionLSTM."); + PADDLE_ENFORCE(fair_output("LSTMX"), + "Assert only one Output(LSTMX) of AttentionLSTM."); + PADDLE_ENFORCE(fair_output("LSTMOUT"), + "Assert only one Output(LSTMOUT) of AttentionLSTM."); auto x_dims = ctx->GetInputDim("X"); const int M = x_dims[1]; @@ -65,7 +97,7 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { auto c_dims = ctx->GetInputDim("C0"); PADDLE_ENFORCE_EQ(c_dims.size(), 2, "Input(C0)'s rank must be 2."); PADDLE_ENFORCE_EQ(c_dims[1], D, "C0 dims should be N x %d.", D); - if (ctx->HasInput("H0")) { + if (fair_input("H0")) { auto h_dims = ctx->GetInputDim("H0"); PADDLE_ENFORCE(h_dims == c_dims, "The dimension of Input(H0) and Input(C0) " @@ -79,7 +111,7 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { "AttentionWeight shapes must be (%d + %d) * 1.", M, D); PADDLE_ENFORCE_EQ(atten_w_dims[1], 1, "AttentionWeight shapes must be (%d + %d) * 1.", M, D); - if (ctx->HasInput("AttentionBias")) { + if (fair_input("AttentionBias")) { auto atten_b_dims = ctx->GetInputDim("AttentionBias"); PADDLE_ENFORCE_EQ(atten_b_dims.size(), 2, "Input(AttentionBias)'s rank must be 2."); @@ -89,7 +121,7 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { "AttentionBias shapes must be 1 * 1."); } - if (ctx->HasInput("AttentionScalar")) { + if (fair_input("AttentionScalar")) { auto dims = ctx->GetInputDim("AttentionScalar"); PADDLE_ENFORCE_EQ(dims.size(), 2, "Input(AttentionScalar)'s rank must be 2."); @@ -97,10 +129,10 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { PADDLE_ENFORCE_EQ(dims[1], 1, "AttentionScalar shapes must be 1 * 1."); } - if (ctx->HasInput("AttentionScalarBias")) { + if (fair_input("AttentionScalarBias")) { auto dims = ctx->GetInputDim("AttentionScalarBias"); PADDLE_ENFORCE( - ctx->HasInput("AttentionScalar"), + fair_input("AttentionScalar"), "AttentionScalar should not be null when have AttentionScalarBias."); PADDLE_ENFORCE_EQ(dims.size(), 2, "Input(AttentionScalarBias)'s rank must be 2."); From 916f42bcbf7bc308f2135be5f341b8628cc883dc Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 11 Sep 2018 18:00:20 +0800 Subject: [PATCH 38/85] refine fusion gru infershape --- paddle/fluid/operators/fusion_gru_op.cc | 65 +++++++++++++++++++------ 1 file changed, 49 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/operators/fusion_gru_op.cc b/paddle/fluid/operators/fusion_gru_op.cc index 916f84cb4a..bcdcb2ac4d 100644 --- a/paddle/fluid/operators/fusion_gru_op.cc +++ b/paddle/fluid/operators/fusion_gru_op.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/operators/fusion_gru_op.h" #include // for memcpy #include +#include "paddle/fluid/framework/shape_runtime_infer.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/cpu_vec.h" #include "paddle/fluid/operators/math/fc_compute.h" @@ -25,14 +26,46 @@ namespace paddle { namespace operators { void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of GRU should not be null."); - PADDLE_ENFORCE(ctx->HasInput("WeightX"), - "Input(WeightX) of GRU should not be null."); - PADDLE_ENFORCE(ctx->HasInput("WeightH"), - "Input(WeightH) of GRU should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("XX"), "Output(XX) of GRU should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Hidden"), - "Output(Hidden) of GRU should not be null."); + auto* runtime_ctx = dynamic_cast(ctx); + if (runtime_ctx == nullptr) { + LOG(FATAL) << "Should have runtime infer context"; + } + const auto& ins = runtime_ctx->OpBase().Inputs(); + const auto& outs = runtime_ctx->OpBase().Outputs(); + const auto& scope = runtime_ctx->InferScope(); + const auto ins_end = ins.end(); + const auto outs_end = outs.end(); + auto fair_input = [&](const std::string& name) -> bool { + auto it = ins.find(name); + if (it == ins_end) { + return false; + } + const auto& in = it->second; + if (in.size() != 1 || in[0] == framework::kEmptyVarName) { + return false; + } + return scope.FindVar(in[0]) != nullptr; + }; + auto fair_output = [&](const std::string& name) -> bool { + auto it = outs.find(name); + if (it == outs_end) { + return false; + } + const auto& out = it->second; + if (out.size() != 1 || out[0] == framework::kEmptyVarName) { + return false; + } + return scope.FindVar(out[0]) != nullptr; + }; + + PADDLE_ENFORCE(fair_input("X"), "Assert only one Input(X) of GRU."); + PADDLE_ENFORCE(fair_input("WeightX"), + "Assert only one Input(WeightX) of GRU."); + PADDLE_ENFORCE(fair_input("WeightH"), + "Assert only one Input(WeightH) of GRU."); + PADDLE_ENFORCE(fair_output("XX"), "Assert only one Output(XX) of GRU."); + PADDLE_ENFORCE(fair_output("Hidden"), + "Assert only one Output(Hidden) of GRU."); auto x_dims = ctx->GetInputDim("X"); PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2."); @@ -58,12 +91,12 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const { "should be 3 * %d.", frame_size); - if (ctx->HasInput("H0")) { + if (fair_input("H0")) { auto h0_dims = ctx->GetInputDim("H0"); PADDLE_ENFORCE_EQ(h0_dims[1], frame_size, "The width of H0 must be equal to frame_size."); } - if (ctx->HasInput("Bias")) { + if (fair_input("Bias")) { auto b_dims = ctx->GetInputDim("Bias"); PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2."); PADDLE_ENFORCE_EQ(b_dims[0], 1, @@ -79,12 +112,12 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const { xx_width = wx_dims[1]; } else { xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1]; - PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"), - "Output(ReorderedH0) of GRU should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"), - "Output(BatchedInput) of GRU should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("BatchedOut"), - "Output(BatchedOut) of GRU should not be null."); + PADDLE_ENFORCE(fair_output("ReorderedH0"), + "Assert only one Output(ReorderedH0) of GRU."); + PADDLE_ENFORCE(fair_output("BatchedInput"), + "Assert only one Output(BatchedInput) of GRU."); + PADDLE_ENFORCE(fair_output("BatchedOut"), + "Assert only one Output(BatchedOut) of GRU."); ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]}); ctx->SetOutputDim("BatchedOut", out_dims); } From 8e0fe035d478a8bfb7bea888b986eafa827dcbf1 Mon Sep 17 00:00:00 2001 From: superjomn Date: Tue, 11 Sep 2018 10:16:19 +0000 Subject: [PATCH 39/85] fix ner_test when bs>1 --- paddle/fluid/inference/tests/api/analyzer_ner_tester.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc index 661b047ed7..6e8e43add7 100644 --- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc @@ -144,8 +144,9 @@ void TestChineseNERPrediction(bool use_analysis) { size_t num_samples; for (int i = 0; i < FLAGS_repeat; i++) { DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + // Just one batch, the num_samples remains the same. num_samples = data.num_samples; - for (size_t bid = 0; bid < num_samples; ++bid) { + for (size_t bid = 0; bid < num_samples / FLAGS_batch_size; ++bid) { PrepareInputs(&input_slots, &data, FLAGS_batch_size); timer.tic(); predictor->Run(input_slots, &outputs); From 8a1abe54d797de7c4f17ab92d2268c3cebf83b66 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 11 Sep 2018 18:30:49 +0800 Subject: [PATCH 40/85] clean fusion infershape code --- paddle/fluid/operators/attention_lstm_op.cc | 35 +---------- paddle/fluid/operators/fusion_gru_op.cc | 35 +---------- .../operators/fusion_infershape_define.h | 60 +++++++++++++++++++ paddle/fluid/operators/fusion_lstm_op.cc | 35 +---------- 4 files changed, 66 insertions(+), 99 deletions(-) create mode 100644 paddle/fluid/operators/fusion_infershape_define.h diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc index ac4ddb5502..7531aa9a46 100644 --- a/paddle/fluid/operators/attention_lstm_op.cc +++ b/paddle/fluid/operators/attention_lstm_op.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/attention_lstm_op.h" #include -#include "paddle/fluid/framework/shape_runtime_infer.h" +#include "paddle/fluid/operators/fusion_infershape_define.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/cpu_vec.h" #include "paddle/fluid/operators/math/fc_compute.h" @@ -24,38 +24,7 @@ namespace paddle { namespace operators { void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { - auto* runtime_ctx = dynamic_cast(ctx); - if (runtime_ctx == nullptr) { - LOG(FATAL) << "Should have runtime infer context"; - } - const auto& ins = runtime_ctx->OpBase().Inputs(); - const auto& outs = runtime_ctx->OpBase().Outputs(); - const auto& scope = runtime_ctx->InferScope(); - const auto ins_end = ins.end(); - const auto outs_end = outs.end(); - auto fair_input = [&](const std::string& name) -> bool { - auto it = ins.find(name); - if (it == ins_end) { - return false; - } - const auto& in = it->second; - if (in.size() != 1 || in[0] == framework::kEmptyVarName) { - return false; - } - return scope.FindVar(in[0]) != nullptr; - }; - auto fair_output = [&](const std::string& name) -> bool { - auto it = outs.find(name); - if (it == outs_end) { - return false; - } - const auto& out = it->second; - if (out.size() != 1 || out[0] == framework::kEmptyVarName) { - return false; - } - return scope.FindVar(out[0]) != nullptr; - }; - + FUSION_INFERSHAPE_INIT; PADDLE_ENFORCE(fair_input("X"), "Assert only one Input(X) of AttentionLSTM."); PADDLE_ENFORCE(fair_input("C0"), "Assert only one Input(C0) of AttentionLSTM."); diff --git a/paddle/fluid/operators/fusion_gru_op.cc b/paddle/fluid/operators/fusion_gru_op.cc index bcdcb2ac4d..b10d311f05 100644 --- a/paddle/fluid/operators/fusion_gru_op.cc +++ b/paddle/fluid/operators/fusion_gru_op.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include "paddle/fluid/operators/fusion_gru_op.h" #include // for memcpy #include -#include "paddle/fluid/framework/shape_runtime_infer.h" +#include "paddle/fluid/operators/fusion_infershape_define.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/cpu_vec.h" #include "paddle/fluid/operators/math/fc_compute.h" @@ -26,38 +26,7 @@ namespace paddle { namespace operators { void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const { - auto* runtime_ctx = dynamic_cast(ctx); - if (runtime_ctx == nullptr) { - LOG(FATAL) << "Should have runtime infer context"; - } - const auto& ins = runtime_ctx->OpBase().Inputs(); - const auto& outs = runtime_ctx->OpBase().Outputs(); - const auto& scope = runtime_ctx->InferScope(); - const auto ins_end = ins.end(); - const auto outs_end = outs.end(); - auto fair_input = [&](const std::string& name) -> bool { - auto it = ins.find(name); - if (it == ins_end) { - return false; - } - const auto& in = it->second; - if (in.size() != 1 || in[0] == framework::kEmptyVarName) { - return false; - } - return scope.FindVar(in[0]) != nullptr; - }; - auto fair_output = [&](const std::string& name) -> bool { - auto it = outs.find(name); - if (it == outs_end) { - return false; - } - const auto& out = it->second; - if (out.size() != 1 || out[0] == framework::kEmptyVarName) { - return false; - } - return scope.FindVar(out[0]) != nullptr; - }; - + FUSION_INFERSHAPE_INIT; PADDLE_ENFORCE(fair_input("X"), "Assert only one Input(X) of GRU."); PADDLE_ENFORCE(fair_input("WeightX"), "Assert only one Input(WeightX) of GRU."); diff --git a/paddle/fluid/operators/fusion_infershape_define.h b/paddle/fluid/operators/fusion_infershape_define.h new file mode 100644 index 0000000000..89521672b0 --- /dev/null +++ b/paddle/fluid/operators/fusion_infershape_define.h @@ -0,0 +1,60 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef PADDLE_FLUID_OPERATORS_FUSION_INFERSHAPE_DEFINE_H_ +#define PADDLE_FLUID_OPERATORS_FUSION_INFERSHAPE_DEFINE_H_ + +#include +#include "paddle/fluid/framework/shape_runtime_infer.h" + +namespace paddle { +namespace operators { + +#define FUSION_INFERSHAPE_INIT \ + auto* runtime_ctx = dynamic_cast(ctx); \ + if (runtime_ctx == nullptr) { \ + LOG(FATAL) << "Should have runtime infer context"; \ + } \ + const auto& ins = runtime_ctx->OpBase().Inputs(); \ + const auto& outs = runtime_ctx->OpBase().Outputs(); \ + const auto& scope = runtime_ctx->InferScope(); \ + const auto ins_end = ins.end(); \ + const auto outs_end = outs.end(); \ + auto fair_input = [&](const std::string& name) -> bool { \ + auto it = ins.find(name); \ + if (it == ins_end) { \ + return false; \ + } \ + const auto& in = it->second; \ + if (in.size() != 1 || in[0] == framework::kEmptyVarName) { \ + return false; \ + } \ + return scope.FindVar(in[0]) != nullptr; \ + }; \ + auto fair_output = [&](const std::string& name) -> bool { \ + auto it = outs.find(name); \ + if (it == outs_end) { \ + return false; \ + } \ + const auto& out = it->second; \ + if (out.size() != 1 || out[0] == framework::kEmptyVarName) { \ + return false; \ + } \ + return scope.FindVar(out[0]) != nullptr; \ + } + +} // namespace operators +} // namespace paddle + +#endif // PADDLE_FLUID_OPERATORS_FUSION_INFERSHAPE_DEFINE_H_ diff --git a/paddle/fluid/operators/fusion_lstm_op.cc b/paddle/fluid/operators/fusion_lstm_op.cc index ae9d5d78ae..08af98f850 100644 --- a/paddle/fluid/operators/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fusion_lstm_op.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/fusion_lstm_op.h" #include -#include "paddle/fluid/framework/shape_runtime_infer.h" +#include "paddle/fluid/operators/fusion_infershape_define.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/cpu_vec.h" #include "paddle/fluid/operators/math/fc_compute.h" @@ -25,38 +25,7 @@ namespace paddle { namespace operators { void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { - auto* runtime_ctx = dynamic_cast(ctx); - if (runtime_ctx == nullptr) { - LOG(FATAL) << "Should have runtime infer context"; - } - const auto& ins = runtime_ctx->OpBase().Inputs(); - const auto& outs = runtime_ctx->OpBase().Outputs(); - const auto& scope = runtime_ctx->InferScope(); - const auto ins_end = ins.end(); - const auto outs_end = outs.end(); - auto fair_input = [&](const std::string& name) -> bool { - auto it = ins.find(name); - if (it == ins_end) { - return false; - } - const auto& in = it->second; - if (in.size() != 1 || in[0] == framework::kEmptyVarName) { - return false; - } - return scope.FindVar(in[0]) != nullptr; - }; - auto fair_output = [&](const std::string& name) -> bool { - auto it = outs.find(name); - if (it == outs_end) { - return false; - } - const auto& out = it->second; - if (out.size() != 1 || out[0] == framework::kEmptyVarName) { - return false; - } - return scope.FindVar(out[0]) != nullptr; - }; - + FUSION_INFERSHAPE_INIT; PADDLE_ENFORCE(fair_input("X"), "Assert only one Input(X) of LSTM."); PADDLE_ENFORCE(fair_input("WeightX"), "Assert only one Input(WeightX) of LSTM."); From 23b12c6f585cd18374810862f98760b80a5ae473 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Tue, 11 Sep 2018 18:32:15 +0800 Subject: [PATCH 41/85] fix invalide bcast in reduce strategy --- paddle/fluid/framework/details/multi_devices_graph_pass.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 7a99169849..d44ebbae4d 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -442,8 +442,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( use_gpu = nccl_ctxs_ != nullptr; #endif - if (use_gpu || - strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { + if (use_gpu && strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) { // Insert BCast Ops for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) { auto &to_bcast_set = bcast_var_name_set[dev_id]; From 7dd54afd0c7f328891fbb0df15e434aa9afba216 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Tue, 11 Sep 2018 12:12:46 +0000 Subject: [PATCH 42/85] fix program desc unit test error --- paddle/fluid/framework/program_desc_test.cc | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/program_desc_test.cc b/paddle/fluid/framework/program_desc_test.cc index 925ea98dbe..7e689a37da 100644 --- a/paddle/fluid/framework/program_desc_test.cc +++ b/paddle/fluid/framework/program_desc_test.cc @@ -87,8 +87,17 @@ TEST(ProgramDesc, copy_ctor) { ASSERT_EQ(op_origin->Inputs(), op_copy->Inputs()); ASSERT_EQ(op_origin->Outputs(), op_copy->Outputs()); - ASSERT_EQ(op_copy->Proto()->SerializeAsString(), - op_origin->Proto()->SerializeAsString()); + ASSERT_EQ(op_origin->Proto()->attrs().size(), + op_copy->Proto()->attrs().size()); + for (auto it = op_origin->Proto()->attrs().begin(); + it != op_origin->Proto()->attrs().end(); ++it) { + for (auto it_2 = op_copy->Proto()->attrs().begin(); + it_2 != op_copy->Proto()->attrs().end(); ++it_2) { + if (it->name() == it_2->name()) { + ASSERT_TRUE(it_2->SerializeAsString() == it->SerializeAsString()); + } + } + } if (op->Type() == "op_with_subblock") { ASSERT_EQ(1, op->GetBlockAttrId("sub_block")); From 8cbb3c0720ab48abd08b06a49ac2e073b750f22f Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 11 Sep 2018 21:56:17 +0800 Subject: [PATCH 43/85] refine lac ut and fix fetch --- paddle/fluid/inference/api/api_impl.cc | 2 +- .../tests/api/analyzer_lac_tester.cc | 28 ------------------- 2 files changed, 1 insertion(+), 29 deletions(-) diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index bd9b4b1a81..6fe13ed027 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -262,7 +262,7 @@ void NativePaddlePredictor::GetFetchOne(const framework::LoDTensor &fetch, if (buffer.empty() || buffer.length() < sizeof(T) * data.size()) { buffer.Resize(sizeof(T) * data.size()); } - std::memcpy(buffer.data(), data.data(), buffer.length()); + std::memcpy(buffer.data(), data.data(), sizeof(T) * data.size()); // copy LoD for (const auto &level : fetch.lod()) { output->lod.emplace_back(level); diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc index 522d870db8..7e00cb20ad 100644 --- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc @@ -117,34 +117,6 @@ void GetOneBatch(std::vector *input_slots, DataRecord *data, input_slots->assign({input_tensor}); } -void BenchAllData(const std::string &model_path, const std::string &data_file, - const int batch_size, const int repeat) { - NativeConfig config; - config.model_dir = model_path; - config.use_gpu = false; - config.device = 0; - config.specify_input_name = true; - std::vector input_slots, outputs_slots; - DataRecord data(data_file, batch_size); - auto predictor = - CreatePaddlePredictor(config); - GetOneBatch(&input_slots, &data, batch_size); - for (int i = 0; i < FLAGS_burning; i++) { - predictor->Run(input_slots, &outputs_slots); - } - Timer timer; - double sum = 0; - for (int i = 0; i < repeat; i++) { - for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) { - GetOneBatch(&input_slots, &data, batch_size); - timer.tic(); - predictor->Run(input_slots, &outputs_slots); - sum += timer.toc(); - } - } - PrintTime(batch_size, repeat, 1, 0, sum / repeat); -} - const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25, 25, 25, 25, 25, 44, 24, 25, 25, 25, 36, 42, 43, 44, 14, 15, 44, 14, 15, 44, 14, 15, 44, 38, 39, From 392ae69650ea453accdbdd2b5ed84f3764b2d2c6 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Tue, 11 Sep 2018 22:46:36 +0800 Subject: [PATCH 44/85] Set parallel executor thread num under nccl2 distributed env (#13207) --- python/paddle/fluid/parallel_executor.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 4790e0f611..bd9f8b3c35 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -128,6 +128,13 @@ class ParallelExecutor(object): os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exec_strategy.num_threads = cpu_num * 2 + # Set 1 thread num under nccl2 distribute + # env to make sure all gpus run ops in same order. + if num_trainers > 1: + assert (use_cuda) + # FIXME(gongwb): avoid this set. + exec_strategy.num_threads = 1 + if build_strategy is None: build_strategy = BuildStrategy() From 8bb824bb93629fbf69d7e93ffc0dca85e726300c Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 12 Sep 2018 00:06:58 +0800 Subject: [PATCH 45/85] refine infershape hasinput and hasoutput --- paddle/fluid/framework/operator.cc | 274 ++++++++++-------- paddle/fluid/framework/shape_runtime_infer.h | 86 ------ paddle/fluid/operators/attention_lstm_op.cc | 35 ++- paddle/fluid/operators/fusion_gru_op.cc | 22 +- .../operators/fusion_infershape_define.h | 60 ---- paddle/fluid/operators/fusion_lstm_op.cc | 31 +- 6 files changed, 197 insertions(+), 311 deletions(-) delete mode 100644 paddle/fluid/framework/shape_runtime_infer.h delete mode 100644 paddle/fluid/operators/fusion_infershape_define.h diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 36025db7ba..bbd141cb3b 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -21,7 +21,6 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/shape_inference.h" -#include "paddle/fluid/framework/shape_runtime_infer.h" #include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/platform/profiler.h" @@ -459,147 +458,184 @@ bool OpSupportGPU(const std::string& op_type) { return false; } -bool RuntimeInferShapeContext::HasInput(const std::string& name) const { - if (!op_.HasInputs(name)) { - return false; - } - auto& ins = Inputs(name); - size_t length = ins.size(); - if (length == 0) { - return false; - } - PADDLE_ENFORCE_EQ(length, 1UL, - "Input %s should not have more than one inputs", name); - auto ipt = ins[0]; - auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt); - return var != nullptr; -} +class RuntimeInferShapeContext : public InferShapeContext { + public: + RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope) + : op_(op), scope_(scope) {} -bool RuntimeInferShapeContext::HasOutput(const std::string& name) const { - if (!op_.HasOutputs(name)) { - return false; - } - auto& outs = Outputs(name); - size_t length = outs.size(); - if (length == 0) { - return false; - } - PADDLE_ENFORCE_EQ(length, 1UL, - "Output %s should not have more than one inputs", name); - auto ipt = outs[0]; - auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt); - return var != nullptr; -} + bool HasInput(const std::string& name) const override { + // has only one input + const auto& ins = op_.Inputs(); + auto it = ins.find(name); + if (it == ins.end()) { + return false; + } + const auto& in = it->second; -bool RuntimeInferShapeContext::HasInputs(const std::string& name) const { - if (!op_.HasInputs(name)) { - return false; - } - auto inputs = op_.Inputs(name); - if (inputs.empty()) { - return false; - } - for (auto& input : inputs) { - if (scope_.FindVar(input) == nullptr) { + if (in.size() != 1 || in[0] == kEmptyVarName) { return false; } + return scope_.FindVar(in[0]) != nullptr; } - return true; -} -bool RuntimeInferShapeContext::HasOutputs(const std::string& name) const { - if (!op_.HasOutputs(name)) { - return false; + bool HasOutput(const std::string& name) const override { + // has only one output + const auto& outs = op_.Outputs(); + auto it = outs.find(name); + if (it == outs.end()) { + return false; + } + const auto& out = it->second; + if (out.size() != 1 || out[0] == kEmptyVarName) { + return false; + } + return scope_.FindVar(out[0]) != nullptr; } - auto outputs = op_.Outputs(name); - if (outputs.empty()) { - return false; + + bool HasInputs(const std::string& name) const override { + if (!op_.HasInputs(name)) { + return false; + } + auto inputs = op_.Inputs(name); + if (inputs.empty()) { + return false; + } + for (auto& input : inputs) { + if (scope_.FindVar(input) == nullptr) { + return false; + } + } + return true; } - for (auto& output : outputs) { - if (scope_.FindVar(output) == nullptr) { + + bool HasOutputs(const std::string& name) const override { + if (!op_.HasOutputs(name)) { + return false; + } + auto outputs = op_.Outputs(name); + if (outputs.empty()) { return false; } + for (auto& output : outputs) { + if (scope_.FindVar(output) == nullptr) { + return false; + } + } + return true; } - return true; -} -void RuntimeInferShapeContext::ShareLoD(const std::string& in, - const std::string& out, size_t i, - size_t j) const { - PADDLE_ENFORCE_LT(i, Inputs(in).size()); - PADDLE_ENFORCE_LT(j, Outputs(out).size()); - Variable* in_var = scope_.FindVar(Inputs(in)[i]); - Variable* out_var = scope_.FindVar(Outputs(out)[j]); - if (!in_var->IsType()) return; - PADDLE_ENFORCE(out_var->IsType(), - "The %d-th output of Output(%s) must be LoDTensor.", j, out); - auto in_tensor = in_var->Get(); - auto* out_tensor = out_var->GetMutable(); - out_tensor->set_lod(in_tensor.lod()); + AttrReader Attrs() const override { return AttrReader(op_.Attrs()); } + + const std::vector& Inputs( + const std::string& name) const override { + return op_.Inputs(name); + } + + const std::vector& Outputs( + const std::string& name) const override { + return op_.Outputs(name); + } + + void ShareLoD(const std::string& in, const std::string& out, size_t i = 0, + size_t j = 0) const override { + PADDLE_ENFORCE_LT(i, Inputs(in).size()); + PADDLE_ENFORCE_LT(j, Outputs(out).size()); + Variable* in_var = scope_.FindVar(Inputs(in)[i]); + Variable* out_var = scope_.FindVar(Outputs(out)[j]); + if (!in_var->IsType()) return; + PADDLE_ENFORCE(out_var->IsType(), + "The %d-th output of Output(%s) must be LoDTensor.", j, out); + auto in_tensor = in_var->Get(); + auto* out_tensor = out_var->GetMutable(); + out_tensor->set_lod(in_tensor.lod()); // TODO(dzhwinter) : reuse ShareLoD in most operators. // Need to call ShareLayout explicitly in sequence related ops. // Shall we have a better method to shared info between in/out Tensor? #ifdef PADDLE_WITH_MKLDNN - // Fix me: ugly workaround below - // Correct solution: - // set_layout() should NOT be called here (i.e. ShareLoD). Instead, - // layout of output tensor should be set "manually" in Compute() - // of each OPKernel. The reason layout should NOT be shared between - // input and output "automatically" (now by InferShape()->ShareLoD()) - // is that layout transform may occur after InferShape(). - // Workaround: - // Skip set_layout() when input layout is kMKLDNN - // This is to avoid kMKLDNN is populated wrongly into a non-MKLDNN - // OPKernel. In all MKLDNN OPkernel, set_layout(kMKLDNN) should be called - // in Compute() - if (in_tensor.layout() != DataLayout::kMKLDNN) + // Fix me: ugly workaround below + // Correct solution: + // set_layout() should NOT be called here (i.e. ShareLoD). Instead, + // layout of output tensor should be set "manually" in Compute() + // of each OPKernel. The reason layout should NOT be shared between + // input and output "automatically" (now by InferShape()->ShareLoD()) + // is that layout transform may occur after InferShape(). + // Workaround: + // Skip set_layout() when input layout is kMKLDNN + // This is to avoid kMKLDNN is populated wrongly into a non-MKLDNN + // OPKernel. In all MKLDNN OPkernel, set_layout(kMKLDNN) should be called + // in Compute() + if (in_tensor.layout() != DataLayout::kMKLDNN) #endif + out_tensor->set_layout(in_tensor.layout()); + } + + void ShareLayout(const std::string& in, const std::string& out, size_t i = 0, + size_t j = 0) const { + PADDLE_ENFORCE_LT(i, Inputs(in).size()); + PADDLE_ENFORCE_LT(j, Outputs(out).size()); + Variable* in_var = scope_.FindVar(Inputs(in)[i]); + Variable* out_var = scope_.FindVar(Outputs(out)[j]); + if (!in_var->IsType()) return; + PADDLE_ENFORCE(out_var->IsType(), + "The %d-th output of Output(%s) must be LoDTensor.", j, out); + auto in_tensor = in_var->Get(); + auto* out_tensor = out_var->GetMutable(); out_tensor->set_layout(in_tensor.layout()); -} + } -void RuntimeInferShapeContext::ShareLayout(const std::string& in, - const std::string& out, size_t i, - size_t j) const { - PADDLE_ENFORCE_LT(i, Inputs(in).size()); - PADDLE_ENFORCE_LT(j, Outputs(out).size()); - Variable* in_var = scope_.FindVar(Inputs(in)[i]); - Variable* out_var = scope_.FindVar(Outputs(out)[j]); - if (!in_var->IsType()) return; - PADDLE_ENFORCE(out_var->IsType(), - "The %d-th output of Output(%s) must be LoDTensor.", j, out); - auto in_tensor = in_var->Get(); - auto* out_tensor = out_var->GetMutable(); - out_tensor->set_layout(in_tensor.layout()); -} - -DDim RuntimeInferShapeContext::GetDim(const std::string& name) const { - Variable* var = scope_.FindVar(name); - PADDLE_ENFORCE_NOT_NULL(var); - if (var->IsType()) { - return var->Get().dims(); - } else if (var->IsType()) { - return var->Get().GetCompleteDims(); - } else { - PADDLE_THROW( - "Only LoDTensor/SelectedRows support 'GetDim', but Variable %s's " - "type_id is %s.", - name, var->Type().name()); + bool IsRuntime() const override { return true; } + + protected: + DDim GetDim(const std::string& name) const override { + Variable* var = scope_.FindVar(name); + PADDLE_ENFORCE_NOT_NULL(var); + if (var->IsType()) { + return var->Get().dims(); + } else if (var->IsType()) { + return var->Get().GetCompleteDims(); + } else { + PADDLE_THROW( + "Only LoDTensor/SelectedRows support 'GetDim', but Variable %s's " + "type_id is %s.", + name, var->Type().name()); + } } -} -void RuntimeInferShapeContext::SetDim(const std::string& name, - const DDim& dim) { - Variable* var = scope_.FindVar(name); - if (var->IsType()) { - var->GetMutable()->Resize(dim); - } else if (var->IsType()) { - var->GetMutable()->set_height(dim[0]); - } else { - PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.", name, - var->Type().name()); + std::vector GetRepeatedDims(const std::string& name) const override { + PADDLE_THROW("Only compile time support this method"); } -} + + void SetDim(const std::string& name, const DDim& dim) override { + Variable* var = scope_.FindVar(name); + if (var->IsType()) { + var->GetMutable()->Resize(dim); + } else if (var->IsType()) { + var->GetMutable()->set_height(dim[0]); + } else { + PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.", + name, var->Type().name()); + } + } + + void SetRepeatedDims(const std::string& name, + const std::vector& dims) override { + PADDLE_THROW("Only compile time support this method"); + } + + proto::VarType::Type GetVarType(const std::string& name) const override { + auto* var = scope_.FindVar(name); + return ToVarType(var->Type()); + } + + InferShapeVarPtr GetVarPtr(const std::string& name) override { + return scope_.FindVar(name); + } + + private: + const OperatorBase& op_; + const Scope& scope_; +}; static void CheckTensorNANOrInf(const std::string& name, const framework::Tensor& tensor) { diff --git a/paddle/fluid/framework/shape_runtime_infer.h b/paddle/fluid/framework/shape_runtime_infer.h deleted file mode 100644 index 04d4e33f7a..0000000000 --- a/paddle/fluid/framework/shape_runtime_infer.h +++ /dev/null @@ -1,86 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/shape_inference.h" -#include "paddle/fluid/framework/var_type.h" - -namespace paddle { -namespace framework { - -class RuntimeInferShapeContext : public InferShapeContext { - public: - RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope) - : op_(op), scope_(scope) {} - - bool HasInput(const std::string& name) const override; - bool HasOutput(const std::string& name) const override; - bool HasInputs(const std::string& name) const override; - bool HasOutputs(const std::string& name) const override; - - const OperatorBase& OpBase() const { return op_; } - - const Scope& InferScope() const { return scope_; } - AttrReader Attrs() const override { return AttrReader(op_.Attrs()); } - - const std::vector& Inputs( - const std::string& name) const override { - return op_.Inputs(name); - } - - const std::vector& Outputs( - const std::string& name) const override { - return op_.Outputs(name); - } - - void ShareLoD(const std::string& in, const std::string& out, size_t i = 0, - size_t j = 0) const override; - - void ShareLayout(const std::string& in, const std::string& out, size_t i = 0, - size_t j = 0) const; - - bool IsRuntime() const override { return true; } - - protected: - DDim GetDim(const std::string& name) const override; - void SetDim(const std::string& name, const DDim& dim) override; - - std::vector GetRepeatedDims(const std::string& name) const override { - PADDLE_THROW("Only compile time support this method"); - } - void SetRepeatedDims(const std::string& name, - const std::vector& dims) override { - PADDLE_THROW("Only compile time support this method"); - } - - proto::VarType::Type GetVarType(const std::string& name) const override { - auto* var = scope_.FindVar(name); - return ToVarType(var->Type()); - } - - InferShapeVarPtr GetVarPtr(const std::string& name) override { - return scope_.FindVar(name); - } - - private: - const OperatorBase& op_; - const Scope& scope_; -}; - -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc index 7531aa9a46..9b943440a8 100644 --- a/paddle/fluid/operators/attention_lstm_op.cc +++ b/paddle/fluid/operators/attention_lstm_op.cc @@ -14,7 +14,6 @@ limitations under the License. */ #include "paddle/fluid/operators/attention_lstm_op.h" #include -#include "paddle/fluid/operators/fusion_infershape_define.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/cpu_vec.h" #include "paddle/fluid/operators/math/fc_compute.h" @@ -24,28 +23,28 @@ namespace paddle { namespace operators { void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { - FUSION_INFERSHAPE_INIT; - PADDLE_ENFORCE(fair_input("X"), "Assert only one Input(X) of AttentionLSTM."); - PADDLE_ENFORCE(fair_input("C0"), + PADDLE_ENFORCE(ctx->HasInput("X"), + "Assert only one Input(X) of AttentionLSTM."); + PADDLE_ENFORCE(ctx->HasInput("C0"), "Assert only one Input(C0) of AttentionLSTM."); - PADDLE_ENFORCE(fair_input("LSTMWeight"), + PADDLE_ENFORCE(ctx->HasInput("LSTMWeight"), "Assert only one Input(LSTMWeight) of AttentionLSTM."); - PADDLE_ENFORCE(fair_input("LSTMBias"), + PADDLE_ENFORCE(ctx->HasInput("LSTMBias"), "Assert only one Input(LSTMBias) of AttentionLSTM."); - PADDLE_ENFORCE(fair_input("AttentionWeight"), + PADDLE_ENFORCE(ctx->HasInput("AttentionWeight"), "Assert only one Input(AttentionWeight) of AttentionLSTM."); - PADDLE_ENFORCE(fair_output("Hidden"), + PADDLE_ENFORCE(ctx->HasOutput("Hidden"), "Assert only one Output(Hidden) of AttentionLSTM."); - PADDLE_ENFORCE(fair_output("Cell"), + PADDLE_ENFORCE(ctx->HasOutput("Cell"), "Assert only one Output(Cell) of AttentionLSTM."); - PADDLE_ENFORCE(fair_output("AttentionedX"), + PADDLE_ENFORCE(ctx->HasOutput("AttentionedX"), "Assert only one Output(AttentionedX) of AttentionLSTM."); - PADDLE_ENFORCE(fair_output("AttentionFCOut"), + PADDLE_ENFORCE(ctx->HasOutput("AttentionFCOut"), "Assert only one Output(AttentionFCOut) of AttentionLSTM."); - PADDLE_ENFORCE(fair_output("LSTMX"), + PADDLE_ENFORCE(ctx->HasOutput("LSTMX"), "Assert only one Output(LSTMX) of AttentionLSTM."); - PADDLE_ENFORCE(fair_output("LSTMOUT"), + PADDLE_ENFORCE(ctx->HasOutput("LSTMOUT"), "Assert only one Output(LSTMOUT) of AttentionLSTM."); auto x_dims = ctx->GetInputDim("X"); @@ -66,7 +65,7 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { auto c_dims = ctx->GetInputDim("C0"); PADDLE_ENFORCE_EQ(c_dims.size(), 2, "Input(C0)'s rank must be 2."); PADDLE_ENFORCE_EQ(c_dims[1], D, "C0 dims should be N x %d.", D); - if (fair_input("H0")) { + if (ctx->HasInput("H0")) { auto h_dims = ctx->GetInputDim("H0"); PADDLE_ENFORCE(h_dims == c_dims, "The dimension of Input(H0) and Input(C0) " @@ -80,7 +79,7 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { "AttentionWeight shapes must be (%d + %d) * 1.", M, D); PADDLE_ENFORCE_EQ(atten_w_dims[1], 1, "AttentionWeight shapes must be (%d + %d) * 1.", M, D); - if (fair_input("AttentionBias")) { + if (ctx->HasInput("AttentionBias")) { auto atten_b_dims = ctx->GetInputDim("AttentionBias"); PADDLE_ENFORCE_EQ(atten_b_dims.size(), 2, "Input(AttentionBias)'s rank must be 2."); @@ -90,7 +89,7 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { "AttentionBias shapes must be 1 * 1."); } - if (fair_input("AttentionScalar")) { + if (ctx->HasInput("AttentionScalar")) { auto dims = ctx->GetInputDim("AttentionScalar"); PADDLE_ENFORCE_EQ(dims.size(), 2, "Input(AttentionScalar)'s rank must be 2."); @@ -98,10 +97,10 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { PADDLE_ENFORCE_EQ(dims[1], 1, "AttentionScalar shapes must be 1 * 1."); } - if (fair_input("AttentionScalarBias")) { + if (ctx->HasInput("AttentionScalarBias")) { auto dims = ctx->GetInputDim("AttentionScalarBias"); PADDLE_ENFORCE( - fair_input("AttentionScalar"), + ctx->HasInput("AttentionScalar"), "AttentionScalar should not be null when have AttentionScalarBias."); PADDLE_ENFORCE_EQ(dims.size(), 2, "Input(AttentionScalarBias)'s rank must be 2."); diff --git a/paddle/fluid/operators/fusion_gru_op.cc b/paddle/fluid/operators/fusion_gru_op.cc index b10d311f05..31e87d9113 100644 --- a/paddle/fluid/operators/fusion_gru_op.cc +++ b/paddle/fluid/operators/fusion_gru_op.cc @@ -15,7 +15,6 @@ limitations under the License. */ #include "paddle/fluid/operators/fusion_gru_op.h" #include // for memcpy #include -#include "paddle/fluid/operators/fusion_infershape_define.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/cpu_vec.h" #include "paddle/fluid/operators/math/fc_compute.h" @@ -26,14 +25,13 @@ namespace paddle { namespace operators { void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const { - FUSION_INFERSHAPE_INIT; - PADDLE_ENFORCE(fair_input("X"), "Assert only one Input(X) of GRU."); - PADDLE_ENFORCE(fair_input("WeightX"), + PADDLE_ENFORCE(ctx->HasInput("X"), "Assert only one Input(X) of GRU."); + PADDLE_ENFORCE(ctx->HasInput("WeightX"), "Assert only one Input(WeightX) of GRU."); - PADDLE_ENFORCE(fair_input("WeightH"), + PADDLE_ENFORCE(ctx->HasInput("WeightH"), "Assert only one Input(WeightH) of GRU."); - PADDLE_ENFORCE(fair_output("XX"), "Assert only one Output(XX) of GRU."); - PADDLE_ENFORCE(fair_output("Hidden"), + PADDLE_ENFORCE(ctx->HasOutput("XX"), "Assert only one Output(XX) of GRU."); + PADDLE_ENFORCE(ctx->HasOutput("Hidden"), "Assert only one Output(Hidden) of GRU."); auto x_dims = ctx->GetInputDim("X"); @@ -60,12 +58,12 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const { "should be 3 * %d.", frame_size); - if (fair_input("H0")) { + if (ctx->HasInput("H0")) { auto h0_dims = ctx->GetInputDim("H0"); PADDLE_ENFORCE_EQ(h0_dims[1], frame_size, "The width of H0 must be equal to frame_size."); } - if (fair_input("Bias")) { + if (ctx->HasInput("Bias")) { auto b_dims = ctx->GetInputDim("Bias"); PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2."); PADDLE_ENFORCE_EQ(b_dims[0], 1, @@ -81,11 +79,11 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const { xx_width = wx_dims[1]; } else { xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1]; - PADDLE_ENFORCE(fair_output("ReorderedH0"), + PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"), "Assert only one Output(ReorderedH0) of GRU."); - PADDLE_ENFORCE(fair_output("BatchedInput"), + PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"), "Assert only one Output(BatchedInput) of GRU."); - PADDLE_ENFORCE(fair_output("BatchedOut"), + PADDLE_ENFORCE(ctx->HasOutput("BatchedOut"), "Assert only one Output(BatchedOut) of GRU."); ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]}); ctx->SetOutputDim("BatchedOut", out_dims); diff --git a/paddle/fluid/operators/fusion_infershape_define.h b/paddle/fluid/operators/fusion_infershape_define.h deleted file mode 100644 index 89521672b0..0000000000 --- a/paddle/fluid/operators/fusion_infershape_define.h +++ /dev/null @@ -1,60 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef PADDLE_FLUID_OPERATORS_FUSION_INFERSHAPE_DEFINE_H_ -#define PADDLE_FLUID_OPERATORS_FUSION_INFERSHAPE_DEFINE_H_ - -#include -#include "paddle/fluid/framework/shape_runtime_infer.h" - -namespace paddle { -namespace operators { - -#define FUSION_INFERSHAPE_INIT \ - auto* runtime_ctx = dynamic_cast(ctx); \ - if (runtime_ctx == nullptr) { \ - LOG(FATAL) << "Should have runtime infer context"; \ - } \ - const auto& ins = runtime_ctx->OpBase().Inputs(); \ - const auto& outs = runtime_ctx->OpBase().Outputs(); \ - const auto& scope = runtime_ctx->InferScope(); \ - const auto ins_end = ins.end(); \ - const auto outs_end = outs.end(); \ - auto fair_input = [&](const std::string& name) -> bool { \ - auto it = ins.find(name); \ - if (it == ins_end) { \ - return false; \ - } \ - const auto& in = it->second; \ - if (in.size() != 1 || in[0] == framework::kEmptyVarName) { \ - return false; \ - } \ - return scope.FindVar(in[0]) != nullptr; \ - }; \ - auto fair_output = [&](const std::string& name) -> bool { \ - auto it = outs.find(name); \ - if (it == outs_end) { \ - return false; \ - } \ - const auto& out = it->second; \ - if (out.size() != 1 || out[0] == framework::kEmptyVarName) { \ - return false; \ - } \ - return scope.FindVar(out[0]) != nullptr; \ - } - -} // namespace operators -} // namespace paddle - -#endif // PADDLE_FLUID_OPERATORS_FUSION_INFERSHAPE_DEFINE_H_ diff --git a/paddle/fluid/operators/fusion_lstm_op.cc b/paddle/fluid/operators/fusion_lstm_op.cc index 08af98f850..55e465e3af 100644 --- a/paddle/fluid/operators/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fusion_lstm_op.cc @@ -14,7 +14,6 @@ limitations under the License. */ #include "paddle/fluid/operators/fusion_lstm_op.h" #include -#include "paddle/fluid/operators/fusion_infershape_define.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/cpu_vec.h" #include "paddle/fluid/operators/math/fc_compute.h" @@ -25,23 +24,23 @@ namespace paddle { namespace operators { void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { - FUSION_INFERSHAPE_INIT; - PADDLE_ENFORCE(fair_input("X"), "Assert only one Input(X) of LSTM."); - PADDLE_ENFORCE(fair_input("WeightX"), + PADDLE_ENFORCE(ctx->HasInput("X"), "Assert only one Input(X) of LSTM."); + PADDLE_ENFORCE(ctx->HasInput("WeightX"), "Assert only one Input(WeightX) of LSTM."); - PADDLE_ENFORCE(fair_input("WeightH"), + PADDLE_ENFORCE(ctx->HasInput("WeightH"), "Assert only one Input(WeightH) of LSTM."); - PADDLE_ENFORCE(fair_input("Bias"), "Assert only one Input(Bias) of LSTM."); - PADDLE_ENFORCE(fair_output("XX"), "Assert only one Output(XX) of LSTM."); - PADDLE_ENFORCE(fair_output("Hidden"), + PADDLE_ENFORCE(ctx->HasInput("Bias"), "Assert only one Input(Bias) of LSTM."); + PADDLE_ENFORCE(ctx->HasOutput("XX"), "Assert only one Output(XX) of LSTM."); + PADDLE_ENFORCE(ctx->HasOutput("Hidden"), "Assert only one Output(Hidden) of LSTM."); - PADDLE_ENFORCE(fair_output("Cell"), "Assert only one Output(Cell) of LSTM."); + PADDLE_ENFORCE(ctx->HasOutput("Cell"), + "Assert only one Output(Cell) of LSTM."); auto x_dims = ctx->GetInputDim("X"); PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2."); - if (fair_input("H0")) { - PADDLE_ENFORCE(fair_input("C0"), + if (ctx->HasInput("H0")) { + PADDLE_ENFORCE(ctx->HasInput("C0"), "Input(Cell) and Input(Hidden) of LSTM should not " "be null at the same time."); auto h_dims = ctx->GetInputDim("H0"); @@ -93,15 +92,15 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { xx_width = wx_dims[1]; } else { xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1]; - PADDLE_ENFORCE(fair_output("BatchedInput"), + PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"), "Assert only one Output(BatchedInput) of LSTM."); - PADDLE_ENFORCE(fair_output("BatchedHidden"), + PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"), "Assert only one Output(BatchedHidden) of LSTM."); - PADDLE_ENFORCE(fair_output("BatchedCell"), + PADDLE_ENFORCE(ctx->HasOutput("BatchedCell"), "Assert only one Output(BatchedCell) of LSTM."); - PADDLE_ENFORCE(fair_output("ReorderedH0"), + PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"), "Assert only one Output(ReorderedH0) of LSTM"); - PADDLE_ENFORCE(fair_output("ReorderedC0"), + PADDLE_ENFORCE(ctx->HasOutput("ReorderedC0"), "Assert only one Output(ReorderedC0) of LSTM."); ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]}); ctx->SetOutputDim("BatchedHidden", out_dims); From 8cee9f6176caa87e21109c665fa95c51d3ab296c Mon Sep 17 00:00:00 2001 From: gongweibao Date: Wed, 12 Sep 2018 09:55:46 +0800 Subject: [PATCH 46/85] Fix rpcclient's wait action in aync env. (#13307) --- .../operators/distributed/CMakeLists.txt | 1 + .../operators/distributed/grpc_client.cc | 142 +++++++++--------- .../fluid/operators/distributed/grpc_client.h | 110 +++++++------- .../operators/distributed/request_handler.h | 75 +++++++-- .../fluid/operators/distributed/rpc_client.h | 63 ++++---- .../operators/distributed/varhandle_test.cc | 55 +++++++ paddle/fluid/operators/prefetch_op.cc | 8 +- paddle/fluid/operators/recv_op.cc | 7 +- paddle/fluid/operators/send_op.cc | 10 +- 9 files changed, 296 insertions(+), 175 deletions(-) create mode 100644 paddle/fluid/operators/distributed/varhandle_test.cc diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt index da5d20505e..56734b81e8 100644 --- a/paddle/fluid/operators/distributed/CMakeLists.txt +++ b/paddle/fluid/operators/distributed/CMakeLists.txt @@ -20,6 +20,7 @@ if(WITH_GRPC) DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL) cc_test(rpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_sparse_table_op SERIAL) + cc_test(varhandle_test SRCS varhandle_test.cc) return() endif() diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc index b4f60c9ff9..07ac20797d 100644 --- a/paddle/fluid/operators/distributed/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc_client.cc @@ -59,40 +59,32 @@ GRPCClient::~GRPCClient() { } channels_.clear(); } - client_thread_->join(); } -bool GRPCClient::AsyncSendVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, int64_t time_out) { +VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out) { const platform::DeviceContext* p_ctx = &ctx; const std::string ep_val = ep; const std::string var_name_val = var_name; const framework::Scope* p_scope = &scope; const auto ch = GetChannel(ep_val); + SendProcessor* s = new SendProcessor(ch); + VarHandlePtr h(new VarHandle(ep, "Send", var_name_val, p_ctx, p_scope)); + s->Prepare(h, time_out); - framework::AsyncIO([var_name_val, p_ctx, ep_val, p_scope, time_out, ch, - this] { + framework::AsyncIO([var_name_val, p_scope, p_ctx, s, this] { auto* var = p_scope->FindVar(var_name_val); ::grpc::ByteBuffer req; SerializeToByteBuffer(var_name_val, var, *p_ctx, &req); - // varhandle - VarHandle var_h; - var_h.ep = ep_val; - var_h.scope = p_scope; - var_h.name = var_name_val; - var_h.ctx = p_ctx; - var_h.method = "Send"; - - VLOG(3) << var_h.String() << " begin"; + VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; // stub context - SendProcessor* s = new SendProcessor(ch); - s->Prepare(var_h, time_out); s->response_call_back_ = nullptr; auto call = s->stub_g_.PrepareUnaryCall( @@ -102,13 +94,13 @@ bool GRPCClient::AsyncSendVar(const std::string& ep, }); req_count_++; - return true; + return h; } void ProcGetResponse(const VarHandle& var_h, const ::grpc::ByteBuffer& ret_msg) { framework::Variable* outvar = nullptr; - DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, &outvar); + DeserializeFromByteBuffer(ret_msg, *var_h.ctx(), var_h.scope(), &outvar); } template @@ -119,37 +111,30 @@ void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) { result->Swap(&tmp); } -bool GRPCClient::AsyncGetVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, int64_t time_out) { +VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out) { const platform::DeviceContext* p_ctx = &ctx; const std::string ep_val = ep; const std::string var_name_val = var_name; const framework::Scope* p_scope = &scope; const auto ch = GetChannel(ep_val); + GetProcessor* s = new GetProcessor(ch); + VarHandlePtr h(new VarHandle(ep, "Get", var_name_val, p_ctx, p_scope)); + s->Prepare(h, time_out); - framework::AsyncIO([var_name_val, ep_val, p_scope, p_ctx, time_out, ch, - this] { + framework::AsyncIO([var_name_val, p_scope, p_ctx, s, this] { // prepare input sendrecv::VariableMessage req; req.set_varname(var_name_val); ::grpc::ByteBuffer buf; RequestToByteBuffer(req, &buf); - // var handle - VarHandle var_h; - var_h.ep = ep_val; - var_h.scope = p_scope; - var_h.name = var_name_val; - var_h.ctx = p_ctx; - var_h.method = "Get"; - - VLOG(3) << var_h.String() << " begin"; + VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; // stub context - GetProcessor* s = new GetProcessor(ch); - s->Prepare(var_h, time_out); s->response_call_back_ = ProcGetResponse; auto call = s->stub_g_.PrepareUnaryCall( @@ -160,42 +145,36 @@ bool GRPCClient::AsyncGetVar(const std::string& ep, req_count_++; - return true; + return h; } -bool GRPCClient::AsyncPrefetchVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& in_var_name, - const std::string& out_var_name, - int64_t time_out) { +VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& in_var_name, + const std::string& out_var_name, + int64_t time_out) { const platform::DeviceContext* p_ctx = &ctx; const std::string ep_val = ep; const std::string in_var_name_val = in_var_name; const std::string out_var_name_val = out_var_name; const framework::Scope* p_scope = &scope; const auto ch = GetChannel(ep_val); + GetProcessor* s = new GetProcessor(ch); + VarHandlePtr h( + new VarHandle(ep, "Prefetch", out_var_name_val, p_ctx, p_scope)); + s->Prepare(h, time_out); framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx, - time_out, ch, this] { + time_out, s, this] { auto* var = p_scope->FindVar(in_var_name_val); ::grpc::ByteBuffer req; SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val); - // var handle - VarHandle var_h; - var_h.ep = ep_val; - var_h.scope = p_scope; - var_h.name = out_var_name_val; - var_h.ctx = p_ctx; - var_h.method = "Prefetch"; - - VLOG(3) << var_h.String() << " begin"; + VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; // stub context - GetProcessor* s = new GetProcessor(ch); - s->Prepare(var_h, time_out); s->response_call_back_ = ProcGetResponse; auto call = s->stub_g_.PrepareUnaryCall( @@ -206,56 +185,68 @@ bool GRPCClient::AsyncPrefetchVar(const std::string& ep, }); req_count_++; - return true; + return h; } -void GRPCClient::AsyncSendBatchBarrier(const std::string& ep, - int64_t time_out) { +VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep, + int64_t time_out) { const auto ch = GetChannel(ep); BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); - s->Prepare(time_out); + VarHandlePtr h(new VarHandle(ep, "BatchBarrier", BATCH_BARRIER_MESSAGE, + nullptr, nullptr)); + s->Prepare(h, time_out); sendrecv::VariableMessage req; req.set_varname(BATCH_BARRIER_MESSAGE); auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); req_count_++; + return h; } -void GRPCClient::AsyncSendFetchBarrier(const std::string& ep, - int64_t time_out) { +VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep, + int64_t time_out) { const auto ch = GetChannel(ep); FetchBarrierProcessor* s = new FetchBarrierProcessor(ch); - s->Prepare(time_out); + VarHandlePtr h(new VarHandle(ep, "FetchBarrier", FETCH_BARRIER_MESSAGE, + nullptr, nullptr)); + s->Prepare(h, time_out); sendrecv::VariableMessage req; req.set_varname(FETCH_BARRIER_MESSAGE); auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); req_count_++; + return h; } -void GRPCClient::AsyncSendComplete(const std::string& ep, int64_t time_out) { +VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep, + int64_t time_out) { const auto ch = GetChannel(ep); BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); - s->Prepare(time_out); + VarHandlePtr h( + new VarHandle(ep, "SendComplete", COMPLETE_MESSAGE, nullptr, nullptr)); + s->Prepare(h, time_out); sendrecv::VariableMessage req; req.set_varname(COMPLETE_MESSAGE); auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); req_count_++; + return h; } -void GRPCClient::AsyncCheckpointNotify(const std::string& ep, - const std::string& dir, - int64_t time_out) { +VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep, + const std::string& dir, + int64_t time_out) { const auto ch = GetChannel(ep); CheckpointNotifyProcessor* s = new CheckpointNotifyProcessor(ch); - s->Prepare(time_out); + VarHandlePtr h(new VarHandle(ep, "CheckPointNotify", CHECKPOINT_SAVE_MESSAGE, + nullptr, nullptr)); + s->Prepare(h, time_out); sendrecv::VariableMessage req; req.set_varname(CHECKPOINT_SAVE_MESSAGE); @@ -264,6 +255,7 @@ void GRPCClient::AsyncCheckpointNotify(const std::string& ep, auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); req_count_++; + return h; } bool GRPCClient::Wait() { @@ -276,25 +268,28 @@ void GRPCClient::Proceed() { void* tag = nullptr; bool ok = false; + VLOG(3) << "GRPCClient Proceed begin"; while (!stopped_ && cq_.Next(&tag, &ok)) { BaseProcessor* c = static_cast(tag); GPR_ASSERT(ok); PADDLE_ENFORCE(c); if (c->status_.ok()) { - VLOG(3) << c->var_h_.String() << " process"; + VLOG(3) << c->GetVarHandlePtr()->String() << " process"; c->Process(); } else if (c->status_.error_code() == grpc::StatusCode::DEADLINE_EXCEEDED) { - LOG(ERROR) << c->var_h_.String() + LOG(ERROR) << c->GetVarHandlePtr()->String() << " meets grpc error:" << c->status_.error_message(); { std::lock_guard lk(sync_mutex_); ok_ = false; } - sync_cond_.notify_all(); + c->Finish(false); } else { - LOG(FATAL) << c->var_h_.String() + LOG(FATAL) << c->GetVarHandlePtr()->String() << " meets grpc error:" << c->status_.error_message(); + c->Finish(false); } + delete c; { std::lock_guard lk(sync_mutex_); @@ -302,6 +297,7 @@ void GRPCClient::Proceed() { } sync_cond_.notify_all(); } + VLOG(3) << "GRPCClient Proceed end"; } std::shared_ptr GRPCClient::GetChannel(const std::string& ep) { diff --git a/paddle/fluid/operators/distributed/grpc_client.h b/paddle/fluid/operators/distributed/grpc_client.h index 0c95ffeb5c..75a3662316 100644 --- a/paddle/fluid/operators/distributed/grpc_client.h +++ b/paddle/fluid/operators/distributed/grpc_client.h @@ -53,15 +53,14 @@ void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg); class BaseProcessor { public: - explicit BaseProcessor(std::shared_ptr ch) { - context_ = nullptr; - } + BaseProcessor() { context_ = nullptr; } virtual ~BaseProcessor() {} - virtual void Prepare(const VarHandle& var_info, int64_t time_out) { + virtual void Prepare(VarHandlePtr h, int64_t time_out) { + var_h_ = h; + context_.reset(new grpc::ClientContext()); - var_h_ = var_info; context_->set_wait_for_ready(true); if (time_out) { std::chrono::system_clock::time_point deadline = @@ -71,21 +70,21 @@ class BaseProcessor { } } - virtual void Prepare(int64_t time_out) { - context_.reset(new grpc::ClientContext()); - context_->set_wait_for_ready(true); - - std::chrono::system_clock::time_point deadline = - std::chrono::system_clock::now() + std::chrono::milliseconds(time_out); - - context_->set_deadline(deadline); + void Process() { + ProcessImpl(); + var_h_->Finish(true); } - virtual void Process() = 0; + VarHandlePtr GetVarHandlePtr() { return var_h_; } + bool Wait() { return var_h_->Wait(); } + void Finish(bool ok) { return var_h_->Finish(ok); } + virtual void ProcessImpl() = 0; std::unique_ptr context_; grpc::Status status_; - VarHandle var_h_; + + protected: + VarHandlePtr var_h_; }; typedef std::function @@ -94,13 +93,13 @@ typedef std::function class SendProcessor : public BaseProcessor { public: explicit SendProcessor(std::shared_ptr ch) - : BaseProcessor(ch), stub_g_(ch) {} + : BaseProcessor(), stub_g_(ch) {} virtual ~SendProcessor() {} - virtual void Process() { + void ProcessImpl() override { if (response_call_back_) { - response_call_back_(var_h_, reply_); + response_call_back_(*var_h_.get(), reply_); } } @@ -115,13 +114,13 @@ typedef std::function class GetProcessor : public BaseProcessor { public: explicit GetProcessor(std::shared_ptr ch) - : BaseProcessor(ch), stub_g_(ch) {} + : BaseProcessor(), stub_g_(ch) {} virtual ~GetProcessor() {} - virtual void Process() { + void ProcessImpl() override { if (response_call_back_) { - response_call_back_(var_h_, reply_); + response_call_back_(*var_h_.get(), reply_); } } @@ -133,13 +132,13 @@ class GetProcessor : public BaseProcessor { class BatchBarrierProcessor : public BaseProcessor { public: explicit BatchBarrierProcessor(std::shared_ptr ch) - : BaseProcessor(ch) { + : BaseProcessor() { stub_ = sendrecv::SendRecvService::NewStub(ch); } virtual ~BatchBarrierProcessor() {} - virtual void Process() {} + void ProcessImpl() override {} sendrecv::VoidMessage reply_; std::unique_ptr stub_; }; @@ -147,13 +146,13 @@ class BatchBarrierProcessor : public BaseProcessor { class FetchBarrierProcessor : public BaseProcessor { public: explicit FetchBarrierProcessor(std::shared_ptr ch) - : BaseProcessor(ch) { + : BaseProcessor() { stub_ = sendrecv::SendRecvService::NewStub(ch); } virtual ~FetchBarrierProcessor() {} - virtual void Process() {} + void ProcessImpl() override {} sendrecv::VariableMessage reply_; std::unique_ptr stub_; }; @@ -161,13 +160,13 @@ class FetchBarrierProcessor : public BaseProcessor { class CheckpointNotifyProcessor : public BaseProcessor { public: explicit CheckpointNotifyProcessor(std::shared_ptr ch) - : BaseProcessor(ch) { + : BaseProcessor() { stub_ = sendrecv::SendRecvService::NewStub(ch); } virtual ~CheckpointNotifyProcessor() {} - virtual void Process() {} + void ProcessImpl() override {} sendrecv::VoidMessage reply_; std::unique_ptr stub_; }; @@ -177,32 +176,37 @@ class GRPCClient : public RPCClient { GRPCClient() : ok_(true), completed_(false), stopped_(false) {} virtual ~GRPCClient(); - bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; - - bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; - - bool AsyncPrefetchVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& in_var_name, - const std::string& out_var_name, - int64_t time_out = FLAGS_rpc_deadline) override; - - void AsyncSendBatchBarrier(const std::string& ep, - int64_t time_out = FLAGS_rpc_deadline) override; - - void AsyncSendFetchBarrier(const std::string& ep, - int64_t time_out = FLAGS_rpc_deadline) override; - - void AsyncCheckpointNotify(const std::string& ep, const std::string& dir, - int64_t time_out = FLAGS_rpc_deadline) override; - - void AsyncSendComplete(const std::string& ep, - int64_t time_out = FLAGS_rpc_deadline) override; + VarHandlePtr AsyncSendVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncPrefetchVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& in_var_name, + const std::string& out_var_name, + int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncSendBatchBarrier( + const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncSendFetchBarrier( + const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncCheckpointNotify( + const std::string& ep, const std::string& dir, + int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncSendComplete( + const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; bool Wait() override; diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h index 64ac728184..3c3f9d17c8 100644 --- a/paddle/fluid/operators/distributed/request_handler.h +++ b/paddle/fluid/operators/distributed/request_handler.h @@ -28,6 +28,7 @@ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/platform/macros.h" namespace paddle { namespace operators { @@ -49,23 +50,77 @@ constexpr char kRequestPassBarrier[] = "RequestPassBarrier"; class RPCServer; -struct VarHandle { - // RPC endpoint. - std::string ep; - const platform::DeviceContext* ctx; - const framework::Scope* scope; - // Variable name. - std::string name; - // RPC method name. - std::string method; +class VarHandle { + public: + VarHandle(const std::string ep, const std::string& method, + const std::string& name, + const platform::DeviceContext* p_ctx = nullptr, + const framework::Scope* p_scope = nullptr) + : ok_(kVarHandleDefaultState) { + ep_ = ep; + ctx_ = p_ctx; + scope_ = p_scope; + name_ = name; + method_ = method; + } + + virtual ~VarHandle() {} + + public: + bool Wait() { + { + std::unique_lock lk(sync_mutex_); + wait_cond_.wait(lk, [this] { return ok_ != kVarHandleDefaultState; }); + } + VLOG(7) << "VarHandle wait:" << ok_; + return ok_ != 0; + } + + void Finish(bool ok) { + { + std::unique_lock lk(sync_mutex_); + ok_ = ok; + } + VLOG(7) << "VarHandle finish:" << ok; + wait_cond_.notify_all(); + } std::string String() const { std::ostringstream s; - s << method << " name:[" << name << "], ep:[" << ep << "]"; + s << method_ << " name:[" << name_ << "], ep:[" << ep_ << "], ok:[" << ok_ + << "]"; return s.str(); } + + std::string ep() const { return ep_; } + const platform::DeviceContext* ctx() const { return ctx_; } + const framework::Scope* scope() const { return scope_; } + std::string name() const { return name_; } + std::string method() const { return method_; } + + protected: + // RPC endpoint. + std::string ep_; + const platform::DeviceContext* ctx_; + const framework::Scope* scope_; + // Variable name. + std::string name_; + // RPC method name. + std::string method_; + + protected: + std::mutex sync_mutex_; + std::condition_variable wait_cond_; + int ok_; + + static const int kVarHandleDefaultState = -1; + + private: + DISABLE_COPY_AND_ASSIGN(VarHandle); }; +typedef std::shared_ptr VarHandlePtr; + class RequestHandler { public: explicit RequestHandler(bool sync_mode) diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h index 22a022a5d2..3539ee5e45 100644 --- a/paddle/fluid/operators/distributed/rpc_client.h +++ b/paddle/fluid/operators/distributed/rpc_client.h @@ -14,12 +14,14 @@ #pragma once +#include // NOLINT #include #include "gflags/gflags.h" #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/operators/distributed/request_handler.h" DECLARE_int32(rpc_deadline); @@ -31,37 +33,36 @@ class RPCClient { public: RPCClient() {} virtual ~RPCClient() {} - virtual bool AsyncSendVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual bool AsyncGetVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual bool AsyncPrefetchVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& in_var_name, - const std::string& out_var_name, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual void AsyncSendBatchBarrier(const std::string& ep, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual void AsyncSendFetchBarrier(const std::string& ep, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual void AsyncCheckpointNotify(const std::string& ep, - const std::string& dir, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual void AsyncSendComplete(const std::string& ep, - int64_t time_out = FLAGS_rpc_deadline) = 0; + virtual VarHandlePtr AsyncSendVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) = 0; + + virtual VarHandlePtr AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) = 0; + + virtual VarHandlePtr AsyncPrefetchVar( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& in_var_name, + const std::string& out_var_name, + int64_t time_out = FLAGS_rpc_deadline) = 0; + + virtual VarHandlePtr AsyncSendBatchBarrier( + const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0; + + virtual VarHandlePtr AsyncSendFetchBarrier( + const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0; + + virtual VarHandlePtr AsyncCheckpointNotify( + const std::string& ep, const std::string& dir, + int64_t time_out = FLAGS_rpc_deadline) = 0; + + virtual VarHandlePtr AsyncSendComplete( + const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0; // Complete tells all the pserver instances that finishe the training, // the pserver can reduce it's barrier count, and continue to train diff --git a/paddle/fluid/operators/distributed/varhandle_test.cc b/paddle/fluid/operators/distributed/varhandle_test.cc new file mode 100644 index 0000000000..a0fcaf8864 --- /dev/null +++ b/paddle/fluid/operators/distributed/varhandle_test.cc @@ -0,0 +1,55 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include // NOLINT + +#include "google/protobuf/text_format.h" +#include "gtest/gtest.h" +#include "paddle/fluid/operators/distributed/request_handler.h" + +using paddle::operators::distributed::VarHandlePtr; +using paddle::operators::distributed::VarHandle; + +void WaitTrue(VarHandlePtr s) { EXPECT_TRUE(s->Wait()); } + +void WaitFalse(VarHandlePtr s) { EXPECT_FALSE(s->Wait()); } + +TEST(VarHandle, Run) { + std::vector a; + for (int i = 0; i < 12; i++) { + VarHandlePtr s(new VarHandle("", "", "", nullptr, nullptr)); + a.push_back(s); + } + + std::vector> t; + for (int i = 0; i < 6; i++) { + t.emplace_back(new std::thread(WaitFalse, a[i])); + } + + for (int i = 0; i < 6; i++) { + a[i]->Finish(false); + t[i]->join(); + } + + for (int i = 6; i < 12; i++) { + t.emplace_back(new std::thread(WaitTrue, a[i])); + } + + for (int i = 6; i < 12; i++) { + a[i]->Finish(true); + t[i]->join(); + } +} diff --git a/paddle/fluid/operators/prefetch_op.cc b/paddle/fluid/operators/prefetch_op.cc index 4b804740a0..0519c15e13 100644 --- a/paddle/fluid/operators/prefetch_op.cc +++ b/paddle/fluid/operators/prefetch_op.cc @@ -44,16 +44,20 @@ class PrefetchOp : public framework::OperatorBase { distributed::RPCClient* rpc_client = distributed::RPCClient::GetInstance(); + std::vector rets; for (size_t i = 0; i < ins.size(); i++) { if (NeedSend(scope, ins[i])) { VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << " to get " << outs[i] << " back"; - rpc_client->AsyncPrefetchVar(epmap[i], ctx, scope, ins[i], outs[i]); + rets.push_back(rpc_client->AsyncPrefetchVar(epmap[i], ctx, scope, + ins[i], outs[i])); } else { VLOG(3) << "don't send no-initialied variable: " << ins[i]; } } - PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); + for (size_t i = 0; i < rets.size(); i++) { + PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); + } } }; diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc index a1f368e869..4d34b8a168 100644 --- a/paddle/fluid/operators/recv_op.cc +++ b/paddle/fluid/operators/recv_op.cc @@ -44,12 +44,15 @@ class RecvOp : public framework::OperatorBase { distributed::RPCClient* rpc_client = distributed::RPCClient::GetInstance(); + std::vector rets; for (size_t i = 0; i < outs.size(); i++) { VLOG(3) << "getting " << outs[i] << " from " << epmap[i]; - rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]); + rets.push_back(rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i])); } if (sync_mode) { - PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); + for (size_t i = 0; i < rets.size(); i++) { + PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); + } } } }; diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc index 82a70e4bf1..48322ac7fd 100644 --- a/paddle/fluid/operators/send_op.cc +++ b/paddle/fluid/operators/send_op.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include // NOLINT #include +#include "paddle/fluid/framework/blocking_queue.h" #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" @@ -45,18 +46,19 @@ class SendOp : public framework::OperatorBase { distributed::RPCClient* rpc_client = distributed::RPCClient::GetInstance(); + std::vector rets; for (size_t i = 0; i < ins.size(); i++) { if (NeedSend(scope, ins[i])) { VLOG(3) << "sending " << ins[i] << " to " << epmap[i]; - // TODO(Yancey1989): we need to use an IO threadpool which has - // a larger number of threads than the computing threadpool. - rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]); + rets.push_back(rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i])); } else { VLOG(3) << "don't send no-initialied variable: " << ins[i]; } } if (sync_send) { - PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); + for (size_t i = 0; i < rets.size(); i++) { + PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); + } } } }; From 312e92ab072297dae3bf2baf6479b51bfc9b88e6 Mon Sep 17 00:00:00 2001 From: Shan Yi <35982308+shanyi15@users.noreply.github.com> Date: Wed, 12 Sep 2018 10:43:32 +0800 Subject: [PATCH 47/85] update-readme --- README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 60ffbe7281..45186ec4ef 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ Our vision is to enable deep learning for everyone via PaddlePaddle. Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle. -### Latest PaddlePaddle Release: [Fluid 0.14.0](https://github.com/PaddlePaddle/Paddle/tree/v0.14.0) +### Latest PaddlePaddle Release: [Fluid 0.15.0](https://github.com/PaddlePaddle/Paddle/tree/v0.15.0) ### Install Latest Stable Release: ``` # Linux CPU @@ -76,26 +76,26 @@ pip install paddlepaddle-gpu==0.14.0.post85 ## Installation -It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/beginners_guide/install/install_doc.html) on our website. +It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/beginners_guide/install/install_doc.html) on our website. ## Documentation -We provide [English](http://paddlepaddle.org/documentation/docs/en/0.14.0/getstarted/index_en.html) and -[Chinese](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/beginners_guide/index.html) documentation. +We provide [English](http://paddlepaddle.org/documentation/docs/en/0.15.0/getstarted/index_en.html) and +[Chinese](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/beginners_guide/index.html) documentation. - [Deep Learning 101](https://github.com/PaddlePaddle/book) You might want to start from this online interactive book that can run in a Jupyter Notebook. -- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/user_guides/howto/training/cluster_howto.html) +- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/user_guides/howto/training/cluster_howto.html) You can run distributed training jobs on MPI clusters. -- [Python API](http://paddlepaddle.org/documentation/api/zh/0.14.0/fluid.html) +- [Python API](http://paddlepaddle.org/documentation/api/zh/0.15.0/fluid.html) Our new API enables much shorter programs. -- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/advanced_usage/development/contribute_to_paddle.html) +- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/advanced_usage/development/contribute_to_paddle.html) We appreciate your contributions! From 36d6e44681c3ebe1ff3992b37a981ca468580080 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Wed, 12 Sep 2018 03:41:39 +0000 Subject: [PATCH 48/85] fix test_py_reader_using_executor error --- .../fluid/tests/unittests/test_py_reader_using_executor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py index 931cac409f..0fb9518a45 100644 --- a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py +++ b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py @@ -96,7 +96,8 @@ class TestPyReaderUsingExecutor(unittest.TestCase): self.queue_capacity = 50 def test(self): - for use_cuda in [False, True]: + for use_cuda in ([False, True] + if core.core.is_compiled_with_cuda() else [False]): for use_parallel_executor in [False, True]: for use_double_buffer in [False, True]: print('Test Parameters:'), From 5ce1a960a5dc91459718422379b8bbf398574584 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Wed, 12 Sep 2018 12:47:28 +0800 Subject: [PATCH 49/85] move bcast op into pass --- benchmark/fluid/args.py | 6 +++ benchmark/fluid/fluid_benchmark.py | 9 ++++ benchmark/fluid/models/mnist.py | 11 +++-- .../framework/details/all_reduce_op_handle.cc | 7 +++- .../framework/details/broadcast_op_handle.cc | 7 ++++ .../details/data_balance_op_handle.cc | 7 ++++ .../details/multi_devices_graph_pass.cc | 42 ++++++++++++++----- .../details/multi_devices_graph_pass.h | 4 +- .../framework/details/reduce_op_handle.cc | 6 ++- .../details/scale_loss_grad_op_handle.cc | 2 +- paddle/fluid/pybind/pybind.cc | 1 - python/paddle/fluid/parallel_executor.py | 10 ----- 12 files changed, 82 insertions(+), 30 deletions(-) diff --git a/benchmark/fluid/args.py b/benchmark/fluid/args.py index ed696e82f8..0d5c9652de 100644 --- a/benchmark/fluid/args.py +++ b/benchmark/fluid/args.py @@ -140,5 +140,11 @@ def parse_args(): '--use_lars', action='store_true', help='If set, use lars for optimizers, ONLY support resnet module.') + parser.add_argument( + '--reduce_strategy', + type=str, + choices=['reduce', 'all_reduce'], + default='all_reduce', + help='Specify the reduce strategy, can be reduce, all_reduce') args = parser.parse_args() return args diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py index 25622ee06c..ddd9fe8098 100644 --- a/benchmark/fluid/fluid_benchmark.py +++ b/benchmark/fluid/fluid_benchmark.py @@ -170,6 +170,14 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog, strategy = fluid.ExecutionStrategy() strategy.num_threads = args.cpus strategy.allow_op_delay = False + build_strategy = fluid.BuildStrategy() + if args.reduce_strategy == "reduce": + build_strategy.reduce_strategy = fluid.BuildStrategy( + ).ReduceStrategy.Reduce + else: + build_strategy.reduce_strategy = fluid.BuildStrategy( + ).ReduceStrategy.AllReduce + avg_loss = train_args[0] if args.update_method == "pserver": @@ -184,6 +192,7 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog, avg_loss.name, main_program=train_prog, exec_strategy=strategy, + build_strategy=build_strategy, num_trainers=num_trainers, trainer_id=trainer_id) diff --git a/benchmark/fluid/models/mnist.py b/benchmark/fluid/models/mnist.py index cef8657ee6..f123e07fb7 100644 --- a/benchmark/fluid/models/mnist.py +++ b/benchmark/fluid/models/mnist.py @@ -67,11 +67,14 @@ def cnn_model(data): def get_model(args, is_train, main_prog, startup_prog): # NOTE: mnist is small, we don't implement data sharding yet. - filelist = [ - os.path.join(args.data_path, f) for f in os.listdir(args.data_path) - ] + opt = None + data_file_handle = None with fluid.program_guard(main_prog, startup_prog): if args.use_reader_op: + filelist = [ + os.path.join(args.data_path, f) + for f in os.listdir(args.data_path) + ] data_file_handle = fluid.layers.open_files( filenames=filelist, shapes=[[-1, 1, 28, 28], (-1, 1)], @@ -100,7 +103,7 @@ def get_model(args, is_train, main_prog, startup_prog): if is_train: opt = fluid.optimizer.AdamOptimizer( learning_rate=0.001, beta1=0.9, beta2=0.999) - opt.minimize() + opt.minimize(avg_cost) if args.memory_optimize: fluid.memory_optimize(main_prog) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index bf493a3fa4..8450d8eb8b 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -46,7 +46,12 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, #endif void AllReduceOpHandle::RunImpl() { - platform::RecordEvent r("all_reduce", nullptr); + if (dev_ctxes_.size() > 0UL) { + platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second); + } else { + platform::RecordEvent record_event(Name(), nullptr); + } + if (NoDummyInputSize() == 1) { return; // No need to all reduce when GPU count = 1; } else { diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc index 1d9f1bd6e4..35962ade99 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle.cc @@ -15,12 +15,19 @@ #include "paddle/fluid/framework/details/broadcast_op_handle.h" #include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/variable_visitor.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace framework { namespace details { void BroadcastOpHandle::RunImpl() { + if (dev_ctxes_.size() > 0UL) { + platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second); + } else { + platform::RecordEvent record_event(Name(), nullptr); + } + if (places_.size() == 1) return; // The input and output may have dummy vars. diff --git a/paddle/fluid/framework/details/data_balance_op_handle.cc b/paddle/fluid/framework/details/data_balance_op_handle.cc index 525d243224..91f6a42e6e 100644 --- a/paddle/fluid/framework/details/data_balance_op_handle.cc +++ b/paddle/fluid/framework/details/data_balance_op_handle.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/details/data_balance_op_handle.h" #include #include "paddle/fluid/framework/details/container_cast.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace framework { @@ -86,6 +87,12 @@ std::vector> DataBalanceOpHandle::GetBalancePlan( } void DataBalanceOpHandle::RunImpl() { + if (dev_ctxes_.size() > 0UL) { + platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second); + } else { + platform::RecordEvent record_event(Name(), nullptr); + } + PADDLE_ENFORCE_GT(places_.size(), 1, "Data balance can only be enabled when the number of " "places to run larger than 1."); diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 7a99169849..cd6c8b50a9 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -348,14 +348,31 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( size_t cur_device_id = 0; bool is_forwarding = true; + bool is_dist_train = false; for (ir::Node *node : sorted_ops) { if (boost::get( node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == static_cast(OpRole::kRPC)) { - CreateRPCOp(&result, node); + int op_dev_id = CreateRPCOp(&result, node); + PADDLE_ENFORCE(op_dev_id != -1, + "Can not schedule the RPC operator to the right place."); + if (node->Op()->Type() == "recv") { + auto recv_vars_attr = + boost::get>(node->Op()->GetNullableAttr( + OpProtoAndCheckerMaker::OpRoleVarAttrName())); + PADDLE_ENFORCE(recv_vars_attr.size() == 2UL); // [parameter, gradient] + if (recv_vars_attr[0].find(".block") == std::string::npos) { + bcast_var_name_set[op_dev_id].emplace(recv_vars_attr[0]); + } + } + is_dist_train = true; } else if (IsDistTrainOp(node, send_vars, recv_vars)) { - CreateDistTrainOp(&result, node); + int op_dev_id = CreateDistTrainOp(&result, node); + if (node->Op()->Type() == "concat") { + auto origin_param_name = node->Op()->OutputArgumentNames()[0]; + bcast_var_name_set[op_dev_id].emplace(origin_param_name); + } } else if (IsScaleLossOp(node)) { // user can customize loss@grad if not use_default_grad_scale_ if (strategy_.gradient_scale_ != @@ -414,7 +431,10 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( CreateReduceOp(&result, g_name, cur_device_id); graph->Get(kShardedVarDevice) .emplace(g_name, cur_device_id); - bcast_var_name_set[cur_device_id].emplace(p_name); + if (!is_dist_train) { + // will send gradients directly when distributed training + bcast_var_name_set[cur_device_id].emplace(p_name); + } break; case BuildStrategy::ReduceStrategy::kAllReduce: if (IsSparseGradient(g_name)) { @@ -436,14 +456,14 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( } } } - bool use_gpu = false; #ifdef PADDLE_WITH_CUDA use_gpu = nccl_ctxs_ != nullptr; #endif - if (use_gpu || - strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { + if ((use_gpu && + strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) || + is_dist_train) { // Insert BCast Ops for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) { auto &to_bcast_set = bcast_var_name_set[dev_id]; @@ -676,8 +696,8 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result, return var; } -void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, - ir::Node *node) const { +int MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, + ir::Node *node) const { int op_dev_id = -1; std::vector input_var_names; std::vector output_var_names; @@ -720,6 +740,7 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, node->Op()->Type()); CreateComputationalOp(result, node, op_dev_id); + return op_dev_id; } void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) { @@ -738,8 +759,8 @@ void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) { } // Create RPC related op handles that connects its in ops and out ops. -void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, - ir::Node *node) const { +int MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, + ir::Node *node) const { int op_dev_id = -1; if (node->Op()->Type() == "send") { // TODO(paddle-dev): getting the first var is not safe. @@ -825,6 +846,7 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, CreateOpOutput(result, op_handle, new_node, p, outvar_dev_id); } } + return op_dev_id; } bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const { diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h index ac6d9c5a64..1ca8c4b855 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h @@ -54,8 +54,8 @@ class MultiDevSSAGraphBuilder : public ir::Pass { bool IsScaleLossOp(ir::Node *node) const; - void CreateRPCOp(ir::Graph *result, ir::Node *node) const; - void CreateDistTrainOp(ir::Graph *result, ir::Node *node) const; + int CreateRPCOp(ir::Graph *result, ir::Node *node) const; + int CreateDistTrainOp(ir::Graph *result, ir::Node *node) const; /** * Is this operator as the end-point operator before/after send operator. diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index 6c7e5c1fb0..878828693b 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -27,7 +27,11 @@ namespace framework { namespace details { void ReduceOpHandle::RunImpl() { - platform::RecordEvent r("reduce", nullptr); + if (dev_ctxes_.size() > 0UL) { + platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second); + } else { + platform::RecordEvent record_event(Name(), nullptr); + } if (places_.size() == 1) return; // the input and output may have dummy var. auto in_var_handles = DynamicCast(inputs_); diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc index 609e185819..ba243979b3 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc @@ -51,7 +51,7 @@ void ScaleLossGradOpHandle::RunImpl() { ->stream(); memory::Copy(boost::get(place_), tmp, platform::CPUPlace(), &coeff_, sizeof(float), stream); - VLOG(1) << place_ << "RUN Scale loss grad op"; + VLOG(10) << place_ << "RUN Scale loss grad op"; }); #endif } diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 20fc08e21d..8bc30fc123 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -683,7 +683,6 @@ All parameter, weight, gradient are variables in Paddle. const std::string &, Scope *, std::vector &, const ExecutionStrategy &, const BuildStrategy &, size_t, size_t>()) - .def("_bcast_params", &ParallelExecutor::BCastParamsToDevices) // NOTE: even we return a vec* to Python use reference policy. // We still cannot get local_scope from this vector, since the element // of vec will be freed by Python GC. We can only return Scope* diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 4790e0f611..058f414e9b 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -279,21 +279,11 @@ class ParallelExecutor(object): self.executor.run(fetch_list, fetch_var_name) arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array() - if self.is_dist: - self._bcast_params() - if return_numpy: return executor.as_numpy(arr) return [arr[i] for i in range(len(arr))] - def _bcast_params(self): - """ - Broadcast the parameters to other devices. It is used during - distributed training. - """ - self.executor._bcast_params(set(self.persistable_vars)) - @property def device_count(self): return len(self._act_places) From d61c11764af1249c8acc6937f2c25a8ae6c86c3e Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 12 Sep 2018 12:50:50 +0800 Subject: [PATCH 50/85] follow comment add enforce --- paddle/fluid/framework/operator.cc | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index bbd141cb3b..b7fae7171a 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -471,10 +471,11 @@ class RuntimeInferShapeContext : public InferShapeContext { return false; } const auto& in = it->second; - - if (in.size() != 1 || in[0] == kEmptyVarName) { + if (in.size() == 0 || in[0] == kEmptyVarName) { return false; } + PADDLE_ENFORCE_EQ(in.size(), 1UL, + "Input %s should not have more than one inputs", name); return scope_.FindVar(in[0]) != nullptr; } @@ -486,9 +487,11 @@ class RuntimeInferShapeContext : public InferShapeContext { return false; } const auto& out = it->second; - if (out.size() != 1 || out[0] == kEmptyVarName) { + if (out.size() == 0 || out[0] == kEmptyVarName) { return false; } + PADDLE_ENFORCE_EQ(out.size(), 1UL, + "Output %s should not have more than one outputs", name); return scope_.FindVar(out[0]) != nullptr; } From d41176411fd4f5f06155c7c73264f1145ecccee7 Mon Sep 17 00:00:00 2001 From: Jiabin Yang Date: Wed, 12 Sep 2018 13:08:02 +0800 Subject: [PATCH 51/85] Update test_py_reader_using_executor.py --- .../fluid/tests/unittests/test_py_reader_using_executor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py index 0fb9518a45..b7fad9b3a6 100644 --- a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py +++ b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py @@ -97,7 +97,7 @@ class TestPyReaderUsingExecutor(unittest.TestCase): def test(self): for use_cuda in ([False, True] - if core.core.is_compiled_with_cuda() else [False]): + if core.is_compiled_with_cuda() else [False]): for use_parallel_executor in [False, True]: for use_double_buffer in [False, True]: print('Test Parameters:'), From bdd957b4be7a023426f76ae6e3153aa5a0e1686f Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Wed, 12 Sep 2018 05:18:22 +0000 Subject: [PATCH 52/85] fix test_parallel_executor_transformer --- .../tests/unittests/test_parallel_executor_transformer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py index 5ad922725a..a55b2002ed 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py @@ -20,6 +20,7 @@ import numpy as np from parallel_executor_test_base import TestParallelExecutorBase import unittest import paddle +import paddle.fluid.core as core import paddle.dataset.wmt16 as wmt16 import os @@ -170,7 +171,8 @@ class TestTransformer(TestParallelExecutorBase): writer.complete_append_tensor() def test_main(self): - self.check_network_convergence(transformer, use_cuda=True) + if core.is_compiled_with_cuda(): + self.check_network_convergence(transformer, use_cuda=True) self.check_network_convergence(transformer, use_cuda=False, iter=5) From 670c58bea4c4cd68b575e9d7e3f39da783facb52 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Wed, 12 Sep 2018 07:24:50 +0000 Subject: [PATCH 53/85] fix mac test_data_baalancance --- python/paddle/fluid/tests/unittests/test_data_balance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_data_balance.py b/python/paddle/fluid/tests/unittests/test_data_balance.py index e39eedd282..4bd24510bc 100644 --- a/python/paddle/fluid/tests/unittests/test_data_balance.py +++ b/python/paddle/fluid/tests/unittests/test_data_balance.py @@ -84,7 +84,7 @@ class TestDataBalance(unittest.TestCase): self.data_file_name = './data_balance_test.recordio' self.lod_data_file_name = './data_balance_with_lod_test.recordio' self.total_ins_num = 50 - self.batch_size = 10 + self.batch_size = 12 self.prepare_data() self.prepare_lod_data() From 41de582bb092dfa67bd2a1fa5d3b469db1ae81e2 Mon Sep 17 00:00:00 2001 From: Sylwester Fraczek Date: Wed, 12 Sep 2018 10:22:11 +0200 Subject: [PATCH 54/85] create conv relu pass for MKLDNN (#13258) --- paddle/fluid/framework/ir/CMakeLists.txt | 6 + .../ir/conv_relu_mkldnn_fuse_pass.cc | 90 +++++++++++++++ .../framework/ir/conv_relu_mkldnn_fuse_pass.h | 39 +++++++ .../ir/conv_relu_mkldnn_fuse_pass_tester.cc | 108 ++++++++++++++++++ .../framework/ir/graph_pattern_detector.cc | 33 ++++++ .../framework/ir/graph_pattern_detector.h | 22 ++++ 6 files changed, 298 insertions(+) create mode 100644 paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h create mode 100644 paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index ce3ebed00b..7004f484a9 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -28,6 +28,9 @@ cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph grap pass_library(graph_to_program_pass base) pass_library(graph_viz_pass base) pass_library(fc_fuse_pass inference) +if(WITH_MKLDNN) + pass_library(conv_relu_mkldnn_fuse_pass inference) +endif() pass_library(attention_lstm_fuse_pass inference) pass_library(infer_clean_graph_pass inference) pass_library(fc_lstm_fuse_pass inference) @@ -42,3 +45,6 @@ cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_r cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass) cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector) cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto) +if(WITH_MKLDNN) + cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass) +endif() diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc new file mode 100644 index 0000000000..4408cb45ac --- /dev/null +++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc @@ -0,0 +1,90 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h" +#include +#include +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +std::unique_ptr ConvReLUFusePass::ApplyImpl( + std::unique_ptr graph) const { + PADDLE_ENFORCE(graph.get()); + FusePassBase::Init("conv_relu_mkldnn_fuse", graph.get()); + + std::unordered_set nodes2delete; + + GraphPatternDetector gpd; + auto* conv_input = gpd.mutable_pattern() + ->NewNode("conv_relu_mkldnn_fuse/conv_input") + ->AsInput() + ->assert_is_op_input("conv2d", "Input"); + patterns::ConvReLU conv_relu_pattern(gpd.mutable_pattern(), + "conv_relu_mkldnn_fuse"); + conv_relu_pattern(conv_input); + + int found_conv_relu_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "handle ConvReLU fuse"; + GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, + conv_relu_pattern); // Filter + GET_IR_NODE_FROM_SUBGRAPH(conv_bias, conv_bias, conv_relu_pattern); // Bias + GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_relu_pattern); // tmp + GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_relu_pattern); // CONV op + GET_IR_NODE_FROM_SUBGRAPH(relu_out, relu_out, conv_relu_pattern); // Out + GET_IR_NODE_FROM_SUBGRAPH(relu, relu, conv_relu_pattern); // ReLU op + + // Create an ConvReLU Node. + OpDesc desc; + std::string conv_relu_i_in = subgraph.at(conv_input)->Name(); + std::string conv_relu_w_in = conv_weight->Name(); + std::string conv_relu_b_in = conv_bias->Name(); + std::string conv_relu_out = relu_out->Name(); + desc.SetInput("Input", std::vector({conv_relu_i_in})); + desc.SetInput("Filter", std::vector({conv_relu_w_in})); + desc.SetInput("Bias", std::vector({conv_relu_b_in})); + desc.SetOutput("Out", std::vector({conv_relu_out})); + desc.SetType("conv2d"); + for (auto& attr : conv->Op()->GetAttrMap()) { + desc.SetAttr(attr.first, attr.second); + } + desc.SetAttr("fuse_relu", true); + auto conv_relu_node = g->CreateOpNode(&desc); // OpDesc will be copied. + GraphSafeRemoveNodes(graph.get(), {conv, relu, conv_out}); + + PADDLE_ENFORCE(subgraph.count(conv_input)); + IR_NODE_LINK_TO(subgraph.at(conv_input), conv_relu_node); + IR_NODE_LINK_TO(conv_weight, conv_relu_node); + IR_NODE_LINK_TO(conv_bias, conv_relu_node); + IR_NODE_LINK_TO(conv_relu_node, relu_out); + + found_conv_relu_count++; + }; + + gpd(graph.get(), handler); + + AddStatis(found_conv_relu_count); + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(conv_relu_mkldnn_fuse_pass, + paddle::framework::ir::ConvReLUFusePass); diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h new file mode 100644 index 0000000000..b5de0d5487 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * Fuse the CONV and ReLU to a ConvReLUOp. + */ +class ConvReLUFusePass : public FusePassBase { + public: + virtual ~ConvReLUFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc new file mode 100644 index 0000000000..82b5fa1886 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc @@ -0,0 +1,108 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h" + +#include + +namespace paddle { +namespace framework { +namespace ir { + +void SetOp(ProgramDesc* prog, const std::string& type, + const std::vector& inputs, + const std::vector& outputs) { + auto* op = prog->MutableBlock(0)->AppendOp(); + op->SetType(type); + if (type == "conv2d") { + op->SetAttr("use_mkldnn", true); + op->SetInput("Input", {inputs[0]}); + op->SetInput("Filter", {inputs[1]}); + op->SetInput("Bias", {inputs[2]}); + } else if (type == "relu") { + op->SetInput("X", inputs); + } + op->SetOutput("Out", outputs); +} + +// a->OP0->b +// b->OP1->c +// (c, weights, bias)->conv->f +// (f)->relu->g +ProgramDesc BuildProgramDesc() { + ProgramDesc prog; + for (auto& v : + std::vector({"a", "b", "c", "weights", "bias", "f", "g"})) { + auto* var = prog.MutableBlock(0)->Var(v); + var->SetType(proto::VarType::SELECTED_ROWS); + if (v == "weights" || v == "bias") { + var->SetPersistable(true); + } + } + + SetOp(&prog, "OP0", std::vector({"a"}), + std::vector({"b"})); + SetOp(&prog, "OP1", std::vector({"b"}), + std::vector({"c"})); + SetOp(&prog, "conv2d", std::vector({"c", "weights", "bias"}), + std::vector({"f"})); + SetOp(&prog, "relu", std::vector({"f"}), + std::vector({"g"})); + + return prog; +} + +TEST(ConvReLUFusePass, basic) { + auto prog = BuildProgramDesc(); + + std::unique_ptr graph(new ir::Graph(prog)); + + auto pass = PassRegistry::Instance().Get("conv_relu_mkldnn_fuse_pass"); + + int original_nodes_num = graph->Nodes().size(); + + graph = pass->Apply(std::move(graph)); + + int current_nodes_num = graph->Nodes().size(); + + // Remove 3 Nodes: CONV, RELU, conv_out + // Add 1 Node: ConvReLU + EXPECT_EQ(original_nodes_num - 2, current_nodes_num); + + // Assert conv_relu op in newly generated graph + int conv_relu_count = 0; + + for (auto* node : graph->Nodes()) { + if (node->IsOp() && node->Op()->Type() == "conv2d") { + if (node->Op()->HasAttr("use_mkldnn")) { + bool use_mkldnn = boost::get(node->Op()->GetAttr("use_mkldnn")); + if (use_mkldnn) { + if (node->Op()->HasAttr("fuse_relu")) { + bool fuse_relu = boost::get(node->Op()->GetAttr("fuse_relu")); + if (fuse_relu) { + ++conv_relu_count; + } + } + } + } + } + } + EXPECT_EQ(conv_relu_count, 1); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(conv_relu_mkldnn_fuse_pass); diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 5825a129b7..11d5998aaf 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -522,6 +522,39 @@ bool VarLinksFromOp(Node* node, const std::string& op_type) { return false; } +PDNode* patterns::ConvReLU::operator()( + paddle::framework::ir::PDNode* conv_input) { + // Create Operators + conv_input->assert_is_op_input("conv2d", "Input"); + auto* conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d"); + auto* relu_op = pattern->NewNode(relu_repr())->assert_is_op("relu"); + // Create variables + // Filter + auto* conv_weight_var = pattern->NewNode(conv_weight_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("conv2d", "Filter"); + // Bias + auto* conv_bias_var = pattern->NewNode(conv_bias_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("conv2d", "Bias"); + // intermediate variable, will be removed in the IR after fuse. + auto* conv_out_var = pattern->NewNode(conv_out_repr()) + ->AsIntermediate() + ->assert_is_only_output_of_op("conv2d") + ->assert_is_op_input("relu"); + // output + auto* relu_out_var = pattern->NewNode(relu_out_repr()) + ->AsOutput() + ->assert_is_op_output("relu"); + + conv_op->LinksFrom({conv_input, conv_weight_var, conv_bias_var}) + .LinksTo({conv_out_var}); + relu_op->LinksFrom({conv_out_var}).LinksTo({relu_out_var}); + return relu_out_var; +} + PDNode* patterns::FC::operator()(paddle::framework::ir::PDNode* x, bool with_bias) { // Create shared nodes. diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 57482a07b6..371384dc56 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -360,6 +360,28 @@ struct PatternBase { size_t id_; }; +// CONV with ReLU +// op: conv + relu +// named nodes: +// conv_input, conv_weight, +// conv_bias, conv_out, conv, +// relu_out, relu +struct ConvReLU : public PatternBase { + ConvReLU(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "conv_relu") {} + + PDNode* operator()(PDNode* conv_input); + + // declare operator node's name + PATTERN_DECL_NODE(conv); + PATTERN_DECL_NODE(relu); + // declare variable node's name + PATTERN_DECL_NODE(conv_weight); + PATTERN_DECL_NODE(conv_bias); + PATTERN_DECL_NODE(conv_out); + PATTERN_DECL_NODE(relu_out); +}; + // FC with bias // op: mul + elementwise_add // named nodes: From b12322ce959a2ab79a1bae4e7aaf9e4b42d56909 Mon Sep 17 00:00:00 2001 From: luotao1 Date: Wed, 12 Sep 2018 19:06:17 +0800 Subject: [PATCH 55/85] fix fusion_lstm unique_name bug --- paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc | 5 ++--- paddle/fluid/inference/analysis/ir_pass_manager.cc | 8 ++++++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc index f7fda87357..aa95d3e9f6 100644 --- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc @@ -51,7 +51,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, if (with_fc_bias) { // Add FC-bias with LSTM-bias and create a new weight PADDLE_ENFORCE(scope); - const std::string& new_bias_var = name_scope + "_bias.new"; + const std::string& new_bias_var = patterns::UniqueKey("NewBias"); auto* bias_var = scope->Var(new_bias_var); PADDLE_ENFORCE(bias_var); auto* bias_tensor = bias_var->GetMutable(); @@ -120,7 +120,6 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { - GET_IR_NODE_FROM_SUBGRAPH(lstm, lstm, lstm_pattern); GET_IR_NODE_FROM_SUBGRAPH(Weight, Weight, lstm_pattern); GET_IR_NODE_FROM_SUBGRAPH(Bias, Bias, lstm_pattern); @@ -136,7 +135,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, fc_bias); // Remove unneeded nodes. std::unordered_set marked_nodes( - {mul, lstm, elementwise_add}); + {mul, lstm, elementwise_add, fc_bias}); GraphSafeRemoveNodes(graph, marked_nodes); } else { GET_IR_NODE_FROM_SUBGRAPH(fc_out, mul_out, fc_pattern); diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 30c1e8e93d..e76708baf4 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/inference/analysis/ir_pass_manager.h" #include +#include #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/scope.h" @@ -37,13 +38,16 @@ IRPassManager::IRPassManager(const ProgramDesc &program, void IRPassManager::Apply(const std::vector &passes) { // Apply all the passes std::string pre_pass; + int pass_num = 0; for (const std::string &pass_name : passes) { PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass_name); auto pass = framework::ir::PassRegistry::Instance().Get(pass_name); if (pass_name == "graph_viz_pass") { - std::string dot_file_path = - "ir_" + (pre_pass.empty() ? "origin" : pre_pass) + ".dot"; + std::string dot_file_path = std::to_string(pass_num) + "_ir_" + + (pre_pass.empty() ? "origin" : pre_pass) + + ".dot"; pass->Set("graph_viz_path", new std::string(std::move(dot_file_path))); + pass_num++; } graph_ = pass->Apply(std::move(graph_)); pre_pass = pass_name; From 415e0eac692d8aef35726deb2915f92ea7442edf Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Wed, 12 Sep 2018 11:29:17 +0000 Subject: [PATCH 56/85] fix mac test_reader_reset --- python/paddle/fluid/tests/unittests/test_reader_reset.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/paddle/fluid/tests/unittests/test_reader_reset.py b/python/paddle/fluid/tests/unittests/test_reader_reset.py index 8ad11d76f6..a115c37e1d 100644 --- a/python/paddle/fluid/tests/unittests/test_reader_reset.py +++ b/python/paddle/fluid/tests/unittests/test_reader_reset.py @@ -41,6 +41,8 @@ class TestReaderReset(unittest.TestCase): self.data_file_name, reader, feeder) def setUp(self): + # set parallel threads to fit 20 batches in line 49 + os.environ['CPU_NUM'] = str(20) self.use_cuda = fluid.core.is_compiled_with_cuda() self.data_file_name = './reader_reset_test.recordio' self.ins_shape = [3] From 539b3f300ffe7475cec5114cee32949a54d9d768 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 12 Sep 2018 22:29:39 +0800 Subject: [PATCH 57/85] add ocr analysis ut --- .../fluid/inference/tests/api/CMakeLists.txt | 11 ++ .../tests/api/analyzer_vis_tester.cc | 170 ++++++++++++++++++ 2 files changed, 181 insertions(+) create mode 100644 paddle/fluid/inference/tests/api/analyzer_vis_tester.cc diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index d44a2cfa7f..ff6bb662c1 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -55,3 +55,14 @@ inference_analysis_test(test_text_classification SRCS analyzer_text_classificati --infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt --topn=1 # Just run top 1 batch. ) + +# ocr +set(OCR_MODEL_URL "http://paddlemodels.cdn.bcebos.com/inference-vis-demos%2Focr.tar.gz") +set(OCR_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/ocr") +if (NOT EXISTS ${OCR_INSTALL_DIR} AND WITH_INFERENCE) + inference_download_and_uncompress(${OCR_INSTALL_DIR} ${OCR_MODEL_URL}) +endif() +inference_analysis_test(test_analyzer_ocr SRCS analyzer_vis_tester.cc + EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor + ARGS --infer_model=${OCR_INSTALL_DIR}/model + --infer_data=${OCR_INSTALL_DIR}/data.txt) diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc new file mode 100644 index 0000000000..7a1bb32a57 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc @@ -0,0 +1,170 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/analysis/analyzer.h" +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/inference/analysis/ut_helper.h" +#include "paddle/fluid/inference/api/analysis_predictor.h" +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_inference_pass.h" + +DEFINE_string(infer_model, "", "model path for LAC"); +DEFINE_string(infer_data, "", "data file for LAC"); +DEFINE_int32(batch_size, 1, "batch size."); +DEFINE_int32(repeat, 1, "Running the inference program repeat times."); + +namespace paddle { +namespace inference { +namespace analysis { + +struct Record { + std::vector data; + std::vector shape; +}; + +Record ProcessALine(const std::string &line) { + VLOG(3) << "process a line"; + std::vector columns; + split(line, '\t', &columns); + CHECK_EQ(columns.size(), 2UL) + << "data format error, should be \t"; + + Record record; + std::vector data_strs; + split(columns[0], ' ', &data_strs); + for (auto &d : data_strs) { + record.data.push_back(std::stof(d)); + } + + std::vector shape_strs; + split(columns[1], ' ', &shape_strs); + for (auto &s : shape_strs) { + record.shape.push_back(std::stoi(s)); + } + VLOG(3) << "data size " << record.data.size(); + VLOG(3) << "data shape size " << record.shape.size(); + return record; +} + +/* + * Use the native and analysis fluid engine to inference the demo. + * ocr, mobilenet and se_resnext50 + */ +void TestVisualPrediction() { + std::unique_ptr predictor; + AnalysisConfig cfg; + cfg.param_file = FLAGS_infer_model + "/__params__"; + cfg.prog_file = FLAGS_infer_model + "/__model__"; + cfg.use_gpu = false; + cfg.device = 0; + // cfg.specify_input_name = true; + cfg.enable_ir_optim = true; + predictor = + CreatePaddlePredictor(cfg); + + // Only have single batch of data. + std::string line; + std::ifstream file(FLAGS_infer_data); + std::getline(file, line); + auto record = ProcessALine(line); + file.close(); + + // Inference. + PaddleTensor input; + input.shape = record.shape; + input.data = + PaddleBuf(record.data.data(), record.data.size() * sizeof(float)); + input.dtype = PaddleDType::FLOAT32; + + std::vector outputs_slots; + Timer timer; + timer.tic(); + for (int i = 0; i < FLAGS_repeat; i++) { + predictor->Run({input}, &outputs_slots); + } + PrintTime(/*batch size*/ 1, FLAGS_repeat, /*num threads*/ 1, /*thread id*/ 0, + timer.toc() / FLAGS_repeat); + + VLOG(3) << "output.size " << outputs_slots.size(); + + // run native as reference + NativeConfig config; + config.param_file = FLAGS_infer_model + "/__params__"; + config.prog_file = FLAGS_infer_model + "/__model__"; + config.use_gpu = false; + config.device = 0; + // config.specify_input_name = true; + auto ref_predictor = + CreatePaddlePredictor(config); + std::vector ref_outputs_slots; + ref_predictor->Run({input}, &ref_outputs_slots); + EXPECT_EQ(ref_outputs_slots.size(), outputs_slots.size()); + for (size_t i = 0; i < outputs_slots.size(); ++i) { + auto &ref_out = ref_outputs_slots[i]; + auto &out = outputs_slots[i]; + size_t ref_size = + std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1, + [](int a, int b) { return a * b; }); + size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, + [](int a, int b) { return a * b; }); + EXPECT_EQ(size, ref_size); + EXPECT_EQ(out.dtype, ref_out.dtype); + switch (out.dtype) { + case PaddleDType::INT64: { + int64_t *pdata = static_cast(out.data.data()); + int64_t *pdata_ref = static_cast(ref_out.data.data()); + for (size_t j = 0; j < size; ++j) { + EXPECT_EQ(pdata_ref[j], pdata[j]); + } + break; + } + case PaddleDType::FLOAT32: { + float *pdata = static_cast(out.data.data()); + float *pdata_ref = static_cast(ref_out.data.data()); + for (size_t j = 0; j < size; ++j) { + EXPECT_NEAR(pdata_ref[j], pdata[j], 1e-3); + } + break; + } + } + // print what are fused + AnalysisPredictor *analysis_predictor = + dynamic_cast(predictor.get()); + auto &fuse_statis = analysis_predictor->analysis_argument() + .Get>( + framework::ir::kFuseStatisAttr); + for (auto &item : fuse_statis) { + LOG(INFO) << "fused " << item.first << " " << item.second; + } + int num_ops = 0; + for (auto &node : + analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) { + if (node->IsFunction()) { + ++num_ops; + } + } + LOG(INFO) << "has num ops: " << num_ops; + } +} + +TEST(Analyzer_vis, analysis) { TestVisualPrediction(); } + +} // namespace analysis +} // namespace inference +} // namespace paddle From 65f901b36ff210f0cd440d2378312921c5172936 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 12 Sep 2018 22:40:45 +0800 Subject: [PATCH 58/85] disable fc gru temporarily --- paddle/fluid/inference/tests/api/analyzer_vis_tester.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc index 7a1bb32a57..67bde72304 100644 --- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc @@ -73,8 +73,8 @@ void TestVisualPrediction() { cfg.prog_file = FLAGS_infer_model + "/__model__"; cfg.use_gpu = false; cfg.device = 0; - // cfg.specify_input_name = true; cfg.enable_ir_optim = true; + cfg.ir_passes.push_back("fc_gru_fuse_pass"); predictor = CreatePaddlePredictor(cfg); From 01f0f16884f3587f2d01a830e55c7c446a0c8cde Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 12 Sep 2018 23:18:30 +0800 Subject: [PATCH 59/85] enable mkldnn in infer api --- paddle/fluid/inference/api/analysis_predictor.cc | 3 +++ paddle/fluid/inference/api/api_impl.cc | 3 +++ paddle/fluid/inference/api/paddle_inference_api.h | 4 +++- paddle/fluid/inference/tests/api/analyzer_vis_tester.cc | 8 ++++++-- 4 files changed, 15 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 2a9a7aed48..cd52114713 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -77,6 +77,9 @@ bool AnalysisPredictor::Init( OptimizeInferenceProgram(); ctx_ = executor_->Prepare(*inference_program_, 0); + if (config_.use_mkldnn) { + executor_->EnableMKLDNN(*inference_program_); + } VLOG(5) << "to create variables"; PADDLE_ENFORCE(scope_.get()); diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 6fe13ed027..c6cb09667e 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -106,6 +106,9 @@ bool NativePaddlePredictor::Init( } ctx_ = executor_->Prepare(*inference_program_, 0); + if (config_.use_mkldnn) { + executor_->EnableMKLDNN(*inference_program_); + } executor_->CreateVariables(*inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0); diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index 995da11e4a..e8d51bb72c 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -45,7 +45,7 @@ class PaddleBuf { PaddleBuf(void* data, size_t length) : data_(data), length_(length), memory_owned_{false} {} // Own memory. - PaddleBuf(size_t length) + explicit PaddleBuf(size_t length) : data_(new char[length]), length_(length), memory_owned_(true) {} // Resize to `length` bytes. void Resize(size_t length); @@ -121,6 +121,8 @@ struct NativeConfig : public PaddlePredictor::Config { bool use_gpu{false}; int device{0}; float fraction_of_gpu_memory{-1.f}; // Negative to notify initialization. + // MKLDNN related fields. + bool use_mkldnn{false}; // Specify the variable's name of each input. bool specify_input_name{false}; diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc index 67bde72304..135a81a85c 100644 --- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc @@ -66,12 +66,13 @@ Record ProcessALine(const std::string &line) { * Use the native and analysis fluid engine to inference the demo. * ocr, mobilenet and se_resnext50 */ -void TestVisualPrediction() { +void TestVisualPrediction(bool use_mkldnn) { std::unique_ptr predictor; AnalysisConfig cfg; cfg.param_file = FLAGS_infer_model + "/__params__"; cfg.prog_file = FLAGS_infer_model + "/__model__"; cfg.use_gpu = false; + cfg.use_mkldnn = use_mkldnn; cfg.device = 0; cfg.enable_ir_optim = true; cfg.ir_passes.push_back("fc_gru_fuse_pass"); @@ -163,7 +164,10 @@ void TestVisualPrediction() { } } -TEST(Analyzer_vis, analysis) { TestVisualPrediction(); } +TEST(Analyzer_vis, analysis) { TestVisualPrediction(/*use_mkldnn*/ false); } +TEST(Analyzer_vis, analysis_mkldnn) { + TestVisualPrediction(/*use_mkldnn*/ true); +} } // namespace analysis } // namespace inference From dd149d469b4c585d852b6bedc3c2835ee4b5424c Mon Sep 17 00:00:00 2001 From: Sylwester Fraczek Date: Wed, 12 Sep 2018 10:22:08 -0700 Subject: [PATCH 60/85] hotfix for conv-relu pass --- paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc | 2 +- paddle/fluid/inference/analysis/analyzer.h | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc index 4408cb45ac..09c5ec59d6 100644 --- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc @@ -58,7 +58,7 @@ std::unique_ptr ConvReLUFusePass::ApplyImpl( desc.SetInput("Input", std::vector({conv_relu_i_in})); desc.SetInput("Filter", std::vector({conv_relu_w_in})); desc.SetInput("Bias", std::vector({conv_relu_b_in})); - desc.SetOutput("Out", std::vector({conv_relu_out})); + desc.SetOutput("Output", std::vector({conv_relu_out})); desc.SetType("conv2d"); for (auto& attr : conv->Op()->GetAttrMap()) { desc.SetAttr(attr.first, attr.second); diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h index 399afbe64a..9bdbefc07c 100644 --- a/paddle/fluid/inference/analysis/analyzer.h +++ b/paddle/fluid/inference/analysis/analyzer.h @@ -72,6 +72,9 @@ class Analyzer : public OrderedRegistry { "mul_gru_fuse_pass", // "seq_concat_fc_fuse_pass", // "fc_fuse_pass", // +#ifdef PADDLE_WITH_MKLDNN + "conv_relu_mkldnn_fuse_pass", // +#endif }}; std::unordered_set disabled_ir_passes_; From e69d9c845b30d7150f122c41805b1bc5bf75136c Mon Sep 17 00:00:00 2001 From: Bai Yifan Date: Thu, 13 Sep 2018 09:49:22 +0800 Subject: [PATCH 61/85] code fix (#13365) --- paddle/fluid/operators/softmax_with_cross_entropy_op.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu index 148faec4af..a07c17348e 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu @@ -31,7 +31,8 @@ __global__ void CrossEntropyGrad(T* logit_grad, const int64_t* labels, for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < batch_size; i += blockDim.x * gridDim.x) { int idx = i * class_num + labels[i]; - logit_grad[idx] -= static_cast(1.); + logit_grad[idx] -= + ignore_index == labels[i] ? static_cast(0.) : static_cast(1.); } } From 1e1b6622fdce1b704c7753e2c16656bdc97ac24e Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Thu, 13 Sep 2018 10:44:39 +0800 Subject: [PATCH 62/85] update by comment --- paddle/fluid/framework/details/all_reduce_op_handle.cc | 6 +----- paddle/fluid/framework/details/broadcast_op_handle.cc | 6 +----- .../fluid/framework/details/data_balance_op_handle.cc | 6 ------ .../framework/details/multi_devices_graph_pass.cc | 10 +++------- 4 files changed, 5 insertions(+), 23 deletions(-) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index 8450d8eb8b..7c5f5bd80a 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -46,11 +46,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, #endif void AllReduceOpHandle::RunImpl() { - if (dev_ctxes_.size() > 0UL) { - platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second); - } else { - platform::RecordEvent record_event(Name(), nullptr); - } + platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second); if (NoDummyInputSize() == 1) { return; // No need to all reduce when GPU count = 1; diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc index 35962ade99..4fdab5cd94 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle.cc @@ -22,11 +22,7 @@ namespace framework { namespace details { void BroadcastOpHandle::RunImpl() { - if (dev_ctxes_.size() > 0UL) { - platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second); - } else { - platform::RecordEvent record_event(Name(), nullptr); - } + platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second); if (places_.size() == 1) return; diff --git a/paddle/fluid/framework/details/data_balance_op_handle.cc b/paddle/fluid/framework/details/data_balance_op_handle.cc index 91f6a42e6e..8eb3568e05 100644 --- a/paddle/fluid/framework/details/data_balance_op_handle.cc +++ b/paddle/fluid/framework/details/data_balance_op_handle.cc @@ -87,12 +87,6 @@ std::vector> DataBalanceOpHandle::GetBalancePlan( } void DataBalanceOpHandle::RunImpl() { - if (dev_ctxes_.size() > 0UL) { - platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second); - } else { - platform::RecordEvent record_event(Name(), nullptr); - } - PADDLE_ENFORCE_GT(places_.size(), 1, "Data balance can only be enabled when the number of " "places to run larger than 1."); diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index cd6c8b50a9..11b085c5c7 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -431,10 +431,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( CreateReduceOp(&result, g_name, cur_device_id); graph->Get(kShardedVarDevice) .emplace(g_name, cur_device_id); - if (!is_dist_train) { - // will send gradients directly when distributed training - bcast_var_name_set[cur_device_id].emplace(p_name); - } + bcast_var_name_set[cur_device_id].emplace(p_name); break; case BuildStrategy::ReduceStrategy::kAllReduce: if (IsSparseGradient(g_name)) { @@ -461,9 +458,8 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( use_gpu = nccl_ctxs_ != nullptr; #endif - if ((use_gpu && - strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) || - is_dist_train) { + if (use_gpu && strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce && + !is_dist_train) { // Insert BCast Ops for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) { auto &to_bcast_set = bcast_var_name_set[dev_id]; From 4778c6e21c6918535b648b9c9bc55e9f0ba56e99 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Thu, 13 Sep 2018 10:45:27 +0800 Subject: [PATCH 63/85] delete unused py codes --- python/paddle/fluid/parallel_executor.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index a6395d9e20..44af29d339 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -142,11 +142,6 @@ class ParallelExecutor(object): main = main if main else framework.default_main_program() if scope == None: scope = executor.global_scope() - # FIXME(Yancey1989): it's a temporary approach to determinate the distribute - # train program, call self.bcast_param() at the end of each mini-batch. - self.is_dist = True if "recv" in [ - op.type for op in main.global_block().ops - ] else False if share_vars_from and not isinstance(share_vars_from, ParallelExecutor): From 1664899b63ba8175f7ad5616a031a01c1e54ca1a Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Thu, 13 Sep 2018 10:53:16 +0800 Subject: [PATCH 64/85] update --- paddle/fluid/framework/details/data_balance_op_handle.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/fluid/framework/details/data_balance_op_handle.cc b/paddle/fluid/framework/details/data_balance_op_handle.cc index 8eb3568e05..525d243224 100644 --- a/paddle/fluid/framework/details/data_balance_op_handle.cc +++ b/paddle/fluid/framework/details/data_balance_op_handle.cc @@ -15,7 +15,6 @@ #include "paddle/fluid/framework/details/data_balance_op_handle.h" #include #include "paddle/fluid/framework/details/container_cast.h" -#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace framework { From 5b5fa37fb98bfa05f23e5ad508f6dbf3e7ec9f93 Mon Sep 17 00:00:00 2001 From: Jiabin Yang Date: Thu, 13 Sep 2018 10:53:51 +0800 Subject: [PATCH 65/85] Update test_reader_reset.py import os module to use os.environ in setUp() --- python/paddle/fluid/tests/unittests/test_reader_reset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_reader_reset.py b/python/paddle/fluid/tests/unittests/test_reader_reset.py index a115c37e1d..e97a05b6f9 100644 --- a/python/paddle/fluid/tests/unittests/test_reader_reset.py +++ b/python/paddle/fluid/tests/unittests/test_reader_reset.py @@ -13,7 +13,7 @@ # limitations under the License. from __future__ import print_function - +import os import paddle.fluid as fluid import paddle import numpy as np From 2b10aee52a3f6f23b0243ee64b4f4d722fa41383 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 13 Sep 2018 12:04:16 +0800 Subject: [PATCH 66/85] disable seqexpandconcatfc op test on Mac --- python/paddle/fluid/tests/unittests/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 8ac1cb164e..9d7c528dbd 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -28,6 +28,10 @@ list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/Paddl list(REMOVE_ITEM TEST_OPS op_test) # op_test is a helper python file, not a test list(REMOVE_ITEM TEST_OPS decorators) # decorators is a helper python file, not a test +if(APPLE) + # this op is not support on mac + list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op) +endif() function(py_test_modules TARGET_NAME) if(WITH_TESTING) From 49bafc05bf7380874c92bd2954c5c96bca695ee4 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Thu, 13 Sep 2018 05:35:29 +0000 Subject: [PATCH 67/85] fix comments and set name for trt layer and ITensor --- .../fluid/inference/analysis/subgraph_splitter.cc | 12 ++++++++++-- .../inference/tensorrt/convert/activation_op.cc | 2 ++ .../inference/tensorrt/convert/batch_norm_op.cc | 2 ++ .../fluid/inference/tensorrt/convert/concat_op.cc | 4 ++++ .../fluid/inference/tensorrt/convert/conv2d_op.cc | 5 +++++ .../inference/tensorrt/convert/elementwise_op.cc | 4 ++++ paddle/fluid/inference/tensorrt/convert/fc_op.cc | 2 ++ .../fluid/inference/tensorrt/convert/pool2d_op.cc | 2 ++ paddle/fluid/operators/tensorrt_engine_op.h | 14 -------------- 9 files changed, 31 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/subgraph_splitter.cc index 773fceeeb2..c3a2dbf9d1 100644 --- a/paddle/fluid/inference/analysis/subgraph_splitter.cc +++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc @@ -85,6 +85,14 @@ struct BriefNode { std::vector outlinks; }; +// Union two adjacent BriefNode. +// Suppose we have two adjacent nodes src and dst. +// We will perform the following operations: +// 1. add all inputs(except src) of dst to src inlinks. +// 2. add all outputs of dst to src outlinks. +// 3. change all the dst's inputs and outputs +// corresponding inlinks and outlinks to src node. +// 4. delete all dst's inlinks and outlinks. void UnionContractedNodes(const std::unordered_map &node_map, int src_id, int dst_id) { // merge the two adjacent nodes into one node. @@ -224,8 +232,8 @@ std::vector> SubGraphSplitter::ExtractSubGraphs() { // Our algorithm must guarantee that: // 1. The graph is always directed acyclic graph(DAG). // 2. If there is a path in the subgraph from X to Y (X and Y are both - // nodes - // in the subgraph), then all paths from X to Y are in the subgraph. + // nodes in the subgraph), then all paths from X to Y are in the + // subgraph. // // In order to achieve the above guarantee. // For adjacent nodes src -> dst. diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc index e1cace9cc1..8168cdff1b 100644 --- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc @@ -35,6 +35,8 @@ class ReluOpConverter : public OpConverter { engine_, Activation, *const_cast(input_tensor), nvinfer1::ActivationType::kRELU); auto output_name = op_desc.Output("Out")[0]; + layer->setName(("relu (Output: " + output_name + ")").c_str()); + layer->getOutput(0)->setName(output_name.c_str()); engine_->SetITensor(output_name, layer->getOutput(0)); if (test_mode) { // the test framework can not determine which is the // output, so place the declaration inside. diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc index 94f8b0ae56..3330af2da6 100644 --- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc @@ -116,6 +116,8 @@ class BatchNormOpConverter : public OpConverter { scale_weights.get(), power_weights.get()); auto output_name = op_desc.Output("Y").front(); + layer->setName(("batch_norm (Output: " + output_name + ")").c_str()); + layer->getOutput(0)->setName(output_name.c_str()); engine_->weight_map[op_desc.Input("Bias").front()] = std::move(combile_bias_tensor); engine_->weight_map[op_desc.Input("Scale").front()] = diff --git a/paddle/fluid/inference/tensorrt/convert/concat_op.cc b/paddle/fluid/inference/tensorrt/convert/concat_op.cc index bb9627bf95..2983e91cb2 100644 --- a/paddle/fluid/inference/tensorrt/convert/concat_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/concat_op.cc @@ -30,7 +30,9 @@ class ConcatOpConverter : public OpConverter { framework::OpDesc op_desc(op, nullptr); // Declare inputs std::vector itensors; + std::cout << "Concat op: " << std::endl; for (auto& input_name : op_desc.Input("X")) { + std::cout << input_name << std::endl; itensors.push_back(engine_->GetITensor(input_name)); } int axis = boost::get(op_desc.GetAttr("axis")); @@ -42,6 +44,8 @@ class ConcatOpConverter : public OpConverter { axis = axis - 1; // Remove batch dim layer->setAxis(axis); auto output_name = op_desc.Output("Out")[0]; + layer->setName(("concat (Output: " + output_name + ")").c_str()); + layer->getOutput(0)->setName(output_name.c_str()); engine_->SetITensor(output_name, layer->getOutput(0)); if (test_mode) { // the test framework can not determine which is the // output, so place the declaration inside. diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc index 841a95db38..022e43a571 100644 --- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc @@ -26,6 +26,9 @@ class Conv2dOpConverter : public OpConverter { << "convert a fluid conv2d op to tensorrt conv layer without bias"; framework::OpDesc op_desc(op, nullptr); + std::cout << "Conv op: " << std::endl; + std::cout << op_desc.Input("Input").front() << std::endl; + std::cout << op_desc.Output("Output").front() << std::endl; PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1); // Y is a weight PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1); @@ -78,8 +81,10 @@ class Conv2dOpConverter : public OpConverter { layer->setNbGroups(groups); auto output_name = op_desc.Output("Output").front(); + layer->setName(("conv2d (Output: " + output_name + ")").c_str()); engine_->weight_map[op_desc.Input("Filter").front()] = std::move(weight_tensor); + layer->getOutput(0)->setName(output_name.c_str()); engine_->SetITensor(output_name, layer->getOutput(0)); if (test_mode) { engine_->DeclareOutput(output_name); diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc index 60a72b4eb5..0a6ce568f1 100644 --- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc @@ -89,6 +89,8 @@ class ElementwiseWeightOpConverter : public OpConverter { shift_weights.get(), scale_weights.get(), power_weights.get()); auto output_name = op_desc.Output("Out")[0]; + layer->setName(("elementwise_add (Output: " + output_name + ")").c_str()); + layer->getOutput(0)->setName(output_name.c_str()); engine_->weight_map[op_desc.Input("Y").front()] = std::move(weight_tensor); engine_->SetITensor(output_name, layer->getOutput(0)); if (test_mode) { // the test framework can not determine which is the @@ -137,6 +139,8 @@ class ElementwiseTensorOpConverter : public OpConverter { *const_cast(Y), op_pair->second); auto output_name = op_desc.Output("Out")[0]; + layer->setName(("elementwise (Output: " + output_name + ")").c_str()); + layer->getOutput(0)->setName(output_name.c_str()); engine_->SetITensor(output_name, layer->getOutput(0)); if (test_mode) { // the test framework can not determine which is the // output, so place the declaration inside. diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc index ad98d85aae..7c21ecd95d 100644 --- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc @@ -107,6 +107,8 @@ class FcOpConverter : public OpConverter { n_output, tmp_weight.get(), bias.get()); auto output_name = op_desc.Output("Out").front(); + layer->setName(("fc (Output: " + output_name + ")").c_str()); + layer->getOutput(0)->setName(output_name.c_str()); engine_->SetITensor(output_name, layer->getOutput(0)); engine_->weight_map[op_desc.Input("Y").front()] = std::move(tmp); if (test_mode) { diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc index 73f1b28ddf..f9bb66a6e9 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -72,6 +72,8 @@ class Pool2dOpConverter : public OpConverter { layer->setPadding(nv_paddings); auto output_name = op_desc.Output("Out")[0]; + layer->setName(("pool2d (Output: " + output_name + ")").c_str()); + layer->getOutput(0)->setName(output_name.c_str()); engine_->SetITensor(output_name, layer->getOutput(0)); if (test_mode) { engine_->DeclareOutput(output_name); diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h index 395d8bcc07..79e75ea9a0 100644 --- a/paddle/fluid/operators/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt_engine_op.h @@ -161,20 +161,6 @@ class TensorRTEngineKernel : public framework::OpKernel { boost::get(context.GetPlace()).device)), size * sizeof(float)); - // TODO(zhaolong) : delete it sometimes - /* THIS CODE JUST FOR TEST - std::cout << output_maps[output_index] << std::endl; - platform::CPUPlace cpu_place; - framework::LoDTensor temp_tensor; - temp_tensor.Resize(framework::make_ddim(ddim)); - auto* temp_data = temp_tensor.mutable_data(cpu_place); - - TensorCopySync(*fluid_t, cpu_place ,&temp_tensor); - for(int i = 0; i < size; i++) { - std::cout << temp_data[i] << " " ; - } - std::cout << std::endl; - */ output_index += 1; } From bad4ea192e195f7d6f912eb0f8647e29e7ef929e Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Thu, 13 Sep 2018 14:15:17 +0800 Subject: [PATCH 68/85] update by comment --- paddle/fluid/framework/details/multi_devices_graph_pass.cc | 5 +++-- paddle/fluid/framework/details/reduce_op_handle.cc | 7 ++----- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 11b085c5c7..5781936cb3 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -458,8 +458,9 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( use_gpu = nccl_ctxs_ != nullptr; #endif - if (use_gpu && strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce && - !is_dist_train) { + if ((use_gpu && + strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) || + is_dist_train) { // Insert BCast Ops for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) { auto &to_bcast_set = bcast_var_name_set[dev_id]; diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index 878828693b..7fc06f234d 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -27,11 +27,8 @@ namespace framework { namespace details { void ReduceOpHandle::RunImpl() { - if (dev_ctxes_.size() > 0UL) { - platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second); - } else { - platform::RecordEvent record_event(Name(), nullptr); - } + platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second); + if (places_.size() == 1) return; // the input and output may have dummy var. auto in_var_handles = DynamicCast(inputs_); From dd0b2036c68b6601ca6722f510068d0eb162eda9 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 13 Sep 2018 15:01:41 +0800 Subject: [PATCH 69/85] add note for use mkldnn --- paddle/fluid/inference/api/analysis_predictor.cc | 2 +- paddle/fluid/inference/api/api_impl.cc | 2 +- paddle/fluid/inference/api/paddle_inference_api.h | 4 ++-- paddle/fluid/inference/tests/api/analyzer_vis_tester.cc | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index cd52114713..684e0ce0e2 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -77,7 +77,7 @@ bool AnalysisPredictor::Init( OptimizeInferenceProgram(); ctx_ = executor_->Prepare(*inference_program_, 0); - if (config_.use_mkldnn) { + if (config_._use_mkldnn) { executor_->EnableMKLDNN(*inference_program_); } diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index c6cb09667e..2e9e10139f 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -106,7 +106,7 @@ bool NativePaddlePredictor::Init( } ctx_ = executor_->Prepare(*inference_program_, 0); - if (config_.use_mkldnn) { + if (config_._use_mkldnn) { executor_->EnableMKLDNN(*inference_program_); } executor_->CreateVariables(*inference_program_, diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index e8d51bb72c..55a07ca705 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -121,8 +121,8 @@ struct NativeConfig : public PaddlePredictor::Config { bool use_gpu{false}; int device{0}; float fraction_of_gpu_memory{-1.f}; // Negative to notify initialization. - // MKLDNN related fields. - bool use_mkldnn{false}; + // NOTE: NOT use it, just for the internal test, will discard later + bool _use_mkldnn{false}; // Specify the variable's name of each input. bool specify_input_name{false}; diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc index 135a81a85c..3675c5f7f3 100644 --- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc @@ -72,7 +72,7 @@ void TestVisualPrediction(bool use_mkldnn) { cfg.param_file = FLAGS_infer_model + "/__params__"; cfg.prog_file = FLAGS_infer_model + "/__model__"; cfg.use_gpu = false; - cfg.use_mkldnn = use_mkldnn; + cfg._use_mkldnn = use_mkldnn; cfg.device = 0; cfg.enable_ir_optim = true; cfg.ir_passes.push_back("fc_gru_fuse_pass"); From 26b1704befe6963247b14272046aa5698d2277c3 Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Thu, 13 Sep 2018 15:02:10 +0800 Subject: [PATCH 70/85] fix with distribute cmake --- python/paddle/fluid/tests/unittests/CMakeLists.txt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 8ac1cb164e..19e9882ed6 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -46,6 +46,7 @@ function(py_test_modules TARGET_NAME) endfunction() list(REMOVE_ITEM TEST_OPS test_warpctc_op) list(REMOVE_ITEM TEST_OPS test_dist_train) +list(REMOVE_ITEM TEST_OPS test_dist_transpiler) list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf) list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed) list(REMOVE_ITEM TEST_OPS test_dist_se_resnext) @@ -61,11 +62,12 @@ if(WITH_DISTRIBUTE) set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20) set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200) set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200) + py_test_modules(test_dist_transpiler MODULES test_dist_transpiler) + py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL) + py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL) endif() py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL) py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL) set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 150) -py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL) -py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL) py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL) py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL) From cb4a73be010d2314531173e28022e3b5d163c033 Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Thu, 13 Sep 2018 15:45:27 +0800 Subject: [PATCH 71/85] fix fluid_benchmark resnet lr decay --- benchmark/fluid/models/resnet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py index d71b855612..1b3bfe659c 100644 --- a/benchmark/fluid/models/resnet.py +++ b/benchmark/fluid/models/resnet.py @@ -207,7 +207,7 @@ def get_model(args, is_train, main_prog, startup_prog): total_images = 1281167 / trainer_count - step = int(total_images / args.batch_size + 1) + step = int(total_images / (args.batch_size * args.gpus) + 1) epochs = [30, 60, 90] bd = [step * e for e in epochs] base_lr = args.learning_rate From 0092ad32856ea17c494a64b02e51d8bf14a0ad20 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Thu, 13 Sep 2018 08:08:35 +0000 Subject: [PATCH 72/85] delete unused log --- paddle/fluid/inference/tensorrt/convert/concat_op.cc | 2 -- paddle/fluid/inference/tensorrt/convert/conv2d_op.cc | 3 --- 2 files changed, 5 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/concat_op.cc b/paddle/fluid/inference/tensorrt/convert/concat_op.cc index 2983e91cb2..a11dfa1e8f 100644 --- a/paddle/fluid/inference/tensorrt/convert/concat_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/concat_op.cc @@ -30,9 +30,7 @@ class ConcatOpConverter : public OpConverter { framework::OpDesc op_desc(op, nullptr); // Declare inputs std::vector itensors; - std::cout << "Concat op: " << std::endl; for (auto& input_name : op_desc.Input("X")) { - std::cout << input_name << std::endl; itensors.push_back(engine_->GetITensor(input_name)); } int axis = boost::get(op_desc.GetAttr("axis")); diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc index 022e43a571..0a37d3968c 100644 --- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc @@ -26,9 +26,6 @@ class Conv2dOpConverter : public OpConverter { << "convert a fluid conv2d op to tensorrt conv layer without bias"; framework::OpDesc op_desc(op, nullptr); - std::cout << "Conv op: " << std::endl; - std::cout << op_desc.Input("Input").front() << std::endl; - std::cout << op_desc.Output("Output").front() << std::endl; PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1); // Y is a weight PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1); From 3a3f28f99b87b2626bf872b1cfc4faf631c07443 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Thu, 13 Sep 2018 16:35:56 +0800 Subject: [PATCH 73/85] add (#13377) --- paddle/fluid/operators/distributed/grpc_client.cc | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc index 07ac20797d..e22bc552f8 100644 --- a/paddle/fluid/operators/distributed/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc_client.cc @@ -290,12 +290,18 @@ void GRPCClient::Proceed() { c->Finish(false); } - delete c; + bool notify = false; { std::lock_guard lk(sync_mutex_); req_count_--; + notify = (req_count_ <= 0 || !c->status_.ok()); + } + + delete c; + + if (notify) { + sync_cond_.notify_all(); } - sync_cond_.notify_all(); } VLOG(3) << "GRPCClient Proceed end"; } From 29f5a93b5f5b87b5f7a7f059b1471d373b15e740 Mon Sep 17 00:00:00 2001 From: luotao1 Date: Thu, 13 Sep 2018 17:17:57 +0800 Subject: [PATCH 74/85] add analyzer_rnn2_test --- .../fluid/inference/tests/api/CMakeLists.txt | 58 +++--- .../tests/api/analyzer_rnn2_tester.cc | 181 ++++++++++++++++++ 2 files changed, 211 insertions(+), 28 deletions(-) create mode 100644 paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index d44a2cfa7f..ece0d33399 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -1,56 +1,58 @@ -function (inference_download_and_uncompress install_dir url) - get_filename_component(filename ${url} NAME) - message(STATUS "Download inference test stuff ${filename} from ${url}") +set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com") +set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo") +set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor) +function (inference_download_and_uncompress install_dir filename) + message(STATUS "Download inference test stuff from ${INFERENCE_URL}/${filename}") execute_process(COMMAND bash -c "mkdir -p ${install_dir}") - execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}") + execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${INFERENCE_URL}/${filename}") execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}") message(STATUS "finish downloading ${filename}") endfunction(inference_download_and_uncompress) -function(download_model_and_data install_dir model_url data_url) +function(download_model_and_data install_dir model_name data_name) if (NOT EXISTS ${install_dir} AND WITH_INFERENCE) - inference_download_and_uncompress(${install_dir} ${model_url}) - inference_download_and_uncompress(${install_dir} ${data_url}) + inference_download_and_uncompress(${install_dir} ${model_name}) + inference_download_and_uncompress(${install_dir} ${data_name}) endif() endfunction() # RNN1 -set(RNN1_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/rnn1%2Fmodel.tar.gz") -set(RNN1_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/rnn1%2Fdata.txt.tar.gz") -set(RNN1_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/rnn1") -download_model_and_data(${RNN1_INSTALL_DIR} ${RNN1_MODEL_URL} ${RNN1_DATA_URL}) -inference_analysis_test(test_analyzer_rnn1 SRCS analyzer_rnn1_tester.cc - EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor +set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1") +download_model_and_data(${RNN1_INSTALL_DIR} "rnn1%2Fmodel.tar.gz" "rnn1%2Fdata.txt.tar.gz") +inference_analysis_test(test_analyzer_rnn1 SRCS analyzer_rnn1_tester.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${RNN1_INSTALL_DIR}/model --infer_data=${RNN1_INSTALL_DIR}/data.txt) +# RNN2 +set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2") +download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz") +inference_analysis_test(test_analyzer_rnn2 SRCS analyzer_rnn2_tester.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${RNN2_INSTALL_DIR}/model + --infer_data=${RNN2_INSTALL_DIR}/data.txt) + # chinese_ner -set(CHINESE_NER_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner_model.tar.gz") -set(CHINESE_NER_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner-data.txt.tar.gz") -set(CHINESE_NER_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/chinese_ner") -download_model_and_data(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_MODEL_URL} ${CHINESE_NER_DATA_URL}) +set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner") +download_model_and_data(${CHINESE_NER_INSTALL_DIR} "chinese_ner_model.tar.gz" "chinese_ner-data.txt.tar.gz") inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc - EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model --infer_data=${CHINESE_NER_INSTALL_DIR}/data.txt) # lac -set(LAC_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/lac_model.tar.gz") -set(LAC_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/lac_data.txt.tar.gz") -set(LAC_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/lac") -download_model_and_data(${LAC_INSTALL_DIR} ${LAC_MODEL_URL} ${LAC_DATA_URL}) +set(LAC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lac") +download_model_and_data(${LAC_INSTALL_DIR} "lac_model.tar.gz" "lac_data.txt.tar.gz") inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc - EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${LAC_INSTALL_DIR}/model --infer_data=${LAC_INSTALL_DIR}/data.txt) # text_classification -set(TEXT_CLASSIFICATION_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/text-classification-Senta.tar.gz") -set(TEXT_CLASSIFICATION_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/text_classification_data.txt.tar.gz") -set(TEXT_CLASSIFICATION_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/text_classification") -download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_MODEL_URL} ${TEXT_CLASSIFICATION_DATA_URL}) +set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification") +download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz") inference_analysis_test(test_text_classification SRCS analyzer_text_classification_tester.cc - EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta --infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt --topn=1 # Just run top 1 batch. diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc new file mode 100644 index 0000000000..c40ea58eea --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc @@ -0,0 +1,181 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/analyzer.h" + +#include +#include +#include // NOLINT +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/inference/analysis/ut_helper.h" +#include "paddle/fluid/inference/api/analysis_predictor.h" +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/api/paddle_inference_pass.h" + +DEFINE_string(infer_model, "", "model path"); +DEFINE_string(infer_data, "", "data path"); +DEFINE_int32(batch_size, 1, "batch size."); +DEFINE_int32(repeat, 1, "Running the inference program repeat times."); +DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads."); + +namespace paddle { +namespace inference { + +using namespace framework; // NOLINT + +struct DataRecord { + std::vector>> link_step_data_all; + std::vector lod; + std::vector> rnn_link_data; + std::vector result_data; + size_t batch_iter{0}; + size_t batch_size{1}; + DataRecord() = default; + explicit DataRecord(const std::string &path, int batch_size = 1) + : batch_size(batch_size) { + Load(path); + } + DataRecord NextBatch() { + DataRecord data; + size_t batch_end = batch_iter + batch_size; + // NOTE skip the final batch, if no enough data is provided. + if (batch_end <= link_step_data_all.size()) { + data.link_step_data_all.assign(link_step_data_all.begin() + batch_iter, + link_step_data_all.begin() + batch_end); + // Prepare LoDs + data.lod.push_back(0); + CHECK(!data.link_step_data_all.empty()) << "empty"; + for (size_t j = 0; j < data.link_step_data_all.size(); j++) { + for (const auto &d : data.link_step_data_all[j]) { + data.rnn_link_data.push_back(d); + // calculate lod + data.lod.push_back(data.lod.back() + 11); + } + } + } + batch_iter += batch_size; + return data; + } + void Load(const std::string &path) { + std::ifstream file(path); + std::string line; + int num_lines = 0; + while (std::getline(file, line)) { + num_lines++; + std::vector data; + split(line, ':', &data); + if (num_lines % 2) { // feature + std::vector feature_data; + split(data[1], ' ', &feature_data); + std::vector> link_step_data; + int feature_count = 1; + std::vector feature; + for (auto &step_data : feature_data) { + std::vector tmp; + split_to_float(step_data, ',', &tmp); + feature.insert(feature.end(), tmp.begin(), tmp.end()); + if (feature_count % 11 == 0) { // each sample has 11 features + link_step_data.push_back(feature); + feature.clear(); + } + feature_count++; + } + link_step_data_all.push_back(std::move(link_step_data)); + } else { // result + std::vector tmp; + split_to_float(data[1], ',', &tmp); + result_data.insert(result_data.end(), tmp.begin(), tmp.end()); + } + } + } +}; +void PrepareInputs(std::vector *input_slots, DataRecord *data, + int batch_size) { + PaddleTensor feed_tensor; + feed_tensor.name = "feed"; + auto one_batch = data->NextBatch(); + int token_size = one_batch.rnn_link_data.size(); + // each token has 11 features, each feature's dim is 54. + std::vector rnn_link_data_shape({token_size * 11, 54}); + feed_tensor.shape = rnn_link_data_shape; + feed_tensor.lod.assign({one_batch.lod}); + feed_tensor.dtype = PaddleDType::FLOAT32; + TensorAssignData(&feed_tensor, one_batch.rnn_link_data); + // Set inputs. + input_slots->assign({feed_tensor}); +} + +void CompareResult(const std::vector &outputs, + const std::vector &base_result) { + PADDLE_ENFORCE_GT(outputs.size(), 0); + for (size_t i = 0; i < outputs.size(); i++) { + auto &out = outputs[i]; + size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, + [](int a, int b) { return a * b; }); + PADDLE_ENFORCE_GT(size, 0); + float *data = static_cast(out.data.data()); + for (size_t i = 0; i < size; i++) { + EXPECT_NEAR(data[i], base_result[i], 1e-3); + } + } +} +// Test with a really complicate model. +void TestRNN2Prediction() { + AnalysisConfig config; + config.prog_file = FLAGS_infer_model + "/__model__"; + config.param_file = FLAGS_infer_model + "/param"; + config.use_gpu = false; + config.device = 0; + config.specify_input_name = true; + config.enable_ir_optim = true; + PADDLE_ENFORCE(config.ir_mode == + AnalysisConfig::IrPassMode::kExclude); // default + + int batch_size = FLAGS_batch_size; + int num_times = FLAGS_repeat; + + auto base_predictor = + CreatePaddlePredictor(config); + auto predictor = + CreatePaddlePredictor( + config); + std::vector input_slots; + DataRecord data(FLAGS_infer_data, batch_size); + PrepareInputs(&input_slots, &data, batch_size); + std::vector outputs, base_outputs; + + Timer timer1; + timer1.tic(); + for (int i = 0; i < num_times; i++) { + base_predictor->Run(input_slots, &base_outputs); + } + PrintTime(batch_size, num_times, 1, 0, timer1.toc() / num_times); + + Timer timer2; + timer2.tic(); + for (int i = 0; i < num_times; i++) { + predictor->Run(input_slots, &outputs); + } + PrintTime(batch_size, num_times, 1, 0, timer2.toc() / num_times); + + CompareResult(base_outputs, data.result_data); + CompareResult(outputs, data.result_data); +} + +TEST(Analyzer, rnn2) { TestRNN2Prediction(); } + +} // namespace inference +} // namespace paddle From 20b40cb06a0e10748328d6925cdbc8759c04249f Mon Sep 17 00:00:00 2001 From: luotao1 Date: Thu, 13 Sep 2018 18:27:59 +0800 Subject: [PATCH 75/85] add multi-thread for nlp unit-tests --- paddle/fluid/inference/api/helper.h | 8 +- .../fluid/inference/tests/api/CMakeLists.txt | 6 +- .../tests/api/analyzer_lac_tester.cc | 70 +++------- .../tests/api/analyzer_ner_tester.cc | 73 +++------- .../tests/api/analyzer_rnn1_tester.cc | 84 +----------- .../analyzer_text_classification_tester.cc | 75 ++++------- .../fluid/inference/tests/api/tester_helper.h | 126 ++++++++++++++++++ 7 files changed, 207 insertions(+), 235 deletions(-) create mode 100644 paddle/fluid/inference/tests/api/tester_helper.h diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index f6893be428..8e359a6773 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -123,10 +123,16 @@ std::string DescribeTensor(const PaddleTensor &tensor) { } void PrintTime(int batch_size, int repeat, int num_threads, int tid, - double latency) { + double latency, int epoch = 1) { LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat << ", threads: " << num_threads << ", thread id: " << tid << ", latency: " << latency << "ms ======"; + if (epoch > 1) { + int samples = batch_size * epoch; + LOG(INFO) << "====== sample number: " << samples + << ", average latency of each sample: " << latency / samples + << "ms ======"; + } } } // namespace inference diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index d44a2cfa7f..ece25db019 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -49,9 +49,7 @@ set(TEXT_CLASSIFICATION_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/te set(TEXT_CLASSIFICATION_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/text_classification_data.txt.tar.gz") set(TEXT_CLASSIFICATION_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/text_classification") download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_MODEL_URL} ${TEXT_CLASSIFICATION_DATA_URL}) -inference_analysis_test(test_text_classification SRCS analyzer_text_classification_tester.cc +inference_analysis_test(test_analyzer_text_classification SRCS analyzer_text_classification_tester.cc EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta - --infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt - --topn=1 # Just run top 1 batch. - ) + --infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt) diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc index 7e00cb20ad..45c19af520 100644 --- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc @@ -12,21 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/inference/analysis/analyzer.h" -#include -#include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/inference/analysis/ut_helper.h" -#include "paddle/fluid/inference/api/analysis_predictor.h" -#include "paddle/fluid/inference/api/helper.h" -#include "paddle/fluid/inference/api/paddle_inference_pass.h" -#include "paddle/fluid/platform/profiler.h" - -DEFINE_string(infer_model, "", "model path for LAC"); -DEFINE_string(infer_data, "", "data file for LAC"); -DEFINE_int32(batch_size, 1, "batch size."); -DEFINE_int32(burning, 0, "Burning before repeat."); -DEFINE_int32(repeat, 1, "Running the inference program repeat times."); -DEFINE_bool(test_all_data, false, "Test the all dataset in data file."); +#include "paddle/fluid/inference/tests/api/tester_helper.h" namespace paddle { namespace inference { @@ -126,46 +112,37 @@ void TestLACPrediction(const std::string &model_path, const std::string &data_file, const int batch_size, const int repeat, bool test_all_data, bool use_analysis = false) { - NativeConfig config; - config.model_dir = model_path; - config.use_gpu = false; - config.device = 0; - config.specify_input_name = true; + AnalysisConfig cfg; + cfg.model_dir = model_path; + cfg.use_gpu = false; + cfg.device = 0; + cfg.specify_input_name = true; + cfg.enable_ir_optim = true; + std::vector input_slots, outputs_slots; DataRecord data(data_file, batch_size); GetOneBatch(&input_slots, &data, batch_size); std::unique_ptr predictor; if (use_analysis) { - AnalysisConfig cfg; - cfg.model_dir = model_path; - cfg.use_gpu = false; - cfg.device = 0; - cfg.specify_input_name = true; - cfg.enable_ir_optim = true; predictor = CreatePaddlePredictor(cfg); } else { predictor = - CreatePaddlePredictor(config); + CreatePaddlePredictor(cfg); } for (int i = 0; i < FLAGS_burning; i++) { predictor->Run(input_slots, &outputs_slots); } Timer timer; - if (test_all_data) { - double sum = 0; - LOG(INFO) << "Total number of samples: " << data.datasets.size(); - for (int i = 0; i < repeat; i++) { - for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) { - GetOneBatch(&input_slots, &data, batch_size); - timer.tic(); - predictor->Run(input_slots, &outputs_slots); - sum += timer.toc(); - } + if (FLAGS_test_all_data) { + LOG(INFO) << "test all data"; + std::vector> input_slots_all; + for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) { + GetOneBatch(&input_slots, &data, batch_size); + input_slots_all.emplace_back(input_slots); } - PrintTime(batch_size, repeat, 1, 0, sum / repeat); - LOG(INFO) << "Average latency of each sample: " - << sum / repeat / data.datasets.size() << " ms"; + LOG(INFO) << "total number of samples: " << data.datasets.size(); + TestPrediction(cfg, input_slots_all, &outputs_slots, FLAGS_num_threads); return; } timer.tic(); @@ -190,19 +167,10 @@ void TestLACPrediction(const std::string &model_path, if (use_analysis) { // run once for comparion as reference auto ref_predictor = - CreatePaddlePredictor(config); + CreatePaddlePredictor(cfg); std::vector ref_outputs_slots; ref_predictor->Run(input_slots, &ref_outputs_slots); - EXPECT_EQ(ref_outputs_slots.size(), outputs_slots.size()); - auto &ref_out = ref_outputs_slots[0]; - size_t ref_size = - std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1, - [](int a, int b) { return a * b; }); - EXPECT_EQ(size, ref_size); - int64_t *pdata_ref = static_cast(ref_out.data.data()); - for (size_t i = 0; i < size; ++i) { - EXPECT_EQ(pdata_ref[i], pdata[i]); - } + CompareResult(ref_outputs_slots, outputs_slots); AnalysisPredictor *analysis_predictor = dynamic_cast(predictor.get()); diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc index 6e8e43add7..f8c651e32f 100644 --- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc @@ -12,20 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/inference/analysis/analyzer.h" -#include -#include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/inference/analysis/ut_helper.h" -#include "paddle/fluid/inference/api/analysis_predictor.h" -#include "paddle/fluid/inference/api/helper.h" -#include "paddle/fluid/inference/api/paddle_inference_pass.h" -#include "paddle/fluid/platform/profiler.h" - -DEFINE_string(infer_model, "", "model path"); -DEFINE_string(infer_data, "", "data path"); -DEFINE_int32(batch_size, 10, "batch size."); -DEFINE_int32(repeat, 1, "Running the inference program repeat times."); -DEFINE_bool(test_all_data, false, "Test the all dataset in data file."); +#include "paddle/fluid/inference/tests/api/tester_helper.h" namespace paddle { namespace inference { @@ -113,50 +100,35 @@ const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26, 48, 39, 38, 16, 25}; void TestChineseNERPrediction(bool use_analysis) { - NativeConfig config; - config.prog_file = FLAGS_infer_model + "/__model__"; - config.param_file = FLAGS_infer_model + "/param"; - config.use_gpu = false; - config.device = 0; - config.specify_input_name = true; + AnalysisConfig cfg; + cfg.prog_file = FLAGS_infer_model + "/__model__"; + cfg.param_file = FLAGS_infer_model + "/param"; + cfg.use_gpu = false; + cfg.device = 0; + cfg.specify_input_name = true; + cfg.enable_ir_optim = true; std::vector input_slots, outputs; std::unique_ptr predictor; Timer timer; if (use_analysis) { - AnalysisConfig cfg; - cfg.prog_file = FLAGS_infer_model + "/__model__"; - cfg.param_file = FLAGS_infer_model + "/param"; - cfg.use_gpu = false; - cfg.device = 0; - cfg.specify_input_name = true; - cfg.enable_ir_optim = true; predictor = CreatePaddlePredictor(cfg); } else { predictor = - CreatePaddlePredictor(config); + CreatePaddlePredictor(cfg); } if (FLAGS_test_all_data) { LOG(INFO) << "test all data"; - double sum = 0; - size_t num_samples; - for (int i = 0; i < FLAGS_repeat; i++) { - DataRecord data(FLAGS_infer_data, FLAGS_batch_size); - // Just one batch, the num_samples remains the same. - num_samples = data.num_samples; - for (size_t bid = 0; bid < num_samples / FLAGS_batch_size; ++bid) { - PrepareInputs(&input_slots, &data, FLAGS_batch_size); - timer.tic(); - predictor->Run(input_slots, &outputs); - sum += timer.toc(); - } + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + std::vector> input_slots_all; + for (size_t bid = 0; bid < data.num_samples / FLAGS_batch_size; ++bid) { + PrepareInputs(&input_slots, &data, FLAGS_batch_size); + input_slots_all.emplace_back(input_slots); } - LOG(INFO) << "total number of samples: " << num_samples; - PrintTime(FLAGS_batch_size, FLAGS_repeat, 1, 0, sum / FLAGS_repeat); - LOG(INFO) << "average latency of each sample: " - << sum / FLAGS_repeat / num_samples; + LOG(INFO) << "total number of samples: " << data.num_samples; + TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); return; } // Prepare inputs. @@ -182,19 +154,10 @@ void TestChineseNERPrediction(bool use_analysis) { if (use_analysis) { // run once for comparion as reference auto ref_predictor = - CreatePaddlePredictor(config); + CreatePaddlePredictor(cfg); std::vector ref_outputs_slots; ref_predictor->Run(input_slots, &ref_outputs_slots); - EXPECT_EQ(ref_outputs_slots.size(), outputs.size()); - auto &ref_out = ref_outputs_slots[0]; - size_t ref_size = - std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1, - [](int a, int b) { return a * b; }); - EXPECT_EQ(size, ref_size); - int64_t *pdata_ref = static_cast(ref_out.data.data()); - for (size_t i = 0; i < size; ++i) { - EXPECT_EQ(pdata_ref[i], result[i]); - } + CompareResult(ref_outputs_slots, outputs); AnalysisPredictor *analysis_predictor = dynamic_cast(predictor.get()); diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc index b8ac468b4e..df96be544e 100644 --- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc @@ -12,24 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/inference/analysis/analyzer.h" - -#include -#include -#include // NOLINT -#include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/framework/ir/pass.h" -#include "paddle/fluid/inference/analysis/ut_helper.h" -#include "paddle/fluid/inference/api/analysis_predictor.h" -#include "paddle/fluid/inference/api/helper.h" -#include "paddle/fluid/inference/api/paddle_inference_api.h" -#include "paddle/fluid/inference/api/paddle_inference_pass.h" - -DEFINE_string(infer_model, "", "model path"); -DEFINE_string(infer_data, "", "data path"); -DEFINE_int32(batch_size, 10, "batch size."); -DEFINE_int32(repeat, 1, "Running the inference program repeat times."); -DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads."); +#include "paddle/fluid/inference/tests/api/tester_helper.h" namespace paddle { namespace inference { @@ -164,26 +147,6 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, } } -void CompareResult(const std::vector &outputs, - const std::vector &base_outputs) { - PADDLE_ENFORCE_GT(outputs.size(), 0); - PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size()); - for (size_t i = 0; i < outputs.size(); i++) { - auto &out = outputs[i]; - auto &base_out = base_outputs[i]; - size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, - [](int a, int b) { return a * b; }); - size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(), - 1, [](int a, int b) { return a * b; }); - PADDLE_ENFORCE_EQ(size, size1); - PADDLE_ENFORCE_GT(size, 0); - float *data = static_cast(out.data.data()); - float *base_data = static_cast(base_out.data.data()); - for (size_t i = 0; i < size; i++) { - EXPECT_NEAR(data[i], base_data[i], 1e-3); - } - } -} // Test with a really complicate model. void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) { AnalysisConfig config; @@ -198,7 +161,6 @@ void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) { config.ir_passes.clear(); // Do not exclude any pass. int batch_size = FLAGS_batch_size; - int num_times = FLAGS_repeat; auto base_predictor = CreatePaddlePredictor(config); @@ -213,45 +175,14 @@ void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) { base_predictor->Run(input_slots, &base_outputs); + std::vector> input_slots_all; + input_slots_all.emplace_back(input_slots); if (num_threads == 1) { - // Prepare inputs. - Timer timer; - timer.tic(); - for (int i = 0; i < num_times; i++) { - predictor->Run(input_slots, &outputs); - } - PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times); + TestOneThreadPrediction(config, input_slots_all, &outputs); CompareResult(outputs, base_outputs); } else { - std::vector threads; - std::vector> predictors; - // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled - // because AttentionLSTM's hard code nodeid will be damanged. - for (int tid = 0; tid < num_threads; ++tid) { - predictors.emplace_back( - CreatePaddlePredictor( - config)); - } - for (int tid = 0; tid < num_threads; ++tid) { - threads.emplace_back([&, tid]() { - // Each thread should have local input_slots and outputs. - std::vector input_slots; - DataRecord data(FLAGS_infer_data, batch_size); - PrepareInputs(&input_slots, &data, batch_size); - std::vector outputs; - Timer timer; - timer.tic(); - for (int i = 0; i < num_times; i++) { - predictors[tid]->Run(input_slots, &outputs); - } - PrintTime(batch_size, num_times, num_threads, tid, - timer.toc() / num_times); - CompareResult(outputs, base_outputs); - }); - } - for (int i = 0; i < num_threads; ++i) { - threads[i].join(); - } + // only return the output of first thread + TestMultiThreadPrediction(config, input_slots_all, &outputs, num_threads); } if (use_analysis && activate_ir) { @@ -293,8 +224,7 @@ TEST(Analyzer, RNN_tests) { // Directly infer with the original model. TestRNN1Prediction(false, false, i); // Inference with the original model with the analysis turned on, the - // analysis - // module will transform the program to a data flow graph. + // analysis module will transform the program to a data flow graph. TestRNN1Prediction(true, false, i); // Inference with analysis and IR. The IR module will fuse some large // kernels. diff --git a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc index 65169f8cfc..1472c475e4 100644 --- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc @@ -12,23 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/inference/analysis/analyzer.h" -#include -#include // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files. -#include -#include -#include "paddle/fluid/framework/ir/pass.h" -#include "paddle/fluid/inference/analysis/ut_helper.h" -#include "paddle/fluid/inference/api/helper.h" -#include "paddle/fluid/inference/api/paddle_inference_api.h" -#include "paddle/fluid/inference/api/paddle_inference_pass.h" -#include "paddle/fluid/inference/api/timer.h" - -DEFINE_string(infer_model, "", "Directory of the inference model."); -DEFINE_string(infer_data, "", "Path of the dataset."); -DEFINE_int32(batch_size, 1, "batch size."); -DEFINE_int32(repeat, 1, "How many times to repeat run."); -DEFINE_int32(topn, -1, "Run top n batches of data to save time"); +#include "paddle/fluid/inference/tests/api/tester_helper.h" namespace paddle { namespace inference { @@ -37,24 +21,25 @@ struct DataReader { explicit DataReader(const std::string &path) : file(new std::ifstream(path)) {} - bool NextBatch(PaddleTensor *tensor, int batch_size) { + bool NextBatch(std::vector *input, int batch_size) { PADDLE_ENFORCE_EQ(batch_size, 1); std::string line; - tensor->lod.clear(); - tensor->lod.emplace_back(std::vector({0})); + PaddleTensor tensor; + tensor.dtype = PaddleDType::INT64; + tensor.lod.emplace_back(std::vector({0})); std::vector data; for (int i = 0; i < batch_size; i++) { if (!std::getline(*file, line)) return false; inference::split_to_int64(line, ' ', &data); } - tensor->lod.front().push_back(data.size()); + tensor.lod.front().push_back(data.size()); - tensor->data.Resize(data.size() * sizeof(int64_t)); - memcpy(tensor->data.data(), data.data(), data.size() * sizeof(int64_t)); - tensor->shape.clear(); - tensor->shape.push_back(data.size()); - tensor->shape.push_back(1); + tensor.data.Resize(data.size() * sizeof(int64_t)); + memcpy(tensor.data.data(), data.data(), data.size() * sizeof(int64_t)); + tensor.shape.push_back(data.size()); + tensor.shape.push_back(1); + input->assign({tensor}); return true; } @@ -68,32 +53,28 @@ void Main(int batch_size) { config.model_dir = FLAGS_infer_model; config.use_gpu = false; config.enable_ir_optim = true; - auto predictor = - CreatePaddlePredictor( - config); - - std::vector input_slots(1); - // one batch starts - // data -- - auto &input = input_slots[0]; - input.dtype = PaddleDType::INT64; - inference::Timer timer; - double sum = 0; - std::vector output_slots; + std::vector input_slots, output_slots; + DataReader reader(FLAGS_infer_data); + std::vector> input_slots_all; - int num_batches = 0; - for (int t = 0; t < FLAGS_repeat; t++) { - DataReader reader(FLAGS_infer_data); - while (reader.NextBatch(&input, FLAGS_batch_size)) { - if (FLAGS_topn > 0 && num_batches > FLAGS_topn) break; - timer.tic(); - CHECK(predictor->Run(input_slots, &output_slots)); - sum += timer.toc(); + if (FLAGS_test_all_data) { + LOG(INFO) << "test all data"; + int num_batches = 0; + while (reader.NextBatch(&input_slots, FLAGS_batch_size)) { + input_slots_all.emplace_back(input_slots); ++num_batches; } + LOG(INFO) << "total number of samples: " << num_batches * FLAGS_batch_size; + TestPrediction(config, input_slots_all, &output_slots, FLAGS_num_threads); + return; } - PrintTime(batch_size, FLAGS_repeat, 1, 0, sum / FLAGS_repeat); + + // one batch starts + // data -- + reader.NextBatch(&input_slots, FLAGS_batch_size); + input_slots_all.emplace_back(input_slots); + TestPrediction(config, input_slots_all, &output_slots, FLAGS_num_threads); // Get output LOG(INFO) << "get outputs " << output_slots.size(); diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h new file mode 100644 index 0000000000..44688ad36e --- /dev/null +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -0,0 +1,126 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include // NOLINT +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/inference/analysis/analyzer.h" +#include "paddle/fluid/inference/analysis/ut_helper.h" +#include "paddle/fluid/inference/api/analysis_predictor.h" +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_inference_pass.h" +#include "paddle/fluid/platform/profiler.h" + +DEFINE_string(infer_model, "", "model path"); +DEFINE_string(infer_data, "", "data file"); +DEFINE_int32(batch_size, 1, "batch size."); +DEFINE_int32(burning, 0, "Burning before repeat."); +DEFINE_int32(repeat, 1, "Running the inference program repeat times."); +DEFINE_bool(test_all_data, false, "Test the all dataset in data file."); +DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads."); + +namespace paddle { +namespace inference { + +void CompareResult(const std::vector &outputs, + const std::vector &base_outputs) { + PADDLE_ENFORCE_GT(outputs.size(), 0); + PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size()); + for (size_t i = 0; i < outputs.size(); i++) { + auto &out = outputs[i]; + auto &base_out = base_outputs[i]; + size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, + [](int a, int b) { return a * b; }); + size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(), + 1, [](int a, int b) { return a * b; }); + PADDLE_ENFORCE_EQ(size, size1); + PADDLE_ENFORCE_GT(size, 0); + float *data = static_cast(out.data.data()); + float *base_data = static_cast(base_out.data.data()); + for (size_t i = 0; i < size; i++) { + EXPECT_NEAR(data[i], base_data[i], 1e-3); + } + } +} + +void TestOneThreadPrediction( + AnalysisConfig config, const std::vector> inputs, + std::vector *outputs) { + int batch_size = FLAGS_batch_size; + int num_times = FLAGS_repeat; + auto predictor = + CreatePaddlePredictor( + config); + Timer timer; + timer.tic(); + for (int i = 0; i < num_times; i++) { + for (size_t j = 0; j < inputs.size(); j++) { + predictor->Run(inputs[j], outputs); + } + } + PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times, + inputs.size()); +} + +void TestMultiThreadPrediction( + AnalysisConfig config, const std::vector> inputs, + std::vector *outputs, int num_threads) { + int batch_size = FLAGS_batch_size; + int num_times = FLAGS_repeat; + std::vector threads; + std::vector> predictors; + // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled + // because AttentionLSTM's hard code nodeid will be damanged. + for (int tid = 0; tid < num_threads; ++tid) { + predictors.emplace_back( + CreatePaddlePredictor( + config)); + } + for (int tid = 0; tid < num_threads; ++tid) { + threads.emplace_back([&, tid]() { + // Each thread should have local inputs and outputs. + // The inputs of each thread are all the same. + std::vector> inputs_tid = inputs; + std::vector outputs_tid; + Timer timer; + timer.tic(); + for (int i = 0; i < num_times; i++) { + for (size_t j = 0; j < inputs_tid.size(); j++) { + predictors[tid]->Run(inputs_tid[j], &outputs_tid); + } + } + PrintTime(batch_size, num_times, num_threads, tid, + timer.toc() / num_times, inputs_tid.size()); + }); + } + for (int i = 0; i < num_threads; ++i) { + threads[i].join(); + } +} + +void TestPrediction(AnalysisConfig config, + const std::vector> inputs, + std::vector *outputs, int num_threads) { + if (num_threads == 1) { + TestOneThreadPrediction(config, inputs, outputs); + } else { + TestMultiThreadPrediction(config, inputs, outputs, num_threads); + } +} + +} // namespace inference +} // namespace paddle From 1052a793bc4eb9cdebd4a16772b4b230f808f2c4 Mon Sep 17 00:00:00 2001 From: chuanqiw Date: Thu, 13 Sep 2018 14:08:13 +0800 Subject: [PATCH 76/85] support group convolution layer with mkldnn. --- paddle/fluid/operators/conv_mkldnn_op.cc | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc index 3eb02c6b61..5385bcdaec 100644 --- a/paddle/fluid/operators/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/conv_mkldnn_op.cc @@ -302,8 +302,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { bool fuse_relu = ctx.Attr("fuse_relu"); int groups = ctx.Attr("groups"); - // TODO(pzelazko-intel) add support for group convolution and dilation - PADDLE_ENFORCE(groups == 1, "group convolution is not implemented yet"); + // TODO: add support for dilation PADDLE_ENFORCE( dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1, "dilation in convolution is not implemented yet"); @@ -314,6 +313,19 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { std::vector src_tz = paddle::framework::vectorize2int(input->dims()); std::vector weights_tz = paddle::framework::vectorize2int(filter->dims()); + int g = std::max(groups, 1); + if (g > 1) { + int o = weights_tz[0]; + int i = weights_tz[1]; + int h = weights_tz[2]; + int w = weights_tz[3]; + weights_tz.resize(5); + weights_tz[0] = g; + weights_tz[1] = o / g; + weights_tz[2] = i; + weights_tz[3] = h; + weights_tz[4] = w; + } std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); // Get unique name for storing MKLDNN primitives @@ -327,7 +339,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto user_src_md = platform::MKLDNNMemDesc( {src_tz}, platform::MKLDNNGetDataType(), input->format()); auto user_weights_md = platform::MKLDNNMemDesc( - {weights_tz}, platform::MKLDNNGetDataType(), filter->format()); + {weights_tz}, platform::MKLDNNGetDataType(), + (g == 1) ? filter->format() : mkldnn::memory::format::goihw); /* create memory descriptor for convolution without specified format * ('any') which lets a primitive (convolution in this case) choose @@ -340,7 +353,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto src_md = platform::MKLDNNMemDesc( src_tz, platform::MKLDNNGetDataType(), chosen_memory_format); auto weights_md = platform::MKLDNNMemDesc( - weights_tz, platform::MKLDNNGetDataType(), chosen_memory_format); + weights_tz, platform::MKLDNNGetDataType(), + (g == 1) ? chosen_memory_format : mkldnn::memory::format::goihw); std::vector bias_tz; // TODO(mgallus): avoid empty vector creation. // Currently used whenever bias is != nullptr. auto dst_md = platform::MKLDNNMemDesc( From 9ee1b7bc045091522c53cc69d174bebda979667e Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Thu, 13 Sep 2018 21:17:30 +0800 Subject: [PATCH 77/85] add some comments --- paddle/fluid/framework/details/multi_devices_graph_pass.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 5781936cb3..7e7f1234c2 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -431,7 +431,9 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( CreateReduceOp(&result, g_name, cur_device_id); graph->Get(kShardedVarDevice) .emplace(g_name, cur_device_id); - bcast_var_name_set[cur_device_id].emplace(p_name); + if (!is_dist_train) { + bcast_var_name_set[cur_device_id].emplace(p_name); + } break; case BuildStrategy::ReduceStrategy::kAllReduce: if (IsSparseGradient(g_name)) { @@ -461,7 +463,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( if ((use_gpu && strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) || is_dist_train) { - // Insert BCast Ops + // allways broadcast receieved parameters for distributed training for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) { auto &to_bcast_set = bcast_var_name_set[dev_id]; for (auto &bcast_name : to_bcast_set) { From 3ab3a7f39241db6ed5698e306d80f8760c63d26f Mon Sep 17 00:00:00 2001 From: Wu Yi Date: Thu, 13 Sep 2018 22:15:27 +0800 Subject: [PATCH 78/85] Trainer auto wait pserver ports (#13341) * trainer auto wait pserver port ready * add file * fix docstring * add option to not wait * update api spec * clean * fix test hang --- paddle/fluid/API.spec | 4 +- .../tests/unittests/test_dist_transpiler.py | 2 +- .../fluid/transpiler/details/__init__.py | 1 + .../fluid/transpiler/details/checkport.py | 50 +++++++++++++++++++ .../fluid/transpiler/distribute_transpiler.py | 5 +- 5 files changed, 58 insertions(+), 4 deletions(-) create mode 100644 python/paddle/fluid/transpiler/details/checkport.py diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 842fde1ec5..1971774527 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -59,7 +59,7 @@ paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], vara paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) -paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)) paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None)) paddle.fluid.InferenceTranspiler.__init__ paddle.fluid.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) @@ -346,7 +346,7 @@ paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'con paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) -paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)) paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None)) paddle.fluid.transpiler.InferenceTranspiler.__init__ paddle.fluid.transpiler.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index b85501ef6b..a198b25520 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -62,7 +62,7 @@ class TranspilerTest(unittest.TestCase): t = self._transpiler_instance(config) - trainer_main = t.get_trainer_program() + trainer_main = t.get_trainer_program(wait_port=False) trainer_startup = fluid.default_startup_program() assert (src.num_blocks == 1) diff --git a/python/paddle/fluid/transpiler/details/__init__.py b/python/paddle/fluid/transpiler/details/__init__.py index 5e98266a76..f33c05ed2f 100644 --- a/python/paddle/fluid/transpiler/details/__init__.py +++ b/python/paddle/fluid/transpiler/details/__init__.py @@ -16,3 +16,4 @@ from __future__ import print_function from .program_utils import * from .ufind import * +from .checkport import * diff --git a/python/paddle/fluid/transpiler/details/checkport.py b/python/paddle/fluid/transpiler/details/checkport.py new file mode 100644 index 0000000000..7bad4b427a --- /dev/null +++ b/python/paddle/fluid/transpiler/details/checkport.py @@ -0,0 +1,50 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import time +import socket +from contextlib import closing + + +def wait_server_ready(endpoints): + """ + Wait until parameter servers are ready, use connext_ex to detect + port readiness. + + Args: + endpoints (list): endpoints string list, like: + ["127.0.0.1:8080", "127.0.0.1:8081"] + + Examples: + .. code-block:: python + + wait_server_ready(["127.0.0.1:8080", "127.0.0.1:8081"]) + """ + while True: + all_ok = True + for ep in endpoints: + ip_port = ep.split(":") + with closing(socket.socket(socket.AF_INET, + socket.SOCK_STREAM)) as sock: + sock.settimeout(2) + result = sock.connect_ex((ip_port[0], int(ip_port[1]))) + if result != 0: + all_ok = False + if not all_ok: + sys.stderr.write("pserver not ready, wait 3 sec to retry...\n") + sys.stderr.flush() + time.sleep(3) + else: + break diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index d4d218d547..53c9cbe23d 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -381,7 +381,7 @@ class DistributeTranspiler(object): pserver_endpoints) self._split_table_grad_and_add_send_vars(program, pserver_endpoints) - def get_trainer_program(self): + def get_trainer_program(self, wait_port=True): """ Get transpiled trainer side program. @@ -393,6 +393,9 @@ class DistributeTranspiler(object): delete_ops(self.origin_program.global_block(), self.optimize_ops) self.origin_program.__str__() + if wait_port: + wait_server_ready(self.pserver_endpoints) + return self.origin_program def _get_trainer_startup_program(self, recv_vars, eplist): From 757f9683abf603e4b3860934d63fce774b65ec37 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Fri, 14 Sep 2018 00:46:15 +0800 Subject: [PATCH 79/85] update comment text --- paddle/fluid/framework/details/multi_devices_graph_pass.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 7e7f1234c2..250e093a5f 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -460,10 +460,14 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( use_gpu = nccl_ctxs_ != nullptr; #endif + // Insert broadcast operators principle: + // 1. Broadcast optimized parameters in Reduce strategy; + // 2. No need broadcast optimized parameters in AllReduce strategy because of + // the optimization sub-graph would be run on every GPU; + // 3. Allways broadcast received parameters in Distribute Training. if ((use_gpu && strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) || is_dist_train) { - // allways broadcast receieved parameters for distributed training for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) { auto &to_bcast_set = bcast_var_name_set[dev_id]; for (auto &bcast_name : to_bcast_set) { From 0c7f883d4faeb6597156f257ec373683b3ce6d66 Mon Sep 17 00:00:00 2001 From: Yan Chunwei Date: Fri, 14 Sep 2018 07:31:25 +0800 Subject: [PATCH 80/85] small fix (#13322) --- paddle/fluid/string/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/fluid/string/CMakeLists.txt b/paddle/fluid/string/CMakeLists.txt index 719411bf66..8572dc1e8e 100644 --- a/paddle/fluid/string/CMakeLists.txt +++ b/paddle/fluid/string/CMakeLists.txt @@ -1,6 +1,5 @@ cc_library(stringpiece SRCS piece.cc) cc_library(pretty_log SRCS pretty_log.cc) -cc_test(test_pretty_log SRCS pretty_log.cc) cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags) cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags) cc_test(to_string_test SRCS to_string_test.cc) From b7a64e8698f61ddd82f6a8718e722d3309fd5aa7 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 14 Sep 2018 10:59:48 +0800 Subject: [PATCH 81/85] fix confilts --- paddle/fluid/inference/tests/api/CMakeLists.txt | 7 ++++++- paddle/fluid/inference/tests/api/analyzer_lac_tester.cc | 7 +++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index f1075ea708..3eba375514 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -60,7 +60,12 @@ inference_analysis_test(test_analyzer_text_classification SRCS analyzer_text_cla set(OCR_MODEL_URL "http://paddlemodels.cdn.bcebos.com/inference-vis-demos%2Focr.tar.gz") set(OCR_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/ocr") if (NOT EXISTS ${OCR_INSTALL_DIR} AND WITH_INFERENCE) - inference_download_and_uncompress(${OCR_INSTALL_DIR} ${OCR_MODEL_URL}) + get_filename_component(filename ${OCR_MODEL_URL} NAME) + message(STATUS "Download inference test stuff ${filename} from ${OCR_MODEL_URL}") + execute_process(COMMAND bash -c "mkdir -p ${OCR_INSTALL_DIR}") + execute_process(COMMAND bash -c "cd ${OCR_INSTALL_DIR} && wget -q ${OCR_MODEL_URL}") + execute_process(COMMAND bash -c "cd ${OCR_INSTALL_DIR} && tar xzf ${filename}") + message(STATUS "finish downloading ${filename}") endif() inference_analysis_test(test_analyzer_ocr SRCS analyzer_vis_tester.cc EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc index 45c19af520..bf893e3256 100644 --- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc @@ -110,8 +110,7 @@ const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25, void TestLACPrediction(const std::string &model_path, const std::string &data_file, const int batch_size, - const int repeat, bool test_all_data, - bool use_analysis = false) { + const int repeat, bool use_analysis = false) { AnalysisConfig cfg; cfg.model_dir = model_path; cfg.use_gpu = false; @@ -199,13 +198,13 @@ void TestLACPrediction(const std::string &model_path, TEST(Analyzer_LAC, native) { LOG(INFO) << "LAC with native"; TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size, - FLAGS_repeat, FLAGS_test_all_data); + FLAGS_repeat); } TEST(Analyzer_LAC, analysis) { LOG(INFO) << "LAC with analysis"; TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size, - FLAGS_repeat, FLAGS_test_all_data, true); + FLAGS_repeat, true); } } // namespace analysis From 1a99302c141c8de2cd1202b16205a2ec02fb1b67 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 14 Sep 2018 11:24:03 +0800 Subject: [PATCH 82/85] refine and reuse code --- .../fluid/inference/tests/api/CMakeLists.txt | 2 +- .../tests/api/analyzer_vis_tester.cc | 86 +++++-------------- .../fluid/inference/tests/api/tester_helper.h | 39 ++++++--- 3 files changed, 48 insertions(+), 79 deletions(-) diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 3eba375514..e8c34047ab 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -68,6 +68,6 @@ if (NOT EXISTS ${OCR_INSTALL_DIR} AND WITH_INFERENCE) message(STATUS "finish downloading ${filename}") endif() inference_analysis_test(test_analyzer_ocr SRCS analyzer_vis_tester.cc - EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${OCR_INSTALL_DIR}/model --infer_data=${OCR_INSTALL_DIR}/data.txt) diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc index 3675c5f7f3..0591869996 100644 --- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc @@ -12,22 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/inference/analysis/analyzer.h" -#include -#include -#include #include #include -#include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/inference/analysis/ut_helper.h" -#include "paddle/fluid/inference/api/analysis_predictor.h" -#include "paddle/fluid/inference/api/helper.h" -#include "paddle/fluid/inference/api/paddle_inference_pass.h" - -DEFINE_string(infer_model, "", "model path for LAC"); -DEFINE_string(infer_data, "", "data file for LAC"); -DEFINE_int32(batch_size, 1, "batch size."); -DEFINE_int32(repeat, 1, "Running the inference program repeat times."); +#include "paddle/fluid/inference/tests/api/tester_helper.h" namespace paddle { namespace inference { @@ -105,69 +92,36 @@ void TestVisualPrediction(bool use_mkldnn) { VLOG(3) << "output.size " << outputs_slots.size(); // run native as reference - NativeConfig config; - config.param_file = FLAGS_infer_model + "/__params__"; - config.prog_file = FLAGS_infer_model + "/__model__"; - config.use_gpu = false; - config.device = 0; - // config.specify_input_name = true; auto ref_predictor = - CreatePaddlePredictor(config); + CreatePaddlePredictor(cfg); std::vector ref_outputs_slots; ref_predictor->Run({input}, &ref_outputs_slots); - EXPECT_EQ(ref_outputs_slots.size(), outputs_slots.size()); - for (size_t i = 0; i < outputs_slots.size(); ++i) { - auto &ref_out = ref_outputs_slots[i]; - auto &out = outputs_slots[i]; - size_t ref_size = - std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1, - [](int a, int b) { return a * b; }); - size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, - [](int a, int b) { return a * b; }); - EXPECT_EQ(size, ref_size); - EXPECT_EQ(out.dtype, ref_out.dtype); - switch (out.dtype) { - case PaddleDType::INT64: { - int64_t *pdata = static_cast(out.data.data()); - int64_t *pdata_ref = static_cast(ref_out.data.data()); - for (size_t j = 0; j < size; ++j) { - EXPECT_EQ(pdata_ref[j], pdata[j]); - } - break; - } - case PaddleDType::FLOAT32: { - float *pdata = static_cast(out.data.data()); - float *pdata_ref = static_cast(ref_out.data.data()); - for (size_t j = 0; j < size; ++j) { - EXPECT_NEAR(pdata_ref[j], pdata[j], 1e-3); - } - break; - } - } - // print what are fused - AnalysisPredictor *analysis_predictor = - dynamic_cast(predictor.get()); - auto &fuse_statis = analysis_predictor->analysis_argument() - .Get>( - framework::ir::kFuseStatisAttr); - for (auto &item : fuse_statis) { - LOG(INFO) << "fused " << item.first << " " << item.second; - } - int num_ops = 0; - for (auto &node : - analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) { - if (node->IsFunction()) { - ++num_ops; - } + CompareResult(outputs_slots, ref_outputs_slots); + // print what are fused + AnalysisPredictor *analysis_predictor = + dynamic_cast(predictor.get()); + auto &fuse_statis = analysis_predictor->analysis_argument() + .Get>( + framework::ir::kFuseStatisAttr); + for (auto &item : fuse_statis) { + LOG(INFO) << "fused " << item.first << " " << item.second; + } + int num_ops = 0; + for (auto &node : + analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) { + if (node->IsFunction()) { + ++num_ops; } - LOG(INFO) << "has num ops: " << num_ops; } + LOG(INFO) << "has num ops: " << num_ops; } TEST(Analyzer_vis, analysis) { TestVisualPrediction(/*use_mkldnn*/ false); } +#ifdef PADDLE_WITH_MKLDNN TEST(Analyzer_vis, analysis_mkldnn) { TestVisualPrediction(/*use_mkldnn*/ true); } +#endif } // namespace analysis } // namespace inference diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 44688ad36e..43e97614e3 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -37,22 +37,37 @@ namespace paddle { namespace inference { void CompareResult(const std::vector &outputs, - const std::vector &base_outputs) { - PADDLE_ENFORCE_GT(outputs.size(), 0); - PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size()); + const std::vector &ref_outputs) { + EXPECT_GT(outputs.size(), 0); + EXPECT_EQ(outputs.size(), ref_outputs.size()); for (size_t i = 0; i < outputs.size(); i++) { auto &out = outputs[i]; - auto &base_out = base_outputs[i]; + auto &ref_out = ref_outputs[i]; size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, [](int a, int b) { return a * b; }); - size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(), - 1, [](int a, int b) { return a * b; }); - PADDLE_ENFORCE_EQ(size, size1); - PADDLE_ENFORCE_GT(size, 0); - float *data = static_cast(out.data.data()); - float *base_data = static_cast(base_out.data.data()); - for (size_t i = 0; i < size; i++) { - EXPECT_NEAR(data[i], base_data[i], 1e-3); + size_t ref_size = + std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1, + [](int a, int b) { return a * b; }); + EXPECT_GT(size, 0); + EXPECT_EQ(size, ref_size); + EXPECT_EQ(out.dtype, ref_out.dtype); + switch (out.dtype) { + case PaddleDType::INT64: { + int64_t *pdata = static_cast(out.data.data()); + int64_t *pdata_ref = static_cast(ref_out.data.data()); + for (size_t j = 0; j < size; ++j) { + EXPECT_EQ(pdata_ref[j], pdata[j]); + } + break; + } + case PaddleDType::FLOAT32: { + float *pdata = static_cast(out.data.data()); + float *pdata_ref = static_cast(ref_out.data.data()); + for (size_t j = 0; j < size; ++j) { + EXPECT_NEAR(pdata_ref[j], pdata[j], 1e-3); + } + break; + } } } } From 9a9105018d6acf71bec681d8fdcd3fc6559b80ac Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Fri, 14 Sep 2018 04:01:04 +0000 Subject: [PATCH 83/85] fix mac compile error in subgraph_splitter --- paddle/fluid/inference/analysis/subgraph_splitter.cc | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/subgraph_splitter.cc index c3a2dbf9d1..b879067d2f 100644 --- a/paddle/fluid/inference/analysis/subgraph_splitter.cc +++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc @@ -120,13 +120,20 @@ void UnionContractedNodes(const std::unordered_map &node_map, outputs.insert(node); } - // update the dst and src node's inlinks and outlinks. +// update the dst and src node's inlinks and outlinks. +#ifdef __clang__ + src_node->inlinks = std::vector(inputs.begin(), inputs.end()); + src_node->outlinks = std::vector(outputs.begin(), outputs.end()); + dst_node->inlinks.clear(); + dst_node->outlinks.clear(); +#else src_node->inlinks = std::move(std::vector(inputs.begin(), inputs.end())); src_node->outlinks = std::move(std::vector(outputs.begin(), outputs.end())); dst_node->inlinks.clear(); dst_node->outlinks.clear(); +#endif auto inlink_or_outlink_cleaner = [&](std::vector &nodes) { for (auto *&n : nodes) { From 26fc698f8510873594e7abbd9e64d141f1233887 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 14 Sep 2018 13:11:50 +0800 Subject: [PATCH 84/85] disable mkldnn fuse on ocr test --- paddle/fluid/inference/tests/api/analyzer_vis_tester.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc index 0591869996..a207c41b71 100644 --- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc @@ -62,7 +62,12 @@ void TestVisualPrediction(bool use_mkldnn) { cfg._use_mkldnn = use_mkldnn; cfg.device = 0; cfg.enable_ir_optim = true; + // TODO(TJ): fix fusion gru cfg.ir_passes.push_back("fc_gru_fuse_pass"); +#ifdef PADDLE_WITH_MKLDNN + // disable mkldnn fuse since it should have some bugs + cfg.ir_passes.push_back("conv_relu_mkldnn_fuse_pass"); +#endif predictor = CreatePaddlePredictor(cfg); From 9e2e893f5975b282fa7a7d0bc599971ab342bef8 Mon Sep 17 00:00:00 2001 From: Xingyuan Bu Date: Fri, 14 Sep 2018 14:53:37 +0800 Subject: [PATCH 85/85] Enhence generate_proposal_labels_op and fix some bug. (#13239) * Enhence generate_proposal_labels_op * Fix bug in generate_proposals_op * Refine rpn_target_assign_op. * by Bu Xingyuan, Wang Guanzhong and Dang Qingqing --- paddle/fluid/API.spec | 4 +- paddle/fluid/operators/detection/bbox_util.h | 33 +- .../detection/generate_proposal_labels_op.cc | 139 ++-- .../detection/generate_proposals_op.cc | 46 +- .../detection/rpn_target_assign_op.cc | 602 ++++++++++++------ python/paddle/fluid/layers/detection.py | 85 ++- python/paddle/fluid/tests/test_detection.py | 143 +++-- ...py => test_generate_proposal_labels_op.py} | 93 +-- ...osals.py => test_generate_proposals_op.py} | 41 +- .../unittests/test_rpn_target_assign_op.py | 214 ++++--- 10 files changed, 874 insertions(+), 526 deletions(-) rename python/paddle/fluid/tests/unittests/{test_generate_proposal_labels.py => test_generate_proposal_labels_op.py} (77%) rename python/paddle/fluid/tests/unittests/{test_generate_proposals.py => test_generate_proposals_op.py} (88%) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 1971774527..e362d34864 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -305,9 +305,9 @@ paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'neg paddle.fluid.layers.detection_output ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0)) paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', 'gt_label', 'prior_box', 'prior_box_var', 'background_label', 'overlap_threshold', 'neg_pos_ratio', 'neg_overlap', 'loc_loss_weight', 'conf_loss_weight', 'match_type', 'mining_type', 'normalize', 'sample_size'], varargs=None, keywords=None, defaults=(None, 0, 0.5, 3.0, 0.5, 1.0, 1.0, 'per_prediction', 'max_negative', True, None)) paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral')) -paddle.fluid.layers.rpn_target_assign ArgSpec(args=['loc', 'scores', 'anchor_box', 'anchor_var', 'gt_box', 'rpn_batch_size_per_im', 'fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap'], varargs=None, keywords=None, defaults=(256, 0.25, 0.7, 0.3)) +paddle.fluid.layers.rpn_target_assign ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'is_crowd', 'im_info', 'rpn_batch_size_per_im', 'rpn_straddle_thresh', 'rpn_fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.0, 0.5, 0.7, 0.3, True)) paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None)) -paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'gt_boxes', 'im_scales', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None)) +paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True)) paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None)) paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) diff --git a/paddle/fluid/operators/detection/bbox_util.h b/paddle/fluid/operators/detection/bbox_util.h index 0dee178162..6abeca1da4 100644 --- a/paddle/fluid/operators/detection/bbox_util.h +++ b/paddle/fluid/operators/detection/bbox_util.h @@ -9,6 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/tensor.h" @@ -21,7 +22,7 @@ namespace operators { */ template inline void BoxToDelta(const int box_num, const framework::Tensor& ex_boxes, - const framework::Tensor& gt_boxes, const T* weights, + const framework::Tensor& gt_boxes, const float* weights, const bool normalized, framework::Tensor* box_delta) { auto ex_boxes_et = framework::EigenTensor::From(ex_boxes); auto gt_boxes_et = framework::EigenTensor::From(gt_boxes); @@ -62,5 +63,35 @@ void Gather(const T* in, const int in_stride, const int* index, const int num, } } +template +void BboxOverlaps(const framework::Tensor& r_boxes, + const framework::Tensor& c_boxes, + framework::Tensor* overlaps) { + auto r_boxes_et = framework::EigenTensor::From(r_boxes); + auto c_boxes_et = framework::EigenTensor::From(c_boxes); + auto overlaps_et = framework::EigenTensor::From(*overlaps); + int r_num = r_boxes.dims()[0]; + int c_num = c_boxes.dims()[0]; + auto zero = static_cast(0.0); + T r_box_area, c_box_area, x_min, y_min, x_max, y_max, inter_w, inter_h, + inter_area; + for (int i = 0; i < r_num; ++i) { + r_box_area = (r_boxes_et(i, 2) - r_boxes_et(i, 0) + 1) * + (r_boxes_et(i, 3) - r_boxes_et(i, 1) + 1); + for (int j = 0; j < c_num; ++j) { + c_box_area = (c_boxes_et(j, 2) - c_boxes_et(j, 0) + 1) * + (c_boxes_et(j, 3) - c_boxes_et(j, 1) + 1); + x_min = std::max(r_boxes_et(i, 0), c_boxes_et(j, 0)); + y_min = std::max(r_boxes_et(i, 1), c_boxes_et(j, 1)); + x_max = std::min(r_boxes_et(i, 2), c_boxes_et(j, 2)); + y_max = std::min(r_boxes_et(i, 3), c_boxes_et(j, 3)); + inter_w = std::max(x_max - x_min + 1, zero); + inter_h = std::max(y_max - y_min + 1, zero); + inter_area = inter_w * inter_h; + overlaps_et(i, j) = inter_area / (r_box_area + c_box_area - inter_area); + } + } +} + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc index be06dc1974..d7a53f1bef 100644 --- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc +++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc @@ -42,10 +42,11 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel { "Input(RpnRois) shouldn't be null."); PADDLE_ENFORCE(ctx->HasInput("GtClasses"), "Input(GtClasses) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("IsCrowd"), + "Input(IsCrowd) shouldn't be null."); PADDLE_ENFORCE(ctx->HasInput("GtBoxes"), "Input(GtBoxes) shouldn't be null."); - PADDLE_ENFORCE(ctx->HasInput("ImScales"), - "Input(ImScales) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("ImInfo"), "Input(ImInfo) shouldn't be null."); PADDLE_ENFORCE(ctx->HasOutput("Rois"), "Output(Rois) of RpnTargetAssignOp should not be null"); @@ -64,22 +65,21 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel { auto rpn_rois_dims = ctx->GetInputDim("RpnRois"); auto gt_classes_dims = ctx->GetInputDim("GtClasses"); + auto is_crowd_dims = ctx->GetInputDim("IsCrowd"); auto gt_boxes_dims = ctx->GetInputDim("GtBoxes"); - auto im_scales_dims = ctx->GetInputDim("ImScales"); + auto im_info_dims = ctx->GetInputDim("ImInfo"); PADDLE_ENFORCE_EQ(rpn_rois_dims.size(), 2, "The rank of Input(RpnRois) must be 2."); - PADDLE_ENFORCE_EQ(gt_classes_dims.size(), 1, - "The rank of Input(GtClasses) must be 1."); PADDLE_ENFORCE_EQ(gt_boxes_dims.size(), 2, "The rank of Input(GtBoxes) must be 2."); - PADDLE_ENFORCE_EQ(im_scales_dims.size(), 1, - "The rank of Input(ImScales) must be 1."); + PADDLE_ENFORCE_EQ(im_info_dims.size(), 2, + "The rank of Input(ImInfo) must be 2."); int class_nums = ctx->Attrs().Get("class_nums"); ctx->SetOutputDim("Rois", {-1, 4}); - ctx->SetOutputDim("LabelsInt32", {-1}); + ctx->SetOutputDim("LabelsInt32", {-1, 1}); ctx->SetOutputDim("BboxTargets", {-1, 4 * class_nums}); ctx->SetOutputDim("BboxInsideWeights", {-1, 4 * class_nums}); ctx->SetOutputDim("BboxOutsideWeights", {-1, 4 * class_nums}); @@ -105,45 +105,18 @@ void Concat(const platform::CPUDeviceContext& context, concat_functor(context, inputs, axis, out_tensor); } -template -void BboxOverlaps(const Tensor& r_boxes, const Tensor& c_boxes, - Tensor* overlaps) { - auto r_boxes_et = framework::EigenTensor::From(r_boxes); - auto c_boxes_et = framework::EigenTensor::From(c_boxes); - auto overlaps_et = framework::EigenTensor::From(*overlaps); - int r_num = r_boxes.dims()[0]; - int c_num = c_boxes.dims()[0]; - auto zero = static_cast(0.0); - T r_box_area, c_box_area, x_min, y_min, x_max, y_max, inter_w, inter_h, - inter_area; - for (int i = 0; i < r_num; ++i) { - r_box_area = (r_boxes_et(i, 2) - r_boxes_et(i, 0) + 1) * - (r_boxes_et(i, 3) - r_boxes_et(i, 1) + 1); - for (int j = 0; j < c_num; ++j) { - c_box_area = (c_boxes_et(j, 2) - c_boxes_et(j, 0) + 1) * - (c_boxes_et(j, 3) - c_boxes_et(j, 1) + 1); - x_min = std::max(r_boxes_et(i, 0), c_boxes_et(j, 0)); - y_min = std::max(r_boxes_et(i, 1), c_boxes_et(j, 1)); - x_max = std::min(r_boxes_et(i, 2), c_boxes_et(j, 2)); - y_max = std::min(r_boxes_et(i, 3), c_boxes_et(j, 3)); - inter_w = std::max(x_max - x_min + 1, zero); - inter_h = std::max(y_max - y_min + 1, zero); - inter_area = inter_w * inter_h; - overlaps_et(i, j) = inter_area / (r_box_area + c_box_area - inter_area); - } - } -} - template std::vector> SampleFgBgGt( const platform::CPUDeviceContext& context, Tensor* iou, - const int batch_size_per_im, const float fg_fraction, const float fg_thresh, - const float bg_thresh_hi, const float bg_thresh_lo, - std::minstd_rand engine) { + const Tensor& is_crowd, const int batch_size_per_im, + const float fg_fraction, const float fg_thresh, const float bg_thresh_hi, + const float bg_thresh_lo, std::minstd_rand engine, const bool use_random) { std::vector fg_inds; std::vector bg_inds; std::vector gt_inds; - T* proposal_to_gt_overlaps = iou->mutable_data(context.GetPlace()); + int64_t gt_num = is_crowd.numel(); + const int* crowd_data = is_crowd.data(); + T* proposal_to_gt_overlaps = iou->data(); int64_t row = iou->dims()[0]; int64_t col = iou->dims()[1]; float epsilon = 0.00001; @@ -152,6 +125,9 @@ std::vector> SampleFgBgGt( for (int64_t i = 0; i < row; ++i) { const T* v = proposal_to_gt_overlaps + i * col; T max_overlap = *std::max_element(v, v + col); + if ((i < gt_num) && (crowd_data[i])) { + max_overlap = -1.0; + } if (max_overlap > fg_thresh) { for (int64_t j = 0; j < col; ++j) { T val = proposal_to_gt_overlaps[i * col + j]; @@ -170,17 +146,19 @@ std::vector> SampleFgBgGt( } // Reservoir Sampling + std::uniform_real_distribution uniform(0, 1); int fg_rois_per_im = std::floor(batch_size_per_im * fg_fraction); int fg_rois_this_image = fg_inds.size(); int fg_rois_per_this_image = std::min(fg_rois_per_im, fg_rois_this_image); - std::uniform_real_distribution uniform(0, 1); - const int64_t fg_size = static_cast(fg_inds.size()); - if (fg_size > fg_rois_per_this_image) { - for (int64_t i = fg_rois_per_this_image; i < fg_size; ++i) { - int rng_ind = std::floor(uniform(engine) * i); - if (rng_ind < fg_rois_per_this_image) { - std::iter_swap(fg_inds.begin() + rng_ind, fg_inds.begin() + i); - std::iter_swap(gt_inds.begin() + rng_ind, gt_inds.begin() + i); + if (use_random) { + const int64_t fg_size = static_cast(fg_inds.size()); + if (fg_size > fg_rois_per_this_image) { + for (int64_t i = fg_rois_per_this_image; i < fg_size; ++i) { + int rng_ind = std::floor(uniform(engine) * i); + if (rng_ind < fg_rois_per_this_image) { + std::iter_swap(fg_inds.begin() + rng_ind, fg_inds.begin() + i); + std::iter_swap(gt_inds.begin() + rng_ind, gt_inds.begin() + i); + } } } } @@ -192,12 +170,14 @@ std::vector> SampleFgBgGt( int bg_rois_per_image = batch_size_per_im - fg_rois_per_this_image; int bg_rois_this_image = bg_inds.size(); int bg_rois_per_this_image = std::min(bg_rois_per_image, bg_rois_this_image); - const int64_t bg_size = static_cast(bg_inds.size()); - if (bg_size > bg_rois_per_this_image) { - for (int64_t i = bg_rois_per_this_image; i < bg_size; ++i) { - int rng_ind = std::floor(uniform(engine) * i); - if (rng_ind < fg_rois_per_this_image) - std::iter_swap(bg_inds.begin() + rng_ind, bg_inds.begin() + i); + if (use_random) { + const int64_t bg_size = static_cast(bg_inds.size()); + if (bg_size > bg_rois_per_this_image) { + for (int64_t i = bg_rois_per_this_image; i < bg_size; ++i) { + int rng_ind = std::floor(uniform(engine) * i); + if (rng_ind < fg_rois_per_this_image) + std::iter_swap(bg_inds.begin() + rng_ind, bg_inds.begin() + i); + } } } std::vector new_bg_inds(bg_inds.begin(), @@ -248,14 +228,14 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context, template std::vector SampleRoisForOneImage( const platform::CPUDeviceContext& context, Tensor* rpn_rois, - Tensor* gt_classes, Tensor* gt_boxes, Tensor* im_scale, + Tensor* gt_classes, Tensor* is_crowd, Tensor* gt_boxes, Tensor* im_info, const int batch_size_per_im, const float fg_fraction, const float fg_thresh, const float bg_thresh_hi, const float bg_thresh_lo, const std::vector& bbox_reg_weights, const int class_nums, - std::minstd_rand engine) { + std::minstd_rand engine, bool use_random) { auto rpn_rois_et = framework::EigenTensor::From(*rpn_rois); - auto im_scale_data = im_scale->data()[0]; - rpn_rois_et = rpn_rois_et / im_scale_data; + auto im_scale = im_info->data()[2]; + rpn_rois_et = rpn_rois_et / im_scale; Tensor boxes; int proposals_num = gt_boxes->dims()[0] + rpn_rois->dims()[0]; @@ -270,8 +250,8 @@ std::vector SampleRoisForOneImage( // Generate proposal index std::vector> fg_bg_gt = SampleFgBgGt( - context, &proposal_to_gt_overlaps, batch_size_per_im, fg_fraction, - fg_thresh, bg_thresh_hi, bg_thresh_lo, engine); + context, &proposal_to_gt_overlaps, *is_crowd, batch_size_per_im, + fg_fraction, fg_thresh, bg_thresh_hi, bg_thresh_lo, engine, use_random); std::vector fg_inds = fg_bg_gt[0]; std::vector bg_inds = fg_bg_gt[1]; std::vector gt_inds = fg_bg_gt[2]; @@ -291,15 +271,15 @@ std::vector SampleRoisForOneImage( // Compute targets Tensor bbox_targets_single; bbox_targets_single.mutable_data(bbox_dim, context.GetPlace()); - BoxToDelta(fg_num, sampled_boxes, sampled_gts, nullptr, false, - &bbox_targets_single); + BoxToDelta(fg_num, sampled_boxes, sampled_gts, bbox_reg_weights.data(), + false, &bbox_targets_single); // Scale rois Tensor sampled_rois; sampled_rois.mutable_data(sampled_boxes.dims(), context.GetPlace()); auto sampled_rois_et = framework::EigenTensor::From(sampled_rois); auto sampled_boxes_et = framework::EigenTensor::From(sampled_boxes); - sampled_rois_et = sampled_boxes_et * im_scale_data; + sampled_rois_et = sampled_boxes_et * im_scale; // Expand box targets Tensor bbox_targets, bbox_inside_weights, bbox_outside_weights; @@ -351,8 +331,9 @@ class GenerateProposalLabelsKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& context) const override { auto* rpn_rois = context.Input("RpnRois"); auto* gt_classes = context.Input("GtClasses"); + auto* is_crowd = context.Input("IsCrowd"); auto* gt_boxes = context.Input("GtBoxes"); - auto* im_scales = context.Input("ImScales"); + auto* im_info = context.Input("ImInfo"); auto* rois = context.Output("Rois"); auto* labels_int32 = context.Output("LabelsInt32"); @@ -369,18 +350,21 @@ class GenerateProposalLabelsKernel : public framework::OpKernel { std::vector bbox_reg_weights = context.Attr>("bbox_reg_weights"); int class_nums = context.Attr("class_nums"); + bool use_random = context.Attr("use_random"); PADDLE_ENFORCE_EQ(rpn_rois->lod().size(), 1UL, "GenerateProposalLabelsOp rpn_rois needs 1 level of LoD"); PADDLE_ENFORCE_EQ( gt_classes->lod().size(), 1UL, "GenerateProposalLabelsOp gt_classes needs 1 level of LoD"); + PADDLE_ENFORCE_EQ(is_crowd->lod().size(), 1UL, + "GenerateProposalLabelsOp is_crowd needs 1 level of LoD"); PADDLE_ENFORCE_EQ(gt_boxes->lod().size(), 1UL, "GenerateProposalLabelsOp gt_boxes needs 1 level of LoD"); int64_t n = static_cast(rpn_rois->lod().back().size() - 1); rois->mutable_data({n * batch_size_per_im, kBoxDim}, context.GetPlace()); - labels_int32->mutable_data({n * batch_size_per_im}, + labels_int32->mutable_data({n * batch_size_per_im, 1}, context.GetPlace()); bbox_targets->mutable_data({n * batch_size_per_im, kBoxDim * class_nums}, context.GetPlace()); @@ -391,8 +375,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel { std::random_device rnd; std::minstd_rand engine; - int seed = - context.Attr("fix_seed") ? context.Attr("seed") : rnd(); + int seed = rnd(); engine.seed(seed); framework::LoD lod; @@ -403,19 +386,23 @@ class GenerateProposalLabelsKernel : public framework::OpKernel { auto rpn_rois_lod = rpn_rois->lod().back(); auto gt_classes_lod = gt_classes->lod().back(); + auto is_crowd_lod = is_crowd->lod().back(); auto gt_boxes_lod = gt_boxes->lod().back(); for (int i = 0; i < n; ++i) { Tensor rpn_rois_slice = rpn_rois->Slice(rpn_rois_lod[i], rpn_rois_lod[i + 1]); Tensor gt_classes_slice = gt_classes->Slice(gt_classes_lod[i], gt_classes_lod[i + 1]); + Tensor is_crowd_slice = + is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]); Tensor gt_boxes_slice = gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]); - Tensor im_scales_slice = im_scales->Slice(i, i + 1); + Tensor im_info_slice = im_info->Slice(i, i + 1); std::vector tensor_output = SampleRoisForOneImage( - dev_ctx, &rpn_rois_slice, >_classes_slice, >_boxes_slice, - &im_scales_slice, batch_size_per_im, fg_fraction, fg_thresh, - bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums, engine); + dev_ctx, &rpn_rois_slice, >_classes_slice, &is_crowd_slice, + >_boxes_slice, &im_info_slice, batch_size_per_im, fg_fraction, + fg_thresh, bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums, + engine, use_random); Tensor sampled_rois = tensor_output[0]; Tensor sampled_labels_int32 = tensor_output[1]; Tensor sampled_bbox_targets = tensor_output[2]; @@ -442,7 +429,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel { bbox_inside_weights->set_lod(lod); bbox_outside_weights->set_lod(lod); rois->Resize({num_rois, kBoxDim}); - labels_int32->Resize({num_rois}); + labels_int32->Resize({num_rois, 1}); bbox_targets->Resize({num_rois, kBoxDim * class_nums}); bbox_inside_weights->Resize({num_rois, kBoxDim * class_nums}); bbox_outside_weights->Resize({num_rois, kBoxDim * class_nums}); @@ -455,8 +442,9 @@ class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker { // TODO(buxingyuan): Add Document AddInput("RpnRois", "RpnRois."); AddInput("GtClasses", "GtClasses."); + AddInput("IsCrowd", "IsCrowd."); AddInput("GtBoxes", "GtBoxes."); - AddInput("ImScales", "ImScales."); + AddInput("ImInfo", "ImInfo."); AddOutput("Rois", "Rois."); AddOutput("LabelsInt32", "LabelsInt32."); @@ -471,8 +459,7 @@ class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("bg_thresh_lo", "bg_thresh_lo"); AddAttr>("bbox_reg_weights", "bbox_reg_weights"); AddAttr("class_nums", "class_nums"); - AddAttr("fix_seed", "fix_seed").SetDefault(false); - AddAttr("seed", "seed").SetDefault(0); + AddAttr("use_random", "use_random").SetDefault(true); AddComment(R"DOC( Generate Proposals Labels Operator. diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc index ebe6830ecc..c33aa25536 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cc +++ b/paddle/fluid/operators/detection/generate_proposals_op.cc @@ -89,12 +89,11 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors, } for (int64_t i = 0; i < row; ++i) { - T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len]; - T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1]; + T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len] + 1.0; + T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1] + 1.0; - T anchor_center_x = (anchor_data[i * len + 2] + anchor_data[i * len]) / 2; - T anchor_center_y = - (anchor_data[i * len + 3] + anchor_data[i * len + 1]) / 2; + T anchor_center_x = anchor_data[i * len] + 0.5 * anchor_width; + T anchor_center_y = anchor_data[i * len + 1] + 0.5 * anchor_height; T bbox_center_x = 0, bbox_center_y = 0; T bbox_width = 0, bbox_height = 0; @@ -106,25 +105,31 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors, bbox_center_y = variances_data[i * len + 1] * bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y; - bbox_width = std::exp(variances_data[i * len + 2] * - bbox_deltas_data[i * len + 2]) * + bbox_width = std::exp(std::min(variances_data[i * len + 2] * + bbox_deltas_data[i * len + 2], + std::log(1000.0 / 16.0))) * anchor_width; - bbox_height = std::exp(variances_data[i * len + 3] * - bbox_deltas_data[i * len + 3]) * + bbox_height = std::exp(std::min(variances_data[i * len + 3] * + bbox_deltas_data[i * len + 3], + std::log(1000.0 / 16.0))) * anchor_height; } else { bbox_center_x = bbox_deltas_data[i * len] * anchor_width + anchor_center_x; bbox_center_y = bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y; - bbox_width = std::exp(bbox_deltas_data[i * len + 2]) * anchor_width; - bbox_height = std::exp(bbox_deltas_data[i * len + 3]) * anchor_height; + bbox_width = std::exp(std::min(bbox_deltas_data[i * len + 2], + std::log(1000.0 / 16.0))) * + anchor_width; + bbox_height = std::exp(std::min(bbox_deltas_data[i * len + 3], + std::log(1000.0 / 16.0))) * + anchor_height; } proposals_data[i * len] = bbox_center_x - bbox_width / 2; proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2; - proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2; - proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2; + proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1; + proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1; } // return proposals; } @@ -156,18 +161,23 @@ void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes, float min_size, const Tensor &im_info, Tensor *keep) { const T *im_info_data = im_info.data(); T *boxes_data = boxes->mutable_data(ctx.GetPlace()); - min_size *= im_info_data[2]; + T im_scale = im_info_data[2]; keep->Resize({boxes->dims()[0], 1}); + min_size = std::max(min_size, 1.0f); int *keep_data = keep->mutable_data(ctx.GetPlace()); int keep_len = 0; for (int i = 0; i < boxes->dims()[0]; ++i) { T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + 1; T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + 1; + T ws_origin_scale = + (boxes_data[4 * i + 2] - boxes_data[4 * i]) / im_scale + 1; + T hs_origin_scale = + (boxes_data[4 * i + 3] - boxes_data[4 * i + 1]) / im_scale + 1; T x_ctr = boxes_data[4 * i] + ws / 2; T y_ctr = boxes_data[4 * i + 1] + hs / 2; - if (ws >= min_size && hs >= min_size && x_ctr <= im_info_data[1] && - y_ctr <= im_info_data[0]) { + if (ws_origin_scale >= min_size && hs_origin_scale >= min_size && + x_ctr <= im_info_data[1] && y_ctr <= im_info_data[0]) { keep_data[keep_len++] = i; } } @@ -218,8 +228,8 @@ T JaccardOverlap(const T *box1, const T *box2, const bool normalized) { const T inter_ymin = std::max(box1[1], box2[1]); const T inter_xmax = std::min(box1[2], box2[2]); const T inter_ymax = std::min(box1[3], box2[3]); - const T inter_w = inter_xmax - inter_xmin; - const T inter_h = inter_ymax - inter_ymin; + const T inter_w = std::max(0.0f, inter_xmax - inter_xmin + 1); + const T inter_h = std::max(0.0f, inter_ymax - inter_ymin + 1); const T inter_area = inter_w * inter_h; const T bbox1_area = BBoxArea(box1, normalized); const T bbox2_area = BBoxArea(box2, normalized); diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc index 88757f25cd..dda423efd3 100644 --- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc +++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc @@ -31,8 +31,14 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("DistMat"), - "Input(DistMat) of RpnTargetAssignOp should not be null"); + PADDLE_ENFORCE(ctx->HasInput("Anchor"), + "Input(Anchor) of RpnTargetAssignOp should not be null"); + PADDLE_ENFORCE(ctx->HasInput("GtBoxes"), + "Input(GtBoxes) of RpnTargetAssignOp should not be null"); + PADDLE_ENFORCE(ctx->HasInput("IsCrowd"), + "Input(Anchor) of RpnTargetAssignOp should not be null"); + PADDLE_ENFORCE(ctx->HasInput("ImInfo"), + "Input(ImInfo) of RpnTargetAssignOp should not be null"); PADDLE_ENFORCE( ctx->HasOutput("LocationIndex"), @@ -43,10 +49,20 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel { PADDLE_ENFORCE( ctx->HasOutput("TargetLabel"), "Output(TargetLabel) of RpnTargetAssignOp should not be null"); - - auto in_dims = ctx->GetInputDim("DistMat"); - PADDLE_ENFORCE_EQ(in_dims.size(), 2, - "The rank of Input(DistMat) must be 2."); + PADDLE_ENFORCE( + ctx->HasOutput("TargetBBox"), + "Output(TargetBBox) of RpnTargetAssignOp should not be null"); + + auto anchor_dims = ctx->GetInputDim("Anchor"); + auto gt_boxes_dims = ctx->GetInputDim("GtBoxes"); + auto is_crowd_dims = ctx->GetInputDim("IsCrowd"); + auto im_info_dims = ctx->GetInputDim("ImInfo"); + PADDLE_ENFORCE_EQ(anchor_dims.size(), 2, + "The rank of Input(Anchor) must be 2."); + PADDLE_ENFORCE_EQ(gt_boxes_dims.size(), 2, + "The rank of Input(GtBoxes) must be 2."); + PADDLE_ENFORCE_EQ(im_info_dims.size(), 2, + "The rank of Input(ImInfo) must be 2."); ctx->SetOutputDim("LocationIndex", {-1}); ctx->SetOutputDim("ScoreIndex", {-1}); @@ -59,198 +75,383 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType( - ctx.Input("DistMat")->type()), + ctx.Input("Anchor")->type()), platform::CPUPlace()); } }; template -class RpnTargetAssignKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* anchor_t = context.Input("Anchor"); // (H*W*A) * 4 - auto* gt_bbox_t = context.Input("GtBox"); - auto* dist_t = context.Input("DistMat"); +void AppendRpns(LoDTensor* out, int64_t offset, Tensor* to_add) { + auto* out_data = out->data(); + auto* to_add_data = to_add->data(); + memcpy(out_data + offset, to_add_data, to_add->numel() * sizeof(T)); +} + +template +std::vector FilterStraddleAnchor( + const platform::CPUDeviceContext& context, const Tensor* anchor, + const float rpn_straddle_thresh, T im_height, T im_width) { + std::vector inds_inside; + int anchor_num = anchor->dims()[0]; + auto* anchor_data = anchor->data(); + if (rpn_straddle_thresh >= 0) { + int index; + for (int i = 0; i < anchor_num; ++i) { + index = i * 4; + if ((anchor_data[index + 0] >= -rpn_straddle_thresh) && + (anchor_data[index + 1] >= -rpn_straddle_thresh) && + (anchor_data[index + 2] < im_width + rpn_straddle_thresh) && + (anchor_data[index + 3] < im_height + rpn_straddle_thresh)) { + inds_inside.emplace_back(i); + } + } + } else { + for (int i = 0; i < anchor_num; ++i) { + inds_inside.emplace_back(i); + } + } + int inside_num = inds_inside.size(); + Tensor inds_inside_t; + int* inds_inside_data = + inds_inside_t.mutable_data({inside_num}, context.GetPlace()); + std::copy(inds_inside.begin(), inds_inside.end(), inds_inside_data); + Tensor inside_anchor_t; + T* inside_anchor_data = + inside_anchor_t.mutable_data({inside_num, 4}, context.GetPlace()); + Gather(anchor->data(), 4, inds_inside_data, inside_num, + inside_anchor_data); + std::vector res; + res.emplace_back(inds_inside_t); + res.emplace_back(inside_anchor_t); + return res; +} + +template +Tensor FilterCrowdGt(const platform::CPUDeviceContext& context, + Tensor* gt_boxes, Tensor* is_crowd) { + int gt_num = gt_boxes->dims()[0]; + std::vector not_crowd_inds; + auto* is_crowd_data = is_crowd->data(); + for (int i = 0; i < gt_num; ++i) { + if (is_crowd_data[i] == 0) { + not_crowd_inds.emplace_back(i); + } + } + int ncrowd_num = not_crowd_inds.size(); + Tensor ncrowd_gt_boxes; + T* ncrowd_gt_boxes_data = + ncrowd_gt_boxes.mutable_data({ncrowd_num, 4}, context.GetPlace()); + Gather(gt_boxes->data(), 4, not_crowd_inds.data(), ncrowd_num, + ncrowd_gt_boxes_data); + return ncrowd_gt_boxes; +} + +void ReservoirSampling(const int num, std::vector* inds, + std::minstd_rand engine, bool use_random) { + std::uniform_real_distribution uniform(0, 1); + size_t len = inds->size(); + if (len > static_cast(num)) { + if (use_random) { + for (size_t i = num; i < len; ++i) { + int rng_ind = std::floor(uniform(engine) * i); + if (rng_ind < num) + std::iter_swap(inds->begin() + rng_ind, inds->begin() + i); + } + } + inds->resize(num); + } +} + +template +void ScoreAssign(const T* anchor_by_gt_overlap_data, + const Tensor& anchor_to_gt_max, const Tensor& gt_to_anchor_max, + const int rpn_batch_size_per_im, const float rpn_fg_fraction, + const float rpn_positive_overlap, + const float rpn_negative_overlap, std::vector* fg_inds, + std::vector* bg_inds, std::vector* tgt_lbl, + std::minstd_rand engine, bool use_random) { + float epsilon = 0.00001; + int anchor_num = anchor_to_gt_max.dims()[0]; + int gt_num = gt_to_anchor_max.dims()[0]; + std::vector target_label(anchor_num, -1); + std::vector fg_inds_fake; + std::vector bg_inds_fake; + const T* anchor_to_gt_max_data = anchor_to_gt_max.data(); + const T* gt_to_anchor_max_data = gt_to_anchor_max.data(); + // TODO(buxingyuan): Match with Detectron now + // but it seems here is a bug in two directions assignment + // in which the later one may overwrites the former one. + for (int64_t i = 0; i < anchor_num; ++i) { + bool is_anchors_with_max_overlap = false; + for (int64_t j = 0; j < gt_num; ++j) { + T value = anchor_by_gt_overlap_data[i * gt_num + j]; + T diff = std::abs(value - gt_to_anchor_max_data[j]); + if (diff < epsilon) { + is_anchors_with_max_overlap = true; + break; + } + } + bool is_anchor_great_than_thresh = + (anchor_to_gt_max_data[i] >= rpn_positive_overlap); + if (is_anchors_with_max_overlap || is_anchor_great_than_thresh) { + fg_inds_fake.push_back(i); + } + } - auto* loc_index_t = context.Output("LocationIndex"); - auto* score_index_t = context.Output("ScoreIndex"); - auto* tgt_bbox_t = context.Output("TargetBBox"); - auto* tgt_lbl_t = context.Output("TargetLabel"); + // Reservoir Sampling + int fg_num = static_cast(rpn_fg_fraction * rpn_batch_size_per_im); + ReservoirSampling(fg_num, &fg_inds_fake, engine, use_random); + fg_num = static_cast(fg_inds_fake.size()); + for (int64_t i = 0; i < fg_num; ++i) { + target_label[fg_inds_fake[i]] = 1; + } - auto lod = dist_t->lod().back(); - int64_t batch_num = static_cast(lod.size() - 1); - int64_t anchor_num = dist_t->dims()[1]; - PADDLE_ENFORCE_EQ(anchor_num, anchor_t->dims()[0]); + int bg_num = rpn_batch_size_per_im - fg_num; + for (int64_t i = 0; i < anchor_num; ++i) { + if (anchor_to_gt_max_data[i] < rpn_negative_overlap) { + bg_inds_fake.push_back(i); + } + } + ReservoirSampling(bg_num, &bg_inds_fake, engine, use_random); + bg_num = static_cast(bg_inds_fake.size()); + for (int64_t i = 0; i < bg_num; ++i) { + target_label[bg_inds_fake[i]] = 0; + } - int rpn_batch_size = context.Attr("rpn_batch_size_per_im"); - float pos_threshold = context.Attr("rpn_positive_overlap"); - float neg_threshold = context.Attr("rpn_negative_overlap"); - float fg_fraction = context.Attr("fg_fraction"); + for (int64_t i = 0; i < anchor_num; ++i) { + if (target_label[i] == 1) fg_inds->emplace_back(i); + if (target_label[i] == 0) bg_inds->emplace_back(i); + } + fg_num = fg_inds->size(); + bg_num = bg_inds->size(); + + tgt_lbl->resize(fg_num + bg_num, 0); + std::vector fg_lbl(fg_num, 1); + std::vector bg_lbl(bg_num, 0); + std::copy(fg_lbl.begin(), fg_lbl.end(), tgt_lbl->data()); + std::copy(bg_lbl.begin(), bg_lbl.end(), tgt_lbl->data() + fg_num); +} + +template +std::vector SampleRpnFgBgGt(const platform::CPUDeviceContext& ctx, + const Tensor& anchor_by_gt_overlap, + const int rpn_batch_size_per_im, + const float rpn_positive_overlap, + const float rpn_negative_overlap, + const float rpn_fg_fraction, + std::minstd_rand engine, bool use_random) { + auto* anchor_by_gt_overlap_data = anchor_by_gt_overlap.data(); + int anchor_num = anchor_by_gt_overlap.dims()[0]; + int gt_num = anchor_by_gt_overlap.dims()[1]; + + std::vector fg_inds; + std::vector bg_inds; + std::vector gt_inds; + std::vector tgt_lbl; + + // Calculate the max IoU between anchors and gt boxes + // Map from anchor to gt box that has highest overlap + auto place = ctx.GetPlace(); + Tensor anchor_to_gt_max, anchor_to_gt_argmax, gt_to_anchor_max; + anchor_to_gt_max.mutable_data({anchor_num}, place); + int* argmax = anchor_to_gt_argmax.mutable_data({anchor_num}, place); + gt_to_anchor_max.mutable_data({gt_num}, place); + + auto anchor_by_gt_overlap_et = + framework::EigenMatrix::From(anchor_by_gt_overlap); + auto anchor_to_gt_max_et = + framework::EigenVector::Flatten(anchor_to_gt_max); + auto gt_to_anchor_max_et = + framework::EigenVector::Flatten(gt_to_anchor_max); + auto anchor_to_gt_argmax_et = + framework::EigenVector::Flatten(anchor_to_gt_argmax); + anchor_to_gt_max_et = + anchor_by_gt_overlap_et.maximum(Eigen::DSizes(1)); + anchor_to_gt_argmax_et = + anchor_by_gt_overlap_et.argmax(1).template cast(); + gt_to_anchor_max_et = + anchor_by_gt_overlap_et.maximum(Eigen::DSizes(0)); + + // Follow the Faster RCNN's implementation + ScoreAssign(anchor_by_gt_overlap_data, anchor_to_gt_max, gt_to_anchor_max, + rpn_batch_size_per_im, rpn_fg_fraction, rpn_positive_overlap, + rpn_negative_overlap, &fg_inds, &bg_inds, &tgt_lbl, engine, + use_random); + + int fg_num = fg_inds.size(); + int bg_num = bg_inds.size(); + gt_inds.reserve(fg_num); + for (int i = 0; i < fg_num; ++i) { + gt_inds.emplace_back(argmax[fg_inds[i]]); + } - int fg_num_per_batch = static_cast(rpn_batch_size * fg_fraction); + Tensor loc_index_t, score_index_t, tgt_lbl_t, gt_inds_t; + int* loc_index_data = loc_index_t.mutable_data({fg_num}, place); + int* score_index_data = + score_index_t.mutable_data({fg_num + bg_num}, place); + int* tgt_lbl_data = tgt_lbl_t.mutable_data({fg_num + bg_num}, place); + int* gt_inds_data = gt_inds_t.mutable_data({fg_num}, place); + std::copy(fg_inds.begin(), fg_inds.end(), loc_index_data); + std::copy(fg_inds.begin(), fg_inds.end(), score_index_data); + std::copy(bg_inds.begin(), bg_inds.end(), score_index_data + fg_num); + std::copy(tgt_lbl.begin(), tgt_lbl.end(), tgt_lbl_data); + std::copy(gt_inds.begin(), gt_inds.end(), gt_inds_data); + std::vector loc_score_tgtlbl_gt; + loc_score_tgtlbl_gt.emplace_back(loc_index_t); + loc_score_tgtlbl_gt.emplace_back(score_index_t); + loc_score_tgtlbl_gt.emplace_back(tgt_lbl_t); + loc_score_tgtlbl_gt.emplace_back(gt_inds_t); + + return loc_score_tgtlbl_gt; +} - int64_t max_num = batch_num * anchor_num; +template +class RpnTargetAssignKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* anchor = context.Input("Anchor"); // (H*W*A) * 4 + auto* gt_boxes = context.Input("GtBoxes"); + auto* is_crowd = context.Input("IsCrowd"); + auto* im_info = context.Input("ImInfo"); + + auto* loc_index = context.Output("LocationIndex"); + auto* score_index = context.Output("ScoreIndex"); + auto* tgt_bbox = context.Output("TargetBBox"); + auto* tgt_lbl = context.Output("TargetLabel"); + + PADDLE_ENFORCE_EQ(gt_boxes->lod().size(), 1UL, + "RpnTargetAssignOp gt_boxes needs 1 level of LoD"); + PADDLE_ENFORCE_EQ(is_crowd->lod().size(), 1UL, + "RpnTargetAssignOp is_crowd needs 1 level of LoD"); + int64_t anchor_num = static_cast(anchor->dims()[0]); + int64_t batch_num = static_cast(gt_boxes->lod().back().size() - 1); + + int rpn_batch_size_per_im = context.Attr("rpn_batch_size_per_im"); + float rpn_straddle_thresh = context.Attr("rpn_straddle_thresh"); + float rpn_positive_overlap = context.Attr("rpn_positive_overlap"); + float rpn_negative_overlap = context.Attr("rpn_negative_overlap"); + float rpn_fg_fraction = context.Attr("rpn_fg_fraction"); + bool use_random = context.Attr("use_random"); + + int64_t max_num = batch_num * rpn_batch_size_per_im; auto place = context.GetPlace(); - tgt_bbox_t->mutable_data({max_num, 4}, place); - auto* loc_index = loc_index_t->mutable_data({max_num}, place); - auto* score_index = score_index_t->mutable_data({max_num}, place); + loc_index->mutable_data({max_num}, place); + score_index->mutable_data({max_num}, place); + tgt_bbox->mutable_data({max_num, 4}, place); + tgt_lbl->mutable_data({max_num, 1}, place); - Tensor tmp_tgt_lbl; - auto* tmp_lbl_data = tmp_tgt_lbl.mutable_data({max_num}, place); auto& dev_ctx = context.device_context(); - math::SetConstant iset; - iset(dev_ctx, &tmp_tgt_lbl, static_cast(-1)); std::random_device rnd; std::minstd_rand engine; - int seed = - context.Attr("fix_seed") ? context.Attr("seed") : rnd(); + int seed = rnd(); engine.seed(seed); - int fg_num = 0; - int bg_num = 0; + framework::LoD lod_loc, loc_score; + std::vector lod0_loc(1, 0); + std::vector lod0_score(1, 0); + + int total_loc_num = 0; + int total_score_num = 0; + auto gt_boxes_lod = gt_boxes->lod().back(); + auto is_crowd_lod = is_crowd->lod().back(); for (int i = 0; i < batch_num; ++i) { - Tensor dist = dist_t->Slice(lod[i], lod[i + 1]); - Tensor gt_bbox = gt_bbox_t->Slice(lod[i], lod[i + 1]); - auto fg_bg_gt = SampleFgBgGt(dev_ctx, dist, pos_threshold, neg_threshold, - rpn_batch_size, fg_num_per_batch, engine, - tmp_lbl_data + i * anchor_num); - - int cur_fg_num = fg_bg_gt[0].size(); - int cur_bg_num = fg_bg_gt[1].size(); - std::transform(fg_bg_gt[0].begin(), fg_bg_gt[0].end(), loc_index, - [i, anchor_num](int d) { return d + i * anchor_num; }); - memcpy(score_index, loc_index, cur_fg_num * sizeof(int)); - std::transform(fg_bg_gt[1].begin(), fg_bg_gt[1].end(), - score_index + cur_fg_num, - [i, anchor_num](int d) { return d + i * anchor_num; }); + Tensor gt_boxes_slice = + gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]); + Tensor is_crowd_slice = + is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]); + Tensor im_info_slice = im_info->Slice(i, i + 1); + auto* im_info_data = im_info_slice.data(); + auto im_height = im_info_data[0]; + auto im_width = im_info_data[1]; + auto im_scale = im_info_data[2]; + + // Filter straddle anchor + std::vector filter_output = FilterStraddleAnchor( + dev_ctx, anchor, rpn_straddle_thresh, im_height, im_width); + Tensor inds_inside = filter_output[0]; + Tensor inside_anchor = filter_output[1]; + + // Filter crowd gt + Tensor ncrowd_gt_boxes = + FilterCrowdGt(dev_ctx, >_boxes_slice, &is_crowd_slice); + auto ncrowd_gt_boxes_et = + framework::EigenTensor::From(ncrowd_gt_boxes); + ncrowd_gt_boxes_et = ncrowd_gt_boxes_et * im_scale; + + Tensor anchor_by_gt_overlap; + anchor_by_gt_overlap.mutable_data( + {inside_anchor.dims()[0], ncrowd_gt_boxes.dims()[0]}, place); + BboxOverlaps(inside_anchor, ncrowd_gt_boxes, &anchor_by_gt_overlap); + + auto loc_score_tgtlbl_gt = SampleRpnFgBgGt( + dev_ctx, anchor_by_gt_overlap, rpn_batch_size_per_im, + rpn_positive_overlap, rpn_negative_overlap, rpn_fg_fraction, engine, + use_random); + + Tensor sampled_loc_index = loc_score_tgtlbl_gt[0]; + Tensor sampled_score_index = loc_score_tgtlbl_gt[1]; + Tensor sampled_tgtlbl = loc_score_tgtlbl_gt[2]; + Tensor sampled_gt_index = loc_score_tgtlbl_gt[3]; + + int loc_num = sampled_loc_index.dims()[0]; + int score_num = sampled_score_index.dims()[0]; + // unmap to all anchor + Tensor sampled_loc_index_unmap, sampled_score_index_unmap; + sampled_loc_index_unmap.mutable_data({loc_num}, place); + sampled_score_index_unmap.mutable_data({score_num}, place); + Gather(inds_inside.data(), 1, sampled_loc_index.data(), + loc_num, sampled_loc_index_unmap.data()); + Gather(inds_inside.data(), 1, sampled_score_index.data(), + score_num, sampled_score_index_unmap.data()); // get target bbox deltas - if (cur_fg_num) { - Tensor fg_gt; - T* gt_data = fg_gt.mutable_data({cur_fg_num, 4}, place); - Tensor tgt_bbox = tgt_bbox_t->Slice(fg_num, fg_num + cur_fg_num); - T* tgt_data = tgt_bbox.data(); - Gather(anchor_t->data(), 4, - reinterpret_cast(&fg_bg_gt[0][0]), cur_fg_num, - tgt_data); - Gather(gt_bbox.data(), 4, reinterpret_cast(&fg_bg_gt[2][0]), - cur_fg_num, gt_data); - BoxToDelta(cur_fg_num, tgt_bbox, fg_gt, nullptr, false, &tgt_bbox); - } - - loc_index += cur_fg_num; - score_index += cur_fg_num + cur_bg_num; - fg_num += cur_fg_num; - bg_num += cur_bg_num; - } - - int lbl_num = fg_num + bg_num; - PADDLE_ENFORCE_LE(fg_num, max_num); - PADDLE_ENFORCE_LE(lbl_num, max_num); - - tgt_bbox_t->Resize({fg_num, 4}); - loc_index_t->Resize({fg_num}); - score_index_t->Resize({lbl_num}); - auto* lbl_data = tgt_lbl_t->mutable_data({lbl_num, 1}, place); - Gather(tmp_lbl_data, 1, score_index_t->data(), lbl_num, - lbl_data); - } - - private: - void ScoreAssign(const T* dist_data, const Tensor& anchor_to_gt_max, - const int row, const int col, const float pos_threshold, - const float neg_threshold, int64_t* target_label, - std::vector* fg_inds, std::vector* bg_inds) const { - float epsilon = 0.0001; - for (int64_t i = 0; i < row; ++i) { - const T* v = dist_data + i * col; - T max = *std::max_element(v, v + col); - for (int64_t j = 0; j < col; ++j) { - if (std::abs(max - v[j]) < epsilon) { - target_label[j] = 1; - } - } - } - - // Pick the fg/bg - const T* anchor_to_gt_max_data = anchor_to_gt_max.data(); - for (int64_t j = 0; j < col; ++j) { - if (anchor_to_gt_max_data[j] >= pos_threshold) { - target_label[j] = 1; - } else if (anchor_to_gt_max_data[j] < neg_threshold) { - target_label[j] = 0; - } - if (target_label[j] == 1) { - fg_inds->push_back(j); - } else if (target_label[j] == 0) { - bg_inds->push_back(j); - } + Tensor sampled_anchor, sampled_gt, sampled_tgt_bbox; + auto* sampled_anchor_data = + sampled_anchor.mutable_data({loc_num, 4}, place); + auto* sampled_gt_data = sampled_gt.mutable_data({loc_num, 4}, place); + Gather(anchor->data(), 4, sampled_loc_index_unmap.data(), + loc_num, sampled_anchor_data); + Gather(ncrowd_gt_boxes.data(), 4, sampled_gt_index.data(), + loc_num, sampled_gt_data); + sampled_tgt_bbox.mutable_data({loc_num, 4}, place); + BoxToDelta(loc_num, sampled_anchor, sampled_gt, nullptr, false, + &sampled_tgt_bbox); + + // Add anchor offset + int anchor_offset = i * anchor_num; + auto sampled_loc_index_unmap_et = + framework::EigenTensor::From(sampled_loc_index_unmap); + sampled_loc_index_unmap_et = sampled_loc_index_unmap_et + anchor_offset; + auto sampled_score_index_unmap_et = + framework::EigenTensor::From(sampled_score_index_unmap); + sampled_score_index_unmap_et = + sampled_score_index_unmap_et + anchor_offset; + AppendRpns(loc_index, total_loc_num, &sampled_loc_index_unmap); + AppendRpns(score_index, total_score_num, &sampled_score_index_unmap); + AppendRpns(tgt_bbox, total_loc_num * 4, &sampled_tgt_bbox); + AppendRpns(tgt_lbl, total_score_num, &sampled_tgtlbl); + total_loc_num += loc_num; + + total_score_num += score_num; + lod0_loc.emplace_back(total_loc_num); + lod0_score.emplace_back(total_score_num); } - } - - void ReservoirSampling(const int num, std::minstd_rand engine, - std::vector* inds) const { - std::uniform_real_distribution uniform(0, 1); - size_t len = inds->size(); - if (len > static_cast(num)) { - for (size_t i = num; i < len; ++i) { - int rng_ind = std::floor(uniform(engine) * i); - if (rng_ind < num) - std::iter_swap(inds->begin() + rng_ind, inds->begin() + i); - } - inds->resize(num); - } - } - // std::vector> RpnTargetAssign( - std::vector> SampleFgBgGt( - const platform::CPUDeviceContext& ctx, const Tensor& dist, - const float pos_threshold, const float neg_threshold, - const int rpn_batch_size, const int fg_num, std::minstd_rand engine, - int64_t* target_label) const { - auto* dist_data = dist.data(); - int row = dist.dims()[0]; - int col = dist.dims()[1]; - - std::vector fg_inds; - std::vector bg_inds; - std::vector gt_inds; - - // Calculate the max IoU between anchors and gt boxes - // Map from anchor to gt box that has highest overlap - auto place = ctx.GetPlace(); - Tensor anchor_to_gt_max, anchor_to_gt_argmax; - anchor_to_gt_max.mutable_data({col}, place); - int* argmax = anchor_to_gt_argmax.mutable_data({col}, place); - - auto x = framework::EigenMatrix::From(dist); - auto x_col_max = framework::EigenVector::Flatten(anchor_to_gt_max); - auto x_col_argmax = - framework::EigenVector::Flatten(anchor_to_gt_argmax); - x_col_max = x.maximum(Eigen::DSizes(0)); - x_col_argmax = x.argmax(0).template cast(); - - // Follow the Faster RCNN's implementation - ScoreAssign(dist_data, anchor_to_gt_max, row, col, pos_threshold, - neg_threshold, target_label, &fg_inds, &bg_inds); - // Reservoir Sampling - ReservoirSampling(fg_num, engine, &fg_inds); - int fg_num2 = static_cast(fg_inds.size()); - int bg_num = rpn_batch_size - fg_num2; - ReservoirSampling(bg_num, engine, &bg_inds); - - gt_inds.reserve(fg_num2); - for (int i = 0; i < fg_num2; ++i) { - gt_inds.emplace_back(argmax[fg_inds[i]]); - } - std::vector> fg_bg_gt; - fg_bg_gt.emplace_back(fg_inds); - fg_bg_gt.emplace_back(bg_inds); - fg_bg_gt.emplace_back(gt_inds); - - return fg_bg_gt; + PADDLE_ENFORCE_LE(total_loc_num, max_num); + PADDLE_ENFORCE_LE(total_score_num, max_num); + + lod_loc.emplace_back(lod0_loc); + loc_score.emplace_back(lod0_score); + loc_index->set_lod(lod_loc); + score_index->set_lod(loc_score); + tgt_bbox->set_lod(lod_loc); + tgt_lbl->set_lod(loc_score); + loc_index->Resize({total_loc_num}); + score_index->Resize({total_score_num}); + tgt_bbox->Resize({total_loc_num, 4}); + tgt_lbl->Resize({total_score_num, 1}); } }; @@ -259,18 +460,22 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddInput("Anchor", "(Tensor) input anchor is a 2-D Tensor with shape [H*W*A, 4]."); - AddInput("GtBox", "(LoDTensor) input groud-truth bbox with shape [K, 4]."); - AddInput( - "DistMat", - "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape " - "[K, M]. It is pair-wise distance matrix between the entities " - "represented by each row and each column. For example, assumed one " - "entity is A with shape [K], another entity is B with shape [M]. The " - "DistMat[i][j] is the distance between A[i] and B[j]. The bigger " - "the distance is, the better macthing the pairs are. Please note, " - "This tensor can contain LoD information to represent a batch of " - "inputs. One instance of this batch can contain different numbers of " - "entities."); + AddInput("GtBoxes", + "(LoDTensor) input groud-truth bbox with shape [K, 4]."); + AddInput("IsCrowd", + "(LoDTensor) input which indicates groud-truth is crowd."); + AddInput("ImInfo", + "(LoDTensor) input image information with shape [N, 3]. " + "N is the batch size, each image information includes height, " + "width and scale."); + AddAttr("rpn_batch_size_per_im", + "Total number of RPN examples per image.") + .SetDefault(256); + AddAttr( + "rpn_straddle_thresh", + "Remove RPN anchors that go outside the image by straddle_thresh " + "pixels, " + "Set to -1 or a large value, e.g. 100000, to disable pruning anchors."); AddAttr( "rpn_positive_overlap", "Minimum overlap required between an anchor and ground-truth " @@ -282,20 +487,15 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker { "box for the (anchor, gt box) pair to be a negative examples.") .SetDefault(0.3); AddAttr( - "fg_fraction", + "rpn_fg_fraction", "Target fraction of RoI minibatch that " "is labeled foreground (i.e. class > 0), 0-th class is background.") .SetDefault(0.25); - AddAttr("rpn_batch_size_per_im", - "Total number of RPN examples per image.") - .SetDefault(256); - AddAttr("fix_seed", - "A flag indicating whether to use a fixed seed to generate " - "random mask. NOTE: DO NOT set this flag to true in " - "training. Setting this flag to true is only useful in " - "unittest.") - .SetDefault(false); - AddAttr("seed", "RpnTargetAssign random seed.").SetDefault(0); + AddAttr("use_random", + "A flag indicating whether to use a ReservoirSampling. " + "NOTE: DO NOT set this flag to false in training. " + "Setting this flag to false is only useful in unittest.") + .SetDefault(true); AddOutput( "LocationIndex", "(Tensor), The indexes of foreground anchors in all RPN anchors, the " @@ -308,16 +508,16 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker { "ScoreIndex is [F + B], F and B are sampled foreground and backgroud " " number."); AddOutput("TargetBBox", - "(Tensor), The target bbox deltas with shape " + "(Tensor), The target bbox deltas with shape " "[F, 4], F is the sampled foreground number."); AddOutput( "TargetLabel", - "(Tensor), The target labels of each anchor with shape " + "(Tensor), The target labels of each anchor with shape " "[F + B, 1], F and B are sampled foreground and backgroud number."); AddComment(R"DOC( -This operator can be, for given the IoU between the ground truth bboxes and the +This operator can be, for a given set of ground truth bboxes and the anchors, to assign classification and regression targets to each prediction. -The Score index and LocationIndex will be generated according to the DistMat. +The ScoreIndex and LocationIndex will be generated according to the anchor-groundtruth IOU. The rest anchors would not contibute to the RPN training loss ScoreIndex is composed of foreground anchor indexes(positive labels) and diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index 1bc1dbbeca..1c73c837e2 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -55,15 +55,19 @@ for _OP in set(__auto__): globals()[_OP] = generate_layer_fn(_OP) -def rpn_target_assign(loc, - scores, +def rpn_target_assign(bbox_pred, + cls_logits, anchor_box, anchor_var, - gt_box, + gt_boxes, + is_crowd, + im_info, rpn_batch_size_per_im=256, - fg_fraction=0.25, + rpn_straddle_thresh=0.0, + rpn_fg_fraction=0.5, rpn_positive_overlap=0.7, - rpn_negative_overlap=0.3): + rpn_negative_overlap=0.3, + use_random=True): """ ** Target Assign Layer for region proposal network (RPN) in Faster-RCNN detection. ** @@ -83,14 +87,13 @@ def rpn_target_assign(loc, the positive anchors. Args: - loc(Variable): A 3-D Tensor with shape [N, M, 4] represents the + bbox_pred(Variable): A 3-D Tensor with shape [N, M, 4] represents the predicted locations of M bounding bboxes. N is the batch size, and each bounding box has four coordinate values and the layout is [xmin, ymin, xmax, ymax]. - scores(Variable): A 3-D Tensor with shape [N, M, C] represents the - predicted confidence predictions. N is the batch size, C is the - class number, M is number of bounding boxes. For each category - there are total M scores which corresponding M bounding boxes. + cls_logits(Variable): A 3-D Tensor with shape [N, M, 1] represents the + predicted confidence predictions. N is the batch size, 1 is the + frontground and background sigmoid, M is number of bounding boxes. anchor_box(Variable): A 2-D Tensor with shape [M, 4] holds M boxes, each box is represented as [xmin, ymin, xmax, ymax], [xmin, ymin] is the left top coordinate of the anchor box, @@ -99,11 +102,16 @@ def rpn_target_assign(loc, coordinate of the anchor box. anchor_var(Variable): A 2-D Tensor with shape [M,4] holds expanded variances of anchors. - gt_box (Variable): The ground-truth boudding boxes (bboxes) are a 2D + gt_boxes (Variable): The ground-truth boudding boxes (bboxes) are a 2D LoDTensor with shape [Ng, 4], Ng is the total number of ground-truth bboxes of mini-batch input. + is_crowd (Variable): A 1-D LoDTensor which indicates groud-truth is crowd. + im_info (Variable): A 2-D LoDTensor with shape [N, 3]. N is the batch size, + 3 is the height, width and scale. rpn_batch_size_per_im(int): Total number of RPN examples per image. - fg_fraction(float): Target fraction of RoI minibatch that is labeled + rpn_straddle_thresh(float): Remove RPN anchors that go outside the image + by straddle_thresh pixels. + rpn_fg_fraction(float): Target fraction of RoI minibatch that is labeled foreground (i.e. class > 0), 0-th class is background. rpn_positive_overlap(float): Minimum overlap required between an anchor and ground-truth box for the (anchor, gt box) pair to be a positive @@ -129,45 +137,48 @@ def rpn_target_assign(loc, Examples: .. code-block:: python - loc = layers.data(name='location', shape=[2, 80], + bbox_pred = layers.data(name='bbox_pred', shape=[100, 4], append_batch_size=False, dtype='float32') - scores = layers.data(name='scores', shape=[2, 40], + cls_logits = layers.data(name='cls_logits', shape=[100, 1], append_batch_size=False, dtype='float32') anchor_box = layers.data(name='anchor_box', shape=[20, 4], append_batch_size=False, dtype='float32') - gt_box = layers.data(name='gt_box', shape=[10, 4], + gt_boxes = layers.data(name='gt_boxes', shape=[10, 4], append_batch_size=False, dtype='float32') loc_pred, score_pred, loc_target, score_target = - fluid.layers.detection_output(loc=location, - scores=scores, + fluid.layers.rpn_target_assign(bbox_pred=bbox_pred, + cls_logits=cls_logits, anchor_box=anchor_box, - gt_box=gt_box) + gt_boxes=gt_boxes) """ helper = LayerHelper('rpn_target_assign', **locals()) - # Compute overlaps between the prior boxes and the gt boxes overlaps - iou = iou_similarity(x=gt_box, y=anchor_box) # Assign target label to anchors loc_index = helper.create_tmp_variable(dtype='int32') score_index = helper.create_tmp_variable(dtype='int32') - target_label = helper.create_tmp_variable(dtype='int64') + target_label = helper.create_tmp_variable(dtype='int32') target_bbox = helper.create_tmp_variable(dtype=anchor_box.dtype) helper.append_op( type="rpn_target_assign", - inputs={'Anchor': anchor_box, - 'GtBox': gt_box, - 'DistMat': iou}, + inputs={ + 'Anchor': anchor_box, + 'GtBoxes': gt_boxes, + 'IsCrowd': is_crowd, + 'ImInfo': im_info + }, outputs={ 'LocationIndex': loc_index, 'ScoreIndex': score_index, 'TargetLabel': target_label, - 'TargetBBox': target_bbox, + 'TargetBBox': target_bbox }, attrs={ 'rpn_batch_size_per_im': rpn_batch_size_per_im, + 'rpn_straddle_thresh': rpn_straddle_thresh, 'rpn_positive_overlap': rpn_positive_overlap, 'rpn_negative_overlap': rpn_negative_overlap, - 'fg_fraction': fg_fraction + 'rpn_fg_fraction': rpn_fg_fraction, + 'use_random': use_random }) loc_index.stop_gradient = True @@ -175,12 +186,12 @@ def rpn_target_assign(loc, target_label.stop_gradient = True target_bbox.stop_gradient = True - scores = nn.reshape(x=scores, shape=(-1, 1)) - loc = nn.reshape(x=loc, shape=(-1, 4)) - predicted_scores = nn.gather(scores, score_index) - predicted_location = nn.gather(loc, loc_index) + cls_logits = nn.reshape(x=cls_logits, shape=(-1, 1)) + bbox_pred = nn.reshape(x=bbox_pred, shape=(-1, 4)) + predicted_cls_logits = nn.gather(cls_logits, score_index) + predicted_bbox_pred = nn.gather(bbox_pred, loc_index) - return predicted_scores, predicted_location, target_label, target_bbox + return predicted_cls_logits, predicted_bbox_pred, target_label, target_bbox def detection_output(loc, @@ -1258,15 +1269,17 @@ def anchor_generator(input, def generate_proposal_labels(rpn_rois, gt_classes, + is_crowd, gt_boxes, - im_scales, + im_info, batch_size_per_im=256, fg_fraction=0.25, fg_thresh=0.25, bg_thresh_hi=0.5, bg_thresh_lo=0.0, bbox_reg_weights=[0.1, 0.1, 0.2, 0.2], - class_nums=None): + class_nums=None, + use_random=True): """ ** Generate proposal labels Faster-RCNN ** TODO(buxingyuan): Add Document @@ -1285,8 +1298,9 @@ def generate_proposal_labels(rpn_rois, inputs={ 'RpnRois': rpn_rois, 'GtClasses': gt_classes, + 'IsCrowd': is_crowd, 'GtBoxes': gt_boxes, - 'ImScales': im_scales + 'ImInfo': im_info }, outputs={ 'Rois': rois, @@ -1302,7 +1316,8 @@ def generate_proposal_labels(rpn_rois, 'bg_thresh_hi': bg_thresh_hi, 'bg_thresh_lo': bg_thresh_lo, 'bbox_reg_weights': bbox_reg_weights, - 'class_nums': class_nums + 'class_nums': class_nums, + 'use_random': use_random }) rois.stop_gradient = True diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py index e2564763d1..56129641ce 100644 --- a/python/paddle/fluid/tests/test_detection.py +++ b/python/paddle/fluid/tests/test_detection.py @@ -148,51 +148,60 @@ class TestAnchorGenerator(unittest.TestCase): class TestGenerateProposalLabels(unittest.TestCase): def test_generate_proposal_labels(self): - rpn_rois = layers.data( - name='rpn_rois', - shape=[4, 4], - dtype='float32', - lod_level=1, - append_batch_size=False) - gt_classes = layers.data( - name='gt_classes', - shape=[6], - dtype='int32', - lod_level=1, - append_batch_size=False) - gt_boxes = layers.data( - name='gt_boxes', - shape=[6, 4], - dtype='float32', - lod_level=1, - append_batch_size=False) - im_scales = layers.data( - name='im_scales', - shape=[1], - dtype='float32', - lod_level=1, - append_batch_size=False) - class_nums = 5 - rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights = fluid.layers.generate_proposal_labels( - rpn_rois=rpn_rois, - gt_classes=gt_classes, - gt_boxes=gt_boxes, - im_scales=im_scales, - batch_size_per_im=2, - fg_fraction=0.5, - fg_thresh=0.5, - bg_thresh_hi=0.5, - bg_thresh_lo=0.0, - bbox_reg_weights=[0.1, 0.1, 0.2, 0.2], - class_nums=class_nums) - assert rois.shape[1] == 4 - assert rois.shape[0] == labels_int32.shape[0] - assert rois.shape[0] == bbox_targets.shape[0] - assert rois.shape[0] == bbox_inside_weights.shape[0] - assert rois.shape[0] == bbox_outside_weights.shape[0] - assert bbox_targets.shape[1] == 4 * class_nums - assert bbox_inside_weights.shape[1] == 4 * class_nums - assert bbox_outside_weights.shape[1] == 4 * class_nums + program = Program() + with program_guard(program): + rpn_rois = layers.data( + name='rpn_rois', + shape=[4, 4], + dtype='float32', + lod_level=1, + append_batch_size=False) + gt_classes = layers.data( + name='gt_classes', + shape=[6], + dtype='int32', + lod_level=1, + append_batch_size=False) + is_crowd = layers.data( + name='is_crowd', + shape=[6], + dtype='int32', + lod_level=1, + append_batch_size=False) + gt_boxes = layers.data( + name='gt_boxes', + shape=[6, 4], + dtype='float32', + lod_level=1, + append_batch_size=False) + im_info = layers.data( + name='im_info', + shape=[1, 3], + dtype='float32', + lod_level=1, + append_batch_size=False) + class_nums = 5 + rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights = fluid.layers.generate_proposal_labels( + rpn_rois=rpn_rois, + gt_classes=gt_classes, + is_crowd=is_crowd, + gt_boxes=gt_boxes, + im_info=im_info, + batch_size_per_im=2, + fg_fraction=0.5, + fg_thresh=0.5, + bg_thresh_hi=0.5, + bg_thresh_lo=0.0, + bbox_reg_weights=[0.1, 0.1, 0.2, 0.2], + class_nums=class_nums) + assert rois.shape[1] == 4 + assert rois.shape[0] == labels_int32.shape[0] + assert rois.shape[0] == bbox_targets.shape[0] + assert rois.shape[0] == bbox_inside_weights.shape[0] + assert rois.shape[0] == bbox_outside_weights.shape[0] + assert bbox_targets.shape[1] == 4 * class_nums + assert bbox_inside_weights.shape[1] == 4 * class_nums + assert bbox_outside_weights.shape[1] == 4 * class_nums class TestMultiBoxHead(unittest.TestCase): @@ -254,18 +263,18 @@ class TestRpnTargetAssign(unittest.TestCase): def test_rpn_target_assign(self): program = Program() with program_guard(program): - loc_shape = [10, 50, 4] - score_shape = [10, 50, 2] + bbox_pred_shape = [10, 50, 4] + cls_logits_shape = [10, 50, 2] anchor_shape = [50, 4] - loc = layers.data( - name='loc', - shape=loc_shape, + bbox_pred = layers.data( + name='bbox_pred', + shape=bbox_pred_shape, append_batch_size=False, dtype='float32') - scores = layers.data( - name='scores', - shape=score_shape, + cls_logits = layers.data( + name='cls_logits', + shape=cls_logits_shape, append_batch_size=False, dtype='float32') anchor_box = layers.data( @@ -278,17 +287,31 @@ class TestRpnTargetAssign(unittest.TestCase): shape=anchor_shape, append_batch_size=False, dtype='float32') - gt_box = layers.data( - name='gt_box', shape=[4], lod_level=1, dtype='float32') - + gt_boxes = layers.data( + name='gt_boxes', shape=[4], lod_level=1, dtype='float32') + is_crowd = layers.data( + name='is_crowd', + shape=[10], + dtype='int32', + lod_level=1, + append_batch_size=False) + im_info = layers.data( + name='im_info', + shape=[1, 3], + dtype='float32', + lod_level=1, + append_batch_size=False) pred_scores, pred_loc, tgt_lbl, tgt_bbox = layers.rpn_target_assign( - loc=loc, - scores=scores, + bbox_pred=bbox_pred, + cls_logits=cls_logits, anchor_box=anchor_box, anchor_var=anchor_var, - gt_box=gt_box, + gt_boxes=gt_boxes, + is_crowd=is_crowd, + im_info=im_info, rpn_batch_size_per_im=256, - fg_fraction=0.25, + rpn_straddle_thresh=0.0, + rpn_fg_fraction=0.5, rpn_positive_overlap=0.7, rpn_negative_overlap=0.3) diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py similarity index 77% rename from python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py rename to python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py index 6dc101b6da..2d5cd3b24b 100644 --- a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py +++ b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py @@ -20,10 +20,10 @@ import paddle.fluid as fluid from op_test import OpTest -def generate_proposal_labels_in_python( - rpn_rois, gt_classes, gt_boxes, im_scales, batch_size_per_im, - fg_fraction, fg_thresh, bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, - class_nums): +def generate_proposal_labels_in_python(rpn_rois, gt_classes, is_crowd, gt_boxes, + im_info, batch_size_per_im, fg_fraction, + fg_thresh, bg_thresh_hi, bg_thresh_lo, + bbox_reg_weights, class_nums): rois = [] labels_int32 = [] bbox_targets = [] @@ -31,13 +31,13 @@ def generate_proposal_labels_in_python( bbox_outside_weights = [] lod = [] assert len(rpn_rois) == len( - im_scales), 'batch size of rpn_rois and ground_truth is not matched' + im_info), 'batch size of rpn_rois and ground_truth is not matched' - for im_i in range(len(im_scales)): + for im_i in range(len(im_info)): frcn_blobs = _sample_rois( - rpn_rois[im_i], gt_classes[im_i], gt_boxes[im_i], im_scales[im_i], - batch_size_per_im, fg_fraction, fg_thresh, bg_thresh_hi, - bg_thresh_lo, bbox_reg_weights, class_nums) + rpn_rois[im_i], gt_classes[im_i], is_crowd[im_i], gt_boxes[im_i], + im_info[im_i], batch_size_per_im, fg_fraction, fg_thresh, + bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums) lod.append(frcn_blobs['rois'].shape[0]) @@ -50,13 +50,14 @@ def generate_proposal_labels_in_python( return rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights, lod -def _sample_rois(rpn_rois, gt_classes, gt_boxes, im_scale, batch_size_per_im, - fg_fraction, fg_thresh, bg_thresh_hi, bg_thresh_lo, - bbox_reg_weights, class_nums): +def _sample_rois(rpn_rois, gt_classes, is_crowd, gt_boxes, im_info, + batch_size_per_im, fg_fraction, fg_thresh, bg_thresh_hi, + bg_thresh_lo, bbox_reg_weights, class_nums): rois_per_image = int(batch_size_per_im) fg_rois_per_im = int(np.round(fg_fraction * rois_per_image)) # Roidb + im_scale = im_info[2] inv_im_scale = 1. / im_scale rpn_rois = rpn_rois * inv_im_scale @@ -78,6 +79,9 @@ def _sample_rois(rpn_rois, gt_classes, gt_boxes, im_scale, batch_size_per_im, box_to_gt_ind_map[overlapped_boxes_ind] = overlaps_argmax[ overlapped_boxes_ind] + crowd_ind = np.where(is_crowd)[0] + gt_overlaps[crowd_ind] = -1 + max_overlaps = gt_overlaps.max(axis=1) max_classes = gt_overlaps.argmax(axis=1) @@ -85,9 +89,10 @@ def _sample_rois(rpn_rois, gt_classes, gt_boxes, im_scale, batch_size_per_im, fg_inds = np.where(max_overlaps >= fg_thresh)[0] fg_rois_per_this_image = np.minimum(fg_rois_per_im, fg_inds.shape[0]) # Sample foreground if there are too many - if fg_inds.shape[0] > fg_rois_per_this_image: - fg_inds = np.random.choice( - fg_inds, size=fg_rois_per_this_image, replace=False) + # if fg_inds.shape[0] > fg_rois_per_this_image: + # fg_inds = np.random.choice( + # fg_inds, size=fg_rois_per_this_image, replace=False) + fg_inds = fg_inds[:fg_rois_per_this_image] # Background bg_inds = np.where((max_overlaps < bg_thresh_hi) & (max_overlaps >= @@ -96,9 +101,10 @@ def _sample_rois(rpn_rois, gt_classes, gt_boxes, im_scale, batch_size_per_im, bg_rois_per_this_image = np.minimum(bg_rois_per_this_image, bg_inds.shape[0]) # Sample background if there are too many - if bg_inds.shape[0] > bg_rois_per_this_image: - bg_inds = np.random.choice( - bg_inds, size=bg_rois_per_this_image, replace=False) + # if bg_inds.shape[0] > bg_rois_per_this_image: + # bg_inds = np.random.choice( + # bg_inds, size=bg_rois_per_this_image, replace=False) + bg_inds = bg_inds[:bg_rois_per_this_image] keep_inds = np.append(fg_inds, bg_inds) sampled_labels = max_classes[keep_inds] @@ -208,8 +214,9 @@ class TestGenerateProposalLabelsOp(OpTest): self.inputs = { 'RpnRois': (self.rpn_rois[0], self.rpn_rois_lod), 'GtClasses': (self.gt_classes[0], self.gts_lod), + 'IsCrowd': (self.is_crowd[0], self.gts_lod), 'GtBoxes': (self.gt_boxes[0], self.gts_lod), - 'ImScales': self.im_scales[0] + 'ImInfo': self.im_info } self.attrs = { 'batch_size_per_im': self.batch_size_per_im, @@ -218,14 +225,15 @@ class TestGenerateProposalLabelsOp(OpTest): 'bg_thresh_hi': self.bg_thresh_hi, 'bg_thresh_lo': self.bg_thresh_lo, 'bbox_reg_weights': self.bbox_reg_weights, - 'class_nums': self.class_nums + 'class_nums': self.class_nums, + 'use_random': False } self.outputs = { - 'Rois': (self.rois[0], [self.lod]), - 'LabelsInt32': (self.labels_int32[0], [self.lod]), - 'BboxTargets': (self.bbox_targets[0], [self.lod]), - 'BboxInsideWeights': (self.bbox_inside_weights[0], [self.lod]), - 'BboxOutsideWeights': (self.bbox_outside_weights[0], [self.lod]), + 'Rois': (self.rois, [self.lod]), + 'LabelsInt32': (self.labels_int32, [self.lod]), + 'BboxTargets': (self.bbox_targets, [self.lod]), + 'BboxInsideWeights': (self.bbox_inside_weights, [self.lod]), + 'BboxOutsideWeights': (self.bbox_outside_weights, [self.lod]), } def test_check_output(self): @@ -236,8 +244,8 @@ class TestGenerateProposalLabelsOp(OpTest): self.set_data() def init_test_params(self): - self.batch_size_per_im = 10 - self.fg_fraction = 1.0 + self.batch_size_per_im = 512 + self.fg_fraction = 0.25 self.fg_thresh = 0.5 self.bg_thresh_hi = 0.5 self.bg_thresh_lo = 0.0 @@ -246,14 +254,14 @@ class TestGenerateProposalLabelsOp(OpTest): def init_test_input(self): np.random.seed(0) - image_nums = 1 gt_nums = 6 # Keep same with batch_size_per_im for unittest - proposal_nums = self.batch_size_per_im - gt_nums - images_shape = [] - self.im_scales = [] - for i in range(image_nums): - images_shape.append(np.random.randint(200, size=2)) - self.im_scales.append(np.ones((1)).astype(np.float32)) + proposal_nums = 2000 #self.batch_size_per_im - gt_nums + images_shape = [[64, 64]] + self.im_info = np.ones((len(images_shape), 3)).astype(np.float32) + for i in range(len(images_shape)): + self.im_info[i, 0] = images_shape[i][0] + self.im_info[i, 1] = images_shape[i][1] + self.im_info[i, 2] = 0.8 #scale self.rpn_rois, self.rpn_rois_lod = _generate_proposals(images_shape, proposal_nums) @@ -261,16 +269,23 @@ class TestGenerateProposalLabelsOp(OpTest): images_shape, self.class_nums, gt_nums) self.gt_classes = [gt['gt_classes'] for gt in ground_truth] self.gt_boxes = [gt['boxes'] for gt in ground_truth] + self.is_crowd = [gt['is_crowd'] for gt in ground_truth] def init_test_output(self): self.rois, self.labels_int32, self.bbox_targets, \ self.bbox_inside_weights, self.bbox_outside_weights, \ self.lod = generate_proposal_labels_in_python( - self.rpn_rois, self.gt_classes, self.gt_boxes, self.im_scales, + self.rpn_rois, self.gt_classes, self.is_crowd, self.gt_boxes, self.im_info, self.batch_size_per_im, self.fg_fraction, self.fg_thresh, self.bg_thresh_hi, self.bg_thresh_lo, self.bbox_reg_weights, self.class_nums ) + self.rois = np.vstack(self.rois) + self.labels_int32 = np.hstack(self.labels_int32) + self.labels_int32 = self.labels_int32[:, np.newaxis] + self.bbox_targets = np.vstack(self.bbox_targets) + self.bbox_inside_weights = np.vstack(self.bbox_inside_weights) + self.bbox_outside_weights = np.vstack(self.bbox_outside_weights) def _generate_proposals(images_shape, proposal_nums): @@ -280,7 +295,7 @@ def _generate_proposals(images_shape, proposal_nums): for i, image_shape in enumerate(images_shape): proposals = _generate_boxes(image_shape, proposal_nums) rpn_rois.append(proposals) - num_proposals += len(proposals) + num_proposals = len(proposals) rpn_rois_lod.append(num_proposals) return rpn_rois, [rpn_rois_lod] @@ -294,7 +309,11 @@ def _generate_groundtruth(images_shape, class_nums, gt_nums): gt_classes = np.random.randint( low=1, high=class_nums, size=gt_nums).astype(np.int32) gt_boxes = _generate_boxes(image_shape, gt_nums) - ground_truth.append(dict(gt_classes=gt_classes, boxes=gt_boxes)) + is_crowd = np.zeros((gt_nums), dtype=np.int32) + is_crowd[0] = 1 + ground_truth.append( + dict( + gt_classes=gt_classes, boxes=gt_boxes, is_crowd=is_crowd)) num_gts += len(gt_classes) gts_lod.append(num_gts) return ground_truth, [gts_lod] diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposals.py b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py similarity index 88% rename from python/paddle/fluid/tests/unittests/test_generate_proposals.py rename to python/paddle/fluid/tests/unittests/test_generate_proposals_op.py index 3fbd2ce95a..86e27fe29e 100644 --- a/python/paddle/fluid/tests/unittests/test_generate_proposals.py +++ b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py @@ -114,10 +114,10 @@ def box_coder(all_anchors, bbox_deltas, variances): #anchor_loc: width, height, center_x, center_y anchor_loc = np.zeros_like(bbox_deltas, dtype=np.float32) - anchor_loc[:, 0] = all_anchors[:, 2] - all_anchors[:, 0] - anchor_loc[:, 1] = all_anchors[:, 3] - all_anchors[:, 1] - anchor_loc[:, 2] = (all_anchors[:, 2] + all_anchors[:, 0]) / 2 - anchor_loc[:, 3] = (all_anchors[:, 3] + all_anchors[:, 1]) / 2 + anchor_loc[:, 0] = all_anchors[:, 2] - all_anchors[:, 0] + 1 + anchor_loc[:, 1] = all_anchors[:, 3] - all_anchors[:, 1] + 1 + anchor_loc[:, 2] = all_anchors[:, 0] + 0.5 * anchor_loc[:, 0] + anchor_loc[:, 3] = all_anchors[:, 1] + 0.5 * anchor_loc[:, 1] #predicted bbox: bbox_center_x, bbox_center_y, bbox_width, bbox_height pred_bbox = np.zeros_like(bbox_deltas, dtype=np.float32) @@ -127,23 +127,29 @@ def box_coder(all_anchors, bbox_deltas, variances): i, 0] + anchor_loc[i, 2] pred_bbox[i, 1] = variances[i, 1] * bbox_deltas[i, 1] * anchor_loc[ i, 1] + anchor_loc[i, 3] - pred_bbox[i, 2] = math.exp(variances[i, 2] * - bbox_deltas[i, 2]) * anchor_loc[i, 0] - pred_bbox[i, 3] = math.exp(variances[i, 3] * - bbox_deltas[i, 3]) * anchor_loc[i, 1] + pred_bbox[i, 2] = math.exp( + min(variances[i, 2] * bbox_deltas[i, 2], math.log( + 1000 / 16.0))) * anchor_loc[i, 0] + pred_bbox[i, 3] = math.exp( + min(variances[i, 3] * bbox_deltas[i, 3], math.log( + 1000 / 16.0))) * anchor_loc[i, 1] else: for i in range(bbox_deltas.shape[0]): pred_bbox[i, 0] = bbox_deltas[i, 0] * anchor_loc[i, 0] + anchor_loc[ i, 2] pred_bbox[i, 1] = bbox_deltas[i, 1] * anchor_loc[i, 1] + anchor_loc[ i, 3] - pred_bbox[i, 2] = math.exp(bbox_deltas[i, 2]) * anchor_loc[i, 0] - pred_bbox[i, 3] = math.exp(bbox_deltas[i, 3]) * anchor_loc[i, 1] + pred_bbox[i, 2] = math.exp( + min(bbox_deltas[i, 2], math.log(1000 / 16.0))) * anchor_loc[i, + 0] + pred_bbox[i, 3] = math.exp( + min(bbox_deltas[i, 3], math.log(1000 / 16.0))) * anchor_loc[i, + 1] proposals[:, 0] = pred_bbox[:, 0] - pred_bbox[:, 2] / 2 proposals[:, 1] = pred_bbox[:, 1] - pred_bbox[:, 3] / 2 - proposals[:, 2] = pred_bbox[:, 0] + pred_bbox[:, 2] / 2 - proposals[:, 3] = pred_bbox[:, 1] + pred_bbox[:, 3] / 2 + proposals[:, 2] = pred_bbox[:, 0] + pred_bbox[:, 2] / 2 - 1 + proposals[:, 3] = pred_bbox[:, 1] + pred_bbox[:, 3] / 2 - 1 return proposals @@ -170,13 +176,16 @@ def filter_boxes(boxes, min_size, im_info): """Only keep boxes with both sides >= min_size and center within the image. """ # Scale min_size to match image scale - min_size *= im_info[2] + im_scale = im_info[2] + min_size = max(min_size, 1.0) ws = boxes[:, 2] - boxes[:, 0] + 1 hs = boxes[:, 3] - boxes[:, 1] + 1 + ws_orig_scale = (boxes[:, 2] - boxes[:, 0]) / im_scale + 1 + hs_orig_scale = (boxes[:, 3] - boxes[:, 1]) / im_scale + 1 x_ctr = boxes[:, 0] + ws / 2. y_ctr = boxes[:, 1] + hs / 2. - keep = np.where((ws >= min_size) & (hs >= min_size) & (x_ctr < im_info[1]) & - (y_ctr < im_info[0]))[0] + keep = np.where((ws_orig_scale >= min_size) & (hs_orig_scale >= min_size) & + (x_ctr < im_info[1]) & (y_ctr < im_info[0]))[0] return keep @@ -204,7 +213,7 @@ def iou(box_a, box_b): xb = min(xmax_a, xmax_b) yb = min(ymax_a, ymax_b) - inter_area = max(xb - xa, 0.0) * max(yb - ya, 0.0) + inter_area = max(xb - xa + 1, 0.0) * max(yb - ya + 1, 0.0) iou_ratio = inter_area / (area_a + area_b - inter_area) diff --git a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py index bd548009b3..f63dbcd3d7 100644 --- a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py +++ b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py @@ -19,48 +19,58 @@ import numpy as np import paddle.fluid.core as core from op_test import OpTest from test_anchor_generator_op import anchor_generator_in_python -from test_generate_proposal_labels import _generate_groundtruth -from test_generate_proposal_labels import _bbox_overlaps, _box_to_delta - - -def rpn_target_assign(gt_anchor_iou, rpn_batch_size_per_im, - rpn_positive_overlap, rpn_negative_overlap, fg_fraction): - iou = np.transpose(gt_anchor_iou) - anchor_to_gt_max = iou.max(axis=1) - anchor_to_gt_argmax = iou.argmax(axis=1) - - gt_to_anchor_argmax = iou.argmax(axis=0) - gt_to_anchor_max = iou[gt_to_anchor_argmax, np.arange(iou.shape[1])] - anchors_with_max_overlap = np.where(iou == gt_to_anchor_max)[0] - - tgt_lbl = np.ones((iou.shape[0], ), dtype=np.int32) * -1 - tgt_lbl[anchors_with_max_overlap] = 1 - tgt_lbl[anchor_to_gt_max >= rpn_positive_overlap] = 1 - - num_fg = int(fg_fraction * rpn_batch_size_per_im) - fg_inds = np.where(tgt_lbl == 1)[0] - if len(fg_inds) > num_fg: +from test_generate_proposal_labels_op import _generate_groundtruth +from test_generate_proposal_labels_op import _bbox_overlaps, _box_to_delta + + +def rpn_target_assign(anchor_by_gt_overlap, + rpn_batch_size_per_im, + rpn_positive_overlap, + rpn_negative_overlap, + rpn_fg_fraction, + use_random=True): + anchor_to_gt_argmax = anchor_by_gt_overlap.argmax(axis=1) + anchor_to_gt_max = anchor_by_gt_overlap[np.arange( + anchor_by_gt_overlap.shape[0]), anchor_to_gt_argmax] + + gt_to_anchor_argmax = anchor_by_gt_overlap.argmax(axis=0) + gt_to_anchor_max = anchor_by_gt_overlap[gt_to_anchor_argmax, np.arange( + anchor_by_gt_overlap.shape[1])] + anchors_with_max_overlap = np.where( + anchor_by_gt_overlap == gt_to_anchor_max)[0] + + labels = np.ones((anchor_by_gt_overlap.shape[0], ), dtype=np.int32) * -1 + labels[anchors_with_max_overlap] = 1 + labels[anchor_to_gt_max >= rpn_positive_overlap] = 1 + + num_fg = int(rpn_fg_fraction * rpn_batch_size_per_im) + fg_inds = np.where(labels == 1)[0] + if len(fg_inds) > num_fg and use_random: disable_inds = np.random.choice( fg_inds, size=(len(fg_inds) - num_fg), replace=False) - tgt_lbl[disable_inds] = -1 - fg_inds = np.where(tgt_lbl == 1)[0] + else: + disable_inds = fg_inds[num_fg:] + labels[disable_inds] = -1 + fg_inds = np.where(labels == 1)[0] - num_bg = rpn_batch_size_per_im - np.sum(tgt_lbl == 1) + num_bg = rpn_batch_size_per_im - np.sum(labels == 1) bg_inds = np.where(anchor_to_gt_max < rpn_negative_overlap)[0] - tgt_lbl[bg_inds] = 0 - if len(bg_inds) > num_bg: + if len(bg_inds) > num_bg and use_random: enable_inds = bg_inds[np.random.randint(len(bg_inds), size=num_bg)] - tgt_lbl[enable_inds] = 0 - bg_inds = np.where(tgt_lbl == 0)[0] - tgt_lbl[bg_inds] = 0 + else: + enable_inds = bg_inds[:num_bg] + labels[enable_inds] = 0 + fg_inds = np.where(labels == 1)[0] + bg_inds = np.where(labels == 0)[0] loc_index = fg_inds score_index = np.hstack((fg_inds, bg_inds)) - tgt_lbl = np.expand_dims(tgt_lbl, axis=1) + labels = labels[score_index] + assert not np.any(labels == -1), "Wrong labels with -1" gt_inds = anchor_to_gt_argmax[fg_inds] - return loc_index, score_index, tgt_lbl, gt_inds + return loc_index, score_index, labels, gt_inds def get_anchor(n, c, h, w): @@ -75,85 +85,129 @@ def get_anchor(n, c, h, w): return anchors -def rpn_blob(anchor, gt_boxes, iou, lod, rpn_batch_size_per_im, - rpn_positive_overlap, rpn_negative_overlap, fg_fraction): - - loc_indexes = [] - score_indexes = [] - tmp_tgt_labels = [] - tgt_bboxes = [] - anchor_num = anchor.shape[0] - +def rpn_target_assign_in_python(all_anchors, + gt_boxes, + is_crowd, + im_info, + lod, + rpn_straddle_thresh, + rpn_batch_size_per_im, + rpn_positive_overlap, + rpn_negative_overlap, + rpn_fg_fraction, + use_random=True): + anchor_num = all_anchors.shape[0] batch_size = len(lod) - 1 for i in range(batch_size): + im_height = im_info[i][0] + im_width = im_info[i][1] + im_scale = im_info[i][2] + if rpn_straddle_thresh >= 0: + # Only keep anchors inside the image by a margin of straddle_thresh + inds_inside = np.where( + (all_anchors[:, 0] >= -rpn_straddle_thresh) & + (all_anchors[:, 1] >= -rpn_straddle_thresh) & ( + all_anchors[:, 2] < im_width + rpn_straddle_thresh) & ( + all_anchors[:, 3] < im_height + rpn_straddle_thresh))[0] + # keep only inside anchors + inside_anchors = all_anchors[inds_inside, :] + else: + inds_inside = np.arange(all_anchors.shape[0]) + inside_anchors = all_anchors + b, e = lod[i], lod[i + 1] - iou_slice = iou[b:e, :] - bboxes_slice = gt_boxes[b:e, :] + gt_boxes_slice = gt_boxes[b:e, :] * im_scale + is_crowd_slice = is_crowd[b:e] - loc_idx, score_idx, tgt_lbl, gt_inds = rpn_target_assign( - iou_slice, rpn_batch_size_per_im, rpn_positive_overlap, - rpn_negative_overlap, fg_fraction) + not_crowd_inds = np.where(is_crowd_slice == 0)[0] + gt_boxes_slice = gt_boxes_slice[not_crowd_inds] + iou = _bbox_overlaps(inside_anchors, gt_boxes_slice) - fg_bboxes = bboxes_slice[gt_inds] - fg_anchors = anchor[loc_idx] - box_deltas = _box_to_delta(fg_anchors, fg_bboxes, [1., 1., 1., 1.]) + loc_inds, score_inds, labels, gt_inds = rpn_target_assign( + iou, rpn_batch_size_per_im, rpn_positive_overlap, + rpn_negative_overlap, rpn_fg_fraction, use_random) + # unmap to all anchor + loc_inds = inds_inside[loc_inds] + score_inds = inds_inside[score_inds] + + sampled_gt = gt_boxes_slice[gt_inds] + sampled_anchor = all_anchors[loc_inds] + box_deltas = _box_to_delta(sampled_anchor, sampled_gt, [1., 1., 1., 1.]) if i == 0: - loc_indexes = loc_idx - score_indexes = score_idx - tmp_tgt_labels = tgt_lbl + loc_indexes = loc_inds + score_indexes = score_inds + tgt_labels = labels tgt_bboxes = box_deltas else: loc_indexes = np.concatenate( - [loc_indexes, loc_idx + i * anchor_num]) + [loc_indexes, loc_inds + i * anchor_num]) score_indexes = np.concatenate( - [score_indexes, score_idx + i * anchor_num]) - tmp_tgt_labels = np.concatenate([tmp_tgt_labels, tgt_lbl]) + [score_indexes, score_inds + i * anchor_num]) + tgt_labels = np.concatenate([tgt_labels, labels]) tgt_bboxes = np.vstack([tgt_bboxes, box_deltas]) - tgt_labels = tmp_tgt_labels[score_indexes] return loc_indexes, score_indexes, tgt_bboxes, tgt_labels class TestRpnTargetAssignOp(OpTest): def setUp(self): n, c, h, w = 2, 4, 14, 14 - anchor = get_anchor(n, c, h, w) + all_anchors = get_anchor(n, c, h, w) gt_num = 10 - anchor = anchor.reshape(-1, 4) - anchor_num = anchor.shape[0] - - im_shapes = [[64, 64], [64, 64]] - gt_box, lod = _generate_groundtruth(im_shapes, 3, 4) - bbox = np.vstack([v['boxes'] for v in gt_box]) - - iou = _bbox_overlaps(bbox, anchor) - - anchor = anchor.astype('float32') - bbox = bbox.astype('float32') - iou = iou.astype('float32') - - loc_index, score_index, tgt_bbox, tgt_lbl = rpn_blob( - anchor, bbox, iou, [0, 4, 8], 25600, 0.95, 0.03, 0.25) + all_anchors = all_anchors.reshape(-1, 4) + anchor_num = all_anchors.shape[0] + + images_shape = [[64, 64], [64, 64]] + #images_shape = [[64, 64]] + groundtruth, lod = _generate_groundtruth(images_shape, 3, 4) + lod = [0, 4, 8] + #lod = [0, 4] + + im_info = np.ones((len(images_shape), 3)).astype(np.float32) + for i in range(len(images_shape)): + im_info[i, 0] = images_shape[i][0] + im_info[i, 1] = images_shape[i][1] + im_info[i, 2] = 0.8 #scale + gt_boxes = np.vstack([v['boxes'] for v in groundtruth]) + is_crowd = np.hstack([v['is_crowd'] for v in groundtruth]) + + all_anchors = all_anchors.astype('float32') + gt_boxes = gt_boxes.astype('float32') + + rpn_straddle_thresh = 0.0 + rpn_batch_size_per_im = 256 + rpn_positive_overlap = 0.7 + rpn_negative_overlap = 0.3 + rpn_fg_fraction = 0.5 + use_random = False + + loc_index, score_index, tgt_bbox, labels = rpn_target_assign_in_python( + all_anchors, gt_boxes, is_crowd, im_info, lod, rpn_straddle_thresh, + rpn_batch_size_per_im, rpn_positive_overlap, rpn_negative_overlap, + rpn_fg_fraction, use_random) + labels = labels[:, np.newaxis] self.op_type = "rpn_target_assign" self.inputs = { - 'Anchor': anchor, - 'GtBox': (bbox, [[4, 4]]), - 'DistMat': (iou, [[4, 4]]), + 'Anchor': all_anchors, + 'GtBoxes': (gt_boxes, [[4, 4]]), + 'IsCrowd': (is_crowd, [[4, 4]]), + 'ImInfo': (im_info, [[1, 1]]) } self.attrs = { - 'rpn_batch_size_per_im': 25600, - 'rpn_positive_overlap': 0.95, - 'rpn_negative_overlap': 0.03, - 'fg_fraction': 0.25, - 'fix_seed': True + 'rpn_batch_size_per_im': rpn_batch_size_per_im, + 'rpn_straddle_thresh': rpn_straddle_thresh, + 'rpn_positive_overlap': rpn_positive_overlap, + 'rpn_negative_overlap': rpn_negative_overlap, + 'rpn_fg_fraction': rpn_fg_fraction, + 'use_random': use_random } self.outputs = { 'LocationIndex': loc_index.astype('int32'), 'ScoreIndex': score_index.astype('int32'), 'TargetBBox': tgt_bbox.astype('float32'), - 'TargetLabel': tgt_lbl.astype('int64'), + 'TargetLabel': labels.astype('int32') } def test_check_output(self):