From de5e56bee8cdc92f4a9417c3b91dd6084ac86b79 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Mon, 2 Apr 2018 13:06:46 +0800 Subject: [PATCH 01/18] add og has been broadcasted --- .../fluid/framework/details/multi_devices_graph_builder.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index a1b913a863..1aa33768c8 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -55,6 +55,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( const ProgramDesc &program) const { auto graph = new SSAGraph(); SSAGraph &result = *graph; + std::unordered_set og_has_bc; result.vars_.resize(places_.size()); bool is_forwarding = true; @@ -123,8 +124,10 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( if (!is_forwarding) { auto var_names = op->OutputArgumentNames(); for (auto &og : var_names) { - if (grad_names_.count(og) != 0) { // is param grad - // Insert NCCL AllReduce Op + if (grad_names_.count(og) != 0 && + og_has_bc.count(og) == 0) { // is param grad + // Insert NCCL AllReduce Op + og_has_bc.insert(og); #ifdef PADDLE_WITH_CUDA result.ops_.emplace_back( new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_)); From 19b4a2a5169afe597745f9543d6d4e5af45aa2f1 Mon Sep 17 00:00:00 2001 From: weixing02 <564445201@qq.com> Date: Mon, 2 Apr 2018 13:20:23 +0800 Subject: [PATCH 02/18] Fix some dead links for cn version --- doc/fluid/dev/index_cn.rst | 6 +++--- doc/fluid/dev/index_en.rst | 2 +- doc/fluid/dev/{new_op_kernel_en.md => new_op_kernel.md} | 0 3 files changed, 4 insertions(+), 4 deletions(-) rename doc/fluid/dev/{new_op_kernel_en.md => new_op_kernel.md} (100%) diff --git a/doc/fluid/dev/index_cn.rst b/doc/fluid/dev/index_cn.rst index e70bf5dff3..f627437f35 100644 --- a/doc/fluid/dev/index_cn.rst +++ b/doc/fluid/dev/index_cn.rst @@ -4,9 +4,9 @@ .. toctree:: :maxdepth: 1 - new_op_en.md - new_op_kernel_en.md - use_eigen_en.md + new_op_cn.md + new_op_kernel.md + use_eigen_cn.md name_convention.md support_new_device.md releasing_process.md diff --git a/doc/fluid/dev/index_en.rst b/doc/fluid/dev/index_en.rst index f0e9afcfcc..0b65fed67a 100644 --- a/doc/fluid/dev/index_en.rst +++ b/doc/fluid/dev/index_en.rst @@ -5,7 +5,7 @@ Development :maxdepth: 1 new_op_en.md - new_op_kernel_en.md + new_op_kernel.md use_eigen_en.md name_convention.md support_new_device.md diff --git a/doc/fluid/dev/new_op_kernel_en.md b/doc/fluid/dev/new_op_kernel.md similarity index 100% rename from doc/fluid/dev/new_op_kernel_en.md rename to doc/fluid/dev/new_op_kernel.md From 9b6c5397c5c39864c453a19bdd2dc6ab21cee26b Mon Sep 17 00:00:00 2001 From: weixing02 <564445201@qq.com> Date: Mon, 2 Apr 2018 13:27:47 +0800 Subject: [PATCH 03/18] Merge branch develop --- .travis.yml | 2 +- paddle/fluid/operators/detail/CMakeLists.txt | 3 +- paddle/fluid/operators/detail/grpc_client.cc | 3 +- paddle/fluid/operators/detail/grpc_server.cc | 61 +++++++- paddle/fluid/operators/detail/grpc_server.h | 15 ++ .../operators/detail/grpc_server_test.cc | 51 +++++++ paddle/fluid/operators/detail/grpc_service.h | 3 + paddle/fluid/operators/detail/send_recv.proto | 2 + paddle/fluid/operators/reshape_op.cc | 130 ++++++++---------- paddle/fluid/operators/reshape_op.h | 127 ++++++++++++++++- python/paddle/fluid/executor.py | 73 +++------- python/paddle/fluid/layers/detection.py | 21 ++- python/paddle/fluid/layers/nn.py | 98 +++++++++++++ python/paddle/fluid/layers/ops.py | 1 - .../paddle/fluid/tests/unittests/op_test.py | 8 +- .../unittests/test_mine_hard_examples_op.py | 0 .../fluid/tests/unittests/test_reshape_op.py | 101 ++++++++++++-- .../tests/unittests/test_target_assign_op.py | 0 18 files changed, 529 insertions(+), 170 deletions(-) create mode 100644 paddle/fluid/operators/detail/grpc_server_test.cc mode change 100755 => 100644 python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py mode change 100755 => 100644 python/paddle/fluid/tests/unittests/test_target_assign_op.py diff --git a/.travis.yml b/.travis.yml index bf6a41d13c..929c847bd3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -34,7 +34,7 @@ addons: - automake - libtool - ccache - ssh_known_hosts: 52.76.173.135 + ssh_known_hosts: 13.229.163.131 before_install: - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python diff --git a/paddle/fluid/operators/detail/CMakeLists.txt b/paddle/fluid/operators/detail/CMakeLists.txt index d59411dfb9..f8cd2852f3 100644 --- a/paddle/fluid/operators/detail/CMakeLists.txt +++ b/paddle/fluid/operators/detail/CMakeLists.txt @@ -2,7 +2,8 @@ if(WITH_DISTRIBUTE) grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor selected_rows) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") - set_source_files_properties(serde_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties(serde_test.cc grpc_server_test PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(serde_test SRCS serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc) + cc_test(grpc_server_test SRCS grpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf) endif() diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc index 9652bb888b..ba9882ce24 100644 --- a/paddle/fluid/operators/detail/grpc_client.cc +++ b/paddle/fluid/operators/detail/grpc_client.cc @@ -150,7 +150,8 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep, s->response_call_back_ = ProcGetResponse; auto call = s->stub_g_.PrepareUnaryCall( - s->context_.get(), "/sendrecv.SendRecvService/GetVariable", req, &cq_); + s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req, + &cq_); call->StartCall(); call->Finish(&s->reply_, &s->status_, (void*)s); }); diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc index 109c762e74..591b3e334a 100644 --- a/paddle/fluid/operators/detail/grpc_server.cc +++ b/paddle/fluid/operators/detail/grpc_server.cc @@ -128,6 +128,47 @@ class RequestGet final : public RequestBase { SimpleBlockQueue* queue_; }; +class RequestPrefetch final : public RequestBase { + public: + explicit RequestPrefetch(GrpcService::AsyncService* service, + ::grpc::ServerCompletionQueue* cq, + framework::Scope* scope, + const platform::DeviceContext* dev_ctx, + framework::Executor* executor, + framework::ProgramDesc* program, int blkid) + : RequestBase(service, cq, dev_ctx), + responder_(&ctx_), + scope_(scope), + executor_(executor), + program_(program), + blkid_(blkid) { + int method_id = static_cast(detail::GrpcMethod::kPrefetchVariable); + service_->RequestAsyncUnary(method_id, &ctx_, &request_, &responder_, cq_, + cq_, this); + } + + virtual ~RequestPrefetch() {} + + virtual std::string GetReqName() { return request_.varname(); } + + virtual void Process() { + // prefetch process... + ::grpc::ByteBuffer reply; + // TODO(Yancey1989): execute the Block which containers prefetch ops + + responder_.Finish(reply, ::grpc::Status::OK, this); + status_ = FINISH; + } + + protected: + sendrecv::VariableMessage request_; + ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; + framework::Scope* scope_; + framework::Executor* executor_; + framework::ProgramDesc* program_; + int blkid_; +}; + void AsyncGRPCServer::WaitClientGet(int count) { int fetch_barriers = 0; while (fetch_barriers < count) { @@ -147,6 +188,7 @@ void AsyncGRPCServer::RunSyncUpdate() { cq_send_ = builder.AddCompletionQueue(); cq_get_ = builder.AddCompletionQueue(); + cq_prefetch_ = builder.AddCompletionQueue(); server_ = builder.BuildAndStart(); LOG(INFO) << "Server listening on " << address_ << std::endl; @@ -155,6 +197,8 @@ void AsyncGRPCServer::RunSyncUpdate() { std::bind(&AsyncGRPCServer::TryToRegisterNewSendOne, this); std::function get_register = std::bind(&AsyncGRPCServer::TryToRegisterNewGetOne, this); + std::function prefetch_register = + std::bind(&AsyncGRPCServer::TryToRegisterNewPrefetchOne, this); t_send_.reset( new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this, @@ -163,11 +207,14 @@ void AsyncGRPCServer::RunSyncUpdate() { t_get_.reset( new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this, cq_get_.get(), "cq_get", get_register))); - + t_prefetch_.reset(new std::thread( + std::bind(&AsyncGRPCServer::HandleRequest, this, cq_prefetch_.get(), + "cq_prefetch", prefetch_register))); // wait server server_->Wait(); t_send_->join(); t_get_->join(); + t_prefetch_->join(); } void AsyncGRPCServer::ShutdownQueue() { @@ -203,6 +250,18 @@ void AsyncGRPCServer::TryToRegisterNewGetOne() { VLOG(4) << "Create RequestGet status:" << get->Status(); } +void AsyncGRPCServer::TryToRegisterNewPrefetchOne() { + std::unique_lock lock(cq_mutex_); + if (is_shut_down_) { + return; + } + RequestPrefetch* prefetch = + new RequestPrefetch(&service_, cq_prefetch_.get(), scope_, dev_ctx_, + executor_, program_, prefetch_blk_id_); + + VLOG(4) << "Create RequestPrefetch status:" << prefetch->Status(); +} + // FIXME(typhoonzero): change cq_name to enum. void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq, std::string cq_name, diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h index 10e6dd45a9..dd5cf4b377 100644 --- a/paddle/fluid/operators/detail/grpc_server.h +++ b/paddle/fluid/operators/detail/grpc_server.h @@ -17,7 +17,9 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/var_type.h" @@ -53,6 +55,12 @@ class AsyncGRPCServer final { void SetDevCtx(const platform::DeviceContext *dev_ctx) { dev_ctx_ = dev_ctx; } + void SetProgram(framework::ProgramDesc *program) { program_ = program; } + + void SetPrefetchBlkdId(int blkid) { prefetch_blk_id_ = blkid; } + + void SetExecutor(framework::Executor *executor) { executor_ = executor; } + const ReceivedMessage Get() { return this->var_recv_queue_.Pop(); } void Push(const std::string &msg_name) { @@ -66,6 +74,7 @@ class AsyncGRPCServer final { std::function TryToRegisterNewOne); void TryToRegisterNewSendOne(); void TryToRegisterNewGetOne(); + void TryToRegisterNewPrefetchOne(); void ShutdownQueue(); private: @@ -73,6 +82,7 @@ class AsyncGRPCServer final { volatile bool is_shut_down_ = false; std::unique_ptr<::grpc::ServerCompletionQueue> cq_send_; std::unique_ptr<::grpc::ServerCompletionQueue> cq_get_; + std::unique_ptr<::grpc::ServerCompletionQueue> cq_prefetch_; GrpcService::AsyncService service_; std::unique_ptr<::grpc::Server> server_; @@ -92,6 +102,11 @@ class AsyncGRPCServer final { std::unique_ptr t_send_; std::unique_ptr t_get_; + std::unique_ptr t_prefetch_; + + int prefetch_blk_id_; + framework::ProgramDesc *program_; + framework::Executor *executor_; }; }; // namespace detail diff --git a/paddle/fluid/operators/detail/grpc_server_test.cc b/paddle/fluid/operators/detail/grpc_server_test.cc new file mode 100644 index 0000000000..5773748106 --- /dev/null +++ b/paddle/fluid/operators/detail/grpc_server_test.cc @@ -0,0 +1,51 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/operators/detail/grpc_client.h" +#include "paddle/fluid/operators/detail/grpc_server.h" + +namespace framework = paddle::framework; +namespace platform = paddle::platform; +namespace detail = paddle::operators::detail; + +std::unique_ptr rpc_service_; + +void StartServer(const std::string& endpoint) { + rpc_service_.reset(new detail::AsyncGRPCServer(endpoint)); +} + +TEST(PREFETCH, CPU) { + // start up a server instance backend + // TODO(Yancey1989): Need to start a server with optimize blocks and + // prefetch blocks. + std::thread server_thread(StartServer, "127.0.0.1:8889"); + framework::Scope scope; + platform::CPUPlace place; + platform::CPUDeviceContext ctx(place); + // create var on local scope + std::string var_name("tmp_0"); + auto var = scope.Var(var_name); + auto tensor = var->GetMutable(); + tensor->Resize({10, 10}); + + detail::RPCClient client; + client.AsyncPrefetchVariable("127.0.0.1:8889", ctx, scope, var_name, ""); + server_thread.join(); + rpc_service_.reset(nullptr); +} diff --git a/paddle/fluid/operators/detail/grpc_service.h b/paddle/fluid/operators/detail/grpc_service.h index ae6f9db3bd..879e21933b 100644 --- a/paddle/fluid/operators/detail/grpc_service.h +++ b/paddle/fluid/operators/detail/grpc_service.h @@ -76,6 +76,7 @@ namespace detail { enum class GrpcMethod { kSendVariable, kGetVariable, + kPrefetchVariable, }; static const int kGrpcNumMethods = @@ -87,6 +88,8 @@ inline const char* GrpcMethodName(GrpcMethod id) { return "/sendrecv.SendRecvService/SendVariable"; case GrpcMethod::kGetVariable: return "/sendrecv.SendRecvService/GetVariable"; + case GrpcMethod::kPrefetchVariable: + return "/sendrecv.SendREcvService/PrefetchVariable"; } // Shouldn't be reached. diff --git a/paddle/fluid/operators/detail/send_recv.proto b/paddle/fluid/operators/detail/send_recv.proto index 2d33f026e4..fc12e82a7e 100644 --- a/paddle/fluid/operators/detail/send_recv.proto +++ b/paddle/fluid/operators/detail/send_recv.proto @@ -21,6 +21,8 @@ service SendRecvService { rpc SendVariable(VariableMessage) returns (VoidMessage) {} // Argument VariableMessage for GetVariable should only contain varname. rpc GetVariable(VariableMessage) returns (VariableMessage) {} + // Prefetch variable by Ids + rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {} } // VariableMessage is serialized paddle variable message. diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index 832509641c..b87b8e6b26 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -17,90 +17,66 @@ limitations under the License. */ namespace paddle { namespace operators { -class ReshapeOp : public framework::OperatorWithKernel { - public: - ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : OperatorWithKernel(type, inputs, outputs, attrs) {} - - void InferShape(framework::InferShapeContext *ctx) const override { - // input check - PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of ReshapeOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of ReshapeOp should not be null."); - - auto shape = ctx->Attrs().Get>("shape"); - PADDLE_ENFORCE(shape.size() > 0, "Attr(shape) shouldn't be empty."); - auto x_dims = ctx->GetInputDim("X"); - - std::vector neg_dims_idx; - // set some dimension to -1 if it is unknown - const int unknown_size = -1; - for (size_t i = 0; i < shape.size(); ++i) { - PADDLE_ENFORCE(shape[i] > 0 || shape[i] == unknown_size, - "Each dimension of Attr(shape) must be positive or %d.", - unknown_size); - if (shape[i] == unknown_size) { - neg_dims_idx.push_back(i); - PADDLE_ENFORCE(neg_dims_idx.size() <= 1, - "Only one dimension of Attr(shape) can be unknown."); - } - } - - int64_t capacity = - std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()); - int64_t in_size = framework::product(x_dims); - if (neg_dims_idx.size() == 1) { - // dim infer - shape[neg_dims_idx[0]] = in_size / (-capacity); - // recalculate capacity - capacity = shape[neg_dims_idx[0]] * (-capacity); - } - // capacity check - PADDLE_ENFORCE(capacity == in_size, - "The size of Input(X) mismatches with Attr(shape)."); - // resize output - std::vector shape_int64(shape.size(), 0); - std::transform(shape.begin(), shape.end(), shape_int64.begin(), - [](int a) { return static_cast(a); }); - auto out_dims = framework::make_ddim(shape_int64); - ctx->SetOutputDim("Out", out_dims); - if (shape[0] == x_dims[0]) { - // Only pass LoD when the first dimension is equal between - // output and input. - ctx->ShareLoD("X", /*->*/ "Out"); - } - } -}; - class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker { public: ReshapeOpMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput("X", "The input tensor of reshape operator."); - AddOutput("Out", "The output tensor of reshape operator."); - AddAttr>("shape", - "(vector) " - "Target shape of reshape operator."); + AddInput("X", "(Tensor). The input tensor of reshape operator."); + AddInput("Shape", + "(Tensor, optional). If provided, reshape according to " + "this given shape. That is to say it has a higher priority than " + "the shape attribute, while the shape attribute still should be " + "set correctly to gurantee shape inference in compile time.") + .AsDispensable(); + AddOutput("Out", "(Tensor). The output tensor of reshape operator."); + AddAttr>( + "shape", "(std::vector) Target shape of reshape operator."); AddAttr("inplace", - "Change the source tensor's shape without copy memory.") - .SetDefault(true); + "(default: false) Change the source tensor's shape without " + "memory copy. When Attr(inplace) is set true, the output " + "tensor shares memory with Input(X), otherwise, a new output " + "tensor is created, and its data are copied from Input(x).") + .SetDefault(false); AddComment(R"DOC( Reshape Operator. -Reshape Input(X) into the shape specified by Attr(shape). +Reshape Input(X) into the shape specified by Attr(shape) or Input(Shape). The +data in Input(X) are unchanged. + +Examples: -An example: -Given a 2-D tensor X with 2 rows and 2 columns : [[1, 2], [3, 4]] +1. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape +specified by Attr(shape) is [6, 8], the reshape operator will transform Input(X) +into a 2-D tensor with shape [6, 8] and leaving Input(X)'s data unchanged. -and target shape = [1, 4], the reshape operator will transform -the tensor X into a 2-D tensor: [[1, 2, 3, 4]] +2. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape +specified by Attr(shape) is [2, 3, -1, 2], the reshape operator will transform +Input(X) into a 4-D tensor with shape [2, 3, 4, 2] and leaving Input(X)'s data +unchanged. In this case, one and only dimension of Attr(shape) can be set to -1, +the value of this dimension is inferred from the total element number of +Input(X) and remaining dimensions. + +3. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape +specified by Attr(shape) is [-1, 0, 3, 2], the reshape operator will transform +Input(X) into a 4-D tensor with shape [2, 4, 3, 2] and leaving Input(X)'s data +unchanged. In this case, besides -1, 0 means the actual dimension value is going +to be copied from the corresponding dimension of Input(X). + +Note: + +1. One and only one dimension in Attr(shape) can be set -1. In this case, +the actual dimension value will be infered from the total element number of +Input(X) and remaining dimensions. + +2. More than one dimensions in Attr(shape) can be set to 0, which means the real +dimension value will be copied from Input(X) at runtime. Note that the index of +0 can not exceed Rank(X). For example, Input(X) is a 3-D tensor with shape +[2, 3, 4], Attr(shape) = [2, 3, 2, 0] is an invalid input. + +3. Input(Shape) has a higher priority than Attr(shape) if it is provided, while +Attr(shape) still should be set correctly to gurantee shape inference in +compile-time. -One dimension in the target shape can be set -1, representing that its -size is unknown. In this case, the real dimension will be infered from -the original shape of Input(X) and other dimensions in the target shape. )DOC"); } }; @@ -119,6 +95,14 @@ class ReshapeGradOp : public framework::OperatorWithKernel { "Input(Out@GRAD) shouldn't be null."); ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } }; } // namespace operators diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h index eacb0a0cf2..871b4d38d5 100644 --- a/paddle/fluid/operators/reshape_op.h +++ b/paddle/fluid/operators/reshape_op.h @@ -20,17 +20,129 @@ limitations under the License. */ namespace paddle { namespace operators { +class ReshapeOp : public framework::OperatorWithKernel { + public: + ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of ReshapeOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of ReshapeOp should not be null."); + + const std::vector &shape = ctx->Attrs().Get>("shape"); + PADDLE_ENFORCE(!shape.empty(), + "The shape information must be set by Attr(shape)."); + + if (ctx->HasInput("Shape") && ctx->IsRuntime()) { + // If true, set the shape of Output(Out) according to Input(Shape) in + // ReshapeKernel with ExecutionContext. Also check LoD in ReshapeKernel. + ctx->ShareLoD("X", /*->*/ "Out"); + return; + } + + auto x_dims = ctx->GetInputDim("X"); + auto out_dims = ValidateShape(shape, x_dims); + ctx->SetOutputDim("Out", out_dims); + if (x_dims[0] == out_dims[0]) { + // Only pass LoD when the first dimension of output and Input(X) + // are the same. + ctx->ShareLoD("X", /*->*/ "Out"); + } + } + + static framework::DDim ValidateShape(const std::vector shape, + const framework::DDim &in_dims) { + const int64_t in_size = framework::product(in_dims); + // only one dimension canbe set to -1, whose size will be automatically + // infered. + const int64_t unk_dim_val = -1; + const int64_t copy_dim_val = 0; + + std::vector output_shape(shape.size(), 0); + int64_t capacity = 1; + int unk_dim_idx = -1; + for (size_t i = 0; i < shape.size(); ++i) { + if (shape[i] == unk_dim_val) { + PADDLE_ENFORCE( + unk_dim_idx == -1, + "Only one input dimension of Attr(shape) can be unknown."); + unk_dim_idx = i; + } else if (shape[i] == copy_dim_val) { + PADDLE_ENFORCE( + static_cast(i) < in_dims.size(), + "The index of dimension to copy from input shape must be less " + "than the size of input shape."); + } else { + PADDLE_ENFORCE( + shape[i] > 0, + "Each input dimension of Attr(shape) must not be negtive except " + "one unknown dimension."); + } + + capacity *= (shape[i] ? shape[i] : in_dims[i]); + output_shape[i] = + (shape[i] ? static_cast(shape[i]) : in_dims[i]); + } + + if (unk_dim_idx != -1) { + output_shape[unk_dim_idx] = -in_size / capacity; + PADDLE_ENFORCE_EQ(output_shape[unk_dim_idx] * capacity, -in_size, + "Invalid shape is given."); + } else { + PADDLE_ENFORCE_EQ(capacity, in_size, "Invalid shape is given."); + } + return framework::make_ddim(output_shape); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } +}; + template class ReshapeKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const { - auto* out = ctx.Output("Out"); - auto* in = ctx.Input("X"); + void Compute(const framework::ExecutionContext &ctx) const { + auto *out = ctx.Output("Out"); + auto *in = ctx.Input("X"); + auto *shape_tensor = ctx.Input("Shape"); + + framework::DDim out_dims = out->dims(); + if (shape_tensor) { + auto *shape_data = shape_tensor->data(); + if (platform::is_gpu_place(ctx.GetPlace())) { + framework::Tensor cpu_shape_tensor; + TensorCopy(*shape_tensor, platform::CPUPlace(), ctx.device_context(), + &cpu_shape_tensor); + shape_data = cpu_shape_tensor.data(); + } + auto shape = + std::vector(shape_data, shape_data + shape_tensor->numel()); + out_dims = ReshapeOp::ValidateShape(shape, in->dims()); + } + if (!in->lod().empty()) { + PADDLE_ENFORCE_EQ( + out_dims[0], in->dims()[0], + "Reshape operator cannot reshape an input sequence batch " + "into an output sequence batch that has a different " + "number of time steps. Please consider using " + "sequence_reshape op."); + } + bool inplace = ctx.Attr("inplace"); - auto out_dims = out->dims(); + out->Resize(out_dims); if (!inplace) { out->mutable_data(ctx.GetPlace()); framework::TensorCopy(*in, ctx.GetPlace(), ctx.device_context(), out); + // TensorCopy will resize to in_dims. out->Resize(out_dims); } else { out->ShareDataWith(*in); @@ -42,9 +154,10 @@ class ReshapeKernel : public framework::OpKernel { template class ReshapeGradKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const { - auto* d_out = ctx.Input(framework::GradVarName("Out")); - auto* d_x = ctx.Output(framework::GradVarName("X")); + void Compute(const framework::ExecutionContext &ctx) const { + auto *d_out = ctx.Input(framework::GradVarName("Out")); + auto *d_x = ctx.Output(framework::GradVarName("X")); + d_x->mutable_data(ctx.GetPlace()); bool inplace = ctx.Attr("inplace"); diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 2612fb1ae4..54d0a12bcd 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -48,8 +48,7 @@ def as_numpy(tensor): assert isinstance(tensor, core.LoDTensor) lod = tensor.lod() if len(lod) > 0: - raise RuntimeError( - "Some of your featched tensors hold LoD information. \ + raise RuntimeError("Some of your fetched tensors hold LoD information. \ They can not be completely cast to Python ndarray. \ Please set the parameter 'return_numpy' as 'False' to \ return LoDTensor itself directly.") @@ -180,60 +179,24 @@ def get_program_cache_key(feed, fetch_list): class Executor(object): - def __init__(self, places): - if not isinstance(places, list) and not isinstance(places, tuple): - places = [places] - - act_places = [] - for each in places: - p = core.Place() - p.set_place(each) - act_places.append(p) - - # TODO(dzhwinter) : only use the first place - self.executor = core.Executor(act_places[0]) - self.places = places + def __init__(self, place): + self.place = place + p = core.Place() + p.set_place(place) + self.executor = core.Executor(p) self.program_caches = dict() - def aslodtensor(self, data): - def accumulate(data): - if not isinstance(data, list): - return 1 - return sum([accumulate(sub) for sub in data]) - - def parselod(data): - seq_lens = [accumulate(seq) for seq in data] - cur_len = 0 - lod = [cur_len] - for l in seq_lens: - cur_len += l - lod.append(cur_len) - return lod - - assert len(self.places) != 0 - if not isinstance(data, list): - # pure tensor case - tensor = core.LoDTensor() - tensor.set(data, self.places[0]) - return tensor - else: - raise RuntimeError("Current implementation lacks unittests") - # lodtensor case - lod = [] - if not isinstance(data[0], list): - lod.append(parselod(data)) - flattened_data = np.concatenate(data, axis=0).astype("int64") - else: - while isinstance(data[0], list): - lod.append(parselod(seq)) - flattened_data = [item for seq in data for item in seq] - data = flattened_data - flattened_data = np.concatenate(data, axis=0).astype("int64") - flattened_data = flattened_data.reshape([len(flattened_data), 1]) - tensor = core.LoDTensor() - tensor.set(flattened_data, self.places[0]) - tensor.set_lod(lod) - return tensor + def as_lodtensor(self, data): + if isinstance(data, list): + raise RuntimeError("Some of your feed data hold LoD information. \ + They can not be completely cast from a list of Python \ + ndarray to LoDTensor. Please convert data to LoDTensor \ + directly before feeding the data.\ + ") + # single tensor case + tensor = core.LoDTensor() + tensor.set(data, self.place) + return tensor def _get_program_cache(self, program_cache_key): return self.program_caches.get(program_cache_key, None) @@ -293,7 +256,7 @@ class Executor(object): feed_target_name = op.desc.output('Out')[0] cur_feed = feed[feed_target_name] if not isinstance(cur_feed, core.LoDTensor): - cur_feed = self.aslodtensor(cur_feed) + cur_feed = self.as_lodtensor(cur_feed) idx = op.desc.attr('col') core.set_feed_variable(scope, cur_feed, feed_var_name, idx) else: diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index 3e649dc5fd..a5938fe494 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -19,7 +19,6 @@ from layer_function_generator import generate_layer_fn from layer_function_generator import autodoc from ..layer_helper import LayerHelper import tensor -import ops import nn import math @@ -58,7 +57,7 @@ def detection_output(loc, This operation is to get the detection results by performing following two steps: - + 1. Decode input bounding box predictions according to the prior boxes. 2. Get the final detection results by applying multi-class non maximum suppression (NMS). @@ -130,9 +129,9 @@ def detection_output(loc, target_box=loc, code_type='decode_center_size') old_shape = scores.shape - scores = ops.reshape(x=scores, shape=(-1, old_shape[-1])) + scores = nn.reshape(x=scores, shape=(-1, old_shape[-1])) scores = nn.softmax(input=scores) - scores = ops.reshape(x=scores, shape=old_shape) + scores = nn.reshape(x=scores, shape=old_shape) scores = nn.transpose(scores, perm=[0, 2, 1]) scores.stop_gradient = True nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype) @@ -463,7 +462,7 @@ def ssd_loss(location, num, num_prior, num_class = confidence.shape def __reshape_to_2d(var): - return ops.reshape(x=var, shape=[-1, var.shape[-1]]) + return nn.reshape(x=var, shape=[-1, var.shape[-1]]) # 1. Find matched boundding box by prior box. # 1.1 Compute IOU similarity between ground-truth boxes and prior boxes. @@ -474,7 +473,7 @@ def ssd_loss(location, # 2. Compute confidence for mining hard examples # 2.1. Get the target label based on matched indices - gt_label = ops.reshape(x=gt_label, shape=gt_label.shape + (1, )) + gt_label = nn.reshape(x=gt_label, shape=gt_label.shape + (1, )) gt_label.stop_gradient = True target_label, _ = target_assign( gt_label, matched_indices, mismatch_value=background_label) @@ -487,7 +486,7 @@ def ssd_loss(location, conf_loss = nn.softmax_with_cross_entropy(confidence, target_label) # 3. Mining hard examples - conf_loss = ops.reshape(x=conf_loss, shape=(num, num_prior)) + conf_loss = nn.reshape(x=conf_loss, shape=(num, num_prior)) conf_loss.stop_gradient = True neg_indices = helper.create_tmp_variable(dtype='int32') dtype = matched_indices.dtype @@ -556,7 +555,7 @@ def ssd_loss(location, # 5.3 Compute overall weighted loss. loss = conf_loss_weight * conf_loss + loc_loss_weight * loc_loss # reshape to [N, Np], N is the batch size and Np is the prior box number. - loss = ops.reshape(x=loss, shape=[-1, num_prior]) + loss = nn.reshape(x=loss, shape=[-1, num_prior]) loss = nn.reduce_sum(loss, dim=1, keep_dim=True) if normalize: normalizer = nn.reduce_sum(target_loc_weight) @@ -709,7 +708,7 @@ def multi_box_head(inputs, new_shape = [ -1, reduce(lambda x, y: x * y, input.shape[axis:len(input.shape)]) ] - out = ops.reshape(x=input, shape=new_shape) + out = nn.reshape(x=input, shape=new_shape) return out def _is_list_or_tuple_(data): @@ -803,7 +802,7 @@ def multi_box_head(inputs, mbox_loc.shape[0], mbox_loc.shape[1] * mbox_loc.shape[2] * mbox_loc.shape[3] / 4, 4 ] - mbox_loc_flatten = ops.reshape(mbox_loc, shape=new_shape) + mbox_loc_flatten = nn.reshape(mbox_loc, shape=new_shape) mbox_locs.append(mbox_loc_flatten) # get conf @@ -819,7 +818,7 @@ def multi_box_head(inputs, conf_loc.shape[0], conf_loc.shape[1] * conf_loc.shape[2] * conf_loc.shape[3] / num_classes, num_classes ] - conf_loc_flatten = ops.reshape(conf_loc, shape=new_shape) + conf_loc_flatten = nn.reshape(conf_loc, shape=new_shape) mbox_confs.append(conf_loc_flatten) if len(box_results) == 1: diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 0332556f62..e59ee25120 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -73,6 +73,7 @@ __all__ = [ 'smooth_l1', 'one_hot', 'autoincreased_step_counter', + 'reshape', 'lod_reset', 'lrn', ] @@ -3265,6 +3266,8 @@ def one_hot(input, depth): The one-hot tensor or LodTensor, same as input. Examples: + .. code-block:: python + X is a LoDTensor: X.lod = [[0, 1, 4]] X.shape = [4, 1] @@ -3319,6 +3322,101 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1): return counter +def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): + """ + Gives a new shape to the input Tensor without changing its data. + + The target shape can be given by :attr:`shape` or :attr:`actual_shape`. + :attr:`shape` is a list of integer while :attr:`actual_shape` is a tensor + variable. :attr:`actual_shape` has a higher priority than :attr:`shape` + if it is provided, while :attr:`shape` still should be set correctly to + gurantee shape inference in compile-time. + + Some tricks exist when specifying the target shape. + + 1. -1 means the value of this dimension is inferred from the total element + number of x and remaining dimensions. Thus one and only one dimension can + be set -1. + + 2. 0 means the actual dimension value is going to be copied from the + corresponding dimension of x. The indice of 0s in shape can not exceed + Rank(X). + + Here are some examples to explain it. + + 1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape + is [6, 8], the reshape operator will transform x into a 2-D tensor with + shape [6, 8] and leaving x's data unchanged. + + 2. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape + specified is [2, 3, -1, 2], the reshape operator will transform x into a + 4-D tensor with shape [2, 3, 4, 2] and leaving x's data unchanged. In this + case, one dimension of the target shape is set to -1, the value of this + dimension is inferred from the total element number of x and remaining + dimensions. + + 3. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape + is [-1, 0, 3, 2], the reshape operator will transform x into a 4-D tensor + with shape [2, 4, 3, 2] and leaving x's data unchanged. In this case, + besides -1, 0 means the actual dimension value is going to be copied from + the corresponding dimension of x. + + Args: + input(variable): The input tensor. + shape(list): The new shape. At most one dimension of the new shape can + be -1. + actual_shape(variable): An optional input. If provided, reshape + according to this given shape rather than + :attr:`shape` specifying shape. That is to + say :attr:`actual_shape` has a higher priority + than :attr:`shape`. + act (str): The non-linear activation to be applied to output variable. + inplace(bool): If this flag is set true, a new output tensor is created + whose data is copied from input x, otherwise the output + shares data with input without copying. + + Returns(variable): The output tensor. + + Examples: + .. code-block:: python + data = fluid.layers.data( + name='data', shape=[2, 4, 6], dtype='float32') + reshaped = fluid.layers.reshape( + x=data, shape=[-1, 0, 3, 2], act='tanh', inplace=True) + """ + + if not (isinstance(shape, list) or isinstance(shape, tuple)): + raise ValueError("Input shape must be a python lsit or tuple.") + + # Validate the shape + unk_dim_idx = -1 + for dim_idx, dim_size in enumerate(shape): + if dim_size == -1: + assert unk_dim_idx == -1, ( + "Only one dimension in shape can be unknown.") + unk_dim_idx = dim_idx + elif dim_size == 0: + assert dim_idx < len(x.shape), ( + "The indice of 0s in shape can not exceed Rank(X).") + else: + assert dim_size > 0, ( + "Each dimension size given in shape must not be negtive " + "except one unknown dimension.") + + helper = LayerHelper("reshape", **locals()) + reshaped = helper.create_tmp_variable(dtype=x.dtype) + helper.append_op( + type="reshape", + inputs={"X": x, + "Shape": actual_shape} + if isinstance(actual_shape, Variable) else {"X": x}, + attrs={"shape": shape, + "inplace": inplace}, + outputs={"Out": reshaped}) + + return helper.append_activation(reshaped) + + def lod_reset(x, y=None, target_lod=None): """ LoD Reset Operator. Set LoD of **x** to a new one specified by **y** or diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py index 0e5987ee59..a9fe25744c 100644 --- a/python/paddle/fluid/layers/ops.py +++ b/python/paddle/fluid/layers/ops.py @@ -49,7 +49,6 @@ __activations__ = [ __all__ = [ 'mean', 'mul', - 'reshape', 'scale', 'sigmoid_cross_entropy_with_logits', 'elementwise_add', diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 8393f7827b..299ab8e51f 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -334,7 +334,7 @@ class OpTest(unittest.TestCase): np.allclose( actual_t, expect_t, atol=atol), "Output (" + out_name + ") has diff at " + str(place) + - str(actual_t) + str(expect_t)) + str(actual_t) + "\n" + str(expect_t)) if isinstance(expect, tuple): self.assertListEqual(actual.lod(), expect[1], "Output (" + out_name + @@ -568,6 +568,6 @@ class OpTest(unittest.TestCase): fetch_list = [g for p, g in param_grad_list] executor = Executor(place) - return map( - np.array, - executor.run(prog, feed_dict, fetch_list, return_numpy=False)) + return map(np.array, + executor.run(prog, feed_dict, fetch_list, + return_numpy=False)) diff --git a/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py b/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py old mode 100755 new mode 100644 diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py index 11f35c74d4..f51b5a7e99 100644 --- a/python/paddle/fluid/tests/unittests/test_reshape_op.py +++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py @@ -14,15 +14,19 @@ import unittest import numpy as np + from op_test import OpTest class TestReshapeOp(OpTest): def setUp(self): + ori_shape = (2, 25) + new_shape = (5, 10) + self.op_type = "reshape" - self.inputs = {'X': np.random.random((10, 20)).astype("float32")} - self.attrs = {'shape': [10 * 20]} - self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])} + self.inputs = {"X": np.random.random(ori_shape).astype("float32")} + self.attrs = {"shape": new_shape, "inplace": False} + self.outputs = {"Out": self.inputs["X"].reshape(new_shape)} def test_check_output(self): self.check_output() @@ -31,12 +35,33 @@ class TestReshapeOp(OpTest): self.check_grad(["X"], "Out") -class TestReshapeOpDimInfer(OpTest): +class TestReshapeOpDimInfer1(OpTest): def setUp(self): + ori_shape = (5, 10) + new_shape = (5, -1, 5) + self.op_type = "reshape" - self.inputs = {'X': np.random.random((10, 20)).astype("float32")} - self.attrs = {'shape': [4, -1, 5]} - self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])} + self.inputs = {"X": np.random.random(ori_shape).astype("float32")} + self.attrs = {"shape": new_shape, "inplace": False} + self.outputs = {"Out": self.inputs["X"].reshape(self.attrs["shape"])} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + +class TestReshapeOpDimInfer2(OpTest): + def setUp(self): + ori_shape = (2, 2, 6) + new_shape = (2, 0, 3, -1) + infered_shape = (2, 2, 3, -1) + + self.op_type = "reshape" + self.inputs = {"X": np.random.random(ori_shape).astype("float32")} + self.attrs = {"shape": new_shape, "inplace": False} + self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)} def test_check_output(self): self.check_output() @@ -47,10 +72,30 @@ class TestReshapeOpDimInfer(OpTest): class TestReshapeOpInplace(OpTest): def setUp(self): + ori_shape = (2, 25) + new_shape = (5, 10) + + self.op_type = "reshape" + self.inputs = {"X": np.random.random(ori_shape).astype("float32")} + self.attrs = {"shape": new_shape} + self.outputs = {"Out": self.inputs["X"].reshape(new_shape)} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + +class TestReshapeOpDimInferInplace1(OpTest): + def setUp(self): + ori_shape = (5, 10) + new_shape = (5, -1, 5) + self.op_type = "reshape" - self.inputs = {'X': np.random.random((10, 20)).astype("float32")} - self.attrs = {'shape': [10 * 20], 'inplace': True} - self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])} + self.inputs = {"X": np.random.random(ori_shape).astype("float32")} + self.attrs = {"shape": new_shape} + self.outputs = {"Out": self.inputs["X"].reshape(new_shape)} def test_check_output(self): self.check_output() @@ -59,12 +104,38 @@ class TestReshapeOpInplace(OpTest): self.check_grad(["X"], "Out") -class TestReshapeOpDimInferInplace(OpTest): +class TestReshapeOpDimInferInplace2(OpTest): def setUp(self): + ori_shape = (2, 2, 6) + new_shape = (2, 0, 3, -1) + infered_shape = (2, 2, 3, -1) + + self.op_type = "reshape" + self.inputs = {"X": np.random.random(ori_shape).astype("float32")} + self.attrs = {"shape": new_shape} + self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + +class TestReshapeOpWithInputShape(OpTest): + def setUp(self): + ori_shape = (6, 5) + new_shape = (0, -1, 5) + actual_shape = (2, 3, 5) + self.op_type = "reshape" - self.inputs = {'X': np.random.random((10, 20)).astype("float32")} - self.attrs = {'shape': [4, -1, 5], 'inplace': True} - self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])} + self.inputs = { + "X": np.random.random(ori_shape).astype("float32"), + "Shape": np.array( + actual_shape, dtype="int32") + } + self.attrs = {"shape": new_shape} + self.outputs = {"Out": self.inputs["X"].reshape(actual_shape)} def test_check_output(self): self.check_output() @@ -73,5 +144,5 @@ class TestReshapeOpDimInferInplace(OpTest): self.check_grad(["X"], "Out") -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_target_assign_op.py old mode 100755 new mode 100644 From 4c8eef5739ecb64295992a5361a5f52f896895d4 Mon Sep 17 00:00:00 2001 From: guosheng Date: Mon, 2 Apr 2018 14:39:33 +0800 Subject: [PATCH 04/18] Add python wrapper for pad_op --- python/paddle/fluid/layers/nn.py | 58 ++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 0332556f62..c6f831c29d 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -75,6 +75,7 @@ __all__ = [ 'autoincreased_step_counter', 'lod_reset', 'lrn', + 'pad', ] @@ -3482,3 +3483,60 @@ def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None): "beta": beta}) return lrn_out + + +def pad(x, paddings, pad_value=0., name=None): + """ + Pads a tensor with a constant value given by :attr:pad_value, and the + padded width is specified by :attr:paddings. + + Specifically, the number of values padded before each dimension + :attr:i is indicated by :attr:paddings[i], and the number of values padded + after each dimension :attr:i is indicated by :attr:paddings[i+1]. + + See below for an example. + + .. code-block:: text + + Given: + x = [[1, 2], [3, 4]] + + paddings = [0, 1, 1, 2] + + pad_value = 0 + + Return: + + out = [[0, 1, 2, 0, 0] + [0, 3, 4, 0, 0] + [0, 0, 0, 0, 0]] + + Args: + x (Variable): The input tensor variable. + paddings (list): A list of integers. Its elements specify the padded + width before and after for each dimension in turn. + The length of :attr:paddings must be + :math:`rank(x) \\times 2`. + pad_value (float): The constant value used to pad. + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + Variable: The padded tensor variable. + + Examples: + .. code-block:: python + # x is a rank 2 tensor variable. + out = fluid.layers.pad( + x=x, paddings=[0, 1, 1, 2], pad_value=0.) + """ + helper = LayerHelper('pad', input=x, **locals()) + dtype = helper.input_dtype() + out = helper.create_tmp_variable(dtype) + helper.append_op( + type='pad', + inputs={'X': x}, + outputs={'Out': out}, + attrs={'paddings': paddings, + 'pad_value': float(pad_value)}) + return out From 30adc0b5f867aabc61367d293bd1cabb0216a51a Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Mon, 2 Apr 2018 19:06:57 +0800 Subject: [PATCH 05/18] add notation --- .../framework/details/multi_devices_graph_builder.cc | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index 1aa33768c8..c277bd7cb6 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -55,7 +55,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( const ProgramDesc &program) const { auto graph = new SSAGraph(); SSAGraph &result = *graph; - std::unordered_set og_has_bc; + std::unordered_set og_has_been_broadcast; result.vars_.resize(places_.size()); bool is_forwarding = true; @@ -123,11 +123,15 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( if (!is_forwarding) { auto var_names = op->OutputArgumentNames(); + // Currently, we assume that once gradient is generated, it can be + // broadcast, and each gradient is only broadcast once. But there are no + // other cases, for example, we need to adjust the gradient according to + // the input when we get the gradient, which is not considered at present. for (auto &og : var_names) { if (grad_names_.count(og) != 0 && - og_has_bc.count(og) == 0) { // is param grad - // Insert NCCL AllReduce Op - og_has_bc.insert(og); + og_has_been_broadcast.count(og) == 0) { // is param grad + // Insert NCCL AllReduce Op + og_has_been_broadcast.insert(og); #ifdef PADDLE_WITH_CUDA result.ops_.emplace_back( new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_)); From 0b8534f2a456d786fdc7ef2f252409f0ac0bbca3 Mon Sep 17 00:00:00 2001 From: guosheng Date: Mon, 2 Apr 2018 23:35:54 +0800 Subject: [PATCH 06/18] Refine python wrapper for pad_op --- doc/fluid/api/layers.rst | 6 ++++++ python/paddle/fluid/layers/nn.py | 13 ++++++++----- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst index ae35d8c534..22e6fb13d7 100644 --- a/doc/fluid/api/layers.rst +++ b/doc/fluid/api/layers.rst @@ -494,6 +494,12 @@ reshape .. autofunction:: paddle.fluid.layers.reshape :noindex: +pad +--- + +.. autofunction:: paddle.fluid.layers.pad + :noindex: + scale ----- diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index f96bc6911f..3d13133bf2 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -3380,6 +3380,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): Examples: .. code-block:: python + data = fluid.layers.data( name='data', shape=[2, 4, 6], dtype='float32') reshaped = fluid.layers.reshape( @@ -3585,12 +3586,13 @@ def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None): def pad(x, paddings, pad_value=0., name=None): """ - Pads a tensor with a constant value given by :attr:pad_value, and the - padded width is specified by :attr:paddings. + Pads a tensor with a constant value given by :attr:`pad_value`, and the + padded width is specified by :attr:`paddings`. - Specifically, the number of values padded before each dimension - :attr:i is indicated by :attr:paddings[i], and the number of values padded - after each dimension :attr:i is indicated by :attr:paddings[i+1]. + Specifically, the number of values padded before the contents of :attr:`x` + in dimension :attr:`i` is indicated by :attr:`paddings[i]`, and the number + of values padded after the contents of :attr:`x` in dimension :attr:`i` is + indicated by :attr:`paddings[i+1]`. See below for an example. @@ -3624,6 +3626,7 @@ def pad(x, paddings, pad_value=0., name=None): Examples: .. code-block:: python + # x is a rank 2 tensor variable. out = fluid.layers.pad( x=x, paddings=[0, 1, 1, 2], pad_value=0.) From 974908f766748aab52e06d10ea592898be029d82 Mon Sep 17 00:00:00 2001 From: Helin Wang Date: Mon, 2 Apr 2018 16:46:10 -0700 Subject: [PATCH 07/18] TeamCity build: handle case when WITH_FLUID_ONLY=OFF --- paddle/scripts/docker/build.sh | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index f916295cd7..4885b74e6c 100755 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -104,7 +104,9 @@ EOF # make install should also be test when unittest make install -j `nproc` pip install /usr/local/opt/paddle/share/wheels/*.whl - paddle version + if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then + paddle version + fi fi } @@ -183,6 +185,14 @@ EOF NCCL_DEPS="" fi + if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]]; then + PADDLE_VERSION="paddle version" + CMD='"paddle", "version"' + else + PADDLE_VERSION="true" + CMD='"true"' + fi + cat >> /paddle/build/Dockerfile < Date: Sun, 1 Apr 2018 02:35:07 -0700 Subject: [PATCH 08/18] Improve ParallelExecutor performance --- .../details/nccl_all_reduce_op_handle.cc | 2 +- .../details/nccl_all_reduce_op_handle.h | 5 ++ .../fluid/framework/details/op_handle_base.h | 4 ++ .../details/threaded_ssa_graph_executor.cc | 49 +++++++++++++++---- .../details/threaded_ssa_graph_executor.h | 12 ++++- paddle/fluid/framework/parallel_executor.cc | 2 + python/paddle/fluid/parallel_executor.py | 3 +- 7 files changed, 63 insertions(+), 14 deletions(-) diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc index 5ddf331cfc..55b5f11358 100644 --- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc @@ -76,7 +76,7 @@ void NCCLAllReduceOpHandle::RunImpl() { } } -std::string NCCLAllReduceOpHandle::Name() const { return "NCCL AllReduce"; } +std::string NCCLAllReduceOpHandle::Name() const { return "nccl_all_reduce"; } } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h index 045070bb6a..3d61fa79f7 100644 --- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h +++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h @@ -14,6 +14,9 @@ #pragma once +#include +#include + #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" @@ -34,6 +37,8 @@ struct NCCLAllReduceOpHandle : public OpHandleBase { std::string Name() const override; + bool IsDelayedOp() override { return true; }; + protected: void RunImpl() override; }; diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index 71672fd24c..54c2d627ff 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -13,6 +13,8 @@ // limitations under the License. #pragma once +#include +#include #include "paddle/fluid/framework/details/var_handle.h" #include "paddle/fluid/platform/device_context.h" @@ -53,6 +55,8 @@ class OpHandleBase { void AddOutput(VarHandleBase *out); + virtual bool IsDelayedOp() { return false; } + protected: virtual void RunImpl() = 0; }; diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 3f8655147b..075eed4ecc 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -29,17 +29,27 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor( local_scopes_(local_scopes), places_(places), fetch_ctxs_(places), - use_event_(use_event) {} + use_event_(use_event), + running_ops_(0) {} + +void ThreadedSSAGraphExecutor::RunDelayedOps( + const std::unordered_set &delayed_ops) { + for (auto op : delayed_ops) { + op->Run(use_event_); + } +} FeedFetchList ThreadedSSAGraphExecutor::Run( const std::vector &fetch_tensors) { std::unordered_map pending_ops; std::unordered_set pending_vars; - BlockingQueue ready_vars; - std::unordered_set ready_ops; + std::unordered_set delayed_ops; + std::unordered_set after_delayed_ops; + std::unordered_set delayed_vars; + auto InsertPendingVar = [&pending_vars, &ready_vars](VarHandleBase &var) { pending_vars.insert(&var); if (var.generated_op_ == nullptr) { @@ -106,7 +116,14 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( auto run_all_ready_ops = [&] { for (auto *op : ready_ops) { - RunOp(ready_vars, op); + if (op->IsDelayedOp()) { + delayed_ops.insert(op); + delayed_vars.insert(op->outputs_.begin(), op->outputs_.end()); + ready_vars.Extend(op->outputs_); + continue; + } + running_ops_++; + RunOp(&ready_vars, op); } ready_ops.clear(); }; @@ -124,7 +141,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( // 2. Find ready variable bool timeout; - auto cur_ready_vars = ready_vars.PopAll(1000, &timeout); + auto cur_ready_vars = ready_vars.PopAll(1, &timeout); if (timeout) { if (exception_) { @@ -141,13 +158,24 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( auto &deps = pending_ops[op]; --deps; if (deps == 0) { - ready_ops.insert(op); + if (delayed_vars.find(ready_var) != delayed_vars.end()) { + after_delayed_ops.insert(op); + } else { + ready_ops.insert(op); + } } } } + if (ready_ops.empty() && !delayed_ops.empty() && running_ops_ == 0) { + RunDelayedOps(delayed_ops); + delayed_ops.clear(); + for (auto *op : after_delayed_ops) { + ready_ops.insert(op); + } + after_delayed_ops.clear(); + } // Keep loop until all vars are ready. } - ++computation_count_; auto sync_computation = [&] { @@ -182,12 +210,13 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( } void ThreadedSSAGraphExecutor::RunOp( - BlockingQueue &ready_var_q, details::OpHandleBase *op) { - auto op_run = [&ready_var_q, op, this] { + BlockingQueue *ready_var_q, details::OpHandleBase *op) { + auto op_run = [ready_var_q, op, this] { try { VLOG(10) << op->Name() << " : " << op->DebugString(); op->Run(use_event_); - ready_var_q.Extend(op->outputs_); + running_ops_--; + ready_var_q->Extend(op->outputs_); } catch (platform::EnforceNotMet ex) { exception_.reset(new platform::EnforceNotMet(ex)); } catch (...) { diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index 2ea57ac8f9..6193b897e4 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -14,7 +14,12 @@ #pragma once -#include +#include +#include +#include +#include +#include + #include #include "ThreadPool.h" // ThreadPool in thrird party #include "paddle/fluid/framework/details/ssa_graph_executor.h" @@ -79,9 +84,11 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { ~ThreadedSSAGraphExecutor() {} private: - void RunOp(BlockingQueue &ready_var_q, + void RunOp(BlockingQueue *ready_var_q, details::OpHandleBase *op); + void RunDelayedOps(const std::unordered_set &delayed_ops); + private: std::unique_ptr<::ThreadPool> pool_; std::vector local_scopes_; @@ -89,6 +96,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { platform::DeviceContextPool fetch_ctxs_; const bool use_event_; std::unique_ptr exception_; + std::atomic running_ops_; size_t computation_count_{0}; size_t max_async_computation{100}; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 577eea92d2..002a6d362f 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/parallel_executor.h" +#include "paddle/fluid/platform/profiler.h" #include #include @@ -151,6 +152,7 @@ void ParallelExecutor::BCastParamsToGPUs( void ParallelExecutor::Run(const std::vector &fetch_tensors, const std::string &fetched_var_name) { + platform::RecordBlock b(0); auto fetch_data = member_->executor_->Run(fetch_tensors); *member_->global_scope_->Var(fetched_var_name)->GetMutable() = fetch_data; diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 5e0588fa73..33e8d3bf21 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -16,6 +16,7 @@ import core import multiprocessing import framework import executor +import sys __all__ = ['ParallelExecutor'] @@ -35,7 +36,7 @@ class ParallelExecutor(object): places.append(p) if num_threads is None: - num_threads = min(len(places) * 2, multiprocessing.cpu_count()) + num_threads = len(places) startup = framework.default_startup_program() main = framework.default_main_program() From 46f3a39e91fd422f1b6d5cbaadad9a35456eb36a Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Sun, 1 Apr 2018 18:05:59 -0700 Subject: [PATCH 09/18] polish and add comments. --- .../fluid/framework/details/nccl_all_reduce_op_handle.h | 2 ++ .../framework/details/threaded_ssa_graph_executor.cc | 5 ++++- python/paddle/fluid/parallel_executor.py | 8 ++++++-- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h index 3d61fa79f7..bb92625667 100644 --- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h +++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h @@ -37,6 +37,8 @@ struct NCCLAllReduceOpHandle : public OpHandleBase { std::string Name() const override; + // Delay and buffer nccl_all_reduce together can significantly increase + // performance. Disable this feature by returning false. bool IsDelayedOp() override { return true; }; protected: diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 075eed4ecc..32fc9100ab 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -45,7 +45,10 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( std::unordered_set pending_vars; BlockingQueue ready_vars; std::unordered_set ready_ops; - + // For ops (e.g. nccl_all_reduce) that need to coordinate multiple + // streams from multiple GPUs, it's faster to buffer them and schedule + // together since we currently cannot overlap computation and memcpy streams. + // Should revisit it if overlapping is available. std::unordered_set delayed_ops; std::unordered_set after_delayed_ops; std::unordered_set delayed_vars; diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 33e8d3bf21..fec7d6899c 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -16,7 +16,6 @@ import core import multiprocessing import framework import executor -import sys __all__ = ['ParallelExecutor'] @@ -36,7 +35,12 @@ class ParallelExecutor(object): places.append(p) if num_threads is None: - num_threads = len(places) + if use_cuda: + # Experiments on se-resnext shows that too many threads hurt + # performance. Worth tunning for other models in the future. + num_threads = len(places) + else: + min(len(places) * 2, multiprocessing.cpu_count()) startup = framework.default_startup_program() main = framework.default_main_program() From be1373dcf9c233b6a0c870232adb0e66df64f80c Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Sun, 1 Apr 2018 18:11:52 -0700 Subject: [PATCH 10/18] Polish --- .../framework/details/nccl_all_reduce_op_handle.h | 2 +- paddle/fluid/framework/details/op_handle_base.h | 4 +++- .../framework/details/threaded_ssa_graph_executor.cc | 12 +++++++----- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h index bb92625667..ad14a3c5cb 100644 --- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h +++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h @@ -39,7 +39,7 @@ struct NCCLAllReduceOpHandle : public OpHandleBase { // Delay and buffer nccl_all_reduce together can significantly increase // performance. Disable this feature by returning false. - bool IsDelayedOp() override { return true; }; + bool IsMultiDeviceTransfer() override { return true; }; protected: void RunImpl() override; diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index 54c2d627ff..d7a541ac4b 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -55,7 +55,9 @@ class OpHandleBase { void AddOutput(VarHandleBase *out); - virtual bool IsDelayedOp() { return false; } + // If the Op involves data transfer of multiple devices that + // will likely block other computations. + virtual bool IsMultiDeviceTransfer() { return false; } protected: virtual void RunImpl() = 0; diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 32fc9100ab..65fbfb65e1 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -50,7 +50,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( // together since we currently cannot overlap computation and memcpy streams. // Should revisit it if overlapping is available. std::unordered_set delayed_ops; - std::unordered_set after_delayed_ops; + std::unordered_set blocked_by_delayed_ops; std::unordered_set delayed_vars; auto InsertPendingVar = [&pending_vars, &ready_vars](VarHandleBase &var) { @@ -119,7 +119,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( auto run_all_ready_ops = [&] { for (auto *op : ready_ops) { - if (op->IsDelayedOp()) { + if (op->IsMultiDeviceTransfer()) { delayed_ops.insert(op); delayed_vars.insert(op->outputs_.begin(), op->outputs_.end()); ready_vars.Extend(op->outputs_); @@ -162,20 +162,22 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( --deps; if (deps == 0) { if (delayed_vars.find(ready_var) != delayed_vars.end()) { - after_delayed_ops.insert(op); + blocked_by_delayed_ops.insert(op); } else { ready_ops.insert(op); } } } } + // When there are no other ops to schedule, schedule buffered delayed + // ops and unblock other ops. if (ready_ops.empty() && !delayed_ops.empty() && running_ops_ == 0) { RunDelayedOps(delayed_ops); delayed_ops.clear(); - for (auto *op : after_delayed_ops) { + for (auto *op : blocked_by_delayed_ops) { ready_ops.insert(op); } - after_delayed_ops.clear(); + blocked_by_delayed_ops.clear(); } // Keep loop until all vars are ready. } From b123ce88a17ac18dd24ec396d18c1eac7c832442 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Mon, 2 Apr 2018 01:10:00 -0700 Subject: [PATCH 11/18] Add enable/disable for delayed ops --- .../details/threaded_ssa_graph_executor.cc | 12 ++++++++---- .../details/threaded_ssa_graph_executor.h | 4 +++- paddle/fluid/framework/parallel_executor.cc | 6 +++--- paddle/fluid/framework/parallel_executor.h | 6 ++++-- paddle/fluid/pybind/pybind.cc | 4 ++-- python/paddle/fluid/parallel_executor.py | 9 +++++++-- .../tests/unittests/test_parallel_executor.py | 16 ++++++++++++++-- 7 files changed, 41 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 65fbfb65e1..1f96b9dc62 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -23,14 +23,15 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor( size_t num_threads, bool use_event, const std::vector &local_scopes, const std::vector &places, - std::unique_ptr &&graph) + std::unique_ptr &&graph, bool allow_op_delay) : SSAGraphExecutor(std::move(graph)), pool_(num_threads >= 2 ? new ::ThreadPool(num_threads) : nullptr), local_scopes_(local_scopes), places_(places), fetch_ctxs_(places), use_event_(use_event), - running_ops_(0) {} + running_ops_(0), + allow_op_delay_(allow_op_delay) {} void ThreadedSSAGraphExecutor::RunDelayedOps( const std::unordered_set &delayed_ops) { @@ -119,7 +120,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( auto run_all_ready_ops = [&] { for (auto *op : ready_ops) { - if (op->IsMultiDeviceTransfer()) { + if (op->IsMultiDeviceTransfer() && allow_op_delay_) { delayed_ops.insert(op); delayed_vars.insert(op->outputs_.begin(), op->outputs_.end()); ready_vars.Extend(op->outputs_); @@ -138,7 +139,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( } // Step 3. Execution - while (!pending_vars.empty()) { + while (!pending_vars.empty() || !ready_ops.empty() || !delayed_ops.empty()) { // 1. Run All Ready ops run_all_ready_ops(); @@ -181,6 +182,9 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( } // Keep loop until all vars are ready. } + PADDLE_ENFORCE(ready_ops.empty()); + PADDLE_ENFORCE(delayed_ops.empty()); + PADDLE_ENFORCE(blocked_by_delayed_ops.empty()); ++computation_count_; auto sync_computation = [&] { diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index 6193b897e4..79cfc26b46 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -75,7 +75,8 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { ThreadedSSAGraphExecutor(size_t num_threads, bool use_event, const std::vector &local_scopes, const std::vector &places, - std::unique_ptr &&graph); + std::unique_ptr &&graph, + bool allow_op_delay); // Run a SSAGraph by a thread pool // Use topological sort algorithm @@ -97,6 +98,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { const bool use_event_; std::unique_ptr exception_; std::atomic running_ops_; + bool allow_op_delay_; size_t computation_count_{0}; size_t max_async_computation{100}; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 002a6d362f..1788514324 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -48,7 +48,7 @@ ParallelExecutor::ParallelExecutor( const std::vector &places, const std::unordered_set ¶ms, const ProgramDesc &startup_program, const ProgramDesc &main_program, - const std::string &loss_var_name, Scope *scope) + const std::string &loss_var_name, Scope *scope, bool allow_op_delay) : member_(new ParallelExecutorPrivate(places)) { member_->global_scope_ = scope; @@ -83,8 +83,8 @@ ParallelExecutor::ParallelExecutor( auto graph = builder.Build(main_program); member_->executor_.reset(new details::ThreadedSSAGraphExecutor( - num_threads, use_event, member_->local_scopes_, places, - std::move(graph))); + num_threads, use_event, member_->local_scopes_, places, std::move(graph), + allow_op_delay)); // Step 3. Create vars in each scope; for (auto *scope : member_->local_scopes_) { diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 503efa2e44..964b476234 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -14,8 +14,9 @@ limitations under the License. */ #pragma once -#include +#include #include +#include #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/program_desc.h" @@ -37,7 +38,8 @@ class ParallelExecutor { const std::unordered_set& params, const ProgramDesc& startup_program, const ProgramDesc& main_program, - const std::string& loss_var_name, Scope* scope); + const std::string& loss_var_name, Scope* scope, + bool allow_op_delay); void Run(const std::vector& fetch_tensors, const std::string& fetched_var_name = "fetched_var"); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index e1b1bbec97..b0a3f06a88 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -504,10 +504,10 @@ All parameter, weight, gradient are variables in Paddle. const std::unordered_set ¶ms, const ProgramDesc &startup_program, const ProgramDesc &main_program, const std::string &loss_var_name, - Scope *scope) { + Scope *scope, bool allow_op_delay) { new (&self) ParallelExecutor(num_threads, use_event, places, params, startup_program, main_program, - loss_var_name, scope); + loss_var_name, scope, allow_op_delay); }) .def("run", &ParallelExecutor::Run); diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index fec7d6899c..a2c830b3c9 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -21,7 +21,11 @@ __all__ = ['ParallelExecutor'] class ParallelExecutor(object): - def __init__(self, loss_name, use_cuda, num_threads=None): + def __init__(self, + loss_name, + use_cuda, + num_threads=None, + allow_op_delay=False): places = [] if use_cuda: for i in xrange(core.get_cuda_device_count()): @@ -57,7 +61,8 @@ class ParallelExecutor(object): startup.desc, main.desc, loss_name, - scope) + scope, + allow_op_delay) self.scope = scope def run(self, fetch_list): diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py index 95d0f9da47..60130298af 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py @@ -184,7 +184,8 @@ class TestParallelExecutorBase(unittest.TestCase): method, memory_opt=True, iter=10, - batch_size=None): + batch_size=None, + allow_op_delay=False): main = fluid.Program() startup = fluid.Program() with fluid.program_guard(main, startup): @@ -194,7 +195,10 @@ class TestParallelExecutorBase(unittest.TestCase): if memory_opt: fluid.memory_optimize(main) - exe = fluid.ParallelExecutor(loss_name=loss.name, use_cuda=True) + exe = fluid.ParallelExecutor( + loss_name=loss.name, + use_cuda=True, + allow_op_delay=allow_op_delay) if batch_size is not None: batch_size *= fluid.core.get_cuda_device_count() begin = time.time() @@ -236,9 +240,11 @@ class TestMNIST(TestParallelExecutorBase): def test_simple_fc(self): self.check_network_convergence(simple_fc_net) + self.check_network_convergence(simple_fc_net, allow_op_delay=True) def test_batchnorm_fc(self): self.check_network_convergence(fc_with_batchnorm) + self.check_network_convergence(fc_with_batchnorm, allow_op_delay=True) class TestResnet(TestParallelExecutorBase): @@ -268,6 +274,12 @@ class TestResnet(TestParallelExecutorBase): SE_ResNeXt152, batch_size=batch_size), iter=20, batch_size=batch_size) + self.check_network_convergence( + functools.partial( + SE_ResNeXt152, batch_size=batch_size), + iter=20, + batch_size=batch_size, + allow_op_delay=True) class ModelHyperParams(object): From b25a9c488d3d6945157adeeac2c504ca3be977da Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Tue, 3 Apr 2018 00:16:39 -0700 Subject: [PATCH 12/18] Reduce test size to avoid GPU memory issue. --- .../paddle/fluid/tests/unittests/CMakeLists.txt | 2 ++ .../tests/unittests/test_parallel_executor.py | 16 ++++++++-------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 0ad273c716..1b2d29a47f 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -29,6 +29,7 @@ function(py_test_modules TARGET_NAME) endfunction() # test time consuming OPs in a separate process for expliot parallism +list(REMOVE_ITEM TEST_OPS test_parallel_executor) list(REMOVE_ITEM TEST_OPS test_warpctc_op) list(REMOVE_ITEM TEST_OPS test_dyn_rnn) list(REMOVE_ITEM TEST_OPS test_mul_op) @@ -64,6 +65,7 @@ else() endif(WITH_FAST_BUNDLE_TEST) # tests with high overhead +py_test_modules(test_parallel_executor MODULES test_parallel_executor) py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR}) py_test_modules(test_train_dyn_rnn MODULES test_dyn_rnn) py_test_modules(test_mul_op MODULES test_mul_op) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py index 60130298af..f132a754a2 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py @@ -135,18 +135,18 @@ def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio): return fluid.layers.elementwise_add(x=short, y=scale, act='relu') -def SE_ResNeXt152(batch_size=4): +def SE_ResNeXt152Small(batch_size=2): img = fluid.layers.fill_constant( shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0) label = fluid.layers.fill_constant( shape=[batch_size, 1], dtype='int64', value=0.0) conv = conv_bn_layer( - input=img, num_filters=64, filter_size=3, stride=2, act='relu') + input=img, num_filters=16, filter_size=3, stride=2, act='relu') conv = conv_bn_layer( - input=conv, num_filters=64, filter_size=3, stride=1, act='relu') + input=conv, num_filters=16, filter_size=3, stride=1, act='relu') conv = conv_bn_layer( - input=conv, num_filters=128, filter_size=3, stride=1, act='relu') + input=conv, num_filters=16, filter_size=3, stride=1, act='relu') conv = fluid.layers.pool2d( input=conv, pool_size=3, pool_stride=2, pool_padding=1, pool_type='max') @@ -226,7 +226,7 @@ class TestMNIST(TestParallelExecutorBase): def setUpClass(cls): # Convert mnist to recordio file with fluid.program_guard(fluid.Program(), fluid.Program()): - reader = paddle.batch(mnist.train(), batch_size=32) + reader = paddle.batch(mnist.train(), batch_size=4) feeder = fluid.DataFeeder( feed_list=[ # order is image and label fluid.layers.data( @@ -268,15 +268,15 @@ class TestResnet(TestParallelExecutorBase): def test_resnet(self): import functools - batch_size = 4 + batch_size = 2 self.check_network_convergence( functools.partial( - SE_ResNeXt152, batch_size=batch_size), + SE_ResNeXt152Small, batch_size=batch_size), iter=20, batch_size=batch_size) self.check_network_convergence( functools.partial( - SE_ResNeXt152, batch_size=batch_size), + SE_ResNeXt152Small, batch_size=batch_size), iter=20, batch_size=batch_size, allow_op_delay=True) From cf251eb8cf3051f21306d73c73a48b0f2443ef8d Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Tue, 3 Apr 2018 01:07:19 -0700 Subject: [PATCH 13/18] shrink test size --- .../paddle/fluid/tests/unittests/test_parallel_executor.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py index f132a754a2..a79e4b3e18 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py @@ -274,12 +274,6 @@ class TestResnet(TestParallelExecutorBase): SE_ResNeXt152Small, batch_size=batch_size), iter=20, batch_size=batch_size) - self.check_network_convergence( - functools.partial( - SE_ResNeXt152Small, batch_size=batch_size), - iter=20, - batch_size=batch_size, - allow_op_delay=True) class ModelHyperParams(object): From 103407aa6027c676cd1bcc13e6f35b2a51f1cc6a Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Tue, 3 Apr 2018 16:09:27 +0800 Subject: [PATCH 14/18] refine sync_with_cpp when remove ops or remove vars --- python/paddle/fluid/framework.py | 18 +++++++++++++++ .../tests/unittests/test_protobuf_descs.py | 23 +++++++++++++++---- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 3e78788f47..e15456bfc0 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -847,6 +847,11 @@ class Block(object): if not self.has_var(var.name()): self.create_var(name=var.name(), desc=var, type=var.type()) + # sync variables removed from c++ end + for var in self.vars.keys(): + if not self.desc.find_var(var): + self.vars.pop(var) + # sync operators from cpp ops_in_cpp = [] for op_idx in range(0, self.desc.op_size()): @@ -881,6 +886,19 @@ class Block(object): op = Operator(self, op_desc) self.ops.append(op) + # sync ops removed from c++ end + if end_index != -1 and end_index < len(self.ops): + ops_in_cpp_index = 0 + ops_in_python_index = 0 + while ops_in_python_index < len( + self.ops) and ops_in_cpp_index < len(ops_in_cpp): + if self.ops[ops_in_python_index].desc != ops_in_cpp[ + ops_in_cpp_index]: + del self.ops[ops_in_python_index] + else: + ops_in_cpp_index += 1 + ops_in_python_index += 1 + assert len(self.ops) == len(ops_in_cpp) for index in range(len(self.ops)): assert self.ops[index].desc == ops_in_cpp[index] diff --git a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py index da85786d0c..e4cf4a8bce 100644 --- a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py +++ b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py @@ -14,6 +14,7 @@ import unittest import paddle.fluid.core as core +from paddle.fluid.framework import Program class TestOpDesc(unittest.TestCase): @@ -187,32 +188,46 @@ class TestBlockDesc(unittest.TestCase): self.assertEqual(all_ops, [op0, op1, op2]) def test_remove_op(self): - prog = core.ProgramDesc() + program = Program() + prog = program.desc self.assertIsNotNone(prog) block = prog.block(0) self.assertIsNotNone(block) + + op0 = block.append_op() op1 = block.append_op() op2 = block.append_op() + op0.set_type("test") + op1.set_type("test") + op2.set_type("test") + + var0 = block.var("var0") var1 = block.var("var1") var2 = block.var("var2") var3 = block.var("var3") var4 = block.var("var4") var5 = block.var("var5") + + op0.set_input("X", ["var0"]) + op0.set_output("Y", ["var0"]) op1.set_input("X", ["var1", "var2"]) op1.set_output("Y", ["var3", "var4"]) op2.set_input("X", ["var1"]) op2.set_output("Y", ["var4", "var5"]) + program.sync_with_cpp() + # remove op1, its input var2 and output var3 will be removed at the same time, # but its input var1 and output var4 will not be removed since they are used for op2. - block.remove_op(0, 1) + block.remove_op(1, 2) + program.sync_with_cpp() all_ops = [] for idx in xrange(0, block.op_size()): all_ops.append(block.op(idx)) - self.assertEqual(all_ops, [op2]) + self.assertEqual(all_ops, [op0, op2]) all_vars = block.all_vars() - self.assertEqual(set(all_vars), {var1, var4, var5}) + self.assertEqual(set(all_vars), {var0, var1, var4, var5}) if __name__ == '__main__': From 3f3ecae164c13fafc8f5066c53d00eda2a925d45 Mon Sep 17 00:00:00 2001 From: weixing02 <564445201@qq.com> Date: Tue, 3 Apr 2018 17:27:05 +0800 Subject: [PATCH 15/18] Fix tables display error --- benchmark/cluster/README.md | 148 ++++++++++++++-- benchmark/cluster/vgg16/README.md | 160 +++++++++++++++--- .../design/algorithm/parameter_average.md | 2 +- doc/fluid/design/concepts/README.md | 32 +++- doc/fluid/design/concepts/block.md | 61 +++++-- .../concepts/functions_operators_layers.md | 40 ++++- doc/fluid/design/concepts/lod_tensor.md | 38 ++++- doc/fluid/design/concepts/var_desc.md | 25 ++- .../concurrent/concurrent_programming.md | 46 +++-- doc/fluid/design/concurrent/csp.md | 47 +++-- doc/fluid/design/modules/python_api.md | 33 +++- doc/fluid/design/motivation/fluid.md | 36 +++- .../design/motivation/refactorization.md | 36 +++- doc/fluid/design/network/deep_speech_2.md | 123 +++++++++++--- doc/fluid/dev/new_op_cn.md | 33 +++- doc/fluid/dev/new_op_en.md | 29 +++- doc/fluid/dev/releasing_process.md | 126 +++++++++++++- .../concepts/save_model/model_format.md | 68 ++++++-- .../howto/cluster/fluid_cluster_train_cn.md | 58 +++++-- .../howto/optimization/cpu_profiling_cn.md | 42 ++++- .../howto/optimization/cpu_profiling_en.md | 35 ++++ 21 files changed, 1043 insertions(+), 175 deletions(-) diff --git a/benchmark/cluster/README.md b/benchmark/cluster/README.md index b619613ea7..64816098a5 100644 --- a/benchmark/cluster/README.md +++ b/benchmark/cluster/README.md @@ -36,11 +36,41 @@ - Trainer Count: 100 - Metrics: mini-batch / sec -| Batch Size | 32 | 64 | 128 | 256 | -| -- | -- | -- | -- | -- | -| PaddlePaddle Fluid | - | - | - | - | -| PaddlePaddle v2 | - | - | - | - | -| TensorFlow | - | - | - | - | + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Batch Size 3264128 256
PaddlePaddle Fluid-- - -
PaddlePaddle v2 - - - -
TensorFlow - - - -
### Measure the Performance for Different PServer Count @@ -48,11 +78,41 @@ - Batch Size: 64 - Metrics: mini-batch / sec -| PServer Count | 10 | 20 | 40 | 60 | -| -- | -- | -- | -- | -- | -| PaddlePaddle Fluid | - | - | - | - | -| PaddlePaddle v2 | - | - | - | - | -| TensorFlow | - | - | - | - | + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
PServer Count 102040 60
PaddlePaddle Fluid-- - -
PaddlePaddle v2 - - - -
TensorFlow - - - -
### Measure Parallel Efficiency By Increasing Trainer Count @@ -67,11 +127,69 @@ The parallel efficiency is: $E = \div(S, N)$ -| Trainer Counter | 1 | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 | -| -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -| PaddlePaddle Fluid | - | - | - | - | - | - | - | - | - | - | - | -| PaddlePaddle v2 | - | - | - | - | - | - | - | - | - | - | - | - | -| TensorFlow | - | - | - | - | - | - | - | - | - | - | - | - | - | + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Trainer Counter 11020 30405060 708090100
PaddlePaddle Fluid-- - - -- - - -- -
PaddlePaddle v2 - - - - -- - - -- -
TensorFlow - - - - -- - - -- -
+ ## Reproduce the benchmark diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md index cd681a1a28..d56a912b9b 100644 --- a/benchmark/cluster/vgg16/README.md +++ b/benchmark/cluster/vgg16/README.md @@ -16,11 +16,41 @@ Setting environment variable: `MKL_NUM_THREADS=1`. - Metrics: samples / sec -| Batch Size | 32 | 64 | 128 | 256 | -| -- | -- | -- | -- | -- | -| PaddlePaddle Fluid | 15.44 | 16.32 | 16.74 | 16.79 | -| PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 | -| TensorFlow | 9.09 | 9.10 | 9.24 | 8.66 | + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Batch Size 3264128 256
PaddlePaddle Fluid 15.44 16.32 16.74 16.79
PaddlePaddle v2 15.97 17.04 17.60 17.83
TensorFlow 9.09 9.10 9.24 8.66
+ ### Different Batch Size @@ -28,12 +58,40 @@ Setting environment variable: `MKL_NUM_THREADS=1`. - Trainer Count: 20 - Metrics: samples / sec -| Batch Size | 32 | 64 | 128 | 256 | -| -- | -- | -- | -- | -- | -| PaddlePaddle Fluid | 190.20 | 222.15 | 247.40 | 258.18 | -| PaddlePaddle v2 | 170.96 | 233.71 | 256.14 | 329.23 | -| TensorFlow | - | - | - | - | - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Batch Size 3264128 256
PaddlePaddle Fluid 190.20 222.15 247.40 258.18
PaddlePaddle v2 170.96 233.71 256.14 329.23
TensorFlow - - - -
### Accelerate Rate @@ -41,11 +99,41 @@ Setting environment variable: `MKL_NUM_THREADS=1`. - Batch Size: 128 - Metrics: samples / sec -| Trainer Count | 20 | 40 | 80 | 100 | -| -- | -- | -- | -- | -- | -| PaddlePaddle Fluid | 263.29 (78.64%) | 518.80 (77.47%) | 836.26 (62.44%) | 1019.29 (60.89%) | -| PaddlePaddle v2 (need more tests) | 326.85 (92.85%) | 534.58 (75.93%) | 853.30 (60.60%) | 1041.99 (59.20%) | -| TensorFlow | - | - | - | - | + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Trainer Count 204080100
PaddlePaddle Fluid 263.29 (78.64%) 518.80 (77.47%) 836.26 (62.44%) 1019.29 (60.89%)
PaddlePaddle v2 (need more tests) 326.85 (92.85%) 534.58 (75.93%) 853.30 (60.60%) 1041.99 (59.20%)
TensorFlow - - - -
+ ### Different Pserver Count @@ -53,11 +141,41 @@ Setting environment variable: `MKL_NUM_THREADS=1`. - Batch Size: 128 - Metrics: samples/ sec -| PServer Count | 3 | 6 |10 | 20 | -| -- | -- | -- | -- | -- | -| PaddlePaddle Fluid(should fix in next PR) | 589.1 | 592.6 | 656.4 | 655.8 | -| PaddlePaddle v2 | 593.4 | 791.3 | 729.7 | 821.7 | -| TensorFlow | - | - | - | - | + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
PServer Count 361020
PaddlePaddle Fluid(should fix in next PR) 589.1 592.6 656.4 655.8
PaddlePaddle v2 (need more tests) 593.4 791.3 729.7 821.7
TensorFlow - - - -
+ *The performance gap between Fuild and v2 comes from the network interference.* diff --git a/doc/fluid/design/algorithm/parameter_average.md b/doc/fluid/design/algorithm/parameter_average.md index 2c4edee9fe..53d601d3a9 100644 --- a/doc/fluid/design/algorithm/parameter_average.md +++ b/doc/fluid/design/algorithm/parameter_average.md @@ -7,7 +7,7 @@ Polyak and Juditsky (1992) showed that the test performance of simple average of Hence, to accelerate the speed of Stochastic Gradient Descent, Averaged Stochastic Gradient Descent (ASGD) was proposed in Polyak and Juditsky (1992). For ASGD, the running average of parameters obtained by SGD, is used as the estimator for
. The averaging is done as follows: -
+![](./images/asgd.gif) We propose averaging for any optimizer similar to how ASGD performs it, as mentioned above. diff --git a/doc/fluid/design/concepts/README.md b/doc/fluid/design/concepts/README.md index ed3f5aab28..8ded0ad22f 100644 --- a/doc/fluid/design/concepts/README.md +++ b/doc/fluid/design/concepts/README.md @@ -6,11 +6,33 @@ Here are some initial thoughts. Your comments are welcome! I think we need only the following few CMake functions to make a project description mean and clean: -| C++ | CUDA C++ | Go | -|---|---|---| -| cc_library | nv_library | go_library | -| cc_binary | nv_binary | go_binary | -| cc_test | nv_test | go_test | + + + + + + + + + + + + + + + + + + + + + + + + + +
C++CUDA C++Go
cc_library nv_library go_library
cc_binary nv_binary go_binary
cc_test nv_test go_test
+ - The `_library` functions generate .a files from source code. - The `_binary` functions generate executable binary files. diff --git a/doc/fluid/design/concepts/block.md b/doc/fluid/design/concepts/block.md index 907a2def55..3b626bd89c 100644 --- a/doc/fluid/design/concepts/block.md +++ b/doc/fluid/design/concepts/block.md @@ -14,11 +14,29 @@ In programming languages, a block is a pair of curly braces that includes local Blocks work with control flow structures like `if`, `else`, and `for`, which have equivalents in deep learning: -| programming languages | PaddlePaddle | -|-----------------------|-----------------------| -| for, while loop | RNN, WhileOp | -| if, if-else, switch | IfElseOp, SwitchOp | -| sequential execution | a sequence of layers | + + + + + + + + + + + + + + + + + + + + + +
programming languagesPaddlePaddle
for, while loop RNN, WhileOp
if, if-else, switch IfElseOp, SwitchOp
sequential execution a sequence of layers
+ A key difference is that a C++ program describes a one pass computation, whereas a deep learning program describes both the forward and backward passes. @@ -26,12 +44,33 @@ A key difference is that a C++ program describes a one pass computation, whereas The existence of the backward pass makes the execution of a block of PaddlePaddle different from traditional programs: -| programming languages | PaddlePaddle | -|-----------------------|---------------------------------| -| stack | scope hierarchy | -| stack frame | scope | -| push at entering block| push at entering block | -| pop at leaving block | destroy when minibatch completes| + + + + + + + + + + + + + + + + + + + + + + + + + +
programming languagesPaddlePaddle
stack scope hierarchy
stack frame scope
push at entering block push at entering block
pop at leaving block destroy when minibatch completes
+ 1. In traditional programs: diff --git a/doc/fluid/design/concepts/functions_operators_layers.md b/doc/fluid/design/concepts/functions_operators_layers.md index 984b59f4c6..30bc488a18 100644 --- a/doc/fluid/design/concepts/functions_operators_layers.md +++ b/doc/fluid/design/concepts/functions_operators_layers.md @@ -86,12 +86,40 @@ def layer.fc(X): We'd like to have Python bindings to operators in package `paddle.operator`, and Python compositions of operators in package `paddle.layer`. So we have the following concepts in above illustrative example: - -| C++ functions/functors | mul | add | | | -|------------------------|--------------|--------------|-------------|----------| -| C++ operator class | mulOp | addOp | FCOp | | -| Python binding | operator.mul | operator.add | operator.fc | | -| Python function | | | | layer.fc | + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
C++ functions/functorsmuladd
C++ operator class mulOpaddOp FCOp
Python binding operator.mul operator.add operator.fc
Python function layer.fc
This is how we differentiate layer and operators in PaddlePaddle: diff --git a/doc/fluid/design/concepts/lod_tensor.md b/doc/fluid/design/concepts/lod_tensor.md index 10a8a7867f..a88292e788 100644 --- a/doc/fluid/design/concepts/lod_tensor.md +++ b/doc/fluid/design/concepts/lod_tensor.md @@ -2,12 +2,38 @@ Like other deep learning systems, PaddlePaddle supports training models from sequence data. Also, like other systems, PaddlePaddle represent a mini-batch of sequences as a Tensor. What is different is that PaddlePaddle doesn't require all sequences in a mini-batch to be of the same length. Thus no need for padding zeros. -| | TensorFlow | PaddlePaddle | -|-----------------------|------------|--------------| -| RNN | Support | Support | -| recursive RNN | Support | Support | -| padding zeros | Must | No need | -| blob data type | Tensor | LoDTensor | + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
TensorFlowPaddlePaddle
RNN Support Support
recursive RNN Support Support
padding zeros Must No need
blob data type Tensor LoDTensor
+ PaddlePaddle achieves this flexibility by passing through a new data type, *LoD Tensor*, which is a Tensor attached with segmentation index known as *LoD*, between operators. The LoD index doesn't only segment a tensor, but also recursively segments sub-sequences. This document presents the design of LoD and LoDTensor. diff --git a/doc/fluid/design/concepts/var_desc.md b/doc/fluid/design/concepts/var_desc.md index fcba08c07f..6750323c01 100644 --- a/doc/fluid/design/concepts/var_desc.md +++ b/doc/fluid/design/concepts/var_desc.md @@ -10,10 +10,27 @@ PaddlePaddle uses proto message to describe compile time program because : The computation `Program` consists of nested `Blocks`. Each `Block` will consist of data(i.e. `Variable`) and `Operations`. The concept to represent them is in the table below. -| |compile time|runtime| -|---|---|---| -|Data|VarDesc(proto)|Variable(cpp)| -|Operation|OpDesc(proto)|Operator(cpp)| + + + + + + + + + + + + + + + + + + + + +
compile timeruntime
Data VarDesc(proto) Variable(cpp)
Operation OpDesc(proto) Operator(cpp)
## Definition of VarType diff --git a/doc/fluid/design/concurrent/concurrent_programming.md b/doc/fluid/design/concurrent/concurrent_programming.md index f022e67fd3..6460216606 100644 --- a/doc/fluid/design/concurrent/concurrent_programming.md +++ b/doc/fluid/design/concurrent/concurrent_programming.md @@ -10,12 +10,38 @@ The answer relies on the fact that a `ProgramDesc` is similar to an abstract syn The following table compares concepts in Fluid and Go -| Go | Fluid | -|----|-------| -|user-defined functions | [layers](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid) | -| control-flow and built-in functions | [intrinsics/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators) | -| goroutines, channels | [class ThreadPool](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework/thread_pool.h) | -| runtime | [class Executor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h) | + + + + + + + + + + + + + + + + + + + + + + + + + + +
GoFluid
user-defined functions +layers
control-flow and built-in functions +intrinsics/operators
goroutines, channels +class ThreadPool
runtime +class Executor
+ ## An Example Concurrent Program @@ -77,11 +103,11 @@ message ProgramDesc { read(output = X) kube_get_workers_addrs(output = L) Y = tensor_array(len(L)) - parallel_for(input = X, output = Y, + parallel_for(input = X, output = Y, attrs = {L, block_id(1)}) # referring to block 1 ] } - + block[1] = Block { parent = 0, vars = [x, y, index], @@ -102,7 +128,7 @@ func main() { //// block 0 X = fluid.read(...) L = fluid.k8s.get_worker_addrs() Y = fluid.tensor_array(len(L)) - fluid.parallel_for(X, L, + fluid.parallel_for(X, L, func(index int) { //// block 1 x = X[index] fluid.send(L[index], x) @@ -116,7 +142,7 @@ An explanation of the above program: - `fluid.k8s` is a package that provides access to Kubernetes API. - `fluid.k8s.get_worker_addrs` returns the list of IP and ports of all pods of the current job except for the current one (the master pod). -- `fluid.tensor_array` creates a [tensor array](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor_array.h). `fluid.parallel_for` creates a `ParallelFor` intrinsic, which, when executed, +- `fluid.tensor_array` creates a [tensor array](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor_array.h). `fluid.parallel_for` creates a `ParallelFor` intrinsic, which, when executed, 1. creates `len(L)` scopes, each for the concurrent running of the sub-block (block 1 in this case), and initializes a variable named "index" in the scope to an integer value in the range `[0, len(L)-1]`, and 2. creates `len(L)` threads by calling into the `ThreadPool` singleton, each thread diff --git a/doc/fluid/design/concurrent/csp.md b/doc/fluid/design/concurrent/csp.md index 10d936860f..66d19f44ba 100644 --- a/doc/fluid/design/concurrent/csp.md +++ b/doc/fluid/design/concurrent/csp.md @@ -13,14 +13,41 @@ Most DL systems, including TensorFlow, Caffe2, and MxNet, can asynchronously exe There were many concurrent programming models, implemented in various forms: -| concurrent programming model | implementation | -|-----|-----| -| mutex | types and functions in standard libraries | -| semaphore | types and functions in standard libraries | -| communicating sequential processes (CSP) | Go programming language | -| actor model | Erlang programming language | -| message passing | MPI | -| bulk synchronous parallel (BSP) | Pregel distributed programming framework | + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
concurrent programming modelimplementation
mutex types and functions in standard libraries
semaphore types and functions in standard libraries
communicating sequential processes (CSP) Go programming language
actor model Erlang programming language
message passing MPI
bulk synchronous parallel (BSP) Pregel distributed programming framework
+ Since Fluid was designed to be a programming language, we would like to implement CSP in Fluid. @@ -118,9 +145,9 @@ There are four types of actions with a channel: ```go close(ch) ``` - + Please be aware that a closed channel is not a nil channel, which is `var ch chan int`. - + There are some [axioms with channels](https://dave.cheney.net/2014/03/19/channel-axioms): 1. A send to a nil channel blocks forever diff --git a/doc/fluid/design/modules/python_api.md b/doc/fluid/design/modules/python_api.md index 73f6d7b90c..f83ad3b6a4 100644 --- a/doc/fluid/design/modules/python_api.md +++ b/doc/fluid/design/modules/python_api.md @@ -2,12 +2,33 @@ Due to the refactorization of the PaddlePaddle core, we need Python classes to construct corresponding protobuf messages that describe a DL program. -| Python classes | Protobuf messages | -| --- | --- | -| Program | ProgramDesc | -| Block | BlockDesc | -| Operator | OpDesc | -| Variable | VarDesc | + + + + + + + + + + + + + + + + + + + + + + + + + +
Python classesProtobuf messages
Program ProgramDesc
Block BlockDesc
Operator OpDesc
Variable VarDesc
+ Please be aware that these Python classes need to maintain some construction-time information, which are not part of the protobuf messages. diff --git a/doc/fluid/design/motivation/fluid.md b/doc/fluid/design/motivation/fluid.md index 110b7d78bf..5e147f8263 100644 --- a/doc/fluid/design/motivation/fluid.md +++ b/doc/fluid/design/motivation/fluid.md @@ -10,11 +10,37 @@ Fluid is the answer. Fluid is similar to PyTorch and TensorFlow Eager Execution Deep learning infrastructure is one of the fastest evolving technologies. Within four years, there have already been three generations of technologies invented. -| Existed since | model as sequence of layers | model as graph of operators | No model | -|--|--|--|--| -| 2013 | Caffe, Theano, Torch, PaddlePaddle | | | -| 2015 | | TensorFlow, MxNet, Caffe2, ONNX, n-graph | | -| 2016 | | | PyTorch, TensorFlow Eager Execution, PaddlePaddle Fluid | + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Existed sincemodel as sequence of layersmodel as graph of operatorsNo model
2013 Caffe, Theano, Torch, PaddlePaddle
2015 TensorFlow, MxNet, Caffe2, ONNX, n-graph
2016 PyTorch, TensorFlow Eager Execution, PaddlePaddle Fluid
+ From the above table, we see that the deep learning technology is evolving towards getting rid of the concept of a model. To understand the reasons behind this direction, a comparison of the *programming paradigms* or the ways to program deep learning applications using these systems, would be helpful. The following section goes over these. diff --git a/doc/fluid/design/motivation/refactorization.md b/doc/fluid/design/motivation/refactorization.md index 7c39fabcc6..f199cc892f 100644 --- a/doc/fluid/design/motivation/refactorization.md +++ b/doc/fluid/design/motivation/refactorization.md @@ -36,11 +36,37 @@ At compile time, the Python program generates a protobuf message representation At runtime, the C++ program realizes the graph and runs it. -| | Representation (protobuf messages) | Realization (C++ class objects) | -|---|---|---| -|Data|[VarDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L107)|[Variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24)| -|Operation|[OpDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L35)|[Operator](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L64)| -|Block|BlockDesc|Block| + + + + + + + + + + + + + + + + + + + + + + + + + +
Representation (protobuf messages)Realization (C++ class objects)
Data +VarDesc +Variable
Operation +OpDesc +Operator
Block BlockDesc Block
+ The word *graph* is interchangeable with *block* in this document. A graph consists of computation steps and local variables similar to a C++/Java program block, or a pair of parentheses(`{` and `}`). diff --git a/doc/fluid/design/network/deep_speech_2.md b/doc/fluid/design/network/deep_speech_2.md index af0c6ef36f..7f5dcf55f9 100644 --- a/doc/fluid/design/network/deep_speech_2.md +++ b/doc/fluid/design/network/deep_speech_2.md @@ -1,4 +1,4 @@ -# DeepSpeech2 on PaddlePaddle: Design Doc +# DeepSpeech2 on PaddlePaddle: Design Doc We are planning to build Deep Speech 2 (DS2) \[[1](#references)\], a powerful Automatic Speech Recognition (ASR) engine, on PaddlePaddle. For the first-stage plan, we have the following short-term goals: @@ -68,11 +68,33 @@ We roughly break down the project into 14 tasks: Tasks parallelizable within phases: -Roadmap | Description | Parallelizable Tasks ------------ | :------------------------------------ | :-------------------- -Phase I | Simplified model & components | *Task 1* ~ *Task 8* -Phase II | Standard model & benchmarking & profiling | *Task 9* ~ *Task 12* -Phase III | Documentations | *Task13* ~ *Task14* + + + + + + + + + + + + + + + + + + + + + + + + + +
RoadmapDescription Parallelizable Tasks
Phase I Simplified model & components Task 1 ~ Task 8
Phase II Standard model & benchmarking & profilingTask 9 ~ Task 12
Phase III Documentations Task13 ~ Task14
+ Issue for each task will be created later. Contributions, discussions and comments are all highly appreciated and welcomed! @@ -102,37 +124,82 @@ We don't have to persist on this 2-3-7-1-1-1 depth \[[2](#references)\]. Similar Key ingredients about the layers: -- **Data Layers**: +- **Data Layers**: - Frame sequences data of audio **spectrogram** (with FFT). - - Token sequences data of **transcription** text (labels). + - Token sequences data of **transcription** text (labels). - These two type of sequences do not have the same lengthes, thus a CTC-loss layer is required. -- **2D Convolution Layers**: +- **2D Convolution Layers**: - Not only temporal convolution, but also **frequency convolution**. Like a 2D image convolution, but with a variable dimension (i.e. temporal dimension). - With striding for only the first convlution layer. - No pooling for all convolution layers. -- **Uni-directional RNNs** +- **Uni-directional RNNs** - Uni-directional + row convolution: for low-latency inference. - Bi-direcitional + without row convolution: if we don't care about the inference latency. - **Row convolution**: - For looking only a few steps ahead into the feature, instead of looking into a whole sequence in bi-directional RNNs. - - Not nessesary if with bi-direcitional RNNs. + - Not nessesary if with bi-direcitional RNNs. - "**Row**" means convolutions are done within each frequency dimension (row), and no convolution kernels shared across. - **Batch Normalization Layers**: - Added to all above layers (except for data and loss layer). - Sequence-wise normalization for RNNs: BatchNorm only performed on input-state projection and not state-state projection, for efficiency consideration. - - -Required Components | PaddlePaddle Support | Need to Develop -:------------------------------------- | :-------------------------------------- | :----------------------- -Data Layer I (Spectrogram) | Not supported yet. | TBD (Task 3) -Data Layer II (Transcription) | `paddle.data_type.integer_value_sequence` | - -2D Convolution Layer | `paddle.layer.image_conv_layer` | - -DataType Converter (vec2seq) | `paddle.layer.block_expand` | - -Bi-/Uni-directional RNNs | `paddle.layer.recurrent_group` | - -Row Convolution Layer | Not supported yet. | TBD (Task 4) -CTC-loss Layer | `paddle.layer.warp_ctc` | - -Batch Normalization Layer | `paddle.layer.batch_norm` | - -CTC-Beam search | Not supported yet. | TBD (Task 6) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Required Components PaddlePaddle Support Need to Develop
Data Layer I (Spectrogram) Not supported yet.TBD (Task 3)
Data Layer II (Transcription) paddle.data_type.integer_value_sequence -
2D Convolution Layer paddle.layer.image_conv_layer -
DataType Converter (vec2seq) paddle.layer.block_expand -
Bi-/Uni-directional RNNs paddle.layer.recurrent_group -
Row Convolution Layer Not supported yet.TBD (Task 4)
CTC-loss Layer paddle.layer.warp_ctc -
Batch Normalization Layer paddle.layer.batch_norm -
CTC-Beam search Not supported yet. TBD (Task 6)
+ ### Row Convolution @@ -145,14 +212,14 @@ TODO by Assignees Figure 2. Algorithm for CTC Beam Search Decoder. -- The **Beam Search Decoder** for DS2 CTC-trained network follows the similar approach in \[[3](#references)\] as shown in Figure 2, with two important modifications for the ambiguous parts: - - 1) in the iterative computation of probabilities, the assignment operation is changed to accumulation for one prefix may comes from different paths; +- The **Beam Search Decoder** for DS2 CTC-trained network follows the similar approach in \[[3](#references)\] as shown in Figure 2, with two important modifications for the ambiguous parts: + - 1) in the iterative computation of probabilities, the assignment operation is changed to accumulation for one prefix may comes from different paths; - 2) the if condition ```if l^+ not in A_prev then``` after probabilities' computation is deprecated for it is hard to understand and seems unnecessary. - An **external scorer** would be passed into the decoder to evaluate a candidate prefix during decoding whenever a white space appended in English decoding and any character appended in Mandarin decoding. - Such external scorer consists of language model, word count or any other custom scorers. - The **language model** is built from Task 5, with parameters should be carefully tuned to achieve minimum WER/CER (c.f. Task 7) -- This decoder needs to perform with **high efficiency** for the convenience of parameters tuning and speech recognition in reality. - +- This decoder needs to perform with **high efficiency** for the convenience of parameters tuning and speech recognition in reality. + ## Future Work diff --git a/doc/fluid/dev/new_op_cn.md b/doc/fluid/dev/new_op_cn.md index 9299658567..0c3f88d9c3 100644 --- a/doc/fluid/dev/new_op_cn.md +++ b/doc/fluid/dev/new_op_cn.md @@ -26,13 +26,32 @@ 依据是否包含kernel,可以将Op分为两种:包含Kernel的Op和不包含kernel的Op,前者Op的定义继承自`OperatorWithKernel`,后者继承自`OperatorBase`。本教程主要介绍带Kernel的Op如何写,简单总结Op需要包含的内容如下: - - 内容 | 定义位置 --------------- | :---------------------- -OpProtoMake定义 | `.cc`文件,Backward Op不需要定义OpProtoMake -Op定义 | `.cc`文件 -Kernel实现 | CPU、CUDA共享Kernel实现在`.h`文件中,否则,CPU 实现在`.cc`文件中,CUDA 实现在`.cu`文件中。 -注册Op | Op注册实现在`.cc`文件;Kernel注册CPU实现在`.cc`文件中,CUDA实现在`.cu`文件中 + + + + + + + + + + + + + + + + + + + + + + + + + +
内容定义位置
OpProtoMake定义 `.cc`文件,Backward Op不需要定义OpProtoMake
Op定义 `.cc`文件
Kernel实现 CPU、CUDA共享Kernel实现在`.h`文件中,否则,CPU 实现在`.cc`文件中,CUDA 实现在`.cu`文件中。
注册Op Op注册实现在`.cc`文件;Kernel注册CPU实现在`.cc`文件中,CUDA实现在`.cu`文件中
实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)下,文件命名以`*_op.h`(如有) 、 `*_op.cc` 、`*_op.cu`(如有)结尾。**系统会根据文件名自动构建op和其对应的Python扩展。** diff --git a/doc/fluid/dev/new_op_en.md b/doc/fluid/dev/new_op_en.md index da8b1bdd10..a566a09131 100644 --- a/doc/fluid/dev/new_op_en.md +++ b/doc/fluid/dev/new_op_en.md @@ -33,6 +33,33 @@ Op definition | `.cc` files Kernel implementation | The kernel methods shared between CPU and CUDA are defined in `.h` files. CPU-specific kernels live in `.cc` files, while CUDA-specific kernels are implemented in `.cu`files. Registering the Op | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the CUDA implementation. + + + + + + + + + + + + + + + + + + + + + + + + + +
Information Where is it defined
OpProtoMake definition `.cc`files, Backward Op does not need an OpProtoMake interface.
Op definition `.cc` files
Kernel implementation The kernel methods shared between CPU and CUDA are defined in `.h` files. CPU-specific kernels live in `.cc` files, while CUDA-specific kernels are implemented in `.cu`files.
Registering the Op Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the CUDA implementation.
+ New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions.** @@ -279,7 +306,7 @@ A forward operator unit test inherits `unittest.TestCase` and defines metaclass def test_check_output(self): self.check_output() - + def test_check_grad_normal(self): self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5) diff --git a/doc/fluid/dev/releasing_process.md b/doc/fluid/dev/releasing_process.md index b978726109..addd474b69 100644 --- a/doc/fluid/dev/releasing_process.md +++ b/doc/fluid/dev/releasing_process.md @@ -66,7 +66,7 @@ PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git- * 建议,开发者fork的版本库使用`develop`分支同步主版本库的`develop`分支 * 建议,开发者fork的版本库中,再基于`develop`版本fork出自己的功能分支。 * 当功能分支开发完毕后,向PaddlePaddle的主版本库提交`Pull Reuqest`,进而进行代码评审。 - * 在评审过程中,开发者修改自己的代码,可以继续在自己的功能分支提交代码。 + * 在评审过程中,开发者修改自己的代码,可以继续在自己的功能分支提交代码。 * BugFix分支也是在开发者自己的fork版本库维护,与功能分支不同的是,BugFix分支需要分别给主版本库的`master`、`develop`与可能有的`release/版本号`分支,同时提起`Pull Request`。 @@ -78,13 +78,137 @@ PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git- PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。 + | | 新手入门章节 | 识别数字 | 图像分类 | 词向量 | 情感分析 | 语意角色标注 | 机器翻译 | 个性化推荐 | + | --- | --- | --- | --- | --- | --- | --- | --- | --- | + | API.V2 + Docker + GPU | | | | | | | | | + | API.V2 + Docker + CPU | | | | | | | | | + | `paddle_trainer` + Docker + GPU | | | | | | | | | + | `paddle_trainer` + Docker + CPU | | | | | | | | | + | API.V2 + Ubuntu + GPU | | | | | | | | | + | API.V2 + Ubuntu + CPU | | | | | | | | | + | `paddle_trainer` + Ubuntu + GPU | | | | | | | | | + | `paddle_trainer` + Ubuntu + CPU | | | | | | | | | + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
新手入门章节 识别数字 图像分类词向量 情感分析语意角色标注 机器翻译个性化推荐
API.V2 + Docker + GPU
API.V2 + Docker + CPU
`paddle_trainer` + Docker + GPU
`paddle_trainer` + Docker + CPU
API.V2 + Ubuntu + GPU
API.V2 + Ubuntu + CPU
`paddle_trainer` + Ubuntu + GPU
`paddle_trainer` + Ubuntu + CPU
diff --git a/doc/fluid/getstarted/concepts/save_model/model_format.md b/doc/fluid/getstarted/concepts/save_model/model_format.md index e29129fddf..1f12ba0497 100644 --- a/doc/fluid/getstarted/concepts/save_model/model_format.md +++ b/doc/fluid/getstarted/concepts/save_model/model_format.md @@ -4,30 +4,70 @@ A model is an output of the training process. One complete model consists of two parts, the **topology** and the **parameters**. In order to support industrial deployment, the model format must be self-complete and must not expose any training source code. -As a result, In PaddlePaddle, the **topology** is represented as a [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/doc/design/program.md), which describes the model structure. The **parameters** contain all the trainable weights in the model. We must support large size parameters and efficient serialization/deserialization of parameters. +As a result, In PaddlePaddle, the **topology** is represented as a [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/doc/design/program.md), which describes the model structure. The **parameters** contain all the trainable weights in the model. We must support large size parameters and efficient serialization/deserialization of parameters. ## Implementation -The topology is saved as a plain text in a detailed self-contain protobuf file. +The topology is saved as a plain text in a detailed self-contain protobuf file. The parameters are saved as a binary file. As we all know, the protobuf message has a limit of [64M size](https://developers.google.com/protocol-buffers/docs/reference/cpp/google.protobuf.io.coded_stream#CodedInputStream.SetTotalBytesLimit.details). We have done a [benchmark experiment](https://github.com/PaddlePaddle/Paddle/pull/4610), which shows that protobuf is not fit for the task. -As a result, we design a particular format for tensor serialization. By default, an arbitrary tensor in Paddle is a [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md), and has a description information proto of [LoDTensorDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L99). We save the DescProto as the byte string header. It contains all the necessary information, such as the `dims`, and the `LoD` information in [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/paddle/framework/lod_tensor.md). A tensor stores values in a continuous memory buffer. For speed we dump the raw memory to disk and save it as the byte string content. So, the binary format of one tensor is, +As a result, we design a particular format for tensor serialization. By default, an arbitrary tensor in Paddle is a [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md), and has a description information proto of [LoDTensorDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L99). We save the DescProto as the byte string header. It contains all the necessary information, such as the `dims`, and the `LoD` information in [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/paddle/framework/lod_tensor.md). A tensor stores values in a continuous memory buffer. For speed we dump the raw memory to disk and save it as the byte string content. So, the binary format of one tensor is, The table below shows a tensor's byte view in detail. Note that all the signed values are written in the little-endian format. -|field name | type | description | -| --- | --- | --- | -| version | uint32_t | Version of saved file. Always 0 now. | -| tensor desc length | uint32_t | TensorDesc(Protobuf message) length in bytes. | -| tensor desc | void* | TensorDesc protobuf binary message | -| tensor data | void* | Tensor's data in binary format. The length of `tensor_data` is decided by `TensorDesc.dims()` and `TensorDesc.data_type()` | -| lod_level | uint64_t | Level of LoD | -| length of lod[0] | uint64_t | [Optional] length of lod[0] in bytes. | -| data of lod[0] | uint64_t* | [Optional] lod[0].data() | -| ... | ... | ... | - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
field nametype description
version uint32_t Version of saved file. Always 0 now.
tensor desc length uint32_t TensorDesc(Protobuf message) length in bytes.
tensor desc void* TensorDesc protobuf binary message
tensor data void* Tensor's data in binary format. The length of `tensor_data` is decided by `TensorDesc.dims()` and `TensorDesc.data_type()`
lod_level uint64_t Level of LoD
length of lod[0] uint64_t [Optional] length of lod[0] in bytes.
data of lod[0] uint64_t* [Optional] lod[0].data()
... ... ...
## Summary diff --git a/doc/fluid/howto/cluster/fluid_cluster_train_cn.md b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md index 1b6f767869..b99b90056b 100644 --- a/doc/fluid/howto/cluster/fluid_cluster_train_cn.md +++ b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md @@ -65,10 +65,10 @@ exit(1) **因此,在分布式的Fluid环境中,我们有两个角色需要创建,分别是Parameter Server和Trainer。** -### 分布式训练 +### 分布式训练 Fliud专门提供了工具[Distributed Transpiler](https://github.com/PaddlePaddle/Paddle/blob/ba65d54d9d3b41cd3c5171b00f476d4e60133ddb/doc/fluid/design/dist_train/distributed_architecture.md#distributed-transpiler)用于将单机版的训练程序转换为分布式版本的训练程序。工具背后的理念是找出程序的优化算子和梯度参数,将他们分隔为两部分,通过send/recv 操作算子进行连接,优化算子和梯度参数可以在优化器的minimize函数的返回值中获取到。 ```python -optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) +optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) ``` 将Distributed Transpiler、优化算子和梯度函数放在一个代码中如下: ```python @@ -99,15 +99,51 @@ for pass_id in range(100): ### 分布式训练脚本运行说明 分布式任务的运行需要将表格中说明的多个参数进行赋值: -| 参数名 | 值类型 | 说明 | 示例 | -|:-------------|:------|:---------------------------------------|:-------------| -| trainer_id | int | 当前训练节点的ID,训练节点ID编号为0 - n-1, n为trainers的值 | 0/1/2/3 | -| pservers | str | parameter server 列表 | 127.0.0.1:6710,127.0.0.1:6711 | -| trainers | int | 训练节点的总个数,>0的数字 | 4 | -| server_endpoint | str | 当前所起的服务节点的IP:PORT | 127.0.0.1:8789 | -| training_role | str | 节点角色, TRAINER/PSERVER | PSERVER | - -**注意:** ```training_role```是用来区分当前所起服务的角色的,用于训练程序中,用户可根据需要自行定义,其他参数为fluid.DistributeTranspiler的transpile函数所需要,需要在调用函数前进行定义,样例如下: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
参数名 值类型说明 示例
trainer_id int 当前训练节点的ID,训练节点ID编号为0 - n-1, n为trainers的值 0/1/2/3
pservers str parameter server 列表 127.0.0.1:6710,127.0.0.1:6711
trainers int 训练节点的总个数,>0的数字 4
server_endpoint str 当前所起的服务节点的IP:PORT 127.0.0.1:8789
training_rolestr 节点角色, TRAINER/PSERVER PSERVER
+ + +**注意:** ```training_role```是用来区分当前所起服务的角色的,用于训练程序中,用户可根据需要自行定义,其他参数为fluid.DistributeTranspiler的transpile函数所需要,需要在调用函数前进行定义,样例如下: ```python t = fluid.DistributeTranspiler() diff --git a/doc/fluid/howto/optimization/cpu_profiling_cn.md b/doc/fluid/howto/optimization/cpu_profiling_cn.md index 17f895573a..8266dec3c6 100644 --- a/doc/fluid/howto/optimization/cpu_profiling_cn.md +++ b/doc/fluid/howto/optimization/cpu_profiling_cn.md @@ -42,14 +42,40 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py 每一列的含义是: -| 列名 | 含义 | -| --- | --- | -| ncalls | 函数的调用次数 | -| tottime | 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间 | -| percall | tottime的每次调用平均时间 | -| cumtime | 函数总时间。包含这个函数调用其他函数的时间 | -| percall | cumtime的每次调用平均时间 | -| filename:lineno(function) | 文件名, 行号,函数名 | + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
列名含义
ncalls 函数的调用次数
tottime 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间
percall tottime的每次调用平均时间
cumtime 函数总时间。包含这个函数调用其他函数的时间
percall cumtime的每次调用平均时间
filename:lineno(function) 文件名, 行号,函数名
### 寻找性能瓶颈 diff --git a/doc/fluid/howto/optimization/cpu_profiling_en.md b/doc/fluid/howto/optimization/cpu_profiling_en.md index abe4493c17..4447db252f 100644 --- a/doc/fluid/howto/optimization/cpu_profiling_en.md +++ b/doc/fluid/howto/optimization/cpu_profiling_en.md @@ -66,6 +66,41 @@ each column is as follows: | percall | cumtime divided by ncalls | | filename:lineno(function) | where the function is defined | + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
columnmeaning
ncalls the number of calls into a function
tottime the total execution time of the function, not including the execution time of other functions called by the function
percall tottime divided by ncalls
cumtime the total execution time of the function, including the execution time of other functions being called
percall cumtime divided by ncalls
filename:lineno(function) where the function is define
+ ### Identify Performance Bottlenecks Usually, `tottime` and the related `percall` time is what we want to From 6a7cba417432a750391c832ad18e805bbf03ab3e Mon Sep 17 00:00:00 2001 From: weixing Date: Tue, 3 Apr 2018 17:31:33 +0800 Subject: [PATCH 16/18] Update cpu_profiling_en.md --- doc/fluid/howto/optimization/cpu_profiling_en.md | 9 --------- 1 file changed, 9 deletions(-) diff --git a/doc/fluid/howto/optimization/cpu_profiling_en.md b/doc/fluid/howto/optimization/cpu_profiling_en.md index 4447db252f..e95556dd60 100644 --- a/doc/fluid/howto/optimization/cpu_profiling_en.md +++ b/doc/fluid/howto/optimization/cpu_profiling_en.md @@ -57,15 +57,6 @@ port, we will see the output like the following: where each line corresponds to Python function, and the meaning of each column is as follows: -| column | meaning | -| --- | --- | -| ncalls | the number of calls into a function | -| tottime | the total execution time of the function, not including the execution time of other functions called by the function | -| percall | tottime divided by ncalls | -| cumtime | the total execution time of the function, including the execution time of other functions being called | -| percall | cumtime divided by ncalls | -| filename:lineno(function) | where the function is defined | - From e81b140437395c6d137adb2a5664ea4390d3f3a0 Mon Sep 17 00:00:00 2001 From: weixing Date: Tue, 3 Apr 2018 17:32:10 +0800 Subject: [PATCH 17/18] Update releasing_process.md --- doc/fluid/dev/releasing_process.md | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/doc/fluid/dev/releasing_process.md b/doc/fluid/dev/releasing_process.md index addd474b69..0810765b85 100644 --- a/doc/fluid/dev/releasing_process.md +++ b/doc/fluid/dev/releasing_process.md @@ -78,27 +78,6 @@ PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git- PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。 - -| | 新手入门章节 | 识别数字 | 图像分类 | 词向量 | 情感分析 | 语意角色标注 | 机器翻译 | 个性化推荐 | - -| --- | --- | --- | --- | --- | --- | --- | --- | --- | - -| API.V2 + Docker + GPU | | | | | | | | | - -| API.V2 + Docker + CPU | | | | | | | | | - -| `paddle_trainer` + Docker + GPU | | | | | | | | | - -| `paddle_trainer` + Docker + CPU | | | | | | | | | - -| API.V2 + Ubuntu + GPU | | | | | | | | | - -| API.V2 + Ubuntu + CPU | | | | | | | | | - -| `paddle_trainer` + Ubuntu + GPU | | | | | | | | | - -| `paddle_trainer` + Ubuntu + CPU | | | | | | | | | -
From 3ceff3057b5ca1803ee64679fe24a0bf5512707a Mon Sep 17 00:00:00 2001 From: Kevin Zhan Date: Wed, 4 Apr 2018 00:28:49 +0800 Subject: [PATCH 18/18] Update recurrent_group_en.md (#9445) * Update recurrent_group_en.md Fix https://github.com/PaddlePaddle/Paddle/issues/9013 * Update recurrent_group_en.md --- doc/v2/howto/rnn/recurrent_group_en.md | 95 +++++++++++++++++++++++++- 1 file changed, 94 insertions(+), 1 deletion(-) diff --git a/doc/v2/howto/rnn/recurrent_group_en.md b/doc/v2/howto/rnn/recurrent_group_en.md index d264b0a9f8..de6b60f29e 100644 --- a/doc/v2/howto/rnn/recurrent_group_en.md +++ b/doc/v2/howto/rnn/recurrent_group_en.md @@ -1,3 +1,96 @@ # Recurrent Group Tutorial -TBD +## Overview + +Sequential data is common in natural language processing. + +A sentence is a sequence of words and many sentences form a paragraph further. Therefore, a paragraph can be viewed as a nested sequence with two level, where each element of the sequence is another sequence. That is to say, sequential data could be recursive. An example of two-level recursive sequential data is that an article is composed of a sequence of sentences, and each sentence a sequence of words. + +PaddlePaddle and PaddlePaddle v2 support two-level recursive sequential data. The two-level sequence is a very flexible data, which helps us to better describe more complex language data such as discribing paragraphs and several rounds of dialogues. Based on two-level sequence input, we can design and build a flexible, hierarchical RNN model that encodes input data from the word and sentence level. For the support of arbitrary levels, please refer to PaddlePaddle Fluid. + +In PaddlePaddle, `recurrent_group` is an arbitrarily complex RNN unit. The user only needs to define the calculation that the RNN will complete in one time step. PaddlePaddle is responsible for the propagation of information and error in time series. + +Furthermore, `recurrent_group` can also be extended to handle two-level sequence. By defining two nested `recurrent_group` operations at the clause level and the word level respectively, a hierarchical and complex RNN is finally achieved. + +Currently, in the PaddlePaddle, there are `recurrent_group` and some Layers that can process bidirectional sequences. For details, refer to the document: Layers for supporting double-layer sequences as input. + +## Related Concepts + +### Basic Principle +`recurrent_group` is an arbitrarily complex RNN unit supported by PaddlePaddle. The user only needs to focus on the calculations that the RNN is designed to complete within a single time step. The PaddlePaddle is responsible for completing the propagation of information and gradients over time. + +In PaddlePaddle, a simple call to `recurrent_group` is as follows: + +``` python +recurrent_group(step, input, reverse) +``` +- step: A callable function that defines the calculations completed by the RNN unit within a time step +- input: The input must be a single-layer sequence or a double-layer sequence +- reverse: Whether to process the input sequence in reverse order + +The core of using `recurrent_group` is to design the logic of the step function. The step function can be freely combined with various layers supported by PaddlePaddle to complete arbitrary arithmetic logic. The input of `recurrent_group` (input) becomes the input of the step function. Since the step function only focuses on the calculation within one time step of RNN, here `recurrent_group` completes the splitting of the original input data for us. + +### Input +The input sequence processed by `recurrent_group` is mainly divided into the following three types: + +- **Input Data**: When putting a two-level sequence into `recurrent_group`, it will be disassembled into a single-level sequence. When putting a single-level sequence into `recurrent_group`, it will be disassembled into a non-sequence and then passed to the step function. This process is completely transparent to the user. There are two possible types: 1) User input via data_layer; 2) Output from other layers. + +- **Read-only Memory Input**: `StaticInput` defines a read-only Memory. The input specified by `StaticInput` will not be disassembled by `recurrent_group`, and each time step of the `recurrent_group` loop will always be able to reference all inputs. It may be a non-sequence or a single-layer sequence. + +- **Input of Sequence Generation Task**: `GeneratedInput` is only used to specify input data in a sequence generation task. + +### Input Example + +Sequence generation tasks mostly follow the encoder-decoer architecture. The encoder and decoder can be arbitrary neural network units capable of processing sequences and RNN is the most popular choice. + +Given the encoder output and the current word, the decoder predicts the next most likely word each time. In this structure, the decoder accepts two inputs: + +- Target sequence to be generated: a input of the decoder and the basis of the decoder loop. `recurrent_group` will disassemble this input type. + +- Encoder output, an non-sequencce or single-sequence: a unbounded memory. Each time step in the decoder loop will reference the entire result and should not be disassembled. This type of input must be specified via `StaticInput`. For more discussion on Unbounded Memory, please refer to the paper [Neural Turning Machine](https://arxiv.org/abs/1410.5401). + +In a sequence generation task, the decoder RNN always refers to the word vector of the word predicted at the previous moment as the current time input. `GeneratedInput` will automate this process. + +### Output +The `step` function must return the output of one or more Layers. The output of this Layer will be the final output of the entire `recurrent_group`. In the output process, `recurrent_group` will concatenate the output of each time step, which is also transparent to the user. + +### Memory +Memory can only be defined and used in `recurrent_group`. Memory cannot exist independently and must point to a layer defined by PaddlePaddle. Memory is referenced to get a momentary output from this layer, so memory can be interpreted as a delay operation. + +The user can explicitly specify the output of a layer to initialize the memory. When not specified, memory is initialized to 0 by default. + +## Sequence-level RNN Introduction + +`recurrent_group` helps us to split the input sequence, merge the output, and loop through the sequence of computational logic. + +Using this feature, the two nested `recurrent_group` can handle the nested two-level sequences, implementing sequence-level RNN structures at both the word and sentence levels. + +- Word-level RNN: each state corresponds to a word. +- Sequence-level RNN: a sequence-layer RNN consists of multiple word-layer RNNs. Each word-layer RNN (ie, each state of a sequence-layer RNN) has a subsequence. + +For convenience of description, the following takes the NLP task as an example. A paragraph containing a subsequence is defined as a two-level sequence, and a sentence containing a word is defined as a single-layer sequence. Then, the zero-level sequence is a word. + +## Usage of Sequence-level RNN + +### Usage of Training Process +Using `recurrent_group` requires the following conventions: + +- **Single-input Single-output**: Both input and output are single layer sequences. + - If there are multiple inputs, the number of words in different input sequences must be exactly equal. + - A single-layer sequence is output, and the number of words in the output sequence is the same as the input sequence. + - memory: define memory to point to a layer in the step function, get a moment output from this layer by referencing memory to form a recurrent connection. The is_seq parameter of memory must be false. If memory is not defined, the operations within each time step are independent. + - boot_layer: the initial state of memory, set 0 by default. is_seq in memory must be false. + +- **Double-input Double-output**: Both input and output are two-level sequence. + - If there are multiple input sequences, the number of subsequence contained in different inputs must be strictly equal, but the number of words in the subsequence may not be equal. + - output a two-level sequence. The number of subsequence and the number of words are the same as the specified input sequence and the first input is default. + - memory: defining memory in the step function, pointing to a layer, by referring to the memory to get the output of this layer at a time, forming a recurrent connection. The memory defined in the outer `recurrent_group` step function can record the state of the previous subsequence, either as a single-level sequence (only as read-only memory) or as a word. If memory is not defined, the operations between subsequence are independent. + - boot_layer: the initial state of memory. It is either a single-level sequence (only as read-only memory) or a vector. The default is not set, that is, the initial state is 0. + +- **Double-input Single-output**: not support for now, and output the error with "In hierachical RNN, all out links should be from sequences now". + +### Usage of Generation Process +Using `beam_search` need follow those conventions: + +- Word-level RNN: generate the next word from a word. +- Sequence-level RNN: the single-layer RNN generated subsequence is concatenated into a new double-layer sequence. Semantically, there is no case where a subsequence generates the next subseq directly.