From c69d2bbeddea61acfb382ea53c40e6ebdfa5c85d Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 26 Oct 2018 19:20:27 +0800 Subject: [PATCH 001/197] Add base impl --- .../operators/fused_embedding_seq_pool_op.cc | 158 +++++++++++++ .../operators/fused_embedding_seq_pool_op.h | 207 ++++++++++++++++++ 2 files changed, 365 insertions(+) create mode 100644 paddle/fluid/operators/fused_embedding_seq_pool_op.cc create mode 100644 paddle/fluid/operators/fused_embedding_seq_pool_op.h diff --git a/paddle/fluid/operators/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused_embedding_seq_pool_op.cc new file mode 100644 index 0000000000..ea96078291 --- /dev/null +++ b/paddle/fluid/operators/fused_embedding_seq_pool_op.cc @@ -0,0 +1,158 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fused_embedding_seq_pool_op.h" +#include "paddle/fluid/framework/var_type_inference.h" + +namespace paddle { +namespace operators { + +class LookupTableOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("W"), + "Input(W) of LookupTableOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Ids"), + "Input(Ids) of LookupTableOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of LookupTableOp should not be null."); + + auto table_dims = ctx->GetInputDim("W"); + auto ids_dims = ctx->GetInputDim("Ids"); + int ids_rank = ids_dims.size(); + + PADDLE_ENFORCE_EQ(table_dims.size(), 2); + PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1], 1, + "The last dimension of the 'Ids' tensor must be 1."); + + auto output_dims = + framework::vectorize(framework::slice_ddim(ids_dims, 0, ids_rank - 1)); + output_dims.push_back(table_dims[1]); + ctx->SetOutputDim("Out", framework::make_ddim(output_dims)); + + if (ctx->GetOutputsVarType("Out")[0] == + framework::proto::VarType::LOD_TENSOR) { + ctx->ShareLoD("Ids", /*->*/ "Out"); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W")); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("W", + "(Tensor) The input represents embedding tensors, " + "which is a learnable parameter."); + AddInput("Ids", + "An input with type int32 or int64 " + "contains the ids to be looked up in W. " + "The last dimension size must be 1."); + AddOutput("Out", "The lookup results, which have the same type as W."); + AddAttr("is_sparse", + "(boolean, default false) " + "Sparse update.") + .SetDefault(false); + AddAttr("is_distributed", + "(boolean, default false) distributed lookup table.") + .SetDefault(false); + AddAttr("padding_idx", + "(int64, default -1) " + "If the value is -1, it makes no effect to lookup. " + "Otherwise the given value indicates padding the output " + "with zeros whenever lookup encounters it in Ids.") + .SetDefault(kNoPadding); + AddComment(R"DOC( +Lookup Table Operator. + +This operator is used to perform lookups on the parameter W, +then concatenated into a dense tensor. + +The input Ids can carry the LoD (Level of Details) information, +or not. And the output only shares the LoD information with input Ids. + +)DOC"); + } +}; + +class LookupTableOpGradDescMaker + : public framework::DefaultGradOpDescMaker { + using ::paddle::framework::DefaultGradOpDescMaker< + true>::DefaultGradOpDescMaker; + + protected: + virtual std::string GradOpType() const { return "lookup_table_grad"; } +}; + +class LookupTableOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + auto table_dims = ctx->GetInputDim("W"); + ctx->SetOutputDim(framework::GradVarName("W"), table_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W")); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +class LookupTableOpGradVarTypeInference : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { + auto out_var_name = op_desc.Output(framework::GradVarName("W")).front(); + auto attr = op_desc.GetAttr("is_sparse"); + bool is_sparse = boost::get(attr); + if (is_sparse) { + VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W") + << " is set to SelectedRows"; + block->Var(out_var_name) + ->SetType(framework::proto::VarType::SELECTED_ROWS); + } else { + VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W") + << " is set to LoDTensor"; + block->Var(out_var_name)->SetType(framework::proto::VarType::LOD_TENSOR); + } + block->Var(out_var_name)->SetDataType(block->Var("W")->GetDataType()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(lookup_table, ops::LookupTableOp, + ops::LookupTableOpGradDescMaker, ops::LookupTableOpMaker); +REGISTER_OPERATOR(lookup_table_grad, ops::LookupTableOpGrad, + ops::LookupTableOpGradVarTypeInference); + +// REGISTER_OP_CPU_KERNEL(lookup_table, ops::LookupTableKernel, +// ops::LookupTableKernel); +// REGISTER_OP_CPU_KERNEL(lookup_table_grad, ops::LookupTableGradKernel, +// ops::LookupTableGradKernel); +REGISTER_OP_CPU_KERNEL(lookup_table, ops::LookupTableKernel); +REGISTER_OP_CPU_KERNEL(lookup_table_grad, ops::LookupTableGradKernel); diff --git a/paddle/fluid/operators/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused_embedding_seq_pool_op.h new file mode 100644 index 0000000000..6dcf4f44a7 --- /dev/null +++ b/paddle/fluid/operators/fused_embedding_seq_pool_op.h @@ -0,0 +1,207 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/operators/math/blas.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using SelectedRows = framework::SelectedRows; +using DDim = framework::DDim; + +constexpr int64_t kNoPadding = -1; + +template +class LookupTableKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *ids_t = context.Input("Ids"); // int tensor + auto *output_t = context.Output("Out"); // float tensor + auto *table_var = context.InputVar("W"); + + int64_t padding_idx = context.Attr("padding_idx"); + int64_t *ids = const_cast(ids_t->data()); + int64_t ids_numel = ids_t->numel(); + + if (table_var->IsType()) { + auto *table_t = context.Input("W"); + int64_t row_number = table_t->dims()[0]; + int64_t row_width = table_t->dims()[1]; + + auto *table = table_t->data(); + auto *output = output_t->mutable_data(context.GetPlace()); + + for (int64_t i = 0; i < ids_numel; ++i) { + if (padding_idx != kNoPadding && ids[i] == padding_idx) { + memset(output + i * row_width, 0, row_width * sizeof(T)); + } else { + PADDLE_ENFORCE_LT(ids[i], row_number); + PADDLE_ENFORCE_GE(ids[i], 0, "ids %d", i); + memcpy(output + i * row_width, table + ids[i] * row_width, + row_width * sizeof(T)); + } + } + } else if (table_var->IsType()) { + const auto &table_t = table_var->Get(); + int64_t row_width = table_t.value().dims()[1]; + const auto *table = table_t.value().data(); + auto *output = output_t->mutable_data(context.GetPlace()); + + auto blas = math::GetBlas(context); + for (int64_t i = 0; i < ids_numel; ++i) { + if (padding_idx != kNoPadding && ids[i] == padding_idx) { + memset(output + i * row_width, 0, row_width * sizeof(T)); + } else { + PADDLE_ENFORCE_GE(ids[i], 0); + auto id_index = table_t.Index(ids[i]); + PADDLE_ENFORCE_GE(id_index, 0, "the input key should be exists."); + // memcpy(output + i * row_width, table + id_index * row_width, + // row_width * sizeof(T)); + blas.VCOPY(row_width, table + id_index * row_width, + output + i * row_width); + } + } + } + } +}; + +template +class LookupTableGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *table_var = context.InputVar("W"); + DDim table_dim; + if (table_var->IsType()) { + table_dim = context.Input("W")->dims(); + } else if (table_var->IsType()) { + auto *table_t = context.Input("W"); + table_dim = table_t->value().dims(); + } else { + PADDLE_THROW( + "The parameter W of a LookupTable " + "must be either LoDTensor or SelectedRows"); + } + + bool is_sparse = context.Attr("is_sparse"); + // Since paddings are not trainable and fixed in forward, the gradient of + // paddings makes no sense and we don't deal with it in backward. + if (is_sparse) { + // auto start = std::chrono::system_clock::now(); + auto *ids = context.Input("Ids"); + auto *d_output = context.Input(framework::GradVarName("Out")); + auto *d_table = context.Output(framework::GradVarName("W")); + + auto *ids_data = ids->data(); + int64_t ids_num = ids->numel(); + // auto end = std::chrono::system_clock::now(); + // std::chrono::duration diff = end - start; + + // auto copy_start = std::chrono::system_clock::now(); + std::vector new_rows; + new_rows.resize(ids_num); + std::memcpy(&new_rows[0], ids_data, ids_num * sizeof(int64_t)); + // for (int64_t i = 0; i < ids_num; i++) { + // new_rows.push_back(ids_data[i]); + // } + // auto copy_end = std::chrono::system_clock::now(); + // std::chrono::duration copy_diff = copy_end - copy_start; + // diff += copy_diff; + // LOG(ERROR) << "run emb_grad copy end, cost: " << copy_diff.count() << " + // " << ids_num; + + // copy_start = std::chrono::system_clock::now(); + d_table->set_rows(new_rows); + + auto *d_table_value = d_table->mutable_value(); + d_table_value->Resize({ids_num, table_dim[1]}); + d_table_value->ShareDataWith(*d_output); + // d_table_value->mutable_data(context.GetPlace()); + + // // copy_end = std::chrono::system_clock::now(); + // // copy_diff = copy_end - copy_start; + // // diff += copy_diff; + // // LOG(ERROR) << "run emb_grad resize table end, cost: " << + // // copy_diff.count() << " " << ids_num; + + // // copy_start = std::chrono::system_clock::now(); + // d_table->set_height(table_dim[0]); + + // auto *d_output_data = d_output->data(); + // auto *d_table_data = d_table_value->data(); + + // // copy_end = std::chrono::system_clock::now(); + // // copy_diff = copy_end - copy_start; + // // diff += copy_diff; + // // LOG(ERROR) << "run emb_grad set height end, cost: " << + // // copy_diff.count() << " " << ids_num; + + // auto d_output_dims = d_output->dims(); + // PADDLE_ENFORCE_EQ( + // d_table_value->dims(), + // framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1)); + // // copy_start = std::chrono::system_clock::now(); + // auto blas = math::GetBlas(context); + // blas.VCOPY(d_output->numel(), d_output_data, d_table_data); + // cblas_scopy(d_output->numel(), d_output_data, 1, d_table_data, 1); + // // for (int i = 0; i != d_output->numel(), ++i) { + // // *(d_table_data++) = *(d_output_data++); + // // } + // // memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel()); + // // copy_end = std::chrono::system_clock::now(); + // // copy_diff = copy_end - copy_start; + // // diff += copy_diff; + // // LOG(ERROR) << "run emb_grad core end, cost: " << copy_diff.count() + // << " + // // " << ids_num << " " << d_output->numel(); + + // // LOG(ERROR) << "run emb_grad end, cost: " << diff.count(); + } else { + auto *ids = context.Input("Ids"); + auto *d_output = context.Input(framework::GradVarName("Out")); + auto *d_table = context.Output(framework::GradVarName("W")); + + auto *ids_data = ids->data(); + + int N = table_dim[0]; + int D = table_dim[1]; + + auto *d_output_data = d_output->data(); + auto *d_table_data = d_table->mutable_data(context.GetPlace()); + + memset(d_table_data, 0, d_table->numel() * sizeof(T)); + + for (int64_t i = 0; i < ids->numel(); ++i) { + PADDLE_ENFORCE_LT(ids_data[i], N); + PADDLE_ENFORCE_GE(ids_data[i], 0); + for (int j = 0; j < D; ++j) { + d_table_data[ids_data[i] * D + j] += d_output_data[i * D + j]; + } + } + } + } +}; + +} // namespace operators +} // namespace paddle From 6db8c3bfeafca8b1522de32f56c450db473bd3e9 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 5 Nov 2018 15:31:19 +0800 Subject: [PATCH 002/197] Implement the infer shape and infer var type --- .../operators/fused_embedding_seq_pool_op.cc | 116 +++++++++++------- .../operators/fused_embedding_seq_pool_op.h | 2 - 2 files changed, 70 insertions(+), 48 deletions(-) diff --git a/paddle/fluid/operators/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused_embedding_seq_pool_op.cc index ea96078291..5ebaf865fc 100644 --- a/paddle/fluid/operators/fused_embedding_seq_pool_op.cc +++ b/paddle/fluid/operators/fused_embedding_seq_pool_op.cc @@ -18,34 +18,53 @@ limitations under the License. */ namespace paddle { namespace operators { -class LookupTableOp : public framework::OperatorWithKernel { +class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("W"), - "Input(W) of LookupTableOp should not be null."); + "Input W of FusedEmbeddingSeqPoolOp should not be null."); PADDLE_ENFORCE(ctx->HasInput("Ids"), - "Input(Ids) of LookupTableOp should not be null."); + "Input Ids of FusedEmbeddingSeqPoolOp should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of LookupTableOp should not be null."); + "Output of FusedEmbeddingSeqPoolOp should not be null."); auto table_dims = ctx->GetInputDim("W"); auto ids_dims = ctx->GetInputDim("Ids"); - int ids_rank = ids_dims.size(); + const std::string& combiner = ctx->Attrs().Get("combiner"); PADDLE_ENFORCE_EQ(table_dims.size(), 2); - PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1], 1, + PADDLE_ENFORCE_GE(ids_dims.size(), 1u, + "The dim size of the 'Ids' tensor must greater than 1."); + PADDLE_ENFORCE_EQ(ids_dims[ids_dims.size() - 1], 1, "The last dimension of the 'Ids' tensor must be 1."); + // we only support sum now + PADDLE_ENFORCE_EQ(combiner, "sum"); - auto output_dims = - framework::vectorize(framework::slice_ddim(ids_dims, 0, ids_rank - 1)); - output_dims.push_back(table_dims[1]); - ctx->SetOutputDim("Out", framework::make_ddim(output_dims)); + if (ctx->IsRuntime()) { + Variable* ids_var = boost::get(ctx->GetInputVarPtrs("Ids")[0]); + const auto& ids_lod = ids_var->Get().lod(); - if (ctx->GetOutputsVarType("Out")[0] == - framework::proto::VarType::LOD_TENSOR) { - ctx->ShareLoD("Ids", /*->*/ "Out"); + // in run time, the LoD of ids must be 1 + PADDLE_ENFORCE(ids_lod.size(), 1u, + "The LoD level of Input(Ids) must be 1"); + PADDLE_ENFORCE_GE(ids_lod[0].size(), 1u, "The LoD could NOT be empty"); + + size_t batch_size = ids_lod[0].size() - 1; + + // in run time, the shape from Ids -> output + // should be [seq_length, 1] -> [batch_size, embedding_size] + ctx->SetOutputDim("Out", + framework::make_ddim({batch_size, table_dims[1]})); + } else { + // in compile time, the lod level of ids must be 1 + VarDesc* ids_desc = boost::get(ctx->GetInputVarPtrs("Ids")[0]); + PADDLE_ENFORCE_EQ(ids_desc->GetLoDLevel(), 1); + + // in compile time, the shape from Ids -> output + // should be [-1, 1] -> [-1, embedding_size] + ctx->SetOutputDim("Out", framework::make_ddim({-1, table_dims[1]})); } } @@ -57,7 +76,7 @@ class LookupTableOp : public framework::OperatorWithKernel { } }; -class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker { +class FusedEmbeddingSeqPoolOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("W", @@ -68,42 +87,44 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker { "contains the ids to be looked up in W. " "The last dimension size must be 1."); AddOutput("Out", "The lookup results, which have the same type as W."); + AddAttr("combiner", + "(string, default sum) " + "A string specifying the reduction op. Currently sum " + "are supported, sum computes the weighted sum of the " + "embedding results for each row.") + .SetDefault("sum"); AddAttr("is_sparse", "(boolean, default false) " "Sparse update.") .SetDefault(false); - AddAttr("is_distributed", - "(boolean, default false) distributed lookup table.") - .SetDefault(false); - AddAttr("padding_idx", - "(int64, default -1) " - "If the value is -1, it makes no effect to lookup. " - "Otherwise the given value indicates padding the output " - "with zeros whenever lookup encounters it in Ids.") - .SetDefault(kNoPadding); AddComment(R"DOC( -Lookup Table Operator. +FusedEmbeddingSeqPool Operator. + +Computes embeddings for the given ids and weights. This operator is used to perform lookups on the parameter W, -then concatenated into a dense tensor. +then computes the weighted sum of the lookups results for each row +and concatenated into a dense tensor. -The input Ids can carry the LoD (Level of Details) information, -or not. And the output only shares the LoD information with input Ids. +The input Ids should carry the LoD (Level of Details) information. +And the output will change the LoD information with input Ids. )DOC"); } }; -class LookupTableOpGradDescMaker +class FusedEmbeddingSeqPoolOpGradDescMaker : public framework::DefaultGradOpDescMaker { using ::paddle::framework::DefaultGradOpDescMaker< true>::DefaultGradOpDescMaker; protected: - virtual std::string GradOpType() const { return "lookup_table_grad"; } + virtual std::string GradOpType() const { + return "fused_embedding_seq_pool_grad"; + } }; -class LookupTableOpGrad : public framework::OperatorWithKernel { +class FusedEmbeddingSeqPoolOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -120,7 +141,8 @@ class LookupTableOpGrad : public framework::OperatorWithKernel { } }; -class LookupTableOpGradVarTypeInference : public framework::VarTypeInference { +class FusedEmbeddingSeqPoolOpGradVarTypeInference + : public framework::VarTypeInference { public: void operator()(const framework::OpDesc& op_desc, framework::BlockDesc* block) const override { @@ -128,13 +150,13 @@ class LookupTableOpGradVarTypeInference : public framework::VarTypeInference { auto attr = op_desc.GetAttr("is_sparse"); bool is_sparse = boost::get(attr); if (is_sparse) { - VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W") - << " is set to SelectedRows"; + VLOG(3) << "fused_embedding_seq_pool_grad op " + << framework::GradVarName("W") << " is set to SelectedRows"; block->Var(out_var_name) ->SetType(framework::proto::VarType::SELECTED_ROWS); } else { - VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W") - << " is set to LoDTensor"; + VLOG(3) << "fused_embedding_seq_pool_grad op " + << framework::GradVarName("W") << " is set to LoDTensor"; block->Var(out_var_name)->SetType(framework::proto::VarType::LOD_TENSOR); } block->Var(out_var_name)->SetDataType(block->Var("W")->GetDataType()); @@ -145,14 +167,16 @@ class LookupTableOpGradVarTypeInference : public framework::VarTypeInference { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(lookup_table, ops::LookupTableOp, - ops::LookupTableOpGradDescMaker, ops::LookupTableOpMaker); -REGISTER_OPERATOR(lookup_table_grad, ops::LookupTableOpGrad, - ops::LookupTableOpGradVarTypeInference); - -// REGISTER_OP_CPU_KERNEL(lookup_table, ops::LookupTableKernel, -// ops::LookupTableKernel); -// REGISTER_OP_CPU_KERNEL(lookup_table_grad, ops::LookupTableGradKernel, -// ops::LookupTableGradKernel); -REGISTER_OP_CPU_KERNEL(lookup_table, ops::LookupTableKernel); -REGISTER_OP_CPU_KERNEL(lookup_table_grad, ops::LookupTableGradKernel); +REGISTER_OPERATOR(fused_embedding_seq_pool, ops::FusedEmbeddingSeqPoolOp, + ops::FusedEmbeddingSeqPoolOpGradDescMaker, + ops::FusedEmbeddingSeqPoolOpMaker); +REGISTER_OPERATOR(fused_embedding_seq_pool_grad, + ops::FusedEmbeddingSeqPoolOpGrad, + ops::FusedEmbeddingSeqPoolOpGradVarTypeInference); + +REGISTER_OP_CPU_KERNEL(fused_embedding_seq_pool, + ops::FusedEmbeddingSeqPoolKernel, + ops::FusedEmbeddingSeqPoolKernel); +REGISTER_OP_CPU_KERNEL(fused_embedding_seq_pool_grad, + ops::FusedEmbeddingSeqPoolGradKernel, + ops::FusedEmbeddingSeqPoolGradKernel); diff --git a/paddle/fluid/operators/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused_embedding_seq_pool_op.h index 6dcf4f44a7..24cffc60a8 100644 --- a/paddle/fluid/operators/fused_embedding_seq_pool_op.h +++ b/paddle/fluid/operators/fused_embedding_seq_pool_op.h @@ -31,8 +31,6 @@ using LoDTensor = framework::LoDTensor; using SelectedRows = framework::SelectedRows; using DDim = framework::DDim; -constexpr int64_t kNoPadding = -1; - template class LookupTableKernel : public framework::OpKernel { public: From 17c8014fcd2071920a605f12951d4f6ae1ddcab9 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Tue, 6 Nov 2018 17:42:43 +0800 Subject: [PATCH 003/197] Complete implementation test=develop --- .../operators/fused_embedding_seq_pool_op.cc | 6 + .../operators/fused_embedding_seq_pool_op.h | 182 ++++++------------ 2 files changed, 63 insertions(+), 125 deletions(-) diff --git a/paddle/fluid/operators/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused_embedding_seq_pool_op.cc index 5ebaf865fc..e862769051 100644 --- a/paddle/fluid/operators/fused_embedding_seq_pool_op.cc +++ b/paddle/fluid/operators/fused_embedding_seq_pool_op.cc @@ -93,6 +93,12 @@ class FusedEmbeddingSeqPoolOpMaker : public framework::OpProtoAndCheckerMaker { "are supported, sum computes the weighted sum of the " "embedding results for each row.") .SetDefault("sum"); + // NOTE(minqiyang): grad_inplace is an temporal attribute, + // please do NOT set this attribute in python layer. + AddAttr("grad_inplace", + "(boolean, default false) " + "If the grad op reuse the input's variable.") + .SetDefault(false); AddAttr("is_sparse", "(boolean, default false) " "Sparse update.") diff --git a/paddle/fluid/operators/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused_embedding_seq_pool_op.h index 24cffc60a8..5af234b937 100644 --- a/paddle/fluid/operators/fused_embedding_seq_pool_op.h +++ b/paddle/fluid/operators/fused_embedding_seq_pool_op.h @@ -31,62 +31,54 @@ using LoDTensor = framework::LoDTensor; using SelectedRows = framework::SelectedRows; using DDim = framework::DDim; +template +struct EmbeddingVSumFunctor { + void operator()(const DeviceContext &context, LoDTensor *table_t, + LoDTensor *ids_t, LoDTensor *output_t) { + auto *table = table_t->data(); + int64_t row_number = table->dims()[0]; + int64_t row_width = table->dims()[1]; + int64_t *ids = const_cast(ids_t->data()); + auto ids_lod = ids_t->LoD()[0]; + auto *output = output_t->mutable_data(context.GetPlace()); + + auto blas = math::GetBlas(context); + for (int64_t i = 0; i != ids_lod.size() - 1; ++i) { + size_t begin = ids_lod[i]; + + PADDLE_ENFORCE_LT(ids[begin], row_number); + PADDLE_ENFORCE_GE(ids[begin], 0, "ids %d", i); + blas.VCOPY(row_width, table + ids[begin] * row_width, + output + i * row_width); + + for (int64_t r = ids_lod[i] + 1; r < ids_lod[i + 1]; ++r) { + PADDLE_ENFORCE_LT(ids[r], row_number); + PADDLE_ENFORCE_GE(ids[r], 0, "ids %d", i); + blas.AXPY(row_width, 1., table + ids[r] * row_width, + output + i * row_width); + } + } + } +}; + template -class LookupTableKernel : public framework::OpKernel { +class FusedEmbeddingSeqPoolKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { - auto *ids_t = context.Input("Ids"); // int tensor - auto *output_t = context.Output("Out"); // float tensor - auto *table_var = context.InputVar("W"); - - int64_t padding_idx = context.Attr("padding_idx"); - int64_t *ids = const_cast(ids_t->data()); - int64_t ids_numel = ids_t->numel(); - - if (table_var->IsType()) { - auto *table_t = context.Input("W"); - int64_t row_number = table_t->dims()[0]; - int64_t row_width = table_t->dims()[1]; - - auto *table = table_t->data(); - auto *output = output_t->mutable_data(context.GetPlace()); - - for (int64_t i = 0; i < ids_numel; ++i) { - if (padding_idx != kNoPadding && ids[i] == padding_idx) { - memset(output + i * row_width, 0, row_width * sizeof(T)); - } else { - PADDLE_ENFORCE_LT(ids[i], row_number); - PADDLE_ENFORCE_GE(ids[i], 0, "ids %d", i); - memcpy(output + i * row_width, table + ids[i] * row_width, - row_width * sizeof(T)); - } - } - } else if (table_var->IsType()) { - const auto &table_t = table_var->Get(); - int64_t row_width = table_t.value().dims()[1]; - const auto *table = table_t.value().data(); - auto *output = output_t->mutable_data(context.GetPlace()); - - auto blas = math::GetBlas(context); - for (int64_t i = 0; i < ids_numel; ++i) { - if (padding_idx != kNoPadding && ids[i] == padding_idx) { - memset(output + i * row_width, 0, row_width * sizeof(T)); - } else { - PADDLE_ENFORCE_GE(ids[i], 0); - auto id_index = table_t.Index(ids[i]); - PADDLE_ENFORCE_GE(id_index, 0, "the input key should be exists."); - // memcpy(output + i * row_width, table + id_index * row_width, - // row_width * sizeof(T)); - blas.VCOPY(row_width, table + id_index * row_width, - output + i * row_width); - } - } + LoDTensor *ids_t = context.Input("Ids"); // int tensor + LoDTensor *output_t = context.Output("Out"); // float tensor + LoDTensor *table_var = context.Input("W"); + const std::string &combiner_type = context.Attr("combiner"); + + if (combiner_type == "sum") { + EmbeddingVSumFunctor functor; + functor(context.template device_context(), ids_t, output_t, table_var); } } }; template -class LookupTableGradKernel : public framework::OpKernel { +class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { auto *table_var = context.InputVar("W"); @@ -106,97 +98,37 @@ class LookupTableGradKernel : public framework::OpKernel { // Since paddings are not trainable and fixed in forward, the gradient of // paddings makes no sense and we don't deal with it in backward. if (is_sparse) { - // auto start = std::chrono::system_clock::now(); auto *ids = context.Input("Ids"); auto *d_output = context.Input(framework::GradVarName("Out")); auto *d_table = context.Output(framework::GradVarName("W")); auto *ids_data = ids->data(); int64_t ids_num = ids->numel(); - // auto end = std::chrono::system_clock::now(); - // std::chrono::duration diff = end - start; + auto lod = ids->lod()[0]; + int64_t row_width = table_dim[1]; - // auto copy_start = std::chrono::system_clock::now(); - std::vector new_rows; + framework::Vector new_rows; new_rows.resize(ids_num); std::memcpy(&new_rows[0], ids_data, ids_num * sizeof(int64_t)); - // for (int64_t i = 0; i < ids_num; i++) { - // new_rows.push_back(ids_data[i]); - // } - // auto copy_end = std::chrono::system_clock::now(); - // std::chrono::duration copy_diff = copy_end - copy_start; - // diff += copy_diff; - // LOG(ERROR) << "run emb_grad copy end, cost: " << copy_diff.count() << " - // " << ids_num; - - // copy_start = std::chrono::system_clock::now(); d_table->set_rows(new_rows); auto *d_table_value = d_table->mutable_value(); - d_table_value->Resize({ids_num, table_dim[1]}); - d_table_value->ShareDataWith(*d_output); - // d_table_value->mutable_data(context.GetPlace()); - - // // copy_end = std::chrono::system_clock::now(); - // // copy_diff = copy_end - copy_start; - // // diff += copy_diff; - // // LOG(ERROR) << "run emb_grad resize table end, cost: " << - // // copy_diff.count() << " " << ids_num; - - // // copy_start = std::chrono::system_clock::now(); - // d_table->set_height(table_dim[0]); - - // auto *d_output_data = d_output->data(); - // auto *d_table_data = d_table_value->data(); - - // // copy_end = std::chrono::system_clock::now(); - // // copy_diff = copy_end - copy_start; - // // diff += copy_diff; - // // LOG(ERROR) << "run emb_grad set height end, cost: " << - // // copy_diff.count() << " " << ids_num; - - // auto d_output_dims = d_output->dims(); - // PADDLE_ENFORCE_EQ( - // d_table_value->dims(), - // framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1)); - // // copy_start = std::chrono::system_clock::now(); - // auto blas = math::GetBlas(context); - // blas.VCOPY(d_output->numel(), d_output_data, d_table_data); - // cblas_scopy(d_output->numel(), d_output_data, 1, d_table_data, 1); - // // for (int i = 0; i != d_output->numel(), ++i) { - // // *(d_table_data++) = *(d_output_data++); - // // } - // // memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel()); - // // copy_end = std::chrono::system_clock::now(); - // // copy_diff = copy_end - copy_start; - // // diff += copy_diff; - // // LOG(ERROR) << "run emb_grad core end, cost: " << copy_diff.count() - // << " - // // " << ids_num << " " << d_output->numel(); - - // // LOG(ERROR) << "run emb_grad end, cost: " << diff.count(); - } else { - auto *ids = context.Input("Ids"); - auto *d_output = context.Input(framework::GradVarName("Out")); - auto *d_table = context.Output(framework::GradVarName("W")); - - auto *ids_data = ids->data(); - - int N = table_dim[0]; - int D = table_dim[1]; - - auto *d_output_data = d_output->data(); - auto *d_table_data = d_table->mutable_data(context.GetPlace()); - - memset(d_table_data, 0, d_table->numel() * sizeof(T)); - - for (int64_t i = 0; i < ids->numel(); ++i) { - PADDLE_ENFORCE_LT(ids_data[i], N); - PADDLE_ENFORCE_GE(ids_data[i], 0); - for (int j = 0; j < D; ++j) { - d_table_data[ids_data[i] * D + j] += d_output_data[i * D + j]; + d_table_value->Resize({ids_num, row_width}); + T *d_table_data = d_table_value->mutable_data(context.GetPlace()); + const T *d_output_data = d_output->data(); + + auto blas = math::GetBlas(context); + for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { + int64_t h = static_cast(lod[i + 1] - lod[i]); + int64_t in_offset = lod[i] * row_width; + const T *out_pos = d_output_data + i * row_width; + T *in_pos = d_table_data + in_offset; + for (int r = 0; r != h; ++r) { + blas.VCOPY(row_width, out_pos, in_pos + r * row_width); } } + } else { + LOG(ERROR) << "Dense is not supported in fused_embedding_seq_pool_op now"; } } }; From 8a412c0d3308a0c9b90e8e7295ac117b6735b533 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Tue, 6 Nov 2018 20:04:05 +0800 Subject: [PATCH 004/197] Complete impl --- .../operators/fused_embedding_seq_pool_op.cc | 18 ++++--- .../operators/fused_embedding_seq_pool_op.h | 49 +++++++++++-------- 2 files changed, 40 insertions(+), 27 deletions(-) diff --git a/paddle/fluid/operators/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused_embedding_seq_pool_op.cc index e862769051..6b6b898d4c 100644 --- a/paddle/fluid/operators/fused_embedding_seq_pool_op.cc +++ b/paddle/fluid/operators/fused_embedding_seq_pool_op.cc @@ -42,8 +42,14 @@ class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel { // we only support sum now PADDLE_ENFORCE_EQ(combiner, "sum"); + int64_t last_dim = table_dims[1]; + for (int i = 1; i != ids_dims.size(); ++i) { + last_dim *= ids_dims[i]; + } + if (ctx->IsRuntime()) { - Variable* ids_var = boost::get(ctx->GetInputVarPtrs("Ids")[0]); + framework::Variable* ids_var = + boost::get(ctx->GetInputVarPtrs("Ids")[0]); const auto& ids_lod = ids_var->Get().lod(); // in run time, the LoD of ids must be 1 @@ -51,20 +57,20 @@ class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel { "The LoD level of Input(Ids) must be 1"); PADDLE_ENFORCE_GE(ids_lod[0].size(), 1u, "The LoD could NOT be empty"); - size_t batch_size = ids_lod[0].size() - 1; + int64_t batch_size = ids_lod[0].size() - 1; // in run time, the shape from Ids -> output // should be [seq_length, 1] -> [batch_size, embedding_size] - ctx->SetOutputDim("Out", - framework::make_ddim({batch_size, table_dims[1]})); + ctx->SetOutputDim("Out", framework::make_ddim({batch_size, last_dim})); } else { // in compile time, the lod level of ids must be 1 - VarDesc* ids_desc = boost::get(ctx->GetInputVarPtrs("Ids")[0]); + framework::VarDesc* ids_desc = + boost::get(ctx->GetInputVarPtrs("Ids")[0]); PADDLE_ENFORCE_EQ(ids_desc->GetLoDLevel(), 1); // in compile time, the shape from Ids -> output // should be [-1, 1] -> [-1, embedding_size] - ctx->SetOutputDim("Out", framework::make_ddim({-1, table_dims[1]})); + ctx->SetOutputDim("Out", framework::make_ddim({-1, last_dim})); } } diff --git a/paddle/fluid/operators/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused_embedding_seq_pool_op.h index 5af234b937..7385c8da33 100644 --- a/paddle/fluid/operators/fused_embedding_seq_pool_op.h +++ b/paddle/fluid/operators/fused_embedding_seq_pool_op.h @@ -31,31 +31,38 @@ using LoDTensor = framework::LoDTensor; using SelectedRows = framework::SelectedRows; using DDim = framework::DDim; -template +template struct EmbeddingVSumFunctor { - void operator()(const DeviceContext &context, LoDTensor *table_t, - LoDTensor *ids_t, LoDTensor *output_t) { + void operator()(const framework::ExecutionContext &context, + const LoDTensor *table_t, const LoDTensor *ids_t, + LoDTensor *output_t) { auto *table = table_t->data(); - int64_t row_number = table->dims()[0]; - int64_t row_width = table->dims()[1]; + int64_t row_number = table_t->dims()[0]; + int64_t row_width = table_t->dims()[1]; + int64_t last_dim = output_t->dims()[1]; int64_t *ids = const_cast(ids_t->data()); - auto ids_lod = ids_t->LoD()[0]; + auto ids_lod = ids_t->lod()[0]; + int64_t ids_count = ids_t->numel() / ids_lod.back(); + auto *output = output_t->mutable_data(context.GetPlace()); - auto blas = math::GetBlas(context); + auto blas = math::GetBlas(context); for (int64_t i = 0; i != ids_lod.size() - 1; ++i) { - size_t begin = ids_lod[i]; + for (int64_t j = 0; j != ids_count; ++j) { + size_t begin = ids_lod[i] * ids_count; - PADDLE_ENFORCE_LT(ids[begin], row_number); - PADDLE_ENFORCE_GE(ids[begin], 0, "ids %d", i); - blas.VCOPY(row_width, table + ids[begin] * row_width, - output + i * row_width); + PADDLE_ENFORCE_LT(ids[begin], row_number); + PADDLE_ENFORCE_GE(ids[begin], 0, "ids %d", i); + blas.VCOPY(row_width, table + ids[begin] * row_width, + output + i * last_dim + j * row_width); + } - for (int64_t r = ids_lod[i] + 1; r < ids_lod[i + 1]; ++r) { + for (int64_t r = (ids_lod[i] + 1) * ids_count; + r < ids_lod[i + 1] * ids_count; ++r) { PADDLE_ENFORCE_LT(ids[r], row_number); PADDLE_ENFORCE_GE(ids[r], 0, "ids %d", i); blas.AXPY(row_width, 1., table + ids[r] * row_width, - output + i * row_width); + output + i * row_width + (r % ids_count) * row_width); } } } @@ -65,14 +72,14 @@ template class FusedEmbeddingSeqPoolKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { - LoDTensor *ids_t = context.Input("Ids"); // int tensor - LoDTensor *output_t = context.Output("Out"); // float tensor - LoDTensor *table_var = context.Input("W"); + const LoDTensor *ids_t = context.Input("Ids"); // int tensor + LoDTensor *output_t = context.Output("Out"); // float tensor + const LoDTensor *table_var = context.Input("W"); const std::string &combiner_type = context.Attr("combiner"); if (combiner_type == "sum") { EmbeddingVSumFunctor functor; - functor(context.template device_context(), ids_t, output_t, table_var); + functor(context, table_var, ids_t, output_t); } } }; @@ -105,7 +112,7 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel { auto *ids_data = ids->data(); int64_t ids_num = ids->numel(); auto lod = ids->lod()[0]; - int64_t row_width = table_dim[1]; + int64_t row_width = d_output->dims()[1]; framework::Vector new_rows; new_rows.resize(ids_num); @@ -113,11 +120,11 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel { d_table->set_rows(new_rows); auto *d_table_value = d_table->mutable_value(); - d_table_value->Resize({ids_num, row_width}); + d_table_value->Resize({ids_num, table_dim[1]}); T *d_table_data = d_table_value->mutable_data(context.GetPlace()); const T *d_output_data = d_output->data(); - auto blas = math::GetBlas(context); + auto blas = math::GetBlas(context); for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { int64_t h = static_cast(lod[i + 1] - lod[i]); int64_t in_offset = lod[i] * row_width; From 3d784c27011a127de3c5730d8ee121102fadba6f Mon Sep 17 00:00:00 2001 From: minqiyang Date: Tue, 6 Nov 2018 20:05:18 +0800 Subject: [PATCH 005/197] Polish code --- paddle/fluid/operators/fused_embedding_seq_pool_op.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused_embedding_seq_pool_op.cc index 6b6b898d4c..966bdb4df5 100644 --- a/paddle/fluid/operators/fused_embedding_seq_pool_op.cc +++ b/paddle/fluid/operators/fused_embedding_seq_pool_op.cc @@ -35,7 +35,7 @@ class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel { const std::string& combiner = ctx->Attrs().Get("combiner"); PADDLE_ENFORCE_EQ(table_dims.size(), 2); - PADDLE_ENFORCE_GE(ids_dims.size(), 1u, + PADDLE_ENFORCE_GE(ids_dims.size(), 1, "The dim size of the 'Ids' tensor must greater than 1."); PADDLE_ENFORCE_EQ(ids_dims[ids_dims.size() - 1], 1, "The last dimension of the 'Ids' tensor must be 1."); From 0f91beefd1f70b1596e657ab4cbf77c3d2c9a574 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Tue, 6 Nov 2018 23:23:09 +0800 Subject: [PATCH 006/197] Fix bug test=develop --- paddle/fluid/operators/fused_embedding_seq_pool_op.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused_embedding_seq_pool_op.h index 7385c8da33..f37c688395 100644 --- a/paddle/fluid/operators/fused_embedding_seq_pool_op.h +++ b/paddle/fluid/operators/fused_embedding_seq_pool_op.h @@ -53,7 +53,7 @@ struct EmbeddingVSumFunctor { PADDLE_ENFORCE_LT(ids[begin], row_number); PADDLE_ENFORCE_GE(ids[begin], 0, "ids %d", i); - blas.VCOPY(row_width, table + ids[begin] * row_width, + blas.VCOPY(row_width, table + ids[begin + j] * row_width, output + i * last_dim + j * row_width); } @@ -62,7 +62,7 @@ struct EmbeddingVSumFunctor { PADDLE_ENFORCE_LT(ids[r], row_number); PADDLE_ENFORCE_GE(ids[r], 0, "ids %d", i); blas.AXPY(row_width, 1., table + ids[r] * row_width, - output + i * row_width + (r % ids_count) * row_width); + output + i * last_dim + (r % ids_count) * row_width); } } } From 849fbc7327935cfbe43f85744e71db515efa760d Mon Sep 17 00:00:00 2001 From: minqiyang Date: Tue, 6 Nov 2018 23:23:33 +0800 Subject: [PATCH 007/197] Add unittest test=develop --- .../unittests/test_fused_emb_seq_pool_op.py | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py diff --git a/python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py b/python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py new file mode 100644 index 0000000000..584e309bef --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py @@ -0,0 +1,51 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest +import paddle.fluid.core as core +import paddle.fluid as fluid +from paddle.fluid.op import Operator +import paddle.compat as cpt + + +class TestFusedEmbeddingSeqPoolOp(OpTest): + def setUp(self): + self.op_type = "fused_embedding_seq_pool" + self.emb_size = 2 + table = np.random.random((17, self.emb_size)).astype("float32") + ids = np.array([[[4], [3]], [[4], [3]], [[2], [1]], + [[16], [1]]]).astype("int64") + merged_ids = np.array([4, 2, 16]).astype("int64") + ids_expand = np.expand_dims(ids, axis=1) + self.lod = [[3, 1]] + self.attrs = {'is_sparse': True} + self.inputs = {'W': table, 'Ids': (ids_expand, self.lod)} + self.outputs = { + 'Out': np.reshape( + np.array([ + table[[4, 3]] + table[[4, 3]] + table[[2, 1]], + table[[16, 1]] + ]), [len(self.lod[0]), 2 * self.emb_size]) + } + + def test_check_output(self): + self.check_output() + + +if __name__ == "__main__": + unittest.main() From b0afdc4e7d57b2122da6484421fde65a10e4c783 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Tue, 13 Nov 2018 15:59:34 +0800 Subject: [PATCH 008/197] Add CMake deps --- paddle/fluid/operators/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 919ad96f7a..5e421803c3 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -269,6 +269,7 @@ else() set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op) endif() op_library(hash_op DEPS xxhash) +op_library(fused_hash_embedding_seq_pool DEPS xxhash) op_library(clip_by_norm_op DEPS selected_rows_functor selected_rows) op_library(sum_op DEPS selected_rows_functor) op_library(sgd_op DEPS selected_rows_functor) From 32ebee9f077956046a310d6fe3ad194650f579fa Mon Sep 17 00:00:00 2001 From: minqiyang Date: Tue, 13 Nov 2018 16:05:06 +0800 Subject: [PATCH 009/197] Polish code --- paddle/fluid/operators/fused_embedding_seq_pool_op.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused_embedding_seq_pool_op.h index f37c688395..38dfae8ad6 100644 --- a/paddle/fluid/operators/fused_embedding_seq_pool_op.h +++ b/paddle/fluid/operators/fused_embedding_seq_pool_op.h @@ -48,9 +48,8 @@ struct EmbeddingVSumFunctor { auto blas = math::GetBlas(context); for (int64_t i = 0; i != ids_lod.size() - 1; ++i) { + size_t begin = ids_lod[i] * ids_count; for (int64_t j = 0; j != ids_count; ++j) { - size_t begin = ids_lod[i] * ids_count; - PADDLE_ENFORCE_LT(ids[begin], row_number); PADDLE_ENFORCE_GE(ids[begin], 0, "ids %d", i); blas.VCOPY(row_width, table + ids[begin + j] * row_width, @@ -114,10 +113,9 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel { auto lod = ids->lod()[0]; int64_t row_width = d_output->dims()[1]; - framework::Vector new_rows; - new_rows.resize(ids_num); - std::memcpy(&new_rows[0], ids_data, ids_num * sizeof(int64_t)); - d_table->set_rows(new_rows); + framework::Vector *new_rows = d_table->mutable_rows(); + new_rows->resize(ids_num); + std::memcpy(&(*new_rows)[0], ids_data, ids_num * sizeof(int64_t)); auto *d_table_value = d_table->mutable_value(); d_table_value->Resize({ids_num, table_dim[1]}); From 4cb0100c8ea714e4ce7f8c0cd3c9ebc50aff9e35 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Thu, 6 Dec 2018 16:59:53 +0800 Subject: [PATCH 010/197] add prefetch in nce --- paddle/fluid/operators/nce_op.cc | 18 +++++ paddle/fluid/operators/nce_op.h | 67 ++++++++++++++++--- .../fluid/transpiler/distribute_transpiler.py | 2 +- 3 files changed, 78 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc index 9f97f7821d..06ff825fde 100644 --- a/paddle/fluid/operators/nce_op.cc +++ b/paddle/fluid/operators/nce_op.cc @@ -155,6 +155,24 @@ class NCEOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("is_sparse", "(boolean, default false) Sparse update.") .SetDefault(false); + // for parameter prefetch + AddAttr("remote_prefetch", "").SetDefault(false); + AddAttr("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0); + AddAttr>("height_sections", + "Height for each output SelectedRows.") + .SetDefault(std::vector({})); + AddAttr>( + "epmap", + "(string vector, default 127.0.0.1:6164)" + "Server endpoints in the order of input variables for mapping") + .SetDefault({}); + AddAttr>( + "table_names", + "(string vector, the splited table names that will be fetched from " + "parameter server)" + "in the order of input variables for mapping") + .SetDefault({}); + AddAttr>("custom_neg_classes", "This attribute only be used in unitest. Classes " "in this list wiil be used as negative classes " diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index f2ca6ec247..8f82f77f50 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -15,8 +15,10 @@ limitations under the License. */ #pragma once #include +#include #include #include +#include #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" @@ -144,15 +146,64 @@ class NCEKernel : public framework::OpKernel { } // forward mul auto input_mat = EigenMatrix::From(*(context.Input("Input"))); - auto weight_mat = EigenMatrix::From(*(context.Input("Weight"))); - for (int64_t i = 0; i < sample_labels->numel(); ++i) { - Eigen::Tensor result = - (input_mat.chip(static_cast(i / sample_labels->dims()[1]), 0) * - weight_mat.chip(sample_labels_data[i], 0)) - .sum(); - sample_out_data[i] += result(0); - sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i]))); + + // for remote prefetch + auto epmap = context.Attr>("epmap"); + + if (!epmap.empty()) { + // if epmap is not empty, then the parameter will be fetched from remote + // parameter + // server + + std::vector labels; + for (int64_t i = 0; i < sample_labels->numel(); ++i) { + labels.push_back(sample_labels_data[i]); + } + std::set st(labels.begin(), labels.end()); + labels.assign(st.begin(), st.end()); + + auto &local_scope = context.scope().NewScope(); + auto height_sections = context.Attr>("height_sections"); + auto table_names = context.Attr>("table_names"); + + framework::Variable *ids = local_scope.Var("Ids"); + framework::Variable *weight = local_scope.Var("Weight"); + +#ifdef PADDLE_WITH_DISTRIBUTE + operators::distributed::prefetch("Ids", "Weight", table_names, epmap, + height_sections, context); +#else + PADDLE_THROW( + "paddle is not compiled with distribute support, can not do " + "parameter prefetch!"); + + auto weight_mat = EigenMatrix::From(*(weight->Get())); + for (int64_t i = 0; i < sample_labels->numel(); ++i) { + std::vector::iterator it = + std::find(labels.begin(), labels.end(), sample_labels_data[i]); + int idx = std::distance(labels.begin(), it); + + Eigen::Tensor result = + (input_mat.chip(static_cast(i / sample_labels->dims()[1]), 0) * + weight_mat.chip(idx, 0)) + .sum(); + sample_out_data[i] += result(0); + sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i]))); + } +#endif + } else { + auto weight_mat = + EigenMatrix::From(*(context.Input("Weight"))); + for (int64_t i = 0; i < sample_labels->numel(); ++i) { + Eigen::Tensor result = + (input_mat.chip(static_cast(i / sample_labels->dims()[1]), 0) * + weight_mat.chip(sample_labels_data[i], 0)) + .sum(); + sample_out_data[i] += result(0); + sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i]))); + } } + // forward cost for (int64_t i = 0; i < sample_labels->dims()[0]; ++i) { out_data[i] = 0; diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 1d867d9194..817af602bd 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -239,7 +239,7 @@ class DistributeTranspiler(object): def _get_all_remote_sparse_update_op(self, main_program): sparse_update_ops = [] - sparse_update_op_types = ["lookup_table"] + sparse_update_op_types = ["lookup_table", "nce"] for op in main_program.global_block().ops: if op.type in sparse_update_op_types and op.attr( 'remote_prefetch') is True and not op.attr( From 627a6b8bacc5f4898c1c3c9018fd8e70ef95d8dc Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Thu, 6 Dec 2018 17:14:59 +0800 Subject: [PATCH 011/197] add prefetch in nce --- paddle/fluid/operators/nce_op.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index 8f82f77f50..7397d9f473 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -26,6 +26,10 @@ limitations under the License. */ #include "paddle/fluid/operators/math/sampler.h" #include "unsupported/Eigen/CXX11/Tensor" +#ifdef PADDLE_WITH_DISTRIBUTE +#include "paddle/fluid/operators/distributed/parameter_prefetch.h" +#endif + namespace paddle { namespace operators { @@ -166,8 +170,8 @@ class NCEKernel : public framework::OpKernel { auto height_sections = context.Attr>("height_sections"); auto table_names = context.Attr>("table_names"); - framework::Variable *ids = local_scope.Var("Ids"); - framework::Variable *weight = local_scope.Var("Weight"); + local_scope.Var("Ids"); + local_scope.Var("Weight"); #ifdef PADDLE_WITH_DISTRIBUTE operators::distributed::prefetch("Ids", "Weight", table_names, epmap, From 7fa2e821e470411b75ba0f53a3759fa007391745 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Thu, 6 Dec 2018 17:53:05 +0800 Subject: [PATCH 012/197] add local scope in nce --- paddle/fluid/operators/nce_op.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index 7397d9f473..afb14c3071 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -194,6 +194,8 @@ class NCEKernel : public framework::OpKernel { sample_out_data[i] += result(0); sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i]))); } + + context.scope().DeleteScope(&local_scope); #endif } else { auto weight_mat = From c9de6f1b05aa428d5e6ad9c16db5c2ca8c12cdc7 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Thu, 6 Dec 2018 21:16:10 +0800 Subject: [PATCH 013/197] init parallel graph mode --- paddle/fluid/framework/CMakeLists.txt | 2 +- paddle/fluid/framework/details/CMakeLists.txt | 2 + .../framework/details/all_reduce_op_handle.cc | 28 +++- .../fluid/framework/details/build_strategy.cc | 1 + .../details/computation_op_handle.cc | 12 +- .../framework/details/computation_op_handle.h | 1 + .../framework/details/execution_strategy.h | 2 +- .../details/multi_devices_graph_pass.cc | 8 +- .../fluid/framework/details/op_handle_base.cc | 3 +- .../fluid/framework/details/op_handle_base.h | 1 - .../details/parallel_ssa_graph_executor.cc | 66 ++++++++++ .../details/parallel_ssa_graph_executor.h | 51 +++++++ .../scope_buffered_ssa_graph_executor.cc | 41 +++--- .../scope_buffered_ssa_graph_executor.h | 5 +- .../details/threaded_ssa_graph_executor.h | 1 + paddle/fluid/framework/details/var_handle.cc | 2 +- paddle/fluid/framework/parallel_executor.cc | 124 +++++++++++++----- paddle/fluid/framework/parallel_executor.h | 2 + paddle/fluid/framework/scope.cc | 5 +- paddle/fluid/framework/threadpool.cc | 16 ++- paddle/fluid/framework/threadpool.h | 4 +- paddle/fluid/framework/threadpool_test.cc | 44 +++++++ .../fluid/operators/reader/blocking_queue.h | 3 + .../fluid/operators/reader/buffered_reader.cc | 5 + .../reader/create_double_buffer_reader_op.cc | 14 +- .../operators/reader/create_py_reader_op.cc | 2 + .../fluid/operators/reader/open_files_op.cc | 2 + paddle/fluid/platform/nccl_helper.h | 7 +- paddle/fluid/platform/profiler.cc | 12 +- paddle/fluid/pybind/pybind.cc | 24 ++-- 30 files changed, 399 insertions(+), 91 deletions(-) create mode 100644 paddle/fluid/framework/details/parallel_ssa_graph_executor.cc create mode 100644 paddle/fluid/framework/details/parallel_ssa_graph_executor.h diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index c701a2ad63..b419c8c292 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -177,7 +177,7 @@ else() endif() cc_library(parallel_executor SRCS parallel_executor.cc DEPS - threaded_ssa_graph_executor scope_buffered_ssa_graph_executor + threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor graph build_strategy fast_threaded_ssa_graph_executor variable_helper) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 93288936fe..6524753322 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -54,6 +54,8 @@ cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUT cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context) +cc_library(parallel_ssa_graph_executor SRCS parallel_ssa_graph_executor.cc DEPS threaded_ssa_graph_executor) + cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory device_context broadcast_op_handle) cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index e8bf53e160..ae17ea8a15 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -46,20 +46,27 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, #endif void AllReduceOpHandle::RunImpl() { + int64_t start_ts = GetTS(); + int64_t func_ts = GetTS(); + VLOG(5) << "all_reduce_op_handle::RunImpl start"; platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second); // FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR, // this is a distributed or inter-process call, find a better way. #ifdef PADDLE_WITH_CUDA if (NoDummyInputSize() == 1 && - local_scopes_[0]->FindLocalVar(NCCL_ID_VARNAME) == nullptr) { + local_scopes_[0]->FindVar(NCCL_ID_VARNAME) == nullptr) { #else if (NoDummyInputSize() == 1) { #endif return; // No need to all reduce when GPU count = 1; } else { // Wait input done + start_ts = GetTS(); WaitInputVarGenerated(); + VLOG(5) << "all_reduce_op_handle wait input var spent: " + << GetTS() - start_ts << " (ns)."; + start_ts = GetTS(); auto in_var_handles = DynamicCast(this->Inputs()); auto out_var_handles = DynamicCast(this->Outputs()); PADDLE_ENFORCE_EQ( @@ -100,6 +107,8 @@ void AllReduceOpHandle::RunImpl() { } int dev_id = boost::get(p).device; + VLOG(5) << "call allreduce: " << in_var_handles[i]->name_ + << " on dev: " << dev_id; auto &nccl_ctx = nccl_ctxs_->at(dev_id); auto stream = nccl_ctx.stream(); auto comm = nccl_ctx.comm_; @@ -110,11 +119,20 @@ void AllReduceOpHandle::RunImpl() { }); } this->RunAndRecordEvent([&] { - platform::NCCLGroupGuard guard; - for (auto &call : all_reduce_calls) { - call(); + // TODO(Yancey1989): need allreduce operator to avoid this flag + if (nccl_ctxs_->need_group_call_) { + platform::NCCLGroupGuard guard; + for (auto &call : all_reduce_calls) { + call(); + } + } else { + // only used in executor_type == ParallalGraph, one thread one GPU + // TODO(Yancey1989): use allreduce operator to avoid this tricky. + PADDLE_ENFORCE(all_reduce_calls.size() == 1UL); + all_reduce_calls[0](); } }); + #else PADDLE_THROW("Not compiled with CUDA"); #endif @@ -144,6 +162,8 @@ void AllReduceOpHandle::RunImpl() { } } } + VLOG(5) << "all_reduce_op_handle Impl spent: " << GetTS() - func_ts + << " (ns)."; } std::string AllReduceOpHandle::Name() const { return "all_reduce"; } diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 1e1b945f63..04c1061536 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -118,6 +118,7 @@ std::unique_ptr BuildStrategy::Apply( std::unique_ptr graph(new ir::Graph(main_program)); for (std::shared_ptr &pass : pass_builder_->AllPasses()) { + VLOG(5) << "run pass: " << pass->Type(); if (pass->Type() == "multi_devices_pass") { pass->Erase("places"); pass->SetNotOwned>("places", &places); diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc index 7ad1e40c60..35ba99a879 100644 --- a/paddle/fluid/framework/details/computation_op_handle.cc +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -33,10 +33,18 @@ void ComputationOpHandle::RunImpl() { op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get(), place_); }; - if (is_lock_and_record_event_free_) { + if (Name().compare("conv2d") || Name().compare("conv2d_grad")) { + int64_t start_ts = GetTS(); + auto varname = DynamicCast(this->Outputs())[0]->name_; run_func(); + VLOG(5) << Name() << "_op_handle: " << varname + << " spent: " << GetTS() - start_ts << " (ns)."; } else { - this->RunAndRecordEvent(run_func); + if (is_lock_and_record_event_free_) { + run_func(); + } else { + this->RunAndRecordEvent(run_func); + } } } diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h index 662a91d6b4..5346b56dd6 100644 --- a/paddle/fluid/framework/details/computation_op_handle.h +++ b/paddle/fluid/framework/details/computation_op_handle.h @@ -17,6 +17,7 @@ #include #include +#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h index 15c496130c..d3d5b6bf54 100644 --- a/paddle/fluid/framework/details/execution_strategy.h +++ b/paddle/fluid/framework/details/execution_strategy.h @@ -20,7 +20,7 @@ namespace framework { namespace details { struct ExecutionStrategy { - enum ExecutorType { kDefault = 0, kExperimental = 1 }; + enum ExecutorType { kDefault = 0, kExperimental = 1, kParallelGraph = 2 }; size_t num_threads_{0}; bool use_cuda_{true}; diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index cbae5321d9..1bd238357a 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -300,7 +300,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( auto nodes = graph->ReleaseNodes(); ir::Graph &result = *graph; - int num_trainers = Get(kNumTrainers); + // int num_trainers = Get(kNumTrainers); for (auto &node : nodes) { if (node->IsVar() && node->Var()) { @@ -329,6 +329,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( std::unordered_map sharded_var_device; for (ir::Node *node : sorted_ops) { + VLOG(5) << "op name: " << node->Op()->Type(); if (boost::get( node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == static_cast(OpRole::kRPC)) { @@ -365,9 +366,11 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( // is true only for the op that scale the final scalar loss. // It also assumes backward op will always follow the forward op in // the block. + VLOG(5) << "this is loss scale op!"; is_forwarding = false; } else { int op_dev_id = GetOpDeviceID(result, node, sharded_var_device); + VLOG(5) << "on device id: " << op_dev_id; if (op_dev_id != -1) { // This op only runs on one specific device. CreateComputationalOp(&result, node, op_dev_id); for (ir::Node *n : node->outputs) { @@ -386,7 +389,8 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( CreateComputationalOps(&result, node, places_.size()); } - if (!is_forwarding && (places_.size() > 1 || num_trainers > 1)) { + // if (!is_forwarding && (places_.size() > 1 || num_trainers > 1)) { + if (!is_forwarding && nccl_ctxs_->contexts_.size() > 1) { // Currently, we assume that once gradient is generated, it can be // broadcast, and each gradient is only broadcast once. if (static_cast(boost::get(node->Op()->GetAttr( diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 4822627ac3..d68d1ce71d 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -41,6 +41,7 @@ OpHandleBase::~OpHandleBase() { void OpHandleBase::Run(bool use_cuda) { #ifdef PADDLE_WITH_CUDA + int64_t start_ts = 0; if (events_.empty() && use_cuda) { for (auto &p : dev_ctxes_) { int dev_id = boost::get(p.first).device; @@ -52,7 +53,6 @@ void OpHandleBase::Run(bool use_cuda) { #else PADDLE_ENFORCE(!use_cuda); #endif - RunImpl(); } @@ -125,6 +125,7 @@ bool OpHandleBase::NeedWait(VarHandleBase *in_var) { void OpHandleBase::RunAndRecordEvent(const std::function &callback) { #ifdef PADDLE_WITH_CUDA if (!events_.empty()) { // Use event + VLOG(5) << "events not empty"; std::function method = callback; for (auto &p : dev_ctxes_) { method = [method, p, this]() { diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index ba12ca3c61..88c78e0678 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -26,7 +26,6 @@ namespace framework { namespace details { constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@"; - // Wraps ir::Node and provide helper utilities. // It's responsible for populating necessary fields of ir::Node. class OpHandleBase { diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc new file mode 100644 index 0000000000..72beb74aa4 --- /dev/null +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc @@ -0,0 +1,66 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h" + +namespace paddle { +namespace framework { +namespace details { + +ParallelSSAGraphExecutor::ParallelSSAGraphExecutor( + const ExecutionStrategy &strategy, const std::vector &local_scopes, + const std::vector &places, + std::vector> graphs) + : strategy_(std::move(strategy)), + local_scopes_(std::move(local_scopes)), + places_(std::move(places)), + graphs_(std::move(graphs)), + pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr) { + PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); + for (size_t i = 0; i < places.size(); ++i) { + std::vector scopes = {local_scopes_[i]}; + std::vector places = {places_[i]}; + executors_.emplace_back(new details::ThreadedSSAGraphExecutor( + strategy_, scopes, places, std::move(graphs_[i]))); + } +} + +FeedFetchList ParallelSSAGraphExecutor::Run( + const std::vector &fetch_tensors) { + std::vector> run_futures; + FeedFetchList fetch_data; + + for (size_t i = 0; i < places_.size(); ++i) { + auto call = [this, i] { + // FIXME(Yancey1989): need to fix fetch data failed. + std::vector empty; + executors_[i]->Run(empty); + }; + if (pool_) { + run_futures.emplace_back(pool_->enqueue(std::move(call))); + } else { + call(); + } + } + if (pool_) { + for (auto &f : run_futures) { + f.wait(); + } + } + return fetch_data; +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h new file mode 100644 index 0000000000..c0ba1577f7 --- /dev/null +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h @@ -0,0 +1,51 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "ThreadPool.h" +#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" +#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" + +namespace paddle { +namespace framework { +namespace details { + +class ParallelSSAGraphExecutor : public SSAGraphExecutor { + public: + ParallelSSAGraphExecutor(const ExecutionStrategy &strategy, + const std::vector &local_scopes, + const std::vector &places, + std::vector> graphs); + ~ParallelSSAGraphExecutor() final = default; + const ir::Graph &Graph() const override { return *graphs_[0]; } + + FeedFetchList Run(const std::vector &fetch_tensors) override; + + private: + ExecutionStrategy strategy_; + std::vector local_scopes_; + std::vector places_; + std::vector> graphs_; + std::unique_ptr<::ThreadPool> pool_; + + std::vector> executors_; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index 499246a985..abc6b9f559 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -27,39 +27,40 @@ namespace framework { namespace details { ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor( ExecutionStrategy strategy, std::vector local_scopes, - std::vector var_infos, std::vector places, + std::vector> var_infos_list, + std::vector places, std::unique_ptr &&underlying_executor) : strategy_(std::move(strategy)), underlying_executor_(std::move(underlying_executor)), local_scopes_(std::move(local_scopes)), - var_infos_(std::move(var_infos)), + var_infos_list_(std::move(var_infos_list)), places_(std::move(places)) {} FeedFetchList ScopeBufferedSSAGraphExecutor::Run( const std::vector &fetch_tensors) { if (drop_scope_counter_ == 0) { // Create local scopes. - for (auto it = local_scopes_.rbegin(); it != local_scopes_.rend(); ++it) { - auto &scope = *it; + for (size_t i = 0; i < local_scopes_.size(); ++i) { + auto &scope = local_scopes_[i]; Scope &local_scope = scope->NewScope(); *scope->Var(details::kLocalExecScopeName)->GetMutable() = &local_scope; - - for (auto &info : var_infos_) { - if (scope->FindVar(info.name_) != nullptr) { - continue; - } - - if (info.persistable_) { // Persistable - InitializeVariable(scope->Var(info.name_), info.type_); - } else { - InitializeVariable(local_scope.Var(info.name_), info.type_); + for (auto &var_infos : var_infos_list_) { + for (auto &info : var_infos) { + if (scope->FindVar(info.name_) != nullptr) { + continue; + } + if (info.persistable_) { // Persistable + InitializeVariable(scope->Var(info.name_), info.type_); + } else { + InitializeVariable(local_scope.Var(info.name_), info.type_); + } } } } } std::vector fetch_data; - std::exception_ptr eptr; + std::exception_ptr eptr = nullptr; try { fetch_data = underlying_executor_->Run(fetch_tensors); } catch (...) { @@ -71,9 +72,13 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( #ifdef PADDLE_WITH_CUDA const std::string gc_name = "garbage_collector"; - DeviceGarbageCollectorMap *gc = - Graph().Has(gc_name) ? &(Graph().Get(gc_name)) - : nullptr; + DeviceGarbageCollectorMap *gc = nullptr; + // FIXME(Yancey1989): need to fix gc failed on parallel graph mode + if (strategy_.type_ != ExecutionStrategy::kParallelGraph) { + gc = Graph().Has(gc_name) + ? &(Graph().Get(gc_name)) + : nullptr; + } #endif if (!fetch_tensors.empty() || diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h index 5e87e0bf50..51230d4a42 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h @@ -38,7 +38,8 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { public: ScopeBufferedSSAGraphExecutor( ExecutionStrategy strategy, std::vector local_scopes, - std::vector var_infos, std::vector places, + std::vector> var_info_list, + std::vector places, std::unique_ptr&& underlying_executor); const ir::Graph& Graph() const override { @@ -53,7 +54,7 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { ExecutionStrategy strategy_; std::unique_ptr underlying_executor_; std::vector local_scopes_; - std::vector var_infos_; + std::vector> var_infos_list_; std::vector places_; }; } // namespace details diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index 24da56c09e..b45afbc046 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -24,6 +24,7 @@ #include #include "ThreadPool.h" // ThreadPool in thrird party #include "paddle/fluid/framework/blocking_queue.h" +#include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/exception_holder.h" #include "paddle/fluid/framework/details/execution_strategy.h" #include "paddle/fluid/framework/details/fetch_op_handle.h" diff --git a/paddle/fluid/framework/details/var_handle.cc b/paddle/fluid/framework/details/var_handle.cc index 30da029ca2..7de6025a28 100644 --- a/paddle/fluid/framework/details/var_handle.cc +++ b/paddle/fluid/framework/details/var_handle.cc @@ -20,7 +20,7 @@ namespace details { VarHandleBase::~VarHandleBase() {} -VarHandle::~VarHandle() { VLOG(4) << "deleting var handle " << DebugString(); } +VarHandle::~VarHandle() { VLOG(5) << "deleting var handle " << DebugString(); } std::string VarHandle::DebugString() const { std::stringstream ss; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index b98408ee77..ff3d76fb01 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -26,6 +26,7 @@ limitations under the License. */ #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h" #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/platform/profiler.h" @@ -53,6 +54,7 @@ class ParallelExecutorPrivate { std::vector local_scopes_; Scope *global_scope_; // not owned std::unique_ptr executor_; + std::vector> executors_; #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) std::unique_ptr nccl_ctxs_; @@ -84,6 +86,9 @@ ParallelExecutor::ParallelExecutor( PADDLE_ENFORCE(places.size() > 1, "If you set build_strategy.reduce with 'Reduce'," "the number of places must be greater than 1."); + PADDLE_ENFORCE(exec_strategy.type_ != ExecutionStrategy::kParallelGraph, + "You should set build_strategy.reduce with 'AllReduce' for " + "ParallelGraph executor type"); } // Step 1. Bcast the params to devs. @@ -106,31 +111,55 @@ ParallelExecutor::ParallelExecutor( // Bcast Parameters to all GPUs #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME); - ncclUniqueId *nccl_id = nullptr; + std::unique_ptr nccl_id = nullptr; + bool need_group_call = true; if (nccl_id_var != nullptr) { - nccl_id = nccl_id_var->GetMutable(); + nccl_id.reset(nccl_id_var->GetMutable()); + } else if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { + nccl_id.reset(new ncclUniqueId()); + PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(nccl_id.get())); + *member_->global_scope_->Var(NCCL_ID_VARNAME) + ->GetMutable() = *nccl_id.get(); + need_group_call = false; + } else { + // init nccl_id in NCCLContextMap } + member_->nccl_ctxs_.reset(new platform::NCCLContextMap( - member_->places_, nccl_id, num_trainers, trainer_id)); + member_->places_, nccl_id.get(), num_trainers, trainer_id, + need_group_call)); #else PADDLE_THROW("Not compiled with CUDA"); #endif } - if (member_->local_scopes_.size() != 1 && local_scopes.empty()) { BCastParamsToDevices(bcast_vars); } -// Startup Program has been run. All local scopes has correct parameters. + // Startup Program has been run. All local scopes has correct parameters. -// Step 2. Convert main_program to SSA form and dependency graph. Also, insert -// ncclOp + // Step 2. Convert main_program to SSA form and dependency graph. Also, insert + // ncclOp + std::vector> graphs; #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - std::unique_ptr graph = build_strategy.Apply( - main_program, member_->places_, loss_var_name, params, - member_->local_scopes_, member_->use_cuda_, member_->nccl_ctxs_.get()); + if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { + for (size_t i = 0; i < member_->places_.size(); ++i) { + std::unique_ptr graph = build_strategy.Apply( + main_program, {member_->places_[i]}, loss_var_name, params, + {member_->local_scopes_[i]}, member_->use_cuda_, + member_->nccl_ctxs_.get()); + graphs.push_back(std::move(graph)); + } + } else { + std::unique_ptr graph = build_strategy.Apply( + main_program, member_->places_, loss_var_name, params, + member_->local_scopes_, member_->use_cuda_, member_->nccl_ctxs_.get()); + graphs.push_back(std::move(graph)); + } auto max_memory_size = GetEagerDeletionThreshold(); - if (max_memory_size >= 0) { + // FIXME(Yancey1989): need to fix on parallel graph mode + if (max_memory_size >= 0 && + exec_strategy.type_ != ExecutionStrategy::kParallelGraph) { for (auto &place : member_->places_) { if (!platform::is_gpu_place(place)) continue; auto gpu_place = boost::get(place); @@ -143,40 +172,48 @@ ParallelExecutor::ParallelExecutor( } } if (!gcs_.empty()) { - auto ref_cnt_pass = - ir::PassRegistry::Instance().Get("reference_count_pass"); - ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount, &ref_cnts_); - ref_cnt_pass->SetNotOwned(details::kCurReferenceCount, &cur_ref_cnts_); - ref_cnt_pass->SetNotOwned(details::kGarbageCollector, &gcs_); - graph = ref_cnt_pass->Apply(std::move(graph)); - graph->SetNotOwned("garbage_collector", &gcs_); + for (size_t i = 0; i < graphs.size(); ++i) { + auto ref_cnt_pass = + ir::PassRegistry::Instance().Get("reference_count_pass"); + ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount, &ref_cnts_); + ref_cnt_pass->SetNotOwned(details::kCurReferenceCount, &cur_ref_cnts_); + ref_cnt_pass->SetNotOwned(details::kGarbageCollector, &gcs_); + graphs[0] = ref_cnt_pass->Apply(std::move(graphs[i])); + graphs[0]->SetNotOwned("garbage_collector", &gcs_); + } } } #else std::unique_ptr graph = build_strategy.Apply(main_program, member_->places_, loss_var_name, params, member_->local_scopes_, member_->use_cuda_); + graphs.push_back(std::move(graph)); #endif // Step 3. Create vars in each scope. Passes may also create new vars. // skip control vars and empty vars - std::vector var_infos; - for (auto &node : graph->Nodes()) { - if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { - var_infos.emplace_back(); - var_infos.back().name_ = node->Var()->Name(); - var_infos.back().type_ = node->Var()->GetType(); - var_infos.back().persistable_ = node->Var()->Persistable(); + std::vector> var_infos_list; + for (size_t i = 0; i < graphs.size(); ++i) { + std::vector var_infos; + for (auto &node : graphs[i]->Nodes()) { + if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { + var_infos.emplace_back(); + var_infos.back().name_ = node->Var()->Name(); + var_infos.back().type_ = node->Var()->GetType(); + var_infos.back().persistable_ = node->Var()->Persistable(); + } } + var_infos_list.emplace_back(std::move(var_infos)); } + // If the loss_var_name is given, the number of graph should be only one. if (loss_var_name.size()) { - size_t graph_num = ir::GraphNum(*graph); + size_t graph_num = ir::GraphNum(*graphs[0]); if (graph_num > 1) { LOG(WARNING) << "The number of graph should be only one, " "but the current graph has " - << ir::GraphNum(*graph) + << ir::GraphNum(*graphs[0]) << " sub_graphs. If you want to see the nodes of the " "sub_graphs, you should use 'FLAGS_print_sub_graph_dir' " "to specify the output dir. NOTES: if you not do training, " @@ -185,15 +222,42 @@ ParallelExecutor::ParallelExecutor( } if (exec_strategy.type_ == ExecutionStrategy::kDefault) { + /** + for (size_t i = 0; i < member_->places_.size(); ++i) { + std::vector var_infos; + for (auto &node : graphs[i]->Nodes()) { + if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { + var_infos.emplace_back(); + var_infos.back().name_ = node->Var()->Name(); + var_infos.back().type_ = node->Var()->GetType(); + var_infos.back().persistable_ = node->Var()->Persistable(); + } + } + + std::vector places = {member_->places_[i]}; + std::vector scopes = {member_->local_scopes_[i]}; + std::unique_ptr p(new + details::ThreadedSSAGraphExecutor( + exec_strategy, scopes, places, std::move(graphs[i]))); + + member_->executors_.push_back(std::move(p)); + + member_->executors_[i].reset(new details::ScopeBufferedSSAGraphExecutor( + exec_strategy, scopes, std::move(var_infos), places, + std::move(member_->executors_[i]))); + }**/ member_->executor_.reset(new details::ThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, places, std::move(graph))); + exec_strategy, member_->local_scopes_, places, std::move(graphs[0]))); + } else if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { + member_->executor_.reset(new details::ParallelSSAGraphExecutor( + exec_strategy, member_->local_scopes_, places, graphs)); } else { member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, places, std::move(graph))); + exec_strategy, member_->local_scopes_, places, std::move(graphs[0]))); } member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, std::move(var_infos), + exec_strategy, member_->local_scopes_, std::move(var_infos_list), member_->places_, std::move(member_->executor_))); } diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index ef09b98b2a..319701f1eb 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -20,6 +20,8 @@ limitations under the License. */ #include #include +#include "ThreadPool.h" + #include "paddle/fluid/framework/details/build_strategy.h" #include "paddle/fluid/framework/details/execution_strategy.h" #include "paddle/fluid/framework/executor.h" diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 0d261dd7cc..873f68e42e 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -58,7 +58,10 @@ int64_t GetEagerDeletionThreshold() { (static_cast(1) << 30)); } -Scope::~Scope() { DropKids(); } +Scope::~Scope() { + VLOG(5) << "~Scope()"; + DropKids(); +} Scope& Scope::NewScope() const { SCOPE_LOCK_GUARD diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc index fcec955360..7dc7430c55 100644 --- a/paddle/fluid/framework/threadpool.cc +++ b/paddle/fluid/framework/threadpool.cc @@ -48,9 +48,18 @@ void ThreadPool::Init() { ThreadPool::ThreadPool(int num_threads) : running_(true) { threads_.resize(num_threads); - for (auto& thread : threads_) { + for (int i = 0; i < num_threads; ++i) { + // for (auto& thread : threads_) { // TODO(Yancey1989): binding the thread on the specify CPU number - thread.reset(new std::thread(std::bind(&ThreadPool::TaskLoop, this))); + threads_[i].reset( + new std::thread(std::bind(&ThreadPool::TaskLoop, this, i))); + /** + sched_param sch; + int policy; + pthread_getschedparam(threads_[i]->native_handle(), &policy, &sch); + if (pthread_setschedparam(threads_[i]->native_handle(), SCHED_FIFO, &sch)) { + VLOG(1) << "Failed to setschedparam: " << errno; + }**/ } } @@ -68,7 +77,7 @@ ThreadPool::~ThreadPool() { } } -void ThreadPool::TaskLoop() { +void ThreadPool::TaskLoop(int i) { while (true) { Task task; @@ -89,7 +98,6 @@ void ThreadPool::TaskLoop() { task = std::move(tasks_.front()); tasks_.pop(); } - // run the task task(); } diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h index 7a51d18fbb..bd8c3cdee8 100644 --- a/paddle/fluid/framework/threadpool.h +++ b/paddle/fluid/framework/threadpool.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include // NOLINT #include #include // NOLINT @@ -27,7 +28,6 @@ limitations under the License. */ namespace paddle { namespace framework { - struct ExceptionHandler { mutable std::future> future_; explicit ExceptionHandler( @@ -99,7 +99,7 @@ class ThreadPool { // The constructor starts threads to run TaskLoop, which retrieves // and runs tasks from the queue. - void TaskLoop(); + void TaskLoop(int i); // Init is called by GetInstance. static void Init(); diff --git a/paddle/fluid/framework/threadpool_test.cc b/paddle/fluid/framework/threadpool_test.cc index 884d61e234..1257a76e3e 100644 --- a/paddle/fluid/framework/threadpool_test.cc +++ b/paddle/fluid/framework/threadpool_test.cc @@ -59,3 +59,47 @@ TEST(ThreadPool, ConcurrentRun) { } EXPECT_EQ(sum, ((n + 1) * n) / 2); } +static int64_t GetTS() { + struct timeval tp; + gettimeofday(&tp, NULL); + return tp.tv_sec * 1000000 + tp.tv_usec; +} + +void multi_call(std::function call) { + for (int i = 0; i < 500; ++i) { + call(); + } +} + +TEST(ThreadPool, PERFORMANCE) { + auto sum = [] { + int a = 0; + for (int i = 0; i < 1000; ++i) { + a += i; + } + }; + // framework::ThreadPool *pool = new framework::ThreadPool(2); + int64_t start = GetTS(); + for (int i = 0; i < 1000; ++i) { + // int64_t s = GetTS(); + framework::Async(std::move(sum)); + // pool->Run(std::move(sum)); + // VLOG(5) << "push to pool spent : " << GetTS() - s << " (us)."; + } + VLOG(5) << "pool spent: " << GetTS() - start << " (us)."; + start = GetTS(); + for (int i = 0; i < 1000; ++i) { + sum(); + } + VLOG(5) << "sequence call spent: " << GetTS() - start << " (us)."; + std::vector threads; + start = GetTS(); + for (int i = 0; i < 2; ++i) { + std::thread t(multi_call, std::ref(sum)); + threads.push_back(std::move(t)); + } + for (auto& thread : threads) { + thread.join(); + } + VLOG(5) << "two threads spent: " << GetTS() - start << " (us)."; +} diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h index 51b980acb5..10de11bfa5 100644 --- a/paddle/fluid/operators/reader/blocking_queue.h +++ b/paddle/fluid/operators/reader/blocking_queue.h @@ -67,9 +67,12 @@ class BlockingQueue { } bool Receive(T* elem) { + VLOG(1) << "blocking queue::Receive ..."; std::unique_lock lock(mutex_); receive_cv_.wait(lock, [&] { return !queue_.empty() || closed_; }); + VLOG(1) << "queue_.empty()=" << queue_.empty(); if (!queue_.empty()) { + if (elem == nullptr) VLOG(1) << "elem is nullptr"; PADDLE_ENFORCE_NOT_NULL(elem); *elem = queue_.front(); if (LIKELY(!speed_test_mode_)) { diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index 26ff221dfa..2d66000f1f 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -58,7 +58,9 @@ void BufferedReader::ReadAsync(size_t i) { TensorVec &gpu = gpu_buffer_[i]; gpu.resize(cpu.size()); for (size_t i = 0; i < cpu.size(); ++i) { + VLOG(1) << "launch tensor copy from cpu to cpu, idx: " << i; framework::TensorCopySync(cpu[i], place_, &gpu[i]); + VLOG(1) << "done " << i; gpu[i].set_lod(cpu[i].lod()); } } @@ -80,11 +82,13 @@ void BufferedReader::StartImpl() { } void BufferedReader::ReadNextImpl(std::vector *out) { + VLOG(1) << "ReadNextImpl start on place: " << place_; if (position_.empty()) { out->clear(); return; } size_t i = position_.front().get(); + VLOG(1) << "position front: " << i; position_.pop(); if (i == -1UL) { @@ -101,6 +105,7 @@ void BufferedReader::ReadNextImpl(std::vector *out) { ReadAsync(prev_pos_); } prev_pos_ = i; + VLOG(1) << "success ReadNextImpl"; } } // namespace reader diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc index ed719f91d0..924c92e0bf 100644 --- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc +++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc @@ -25,9 +25,15 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase { private: void RunImpl(const framework::Scope& scope, const platform::Place& dev_place) const override { - auto* out = scope.FindVar(Output("Out")) - ->template GetMutable(); + VLOG(1) << "find var in scope: " << &scope; + auto* out_var = scope.FindVar(Output("Out")); + VLOG(1) << "var " << Output("Out") << " -> " << out_var; + auto* out = out_var->GetMutable(); + + // auto* out = scope.Var(Output("Out")) + // ->template GetMutable(); if (out->Get() != nullptr) { + VLOG(1) << Output("Out") << " is not nullptr."; return; } const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader")) @@ -46,9 +52,11 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase { sin >> num; place = platform::CUDAPlace(static_cast(num)); } - + VLOG(1) << "create buffered reader on " << place; out->Reset(framework::MakeDecoratedReader(underlying_reader, place, 2)); + VLOG(1) << "Reset Buffered Reader in var: " + << scope.FindVar(Input("UnderlyingReader")); } }; diff --git a/paddle/fluid/operators/reader/create_py_reader_op.cc b/paddle/fluid/operators/reader/create_py_reader_op.cc index 901a92ab5b..093b0e56b3 100644 --- a/paddle/fluid/operators/reader/create_py_reader_op.cc +++ b/paddle/fluid/operators/reader/create_py_reader_op.cc @@ -28,8 +28,10 @@ class PyReader : public framework::FileReader { } void ReadNext(std::vector* out) override { + VLOG(1) << "come in PyReader::ReadNext function, out: " << out; bool success; *out = queue_->Pop(&success); + VLOG(1) << "call PyReader::ReadNext " << success; if (!success) out->clear(); } diff --git a/paddle/fluid/operators/reader/open_files_op.cc b/paddle/fluid/operators/reader/open_files_op.cc index 38223e0699..ae37a18725 100644 --- a/paddle/fluid/operators/reader/open_files_op.cc +++ b/paddle/fluid/operators/reader/open_files_op.cc @@ -115,10 +115,12 @@ class PreemptiveReaderContainer : public IReaderContainer { } void ReadNext(std::vector* out) override { + VLOG(1) << "flag"; if (!pending_.empty()) { auto future_it = complete_queue_.Pop(); FutureItem item = future_it->get(); if (item.exception_) { + VLOG(1) << "item has exception!!!"; for (auto it = futures_.begin(); it != futures_.end(); ++it) { if (it != future_it) { it->wait(); // Wait all other threads complete. diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index 7c539d25f6..53de53f43d 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -82,12 +82,15 @@ struct NCCLContext { struct NCCLContextMap { std::unordered_map contexts_; std::vector order_; + bool need_group_call_; explicit NCCLContextMap(const std::vector &places, ncclUniqueId *nccl_id = nullptr, - size_t num_trainers = 1, size_t trainer_id = 0) { + size_t num_trainers = 1, size_t trainer_id = 0, + bool need_group_call = true) { PADDLE_ENFORCE(!places.empty()); order_.reserve(places.size()); + need_group_call_ = need_group_call; for (auto &p : places) { int dev_id = boost::get(p).device; order_.emplace_back(dev_id); @@ -102,7 +105,7 @@ struct NCCLContextMap { } std::unique_ptr comms(new ncclComm_t[order_.size()]); // if num_trainers == 1, should create a new nccl id for local comms. - if (num_trainers == 1) { + if (num_trainers == 1 && nccl_id != nullptr) { std::lock_guard guard(NCCLGroupGuard::NCCLMutex()); PADDLE_ENFORCE(platform::dynload::ncclCommInitAll( comms.get(), static_cast(order_.size()), order_.data())); diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 998242fb4a..040a68f672 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -12,9 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/platform/profiler.h" -#include "paddle/fluid/platform/port.h" - #include #include #include @@ -25,9 +22,12 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include #endif // PADDLE_WITH_CUDA + #include "glog/logging.h" #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/platform/device_tracer.h" +#include "paddle/fluid/platform/port.h" +#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/string/printf.h" DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not."); @@ -173,8 +173,9 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) { RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx) : is_enabled_(false), start_ns_(PosixInNsec()) { - std::lock_guard l(profiler_mu); if (g_state == ProfilerState::kDisabled) return; + std::lock_guard l(profiler_mu); + is_enabled_ = true; dev_ctx_ = dev_ctx; name_ = name; @@ -184,8 +185,9 @@ RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx) } RecordEvent::~RecordEvent() { - std::lock_guard l(profiler_mu); if (g_state == ProfilerState::kDisabled || !is_enabled_) return; + VLOG(5) << "call ~RecordEvent"; + std::lock_guard l(profiler_mu); DeviceTracer* tracer = GetDeviceTracer(); if (tracer) { tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec(), diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index fc7991d297..c313ed2a8b 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -720,6 +720,11 @@ All parameter, weight, gradient are variables in Paddle. )DOC"); + py::enum_(exec_strategy, "ExecutorType") + .value("Default", ExecutionStrategy::ExecutorType::kDefault) + .value("Experimental", ExecutionStrategy::ExecutorType::kExperimental) + .value("ParallelGraph", ExecutionStrategy::ExecutorType::kParallelGraph); + exec_strategy.def(py::init()) .def_property( "num_threads", @@ -777,17 +782,14 @@ All parameter, weight, gradient are variables in Paddle. [](const ExecutionStrategy &self) { return self.dry_run_; }, [](ExecutionStrategy &self, bool dry_run) { self.dry_run_ = dry_run; - }); - - exec_strategy.def_property( - "use_experimental_executor", - [](const ExecutionStrategy &self) { - return self.type_ == ExecutionStrategy::kExperimental; - }, - [](ExecutionStrategy &self, bool experimental) { - self.type_ = experimental ? ExecutionStrategy::kExperimental - : ExecutionStrategy::kDefault; - }); + }) + .def_property( + "executor_type", + [](const ExecutionStrategy &self) { return self.type_; }, + [](ExecutionStrategy &self, ExecutionStrategy::ExecutorType type) { + self.type_ = type; + }, + R"DOC()DOC"); py::class_ build_strategy(pe, "BuildStrategy", R"DOC( BuildStrategy allows the user to more preciously control how to From b653ed05163e9f6d47208d5f46bee18ec57a2645 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Fri, 7 Dec 2018 13:53:31 +0800 Subject: [PATCH 014/197] add prefetch and remvoe selectedrows of bias --- paddle/fluid/operators/nce_op.cc | 8 +-- paddle/fluid/operators/nce_op.h | 47 ++++----------- python/paddle/fluid/layers/nn.py | 9 ++- .../tests/unittests/test_dist_transpiler.py | 59 +++++++++++++++++-- .../fluid/transpiler/distribute_transpiler.py | 3 +- 5 files changed, 75 insertions(+), 51 deletions(-) diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc index 06ff825fde..0a0be24a54 100644 --- a/paddle/fluid/operators/nce_op.cc +++ b/paddle/fluid/operators/nce_op.cc @@ -243,24 +243,20 @@ class NCEOpGradVarTypeInference : public framework::VarTypeInference { void operator()(const framework::OpDesc &op_desc, framework::BlockDesc *block) const override { auto weight_grad = op_desc.Output(framework::GradVarName("Weight")).front(); - auto bias_grad = op_desc.Output(framework::GradVarName("Bias")).front(); auto attr = op_desc.GetAttr("is_sparse"); bool is_sparse = boost::get(attr); if (is_sparse) { - VLOG(3) << "nce_op_grad op " << weight_grad << " and " << bias_grad + VLOG(3) << "nce_op_grad op " << weight_grad << " and " << " is set to SelectedRows"; block->Var(weight_grad) ->SetType(framework::proto::VarType::SELECTED_ROWS); - block->Var(bias_grad)->SetType(framework::proto::VarType::SELECTED_ROWS); } else { - VLOG(3) << "nce_op_grad op " << weight_grad << " and " << bias_grad + VLOG(3) << "nce_op_grad op " << weight_grad << " and " << " is set to LoDTensor"; block->Var(weight_grad)->SetType(framework::proto::VarType::LOD_TENSOR); - block->Var(bias_grad)->SetType(framework::proto::VarType::LOD_TENSOR); } block->Var(weight_grad)->SetDataType(block->Var("Input")->GetDataType()); - block->Var(bias_grad)->SetDataType(block->Var("Input")->GetDataType()); } }; diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index afb14c3071..6567b6534a 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -297,18 +297,19 @@ class NCEGradKernel : public framework::OpKernel { sample_grad_data[i] *= d_out_data[sample_idx]; } + // get d_bias + auto d_bias = context.Output(framework::GradVarName("Bias")); + if (d_bias != nullptr) { + T *d_bias_data = d_bias->mutable_data(context.GetPlace()); + std::fill(d_bias_data, d_bias_data + d_bias->numel(), 0.0); + for (int64_t i = 0; i < sample_labels->numel(); ++i) { + d_bias_data[sample_labels_data[i]] += sample_grad_data[i]; + } + } + bool is_sparse = context.Attr("is_sparse"); if (!is_sparse) { - // get d_bias - auto d_bias = context.Output(framework::GradVarName("Bias")); - if (d_bias != nullptr) { - T *d_bias_data = d_bias->mutable_data(context.GetPlace()); - std::fill(d_bias_data, d_bias_data + d_bias->numel(), 0.0); - for (int64_t i = 0; i < sample_labels->numel(); ++i) { - d_bias_data[sample_labels_data[i]] += sample_grad_data[i]; - } - } // get d_w auto d_w = context.Output(framework::GradVarName("Weight")); if (d_w != nullptr) { @@ -330,34 +331,6 @@ class NCEGradKernel : public framework::OpKernel { std::set st(labels.begin(), labels.end()); labels.assign(st.begin(), st.end()); - auto *bias_var = context.InputVar("Bias"); - DDim bias_dim; - if (bias_var->IsType()) { - bias_dim = context.Input("Bias")->dims(); - } else if (bias_var->IsType()) { - auto *table_t = context.Input("Bias"); - bias_dim = table_t->value().dims(); - } else { - PADDLE_THROW( - "The parameter Bias of a NCE_OP " - "must be either LoDTensor or SelectedRows"); - } - - auto d_bias = - context.Output(framework::GradVarName("Bias")); - d_bias->set_rows(labels); - d_bias->set_height(bias_dim[0]); - - d_bias->mutable_value()->Resize( - {static_cast(labels.size()), bias_dim[1]}); - T *d_bias_data = - d_bias->mutable_value()->mutable_data(context.GetPlace()); - std::fill(d_bias_data, d_bias_data + labels.size(), 0.0); - for (int64_t i = 0; i < sample_labels->numel(); ++i) { - d_bias_data[d_bias->Index(sample_labels_data[i])] += - sample_grad_data[i]; - } - auto *table_var = context.InputVar("Weight"); DDim table_dim; if (table_var->IsType()) { diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 28b8ae895a..9401ffc2b1 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -24,7 +24,7 @@ from ..initializer import Normal, Constant from ..framework import Variable, OpProtoHolder from ..param_attr import ParamAttr from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_ -from .tensor import concat +from .tensor import concat, assign from . import utils from .. import unique_name from functools import reduce @@ -4770,12 +4770,17 @@ def nce(input, else: num_neg_samples = int(num_neg_samples) + remote_prefetch = False + if os.environ.get('PADDLE_ENABLE_REMOTE_PREFETCH'): + remote_prefetch = True + attrs = { 'num_total_classes': int(num_total_classes), 'num_neg_samples': num_neg_samples, 'seed': seed, 'sampler': sampler, - 'is_sparse': is_sparse + 'is_sparse': is_sparse, + 'remote_prefetch': remote_prefetch } helper.append_op( diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index 194387bc98..48bac52654 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -14,14 +14,15 @@ from __future__ import print_function +import traceback import math +import collections +import six import unittest +import numpy as np + import paddle.fluid as fluid -from paddle.fluid.transpiler.distribute_transpiler import delete_ops -import traceback -import collections -import six class TranspilerTest(unittest.TestCase): @@ -823,5 +824,55 @@ class TestRemoteLookupTable(TestDistLookupTableBase): self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) +# test for remote prefetch +class TestRemoteNce(TestDistLookupTableBase): + def network_with_table(self, is_sparse, is_distributed): + + num_total_classes = 20 + sampler = "uniform" + nid_freq_arr = np.random.dirichlet(np.ones(20) * 1000).astype('float32') + + input = fluid.layers.data(name="input", shape=[10], dtype="float32") + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + + w_param = fluid.default_main_program().global_block().create_parameter( + shape=[num_total_classes, 10], + dtype='float32', + name='nce_w', + initializer=fluid.initializer.ConstantInitializer()) + b_param = fluid.default_main_program().global_block().create_parameter( + shape=[num_total_classes, 1], + dtype='float32', + name='nce_b', + initializer=fluid.initializer.ConstantInitializer()) + + cost = fluid.layers.nce(input=input, + label=label, + num_total_classes=num_total_classes, + sampler=sampler, + custom_dist=nid_freq_arr.tolist(), + sample_weight=None, + param_attr='nce_w', + bias_attr='nce_b', + seed=1, + num_neg_samples=5, + is_sparse=is_sparse) + avg_cost = fluid.layers.mean(cost) + # optimizer + optimizer = fluid.optimizer.Adam(learning_rate=0.003) + optimizer.minimize(avg_cost) + + def net_conf(self): + import os + os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1" + self.network_with_table(is_sparse=True, is_distributed=False) + + def transpiler_test_impl(self): + trainer, _ = self.get_trainer() + for op in trainer.blocks[0].ops: + if op.type == "recv": + pass + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 817af602bd..9c526a0d8e 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -242,8 +242,7 @@ class DistributeTranspiler(object): sparse_update_op_types = ["lookup_table", "nce"] for op in main_program.global_block().ops: if op.type in sparse_update_op_types and op.attr( - 'remote_prefetch') is True and not op.attr( - 'is_distributed'): + 'remote_prefetch') is True: sparse_update_ops.append(op) return sparse_update_ops From cb8a24be14f04c23fbc206d8c8537ff365b4e6bc Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Fri, 7 Dec 2018 16:11:16 +0800 Subject: [PATCH 015/197] clean code --- .../framework/details/all_reduce_op_handle.cc | 12 +------ .../details/computation_op_handle.cc | 12 ++----- .../fluid/framework/details/op_handle_base.cc | 2 -- .../details/parallel_ssa_graph_executor.cc | 13 +++---- .../details/parallel_ssa_graph_executor.h | 4 +-- paddle/fluid/framework/parallel_executor.cc | 35 ++++--------------- paddle/fluid/platform/nccl_helper.h | 2 +- 7 files changed, 20 insertions(+), 60 deletions(-) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index ae17ea8a15..ae20338746 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -46,9 +46,6 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, #endif void AllReduceOpHandle::RunImpl() { - int64_t start_ts = GetTS(); - int64_t func_ts = GetTS(); - VLOG(5) << "all_reduce_op_handle::RunImpl start"; platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second); // FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR, @@ -62,11 +59,7 @@ void AllReduceOpHandle::RunImpl() { return; // No need to all reduce when GPU count = 1; } else { // Wait input done - start_ts = GetTS(); WaitInputVarGenerated(); - VLOG(5) << "all_reduce_op_handle wait input var spent: " - << GetTS() - start_ts << " (ns)."; - start_ts = GetTS(); auto in_var_handles = DynamicCast(this->Inputs()); auto out_var_handles = DynamicCast(this->Outputs()); PADDLE_ENFORCE_EQ( @@ -107,8 +100,6 @@ void AllReduceOpHandle::RunImpl() { } int dev_id = boost::get(p).device; - VLOG(5) << "call allreduce: " << in_var_handles[i]->name_ - << " on dev: " << dev_id; auto &nccl_ctx = nccl_ctxs_->at(dev_id); auto stream = nccl_ctx.stream(); auto comm = nccl_ctx.comm_; @@ -118,6 +109,7 @@ void AllReduceOpHandle::RunImpl() { ncclSum, comm, stream)); }); } + this->RunAndRecordEvent([&] { // TODO(Yancey1989): need allreduce operator to avoid this flag if (nccl_ctxs_->need_group_call_) { @@ -162,8 +154,6 @@ void AllReduceOpHandle::RunImpl() { } } } - VLOG(5) << "all_reduce_op_handle Impl spent: " << GetTS() - func_ts - << " (ns)."; } std::string AllReduceOpHandle::Name() const { return "all_reduce"; } diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc index 35ba99a879..7ad1e40c60 100644 --- a/paddle/fluid/framework/details/computation_op_handle.cc +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -33,18 +33,10 @@ void ComputationOpHandle::RunImpl() { op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get(), place_); }; - if (Name().compare("conv2d") || Name().compare("conv2d_grad")) { - int64_t start_ts = GetTS(); - auto varname = DynamicCast(this->Outputs())[0]->name_; + if (is_lock_and_record_event_free_) { run_func(); - VLOG(5) << Name() << "_op_handle: " << varname - << " spent: " << GetTS() - start_ts << " (ns)."; } else { - if (is_lock_and_record_event_free_) { - run_func(); - } else { - this->RunAndRecordEvent(run_func); - } + this->RunAndRecordEvent(run_func); } } diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index d68d1ce71d..4914e0a5ad 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -41,7 +41,6 @@ OpHandleBase::~OpHandleBase() { void OpHandleBase::Run(bool use_cuda) { #ifdef PADDLE_WITH_CUDA - int64_t start_ts = 0; if (events_.empty() && use_cuda) { for (auto &p : dev_ctxes_) { int dev_id = boost::get(p.first).device; @@ -125,7 +124,6 @@ bool OpHandleBase::NeedWait(VarHandleBase *in_var) { void OpHandleBase::RunAndRecordEvent(const std::function &callback) { #ifdef PADDLE_WITH_CUDA if (!events_.empty()) { // Use event - VLOG(5) << "events not empty"; std::function method = callback; for (auto &p : dev_ctxes_) { method = [method, p, this]() { diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc index 72beb74aa4..dfb40721d8 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc @@ -21,19 +21,20 @@ namespace details { ParallelSSAGraphExecutor::ParallelSSAGraphExecutor( const ExecutionStrategy &strategy, const std::vector &local_scopes, const std::vector &places, - std::vector> graphs) + std::vector> &&graphs) : strategy_(std::move(strategy)), local_scopes_(std::move(local_scopes)), + pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr), places_(std::move(places)), - graphs_(std::move(graphs)), - pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr) { + graphs_(std::move(graphs)) { PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); + // do not use threadpool for each graph execution. + strategy_.num_threads_ = 1UL; for (size_t i = 0; i < places.size(); ++i) { - std::vector scopes = {local_scopes_[i]}; - std::vector places = {places_[i]}; executors_.emplace_back(new details::ThreadedSSAGraphExecutor( - strategy_, scopes, places, std::move(graphs_[i]))); + strategy_, {local_scopes_[i]}, {places_[i]}, std::move(graphs_[i]))); } + VLOG(1) << "pool size: " << places_.size(); } FeedFetchList ParallelSSAGraphExecutor::Run( diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h index c0ba1577f7..37784775f0 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h @@ -30,7 +30,7 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor { ParallelSSAGraphExecutor(const ExecutionStrategy &strategy, const std::vector &local_scopes, const std::vector &places, - std::vector> graphs); + std::vector> &&graphs); ~ParallelSSAGraphExecutor() final = default; const ir::Graph &Graph() const override { return *graphs_[0]; } @@ -39,9 +39,9 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor { private: ExecutionStrategy strategy_; std::vector local_scopes_; + std::unique_ptr<::ThreadPool> pool_{nullptr}; std::vector places_; std::vector> graphs_; - std::unique_ptr<::ThreadPool> pool_; std::vector> executors_; }; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index ff3d76fb01..186f0cb803 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -54,7 +54,6 @@ class ParallelExecutorPrivate { std::vector local_scopes_; Scope *global_scope_; // not owned std::unique_ptr executor_; - std::vector> executors_; #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) std::unique_ptr nccl_ctxs_; @@ -142,6 +141,7 @@ ParallelExecutor::ParallelExecutor( std::vector> graphs; #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { + VLOG(1) << "kParallelGraph mode!!"; for (size_t i = 0; i < member_->places_.size(); ++i) { std::unique_ptr graph = build_strategy.Apply( main_program, {member_->places_[i]}, loss_var_name, params, @@ -222,38 +222,17 @@ ParallelExecutor::ParallelExecutor( } if (exec_strategy.type_ == ExecutionStrategy::kDefault) { - /** - for (size_t i = 0; i < member_->places_.size(); ++i) { - std::vector var_infos; - for (auto &node : graphs[i]->Nodes()) { - if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { - var_infos.emplace_back(); - var_infos.back().name_ = node->Var()->Name(); - var_infos.back().type_ = node->Var()->GetType(); - var_infos.back().persistable_ = node->Var()->Persistable(); - } - } - - std::vector places = {member_->places_[i]}; - std::vector scopes = {member_->local_scopes_[i]}; - std::unique_ptr p(new - details::ThreadedSSAGraphExecutor( - exec_strategy, scopes, places, std::move(graphs[i]))); - - member_->executors_.push_back(std::move(p)); - - member_->executors_[i].reset(new details::ScopeBufferedSSAGraphExecutor( - exec_strategy, scopes, std::move(var_infos), places, - std::move(member_->executors_[i]))); - }**/ member_->executor_.reset(new details::ThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, places, std::move(graphs[0]))); + exec_strategy, member_->local_scopes_, member_->places_, + std::move(graphs[0]))); } else if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { member_->executor_.reset(new details::ParallelSSAGraphExecutor( - exec_strategy, member_->local_scopes_, places, graphs)); + exec_strategy, member_->local_scopes_, member_->places_, + std::move(graphs))); } else { member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, places, std::move(graphs[0]))); + exec_strategy, member_->local_scopes_, member_->places_, + std::move(graphs[0]))); } member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index 53de53f43d..23a0222239 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -105,7 +105,7 @@ struct NCCLContextMap { } std::unique_ptr comms(new ncclComm_t[order_.size()]); // if num_trainers == 1, should create a new nccl id for local comms. - if (num_trainers == 1 && nccl_id != nullptr) { + if (num_trainers == 1 && nccl_id == nullptr) { std::lock_guard guard(NCCLGroupGuard::NCCLMutex()); PADDLE_ENFORCE(platform::dynload::ncclCommInitAll( comms.get(), static_cast(order_.size()), order_.data())); From 220db4f334a06bda1b9967740d9fd96806fc461b Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Fri, 7 Dec 2018 16:38:08 +0800 Subject: [PATCH 016/197] clean code --- .../fluid/framework/details/build_strategy.cc | 1 - .../details/multi_devices_graph_pass.cc | 3 -- paddle/fluid/framework/parallel_executor.h | 2 - paddle/fluid/framework/scope.cc | 5 +-- paddle/fluid/framework/threadpool.cc | 15 ++----- paddle/fluid/framework/threadpool.h | 2 +- paddle/fluid/framework/threadpool_test.cc | 44 ------------------- .../fluid/operators/reader/blocking_queue.h | 3 -- .../fluid/operators/reader/buffered_reader.cc | 3 -- .../reader/create_double_buffer_reader_op.cc | 13 +----- 10 files changed, 7 insertions(+), 84 deletions(-) diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 04c1061536..1e1b945f63 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -118,7 +118,6 @@ std::unique_ptr BuildStrategy::Apply( std::unique_ptr graph(new ir::Graph(main_program)); for (std::shared_ptr &pass : pass_builder_->AllPasses()) { - VLOG(5) << "run pass: " << pass->Type(); if (pass->Type() == "multi_devices_pass") { pass->Erase("places"); pass->SetNotOwned>("places", &places); diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 1bd238357a..c16e3006d7 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -329,7 +329,6 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( std::unordered_map sharded_var_device; for (ir::Node *node : sorted_ops) { - VLOG(5) << "op name: " << node->Op()->Type(); if (boost::get( node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == static_cast(OpRole::kRPC)) { @@ -366,11 +365,9 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( // is true only for the op that scale the final scalar loss. // It also assumes backward op will always follow the forward op in // the block. - VLOG(5) << "this is loss scale op!"; is_forwarding = false; } else { int op_dev_id = GetOpDeviceID(result, node, sharded_var_device); - VLOG(5) << "on device id: " << op_dev_id; if (op_dev_id != -1) { // This op only runs on one specific device. CreateComputationalOp(&result, node, op_dev_id); for (ir::Node *n : node->outputs) { diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 319701f1eb..ef09b98b2a 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -20,8 +20,6 @@ limitations under the License. */ #include #include -#include "ThreadPool.h" - #include "paddle/fluid/framework/details/build_strategy.h" #include "paddle/fluid/framework/details/execution_strategy.h" #include "paddle/fluid/framework/executor.h" diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 873f68e42e..0d261dd7cc 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -58,10 +58,7 @@ int64_t GetEagerDeletionThreshold() { (static_cast(1) << 30)); } -Scope::~Scope() { - VLOG(5) << "~Scope()"; - DropKids(); -} +Scope::~Scope() { DropKids(); } Scope& Scope::NewScope() const { SCOPE_LOCK_GUARD diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc index 7dc7430c55..d34f826c1a 100644 --- a/paddle/fluid/framework/threadpool.cc +++ b/paddle/fluid/framework/threadpool.cc @@ -48,18 +48,9 @@ void ThreadPool::Init() { ThreadPool::ThreadPool(int num_threads) : running_(true) { threads_.resize(num_threads); - for (int i = 0; i < num_threads; ++i) { - // for (auto& thread : threads_) { + for (auto& thread : threads_) { // TODO(Yancey1989): binding the thread on the specify CPU number - threads_[i].reset( - new std::thread(std::bind(&ThreadPool::TaskLoop, this, i))); - /** - sched_param sch; - int policy; - pthread_getschedparam(threads_[i]->native_handle(), &policy, &sch); - if (pthread_setschedparam(threads_[i]->native_handle(), SCHED_FIFO, &sch)) { - VLOG(1) << "Failed to setschedparam: " << errno; - }**/ + thread.reset(new std::thread(std::bind(&ThreadPool::TaskLoop, this))); } } @@ -77,7 +68,7 @@ ThreadPool::~ThreadPool() { } } -void ThreadPool::TaskLoop(int i) { +void ThreadPool::TaskLoop() { while (true) { Task task; diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h index bd8c3cdee8..5177b7ee02 100644 --- a/paddle/fluid/framework/threadpool.h +++ b/paddle/fluid/framework/threadpool.h @@ -99,7 +99,7 @@ class ThreadPool { // The constructor starts threads to run TaskLoop, which retrieves // and runs tasks from the queue. - void TaskLoop(int i); + void TaskLoop(); // Init is called by GetInstance. static void Init(); diff --git a/paddle/fluid/framework/threadpool_test.cc b/paddle/fluid/framework/threadpool_test.cc index 1257a76e3e..884d61e234 100644 --- a/paddle/fluid/framework/threadpool_test.cc +++ b/paddle/fluid/framework/threadpool_test.cc @@ -59,47 +59,3 @@ TEST(ThreadPool, ConcurrentRun) { } EXPECT_EQ(sum, ((n + 1) * n) / 2); } -static int64_t GetTS() { - struct timeval tp; - gettimeofday(&tp, NULL); - return tp.tv_sec * 1000000 + tp.tv_usec; -} - -void multi_call(std::function call) { - for (int i = 0; i < 500; ++i) { - call(); - } -} - -TEST(ThreadPool, PERFORMANCE) { - auto sum = [] { - int a = 0; - for (int i = 0; i < 1000; ++i) { - a += i; - } - }; - // framework::ThreadPool *pool = new framework::ThreadPool(2); - int64_t start = GetTS(); - for (int i = 0; i < 1000; ++i) { - // int64_t s = GetTS(); - framework::Async(std::move(sum)); - // pool->Run(std::move(sum)); - // VLOG(5) << "push to pool spent : " << GetTS() - s << " (us)."; - } - VLOG(5) << "pool spent: " << GetTS() - start << " (us)."; - start = GetTS(); - for (int i = 0; i < 1000; ++i) { - sum(); - } - VLOG(5) << "sequence call spent: " << GetTS() - start << " (us)."; - std::vector threads; - start = GetTS(); - for (int i = 0; i < 2; ++i) { - std::thread t(multi_call, std::ref(sum)); - threads.push_back(std::move(t)); - } - for (auto& thread : threads) { - thread.join(); - } - VLOG(5) << "two threads spent: " << GetTS() - start << " (us)."; -} diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h index 10de11bfa5..51b980acb5 100644 --- a/paddle/fluid/operators/reader/blocking_queue.h +++ b/paddle/fluid/operators/reader/blocking_queue.h @@ -67,12 +67,9 @@ class BlockingQueue { } bool Receive(T* elem) { - VLOG(1) << "blocking queue::Receive ..."; std::unique_lock lock(mutex_); receive_cv_.wait(lock, [&] { return !queue_.empty() || closed_; }); - VLOG(1) << "queue_.empty()=" << queue_.empty(); if (!queue_.empty()) { - if (elem == nullptr) VLOG(1) << "elem is nullptr"; PADDLE_ENFORCE_NOT_NULL(elem); *elem = queue_.front(); if (LIKELY(!speed_test_mode_)) { diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index 2d66000f1f..cfa192f8e1 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -82,13 +82,11 @@ void BufferedReader::StartImpl() { } void BufferedReader::ReadNextImpl(std::vector *out) { - VLOG(1) << "ReadNextImpl start on place: " << place_; if (position_.empty()) { out->clear(); return; } size_t i = position_.front().get(); - VLOG(1) << "position front: " << i; position_.pop(); if (i == -1UL) { @@ -105,7 +103,6 @@ void BufferedReader::ReadNextImpl(std::vector *out) { ReadAsync(prev_pos_); } prev_pos_ = i; - VLOG(1) << "success ReadNextImpl"; } } // namespace reader diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc index 924c92e0bf..954fec0fbc 100644 --- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc +++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc @@ -25,15 +25,9 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase { private: void RunImpl(const framework::Scope& scope, const platform::Place& dev_place) const override { - VLOG(1) << "find var in scope: " << &scope; - auto* out_var = scope.FindVar(Output("Out")); - VLOG(1) << "var " << Output("Out") << " -> " << out_var; - auto* out = out_var->GetMutable(); - - // auto* out = scope.Var(Output("Out")) - // ->template GetMutable(); + auto* out = scope.Var(Output("Out")) + ->template GetMutable(); if (out->Get() != nullptr) { - VLOG(1) << Output("Out") << " is not nullptr."; return; } const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader")) @@ -52,11 +46,8 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase { sin >> num; place = platform::CUDAPlace(static_cast(num)); } - VLOG(1) << "create buffered reader on " << place; out->Reset(framework::MakeDecoratedReader(underlying_reader, place, 2)); - VLOG(1) << "Reset Buffered Reader in var: " - << scope.FindVar(Input("UnderlyingReader")); } }; From 73edf1376758b753ca7226cc22c442ef2f6c575d Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Fri, 7 Dec 2018 16:41:53 +0800 Subject: [PATCH 017/197] update --- paddle/fluid/operators/reader/create_double_buffer_reader_op.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc index 954fec0fbc..440b16cf91 100644 --- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc +++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc @@ -25,7 +25,7 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase { private: void RunImpl(const framework::Scope& scope, const platform::Place& dev_place) const override { - auto* out = scope.Var(Output("Out")) + auto* out = scope.FindVar(Output("Out")) ->template GetMutable(); if (out->Get() != nullptr) { return; From 47740ace289721e61489f6b2b5c196f26250aa3f Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Fri, 7 Dec 2018 17:18:45 +0800 Subject: [PATCH 018/197] fix performance --- paddle/fluid/framework/details/all_reduce_op_handle.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index ae20338746..6b7bbf9003 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -107,6 +107,7 @@ void AllReduceOpHandle::RunImpl() { PADDLE_ENFORCE(platform::dynload::ncclAllReduce( buffer, buffer, numel, static_cast(dtype), ncclSum, comm, stream)); + if (!nccl_ctxs_->need_group_call_) cudaStreamSynchronize(stream); }); } From 527946df490df1ad80152ffdc973178b9ae308f6 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Fri, 7 Dec 2018 18:08:29 +0800 Subject: [PATCH 019/197] add scope in prefetch --- .../distributed/parameter_prefetch.cc | 19 +++++++------- paddle/fluid/operators/lookup_table_op.h | 3 ++- paddle/fluid/operators/nce_op.h | 25 ++++++++++++++----- 3 files changed, 31 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc index cf14538b1c..67b56bd218 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.cc +++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc @@ -102,8 +102,9 @@ static void MergeMultipleVarsIntoOneBySection( const std::string& out_name, const std::vector& out_var_names, const std::vector& height_section, const std::vector>& splited_ids, - const framework::ExecutionContext& context, framework::Scope* scope, - platform::DeviceContext* actual_ctx) { + const framework::ExecutionContext& context, + const framework::Scope& actual_scope, framework::Scope* scope, + platform::DeviceContext* actual_ctx, ) { PADDLE_ENFORCE_EQ(out_var_names.size(), height_section.size(), ""); auto cpu_place = platform::CPUPlace(); @@ -114,9 +115,9 @@ static void MergeMultipleVarsIntoOneBySection( id_to_offset[ids_vector[i]].push_back(i); } - auto& id_tensor = scope->FindVar(id_name)->Get(); + auto& id_tensor = actual_scope.FindVar(id_name)->Get(); auto* out_tensor = - scope->FindVar(out_name)->GetMutable(); + actual_scope.FindVar(out_name)->GetMutable(); auto* out_tensor_data = out_tensor->mutable_data(id_tensor.place()); bool is_on_cpu_place = true; @@ -172,8 +173,9 @@ void prefetch(const std::string& id_name, const std::string& out_name, const std::vector& table_names, const std::vector& epmap, const std::vector& height_sections, - const framework::ExecutionContext& context) { - auto& local_scope = context.scope().NewScope(); + const framework::ExecutionContext& context, + const framework::Scope& scope) { + auto& local_scope = scope.NewScope(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& cpu_ctx = *pool.Get(platform::CPUPlace()); @@ -245,9 +247,8 @@ void prefetch(const std::string& id_name, const std::string& out_name, MergeMultipleVarsIntoOneBySection(id_name, ids_vector, out_name, out_var_names, height_sections, splited_ids, - context, &local_scope, &actual_ctx); - - context.scope().DeleteScope(&local_scope); + context, scope, &local_scope, &actual_ctx); + scope.DeleteScope(&local_scope); } }; // namespace distributed diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h index 3a73a7637c..a7d0fd4856 100644 --- a/paddle/fluid/operators/lookup_table_op.h +++ b/paddle/fluid/operators/lookup_table_op.h @@ -59,7 +59,8 @@ class LookupTableKernel : public framework::OpKernel { // server #ifdef PADDLE_WITH_DISTRIBUTE operators::distributed::prefetch(id_name, out_name, table_names, epmap, - height_sections, context); + height_sections, context, + context.scope()); #else PADDLE_THROW( "paddle is not compiled with distribute support, can not do " diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index 6567b6534a..9789e30388 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -170,18 +170,31 @@ class NCEKernel : public framework::OpKernel { auto height_sections = context.Attr>("height_sections"); auto table_names = context.Attr>("table_names"); - local_scope.Var("Ids"); - local_scope.Var("Weight"); + auto *ids = local_scope.Var("Ids"); + auto *x_tensor = ids->GetMutable(); + x_tensor->mutable_data( + framework::make_ddim({static_cast(labels.size()), 1}), + context.GetPlace()); + // copy. + std::memcpy(x_tensor->data(), labels.data(), + labels.size() * sizeof(int64_t)); + + local_scope.Var("Weight@Local") + ->GetMutable() + ->mutable_data(context.GetPlace()); #ifdef PADDLE_WITH_DISTRIBUTE - operators::distributed::prefetch("Ids", "Weight", table_names, epmap, - height_sections, context); + operators::distributed::prefetch("Ids", "Weight@Local", table_names, + epmap, height_sections, context, + &local_scope); #else PADDLE_THROW( "paddle is not compiled with distribute support, can not do " "parameter prefetch!"); +#endif - auto weight_mat = EigenMatrix::From(*(weight->Get())); + auto weight_mat = EigenMatrix::From( + (local_scope.Var("Weight@Local")->Get())); for (int64_t i = 0; i < sample_labels->numel(); ++i) { std::vector::iterator it = std::find(labels.begin(), labels.end(), sample_labels_data[i]); @@ -196,7 +209,7 @@ class NCEKernel : public framework::OpKernel { } context.scope().DeleteScope(&local_scope); -#endif + } else { auto weight_mat = EigenMatrix::From(*(context.Input("Weight"))); From bb2e7f0bbed1cfcf47b5b8e90bc9e35b46c13b50 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Sat, 8 Dec 2018 12:31:33 +0800 Subject: [PATCH 020/197] add scope in prefetch --- paddle/fluid/operators/distributed/parameter_prefetch.cc | 8 ++++---- paddle/fluid/operators/distributed/parameter_prefetch.h | 3 ++- paddle/fluid/operators/nce_op.h | 9 +++++---- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc index 67b56bd218..f6a2d5bbe5 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.cc +++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc @@ -104,7 +104,7 @@ static void MergeMultipleVarsIntoOneBySection( const std::vector>& splited_ids, const framework::ExecutionContext& context, const framework::Scope& actual_scope, framework::Scope* scope, - platform::DeviceContext* actual_ctx, ) { + platform::DeviceContext* actual_ctx) { PADDLE_ENFORCE_EQ(out_var_names.size(), height_section.size(), ""); auto cpu_place = platform::CPUPlace(); @@ -175,7 +175,7 @@ void prefetch(const std::string& id_name, const std::string& out_name, const std::vector& height_sections, const framework::ExecutionContext& context, const framework::Scope& scope) { - auto& local_scope = scope.NewScope(); + auto& local_scope = context.scope().NewScope(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& cpu_ctx = *pool.Get(platform::CPUPlace()); @@ -192,7 +192,7 @@ void prefetch(const std::string& id_name, const std::string& out_name, out_var_names.push_back(out_name + "@" + epmap[i]); } - auto& id_tensor = local_scope.FindVar(id_name)->Get(); + auto& id_tensor = scope.FindVar(id_name)->Get(); std::vector ids_vector; if (platform::is_cpu_place(id_tensor.place())) { auto* id_data = id_tensor.data(); @@ -248,7 +248,7 @@ void prefetch(const std::string& id_name, const std::string& out_name, MergeMultipleVarsIntoOneBySection(id_name, ids_vector, out_name, out_var_names, height_sections, splited_ids, context, scope, &local_scope, &actual_ctx); - scope.DeleteScope(&local_scope); + context.scope().DeleteScope(&local_scope); } }; // namespace distributed diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.h b/paddle/fluid/operators/distributed/parameter_prefetch.h index 53b0fbfb51..53482c4c40 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.h +++ b/paddle/fluid/operators/distributed/parameter_prefetch.h @@ -27,7 +27,8 @@ void prefetch(const std::string& id_name, const std::string& out_name, const std::vector& table_names, const std::vector& epmap, const std::vector& height_sections, - const framework::ExecutionContext& context); + const framework::ExecutionContext& context, + const framework::Scope& scope); }; // namespace distributed }; // namespace operators diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index 9789e30388..2e51c67401 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -180,7 +180,7 @@ class NCEKernel : public framework::OpKernel { labels.size() * sizeof(int64_t)); local_scope.Var("Weight@Local") - ->GetMutable() + ->GetMutable() ->mutable_data(context.GetPlace()); #ifdef PADDLE_WITH_DISTRIBUTE @@ -194,7 +194,7 @@ class NCEKernel : public framework::OpKernel { #endif auto weight_mat = EigenMatrix::From( - (local_scope.Var("Weight@Local")->Get())); + (local_scope.Var("Weight@Local")->Get())); for (int64_t i = 0; i < sample_labels->numel(); ++i) { std::vector::iterator it = std::find(labels.begin(), labels.end(), sample_labels_data[i]); @@ -208,8 +208,9 @@ class NCEKernel : public framework::OpKernel { sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i]))); } - context.scope().DeleteScope(&local_scope); - + if (context.scope().HasKid(&local_scope)) { + context.scope().DeleteScope(&local_scope); + } } else { auto weight_mat = EigenMatrix::From(*(context.Input("Weight"))); From 57557f677476d75a7b251081e97606499255a0c7 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Mon, 10 Dec 2018 11:33:00 +0800 Subject: [PATCH 021/197] fix scope in nce and prefetch --- .../operators/distributed/parameter_prefetch.cc | 13 ++++++------- paddle/fluid/operators/nce_op.h | 13 ++++--------- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc index f6a2d5bbe5..4cdeae81a1 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.cc +++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc @@ -102,8 +102,7 @@ static void MergeMultipleVarsIntoOneBySection( const std::string& out_name, const std::vector& out_var_names, const std::vector& height_section, const std::vector>& splited_ids, - const framework::ExecutionContext& context, - const framework::Scope& actual_scope, framework::Scope* scope, + const framework::ExecutionContext& context, framework::Scope* scope, platform::DeviceContext* actual_ctx) { PADDLE_ENFORCE_EQ(out_var_names.size(), height_section.size(), ""); @@ -115,9 +114,9 @@ static void MergeMultipleVarsIntoOneBySection( id_to_offset[ids_vector[i]].push_back(i); } - auto& id_tensor = actual_scope.FindVar(id_name)->Get(); + auto& id_tensor = scope.FindVar(id_name)->Get(); auto* out_tensor = - actual_scope.FindVar(out_name)->GetMutable(); + scope.FindVar(out_name)->GetMutable(); auto* out_tensor_data = out_tensor->mutable_data(id_tensor.place()); bool is_on_cpu_place = true; @@ -175,7 +174,7 @@ void prefetch(const std::string& id_name, const std::string& out_name, const std::vector& height_sections, const framework::ExecutionContext& context, const framework::Scope& scope) { - auto& local_scope = context.scope().NewScope(); + auto& local_scope = scope.NewScope(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& cpu_ctx = *pool.Get(platform::CPUPlace()); @@ -247,8 +246,8 @@ void prefetch(const std::string& id_name, const std::string& out_name, MergeMultipleVarsIntoOneBySection(id_name, ids_vector, out_name, out_var_names, height_sections, splited_ids, - context, scope, &local_scope, &actual_ctx); - context.scope().DeleteScope(&local_scope); + context, &local_scope, &actual_ctx); + scope.DeleteScope(&local_scope); } }; // namespace distributed diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index 2e51c67401..862064be18 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -170,7 +170,7 @@ class NCEKernel : public framework::OpKernel { auto height_sections = context.Attr>("height_sections"); auto table_names = context.Attr>("table_names"); - auto *ids = local_scope.Var("Ids"); + auto *ids = local_scope.Var("Ids@Local"); auto *x_tensor = ids->GetMutable(); x_tensor->mutable_data( framework::make_ddim({static_cast(labels.size()), 1}), @@ -179,12 +179,10 @@ class NCEKernel : public framework::OpKernel { std::memcpy(x_tensor->data(), labels.data(), labels.size() * sizeof(int64_t)); - local_scope.Var("Weight@Local") - ->GetMutable() - ->mutable_data(context.GetPlace()); + local_scope.Var("Weight@Local"); #ifdef PADDLE_WITH_DISTRIBUTE - operators::distributed::prefetch("Ids", "Weight@Local", table_names, + operators::distributed::prefetch("Ids@Local", "Weight@Local", table_names, epmap, height_sections, context, &local_scope); #else @@ -207,10 +205,7 @@ class NCEKernel : public framework::OpKernel { sample_out_data[i] += result(0); sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i]))); } - - if (context.scope().HasKid(&local_scope)) { - context.scope().DeleteScope(&local_scope); - } + context.scope().DeleteScope(&local_scope); } else { auto weight_mat = EigenMatrix::From(*(context.Input("Weight"))); From 79082c94594adaf4765e950151da51c84ec137b8 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Mon, 10 Dec 2018 17:31:52 +0800 Subject: [PATCH 022/197] fix pyreader failed --- .../scope_buffered_ssa_graph_executor.cc | 27 +++++++++---------- .../scope_buffered_ssa_graph_executor.h | 5 ++-- .../details/threaded_ssa_graph_executor.cc | 1 - paddle/fluid/framework/parallel_executor.cc | 22 +++++++++++---- .../fluid/operators/reader/buffered_reader.cc | 2 -- .../operators/reader/create_py_reader_op.cc | 2 -- .../fluid/operators/reader/open_files_op.cc | 2 -- 7 files changed, 31 insertions(+), 30 deletions(-) diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index abc6b9f559..85898af417 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -27,34 +27,31 @@ namespace framework { namespace details { ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor( ExecutionStrategy strategy, std::vector local_scopes, - std::vector> var_infos_list, - std::vector places, + std::vector var_infos, std::vector places, std::unique_ptr &&underlying_executor) : strategy_(std::move(strategy)), underlying_executor_(std::move(underlying_executor)), local_scopes_(std::move(local_scopes)), - var_infos_list_(std::move(var_infos_list)), + var_infos_(std::move(var_infos)), places_(std::move(places)) {} FeedFetchList ScopeBufferedSSAGraphExecutor::Run( const std::vector &fetch_tensors) { if (drop_scope_counter_ == 0) { // Create local scopes. - for (size_t i = 0; i < local_scopes_.size(); ++i) { - auto &scope = local_scopes_[i]; + for (auto it = local_scopes_.rbegin(); it != local_scopes_.rend(); ++it) { + auto &scope = *it; Scope &local_scope = scope->NewScope(); *scope->Var(details::kLocalExecScopeName)->GetMutable() = &local_scope; - for (auto &var_infos : var_infos_list_) { - for (auto &info : var_infos) { - if (scope->FindVar(info.name_) != nullptr) { - continue; - } - if (info.persistable_) { // Persistable - InitializeVariable(scope->Var(info.name_), info.type_); - } else { - InitializeVariable(local_scope.Var(info.name_), info.type_); - } + for (auto &info : var_infos_) { + if (scope->FindVar(info.name_) != nullptr) { + continue; + } + if (info.persistable_) { // Persistable + InitializeVariable(scope->Var(info.name_), info.type_); + } else { + InitializeVariable(local_scope.Var(info.name_), info.type_); } } } diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h index 51230d4a42..5e87e0bf50 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h @@ -38,8 +38,7 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { public: ScopeBufferedSSAGraphExecutor( ExecutionStrategy strategy, std::vector local_scopes, - std::vector> var_info_list, - std::vector places, + std::vector var_infos, std::vector places, std::unique_ptr&& underlying_executor); const ir::Graph& Graph() const override { @@ -54,7 +53,7 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { ExecutionStrategy strategy_; std::unique_ptr underlying_executor_; std::vector local_scopes_; - std::vector> var_infos_list_; + std::vector var_infos_; std::vector places_; }; } // namespace details diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 677a293794..cebf63364d 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -216,7 +216,6 @@ void ThreadedSSAGraphExecutor::RunOp( if (LIKELY(!strategy_.dry_run_)) { op->Run(strategy_.use_cuda_); } - VLOG(10) << op << " " << op->Name() << " Done "; running_ops_--; ready_var_q->Extend(op->Outputs()); VLOG(10) << op << " " << op->Name() << "Signal posted"; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 186f0cb803..2a9ca3e815 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -141,7 +141,6 @@ ParallelExecutor::ParallelExecutor( std::vector> graphs; #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { - VLOG(1) << "kParallelGraph mode!!"; for (size_t i = 0; i < member_->places_.size(); ++i) { std::unique_ptr graph = build_strategy.Apply( main_program, {member_->places_[i]}, loss_var_name, params, @@ -178,8 +177,8 @@ ParallelExecutor::ParallelExecutor( ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount, &ref_cnts_); ref_cnt_pass->SetNotOwned(details::kCurReferenceCount, &cur_ref_cnts_); ref_cnt_pass->SetNotOwned(details::kGarbageCollector, &gcs_); - graphs[0] = ref_cnt_pass->Apply(std::move(graphs[i])); - graphs[0]->SetNotOwned("garbage_collector", &gcs_); + graphs[i] = ref_cnt_pass->Apply(std::move(graphs[i])); + graphs[i]->SetNotOwned("garbage_collector", &gcs_); } } } @@ -192,6 +191,18 @@ ParallelExecutor::ParallelExecutor( // Step 3. Create vars in each scope. Passes may also create new vars. // skip control vars and empty vars + std::vector var_infos; + for (auto &graph : graphs) { + for (auto &node : graph->Nodes()) { + if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { + var_infos.emplace_back(); + var_infos.back().name_ = node->Var()->Name(); + var_infos.back().type_ = node->Var()->GetType(); + var_infos.back().persistable_ = node->Var()->Persistable(); + } + } + } + /** std::vector> var_infos_list; for (size_t i = 0; i < graphs.size(); ++i) { std::vector var_infos; @@ -203,8 +214,9 @@ ParallelExecutor::ParallelExecutor( var_infos.back().persistable_ = node->Var()->Persistable(); } } - var_infos_list.emplace_back(std::move(var_infos)); + var_infos_list.push_back(std::move(var_infos)); } + **/ // If the loss_var_name is given, the number of graph should be only one. if (loss_var_name.size()) { @@ -236,7 +248,7 @@ ParallelExecutor::ParallelExecutor( } member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, std::move(var_infos_list), + exec_strategy, member_->local_scopes_, std::move(var_infos), member_->places_, std::move(member_->executor_))); } diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index cfa192f8e1..26ff221dfa 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -58,9 +58,7 @@ void BufferedReader::ReadAsync(size_t i) { TensorVec &gpu = gpu_buffer_[i]; gpu.resize(cpu.size()); for (size_t i = 0; i < cpu.size(); ++i) { - VLOG(1) << "launch tensor copy from cpu to cpu, idx: " << i; framework::TensorCopySync(cpu[i], place_, &gpu[i]); - VLOG(1) << "done " << i; gpu[i].set_lod(cpu[i].lod()); } } diff --git a/paddle/fluid/operators/reader/create_py_reader_op.cc b/paddle/fluid/operators/reader/create_py_reader_op.cc index 093b0e56b3..901a92ab5b 100644 --- a/paddle/fluid/operators/reader/create_py_reader_op.cc +++ b/paddle/fluid/operators/reader/create_py_reader_op.cc @@ -28,10 +28,8 @@ class PyReader : public framework::FileReader { } void ReadNext(std::vector* out) override { - VLOG(1) << "come in PyReader::ReadNext function, out: " << out; bool success; *out = queue_->Pop(&success); - VLOG(1) << "call PyReader::ReadNext " << success; if (!success) out->clear(); } diff --git a/paddle/fluid/operators/reader/open_files_op.cc b/paddle/fluid/operators/reader/open_files_op.cc index ae37a18725..38223e0699 100644 --- a/paddle/fluid/operators/reader/open_files_op.cc +++ b/paddle/fluid/operators/reader/open_files_op.cc @@ -115,12 +115,10 @@ class PreemptiveReaderContainer : public IReaderContainer { } void ReadNext(std::vector* out) override { - VLOG(1) << "flag"; if (!pending_.empty()) { auto future_it = complete_queue_.Pop(); FutureItem item = future_it->get(); if (item.exception_) { - VLOG(1) << "item has exception!!!"; for (auto it = futures_.begin(); it != futures_.end(); ++it) { if (it != future_it) { it->wait(); // Wait all other threads complete. From 33a004a779e8c4acb19ab13b641cc16d3827a582 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Mon, 10 Dec 2018 20:36:49 +0800 Subject: [PATCH 023/197] fix numel nce and prefetch --- .../distributed/parameter_prefetch.cc | 10 +++++++-- paddle/fluid/operators/nce_op.h | 21 ++++++++++++------- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc index 4cdeae81a1..aebf6376d1 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.cc +++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc @@ -114,9 +114,15 @@ static void MergeMultipleVarsIntoOneBySection( id_to_offset[ids_vector[i]].push_back(i); } - auto& id_tensor = scope.FindVar(id_name)->Get(); + auto& id_tensor = scope->FindVar(id_name)->Get(); auto* out_tensor = - scope.FindVar(out_name)->GetMutable(); + scope->FindVar(out_name)->GetMutable(); + + PADDLE_ENFORCE_GT( + out_tensor->numel(), 0, + "When calling this method, the Tensor's numel must larger than zero. " + "Please check Tensor::Resize has been called first."); + auto* out_tensor_data = out_tensor->mutable_data(id_tensor.place()); bool is_on_cpu_place = true; diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index 862064be18..99a3baba92 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -166,11 +166,12 @@ class NCEKernel : public framework::OpKernel { std::set st(labels.begin(), labels.end()); labels.assign(st.begin(), st.end()); - auto &local_scope = context.scope().NewScope(); + framework::Scope &local_scope = context.scope().NewScope(); + auto height_sections = context.Attr>("height_sections"); auto table_names = context.Attr>("table_names"); - auto *ids = local_scope.Var("Ids@Local"); + auto *ids = local_scope.Var("Ids@Prefetch"); auto *x_tensor = ids->GetMutable(); x_tensor->mutable_data( framework::make_ddim({static_cast(labels.size()), 1}), @@ -179,12 +180,18 @@ class NCEKernel : public framework::OpKernel { std::memcpy(x_tensor->data(), labels.data(), labels.size() * sizeof(int64_t)); - local_scope.Var("Weight@Local"); + std::vector w_dims = paddle::framework::vectorize2int( + context.Input("Weight")->dims()); + w_dims[0] = static_cast(labels.size()); + + auto *w_tensor = local_scope.Var("Weight@Prefetch") + ->GetMutable(); + w_tensor->Resize(framework::make_ddim(w_dims)); #ifdef PADDLE_WITH_DISTRIBUTE - operators::distributed::prefetch("Ids@Local", "Weight@Local", table_names, - epmap, height_sections, context, - &local_scope); + operators::distributed::prefetch("Ids@Prefetch", "Weight@Prefetch", + table_names, epmap, height_sections, + context, local_scope); #else PADDLE_THROW( "paddle is not compiled with distribute support, can not do " @@ -192,7 +199,7 @@ class NCEKernel : public framework::OpKernel { #endif auto weight_mat = EigenMatrix::From( - (local_scope.Var("Weight@Local")->Get())); + (local_scope.Var("Weight@Prefetch")->Get())); for (int64_t i = 0; i < sample_labels->numel(); ++i) { std::vector::iterator it = std::find(labels.begin(), labels.end(), sample_labels_data[i]); From 59cbf06e2ec67b28bfd46df8ae492d3bf149a764 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Tue, 11 Dec 2018 10:41:18 +0800 Subject: [PATCH 024/197] fix numel nce and prefetch test=develop --- paddle/fluid/operators/nce_op.h | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index 99a3baba92..2c97eef096 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -49,7 +49,6 @@ void PrepareSamples(const framework::ExecutionContext &context, auto label = context.Input("Label"); const int64_t *label_data = label->data(); auto label_dims = label->dims(); - // int num_total_classes = context.Attr("num_total_classes"); // for unitest std::vector custom_neg_classes = context.Attr>("custom_neg_classes"); From 82726402be966ede1e15486d88f9a17c1d1b52b9 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Tue, 11 Dec 2018 19:27:49 +0800 Subject: [PATCH 025/197] exception safe --- .../details/parallel_ssa_graph_executor.cc | 51 +++++++++++++++---- .../details/parallel_ssa_graph_executor.h | 1 + paddle/fluid/framework/parallel_executor.cc | 15 ------ paddle/fluid/framework/threadpool.h | 1 - 4 files changed, 42 insertions(+), 26 deletions(-) diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc index dfb40721d8..f1a07edf08 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc @@ -34,32 +34,63 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor( executors_.emplace_back(new details::ThreadedSSAGraphExecutor( strategy_, {local_scopes_[i]}, {places_[i]}, std::move(graphs_[i]))); } - VLOG(1) << "pool size: " << places_.size(); } FeedFetchList ParallelSSAGraphExecutor::Run( const std::vector &fetch_tensors) { - std::vector> run_futures; - FeedFetchList fetch_data; + std::vector> run_futures; + + std::vector fetch_datas; + FeedFetchList ret; + + fetch_datas.reserve(places_.size()); + ret.reserve(fetch_tensors.size()); + exception_holder_.Clear(); for (size_t i = 0; i < places_.size(); ++i) { - auto call = [this, i] { - // FIXME(Yancey1989): need to fix fetch data failed. - std::vector empty; - executors_[i]->Run(empty); + auto call = [this, i, &fetch_tensors]() -> FeedFetchList { + return executors_[i]->Run(fetch_tensors); }; + if (pool_) { run_futures.emplace_back(pool_->enqueue(std::move(call))); } else { - call(); + try { + fetch_datas.emplace_back(std::move(call())); + } catch (...) { + exception_holder_.Catch(std::current_exception()); + break; + } } } + if (pool_) { for (auto &f : run_futures) { - f.wait(); + if (exception_holder_.IsCaught()) { + f.wait(); + } else { + try { + fetch_datas.emplace_back(std::move(f.get())); + } catch (...) { + exception_holder_.Catch(std::current_exception()); + } + } + } + } + if (exception_holder_.IsCaught()) { + exception_holder_.ReThrow(); + } + + for (size_t fetch_idx = 0; fetch_idx < fetch_tensors.size(); ++fetch_idx) { + std::vector lodtensor_ptrs; + lodtensor_ptrs.reserve(local_scopes_.size()); + for (size_t scope_idx = 0; scope_idx < local_scopes_.size(); ++scope_idx) { + lodtensor_ptrs.push_back(&fetch_datas.at(scope_idx).at(fetch_idx)); } + ret.emplace_back(); + ret.back().MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace()); } - return fetch_data; + return ret; } } // namespace details diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h index 37784775f0..bd777e41f8 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h @@ -44,6 +44,7 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor { std::vector> graphs_; std::vector> executors_; + ExceptionHolder exception_holder_; }; } // namespace details diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 2a9ca3e815..82a7bd2185 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -202,21 +202,6 @@ ParallelExecutor::ParallelExecutor( } } } - /** - std::vector> var_infos_list; - for (size_t i = 0; i < graphs.size(); ++i) { - std::vector var_infos; - for (auto &node : graphs[i]->Nodes()) { - if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { - var_infos.emplace_back(); - var_infos.back().name_ = node->Var()->Name(); - var_infos.back().type_ = node->Var()->GetType(); - var_infos.back().persistable_ = node->Var()->Persistable(); - } - } - var_infos_list.push_back(std::move(var_infos)); - } - **/ // If the loss_var_name is given, the number of graph should be only one. if (loss_var_name.size()) { diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h index 5177b7ee02..8fd834be9a 100644 --- a/paddle/fluid/framework/threadpool.h +++ b/paddle/fluid/framework/threadpool.h @@ -14,7 +14,6 @@ limitations under the License. */ #pragma once -#include #include // NOLINT #include #include // NOLINT From 5cc83f79bfa9516ef9c5f7f688f665deb47e0d07 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Tue, 11 Dec 2018 21:45:45 +0800 Subject: [PATCH 026/197] update by comment --- paddle/fluid/framework/parallel_executor.cc | 29 +++++++++++++-------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 82a7bd2185..b0cd1e8e90 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -110,23 +110,30 @@ ParallelExecutor::ParallelExecutor( // Bcast Parameters to all GPUs #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME); - std::unique_ptr nccl_id = nullptr; + ncclUniqueId *nccl_id = nullptr; bool need_group_call = true; - if (nccl_id_var != nullptr) { - nccl_id.reset(nccl_id_var->GetMutable()); - } else if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { - nccl_id.reset(new ncclUniqueId()); - PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(nccl_id.get())); - *member_->global_scope_->Var(NCCL_ID_VARNAME) - ->GetMutable() = *nccl_id.get(); + if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { + // parallel graph mode should initialize nccl by ncclCommInitRank since + // it call nccl operator per device per thread. + if (nccl_id_var == nullptr) { + nccl_id = new ncclUniqueId(); + PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(nccl_id)); + *member_->global_scope_->Var(NCCL_ID_VARNAME) + ->GetMutable() = *nccl_id; + } else { + nccl_id = nccl_id_var->GetMutable(); + } need_group_call = false; + } else if (nccl_id_var != nullptr) { // the other executor type. + // the distributed training with nccl mode would initialize the nccl id in + // startup_program. + nccl_id = nccl_id_var->GetMutable(); } else { - // init nccl_id in NCCLContextMap + // initlize NCCL by ncclCommInitAll, do not need nccl_id. } member_->nccl_ctxs_.reset(new platform::NCCLContextMap( - member_->places_, nccl_id.get(), num_trainers, trainer_id, - need_group_call)); + member_->places_, nccl_id, num_trainers, trainer_id, need_group_call)); #else PADDLE_THROW("Not compiled with CUDA"); #endif From c2e851f7b284ad122d20b932ff2df165d56b7994 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Wed, 12 Dec 2018 11:42:16 +0000 Subject: [PATCH 027/197] test=develop, remove sparse bias and add prefetch and related tests --- .../distributed/parameter_prefetch.cc | 12 +- .../distributed/parameter_prefetch.h | 24 ++ .../operators/hierarchical_sigmoid_op.cc | 47 ++- .../fluid/operators/hierarchical_sigmoid_op.h | 83 ++++-- .../fluid/operators/math/matrix_bit_code.cc | 17 -- paddle/fluid/operators/math/matrix_bit_code.h | 27 +- python/paddle/fluid/layers/nn.py | 17 +- .../fluid/tests/unittests/test_hsigmoid_op.py | 6 +- .../test_hsigmoid_remote_table_op.py | 271 ++++++++++++++++++ 9 files changed, 418 insertions(+), 86 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_hsigmoid_remote_table_op.py diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc index aebf6376d1..52085482f4 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.cc +++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc @@ -32,7 +32,7 @@ namespace paddle { namespace operators { namespace distributed { -using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; using LoDTensor = framework::LoDTensor; using SelectedRows = framework::SelectedRows; using DDim = framework::DDim; @@ -120,8 +120,8 @@ static void MergeMultipleVarsIntoOneBySection( PADDLE_ENFORCE_GT( out_tensor->numel(), 0, - "When calling this method, the Tensor's numel must larger than zero. " - "Please check Tensor::Resize has been called first."); + "When calling this method, the LoDTensor's numel must larger than zero. " + "Please check LoDTensor::Resize has been called first."); auto* out_tensor_data = out_tensor->mutable_data(id_tensor.place()); @@ -144,7 +144,7 @@ static void MergeMultipleVarsIntoOneBySection( auto row_numel = dims[1]; - for (size_t i = 0; i < dims[0]; ++i) { + for (int64_t i = 0; i < dims[0]; ++i) { auto id = ids_in_this_section[i]; auto origin_id = id + abs_sections[section_idx]; auto& offsets = id_to_offset[origin_id]; @@ -201,7 +201,7 @@ void prefetch(const std::string& id_name, const std::string& out_name, std::vector ids_vector; if (platform::is_cpu_place(id_tensor.place())) { auto* id_data = id_tensor.data(); - for (size_t i = 0; i < id_tensor.numel(); ++i) { + for (int64_t i = 0; i < id_tensor.numel(); ++i) { ids_vector.push_back(id_data[i]); } } else { @@ -209,7 +209,7 @@ void prefetch(const std::string& id_name, const std::string& out_name, PADDLE_THROW("paddle is not compiled with CUDA!"); #else auto cpu_place = platform::CPUPlace(); - framework::Tensor cpu_tensor; + framework::LoDTensor cpu_tensor; auto* cpu_tensor_data = cpu_tensor.mutable_data(id_tensor.dims(), cpu_place); auto stream = diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.h b/paddle/fluid/operators/distributed/parameter_prefetch.h index 53482c4c40..882c6bd9b8 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.h +++ b/paddle/fluid/operators/distributed/parameter_prefetch.h @@ -30,6 +30,30 @@ void prefetch(const std::string& id_name, const std::string& out_name, const framework::ExecutionContext& context, const framework::Scope& scope); +template +void prefetch_with_reconstruct(const std::string& id_name, + const std::string& out_name, + const std::vector& table_names, + const std::vector& epmap, + const std::vector& height_sections, + const framework::ExecutionContext& context, + const framework::Scope& scope, + framework::LoDTensor* original) { + prefetch(id_name, out_name, table_names, epmap, height_sections, context, + scope); + auto& out = scope.FindVar(out_name)->Get(); + auto& ids = scope.FindVar(id_name)->Get(); + auto* original_value = original->data(); + auto* out_value = out.data(); + size_t original_width = original->numel() / original->dims()[0]; + + for (int64_t i = 0; i < ids.numel(); i++) { + const T* out_rows = out_value + original_width * i; + T* original_row = original_value + original_width * ids.data()[i]; + std::memcpy(original_row, out_rows, original_width * sizeof(T)); + } +} + }; // namespace distributed }; // namespace operators }; // namespace paddle diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc index 0dbcc442df..b9059f6b05 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc @@ -67,6 +67,11 @@ class HierarchicalSigmoidOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null."); PADDLE_ENFORCE(ctx->HasOutput("PreOut"), "Output(PreOut) should not be null."); + auto with_prefetch = ctx->Attrs().Get("remote_prefetch"); + if (with_prefetch) { + PADDLE_ENFORCE(ctx->HasOutput("W_Out"), + "Output(W_Out) should not be null."); + } const int64_t batch_size = ctx->GetInputDim("X")[0]; std::vector output_shape({batch_size, 1}); ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); @@ -96,7 +101,7 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Label", "(LoDTensor, required), The labels of training data. It's a" "tensor with shape [N, 1]."); - AddInput("PTable", + AddInput("PathTable", "(LoDTensor, optional), The Path Table from root to current word" "it should have shape like [N, L], L is the length of the Path") .AsDispensable(); @@ -120,8 +125,30 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker { "[batch_size, code_length], where code_length represents the " "maximum path length from root to leaf nodes.") .AsIntermediate(); + AddOutput( + "W_Out", + "(LoDTensor, optinal) using input 'W' as Output to make it mutable" + "When we are using prefetch") + .AsIntermediate(); AddAttr("num_classes", "(int, optional), The number of classes") .SetDefault(2); + // for parameter prefetch + AddAttr("remote_prefetch", "").SetDefault(false); + AddAttr("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0); + AddAttr>("height_sections", + "Height for each output SelectedRows.") + .SetDefault(std::vector({})); + AddAttr>( + "epmap", + "(string vector, default 127.0.0.1:6164)" + "Server endpoints in the order of input variables for mapping") + .SetDefault({}); + AddAttr>( + "table_names", + "(string vector, the splited table names that will be fetched from " + "parameter server)" + "in the order of input variables for mapping") + .SetDefault({}); AddComment(R"DOC( The hierarchical sigmoid operator organize the classes into a binary tree. At each node, a sigmoid function is used to calculate the probability of @@ -191,23 +218,17 @@ class HierarchicalSigmoidGradOpGradVarTypeInference << " is set to SelectedRows"; block->Var(w_grad_var_name) ->SetType(framework::proto::VarType::SELECTED_ROWS); - if (hasBias) { - VLOG(30) << "hierarchical_sigmoid_grad op " - << framework::GradVarName("Bias") << " is set to SelectedRows"; - block->Var(bias_grad_var_name) - ->SetType(framework::proto::VarType::SELECTED_ROWS); - } } else { VLOG(30) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W") << " is set to LoDTensor"; block->Var(w_grad_var_name) ->SetType(framework::proto::VarType::LOD_TENSOR); - if (hasBias) { - VLOG(30) << "hierarchical_sigmoid_grad op " - << framework::GradVarName("Bias") << " is set to LoDTensor"; - block->Var(bias_grad_var_name) - ->SetType(framework::proto::VarType::LOD_TENSOR); - } + } + if (hasBias) { + VLOG(30) << "hierarchical_sigmoid_grad op " + << framework::GradVarName("Bias") << " is set to LoDTensor"; + block->Var(bias_grad_var_name) + ->SetType(framework::proto::VarType::LOD_TENSOR); } block->Var(w_grad_var_name)->SetDataType(block->Var("W")->GetDataType()); } diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h index b73a32af89..d8e406a96b 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.h +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h @@ -14,7 +14,9 @@ limitations under the License. */ #pragma once #include +#include #include +#include #include #include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/framework/op_registry.h" @@ -24,6 +26,10 @@ limitations under the License. */ #include "paddle/fluid/operators/math/matrix_bit_code.h" #include "paddle/fluid/platform/transform.h" +#ifdef PADDLE_WITH_DISTRIBUTE +#include "paddle/fluid/operators/distributed/parameter_prefetch.h" +#endif + namespace paddle { namespace operators { @@ -49,13 +55,55 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto& in = detail::Ref(ctx.Input("X")); auto& w = detail::Ref(ctx.Input("W")); - auto* path = ctx.Input("PTable"); + auto* path = ctx.Input("PathTable"); auto* code = ctx.Input("PathCode"); auto& label = detail::Ref(ctx.Input("Label")); auto* bias = ctx.Input("Bias"); auto* out = ctx.Output("Out"); auto* pre_out = ctx.Output("PreOut"); size_t num_classes = static_cast(ctx.Attr("num_classes")); + // for remote prefetch + + auto epmap = ctx.Attr>("epmap"); + if (!epmap.empty()) { + // if epmap is not empty, then the parameter will be fetched from remote + // parameter + // server + auto height_sections = ctx.Attr>("height_sections"); + auto table_names = ctx.Attr>("table_names"); + VLOG(3) << "path type is " << path->type().name(); + std::vector real_rows = PathToRows(*path); + framework::Scope& local_scope = ctx.scope().NewScope(); + auto* ids = local_scope.Var("Ids@Prefetch"); + auto* x_tensor = ids->GetMutable(); + + x_tensor->mutable_data( + framework::make_ddim({static_cast(real_rows.size()), 1}), + ctx.GetPlace()); + // copy. + + std::memcpy(x_tensor->data(), real_rows.data(), + real_rows.size() * sizeof(int64_t)); + + framework::DDim w_dims = ctx.Input("W")->dims(); + w_dims[0] = x_tensor->dims()[0]; + auto* w_tensor = + local_scope.Var("W@Prefetch")->GetMutable(); + w_tensor->Resize(w_dims); + +#ifdef PADDLE_WITH_DISTRIBUTE + // w_Out is set to used by prefetch, never change it in other cases + auto* w_out = ctx.Output("W_Out"); + operators::distributed::prefetch_with_reconstruct( + "Ids@Prefetch", "W@Prefetch", table_names, epmap, height_sections, + ctx, local_scope, w_out); +#else + PADDLE_THROW( + "paddle is not compiled with distribute support, can not do " + "parameter prefetch!"); +#endif + } + bool is_custom = false; if (path) { is_custom = true; @@ -116,9 +164,8 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto& in = detail::Ref(ctx.Input("X")); auto& w = detail::Ref(ctx.Input("W")); - auto* path = ctx.Input("PTable"); + auto* path = ctx.Input("PathTable"); auto* code = ctx.Input("PathCode"); - auto* bias = ctx.Input("Bias"); auto* in_grad = ctx.Output(framework::GradVarName("X")); bool is_sparse = ctx.Attr("is_sparse"); @@ -165,15 +212,14 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { pre_out_grad_mat * out_grad_mat.broadcast(bcast); // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to // be consistent with the clipping in forward. - + auto* bias_grad = + ctx.Output(framework::GradVarName("Bias")); + if (bias_grad) { + bias_grad->mutable_data(ctx.GetPlace()); + zero(dev_ctx, bias_grad, static_cast(0.0)); + bit_code->AddGrad(pre_out_grad, bias_grad); + } if (!is_sparse) { - auto* bias_grad = - ctx.Output(framework::GradVarName("Bias")); - if (bias_grad) { - bias_grad->mutable_data(ctx.GetPlace()); - zero(dev_ctx, bias_grad, static_cast(0.0)); - bit_code->AddGrad(pre_out_grad, bias_grad); - } auto* w_grad = ctx.Output(framework::GradVarName("W")); w_grad->mutable_data(ctx.GetPlace()); @@ -192,21 +238,6 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { w_grad_value->mutable_data(temp_dim, ctx.GetPlace()); zero(dev_ctx, w_grad_value, static_cast(0.0)); - auto* bias_grad = - ctx.Output(framework::GradVarName("Bias")); - if (bias_grad) { - bias_grad->set_rows(real_rows); - // build ids -> rows index map - bias_grad->SyncIndex(); - bias_grad->set_height(bias->dims()[0]); - auto* bias_grad_value = bias_grad->mutable_value(); - std::vector dims = {static_cast(real_rows.size()), - bias->dims()[1]}; - bias_grad_value->mutable_data(framework::make_ddim(dims), - ctx.GetPlace()); - zero(dev_ctx, bias_grad_value, static_cast(0.0)); - bit_code->AddGrad(pre_out_grad, bias_grad); - } bit_code->MulGradWeight(pre_out_grad, w_grad, in); } bit_code->MulGradError(pre_out_grad, w, in_grad); diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc index 5a6e64b6f8..fed4639b01 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.cc +++ b/paddle/fluid/operators/math/matrix_bit_code.cc @@ -48,23 +48,6 @@ void MatrixBitCodeFunctor::AddGrad(const framework::Tensor& tmat, } } -template -void MatrixBitCodeFunctor::AddGrad(const framework::Tensor& tmat, - framework::SelectedRows* vec) { - size_t batch_size = tmat.dims()[0]; - size_t width = tmat.dims()[1]; - for (size_t i = 0; i < batch_size; ++i) { - auto code = code_table_->get_code(i); - int code_length = code->get_length(); - for (int j = 0; j < code_length; ++j) { - size_t index = code->calc_index(j); - int64_t row_index = vec->GetIndexFromId(static_cast(index)); - vec->mutable_value()->data()[row_index] += - tmat.data()[i * width + j]; - } - } -} - template void MatrixBitCodeFunctor::Sum(const framework::Tensor& tmat, framework::Tensor* sum, T scale_sum) { diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h index 35ca73802b..0bc09bdb35 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.h +++ b/paddle/fluid/operators/math/matrix_bit_code.h @@ -139,11 +139,11 @@ class SimpleCode : public Code { template class CustomCode : public Code { public: - CustomCode(const framework::Tensor& ptable, const framework::Tensor& pcode, - const int64_t* ids, int index) + CustomCode(const framework::Tensor& path_table, + const framework::Tensor& path_code, const int64_t* ids, int index) : ids_(ids), index_(index) { - ptable_ = ptable.Slice(index, index + 1); - pcode_ = pcode.Slice(index, index + 1); + ptable_ = path_table.Slice(index, index + 1); + pcode_ = path_code.Slice(index, index + 1); } /** * Here the id of root shoud be 1 rather than 0, thus the encoding of class c @@ -195,9 +195,9 @@ class SimpleCodeTable : public CodeTable { template class CustomCodeTable : public CodeTable { public: - CustomCodeTable(const framework::Tensor& ptable, - const framework::Tensor& pcode, const int64_t* ids) - : ptable_(ptable), pcode_(pcode), ids_(ids) {} + CustomCodeTable(const framework::Tensor& path_table, + const framework::Tensor& path_code, const int64_t* ids) + : ptable_(path_table), pcode_(path_code), ids_(ids) {} std::unique_ptr get_code(int64_t code) const { std::unique_ptr coder(new CustomCode(ptable_, pcode_, ids_, code)); @@ -223,11 +223,11 @@ class MatrixBitCodeFunctor { ids_(ids), code_table_(new SimpleCodeTable(num_classes, ids)) {} - MatrixBitCodeFunctor(const framework::Tensor& ptable, - const framework::Tensor& pcode, const int64_t* ids) - : num_classes_(static_cast(ptable.dims()[1])), + MatrixBitCodeFunctor(const framework::Tensor& path_table, + const framework::Tensor& path_code, const int64_t* ids) + : num_classes_(static_cast(path_table.dims()[1])), ids_(ids), - code_table_(new CustomCodeTable(ptable, pcode, ids)) {} + code_table_(new CustomCodeTable(path_table, path_code, ids)) {} /* For j < code_length tmat(i, j) += vec(0, index(i, j)) */ @@ -238,11 +238,6 @@ class MatrixBitCodeFunctor { */ void AddGrad(const framework::Tensor& tmat, framework::Tensor* vec); - /* For selected rows For j < code_length - vec(0, index(i, j)) += tmat(i, j) - */ - void AddGrad(const framework::Tensor& tmat, framework::SelectedRows* vec); - /* For j < code_length sum(i, 0) = \sum_j bit(i, j) * tmat(i, j) */ diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 37ddfdf7d5..38dad85717 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -4931,6 +4931,9 @@ def hsigmoid(input, pass weights = None + remote_prefetch = False + if os.environ.get('PADDLE_ENABLE_REMOTE_PREFETCH'): + remote_prefetch = True if not is_custom: weights = helper.create_parameter( @@ -4947,7 +4950,7 @@ def hsigmoid(input, inputs = { "X": input, "W": weights, - "PTable": path_table, + "PathTable": path_table, "PathCode": path_code, "Label": label } @@ -4970,9 +4973,13 @@ def hsigmoid(input, type="hierarchical_sigmoid", inputs=inputs, outputs={"Out": out, - "PreOut": pre_out}, - attrs={"num_classes": num_classes, - "is_sparse": is_sparse}) + "PreOut": pre_out, + "W_Out": weights}, + attrs={ + "num_classes": num_classes, + "is_sparse": is_sparse, + "remote_prefetch": remote_prefetch + }) return out @@ -7440,7 +7447,7 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None): Examples: - .. code-block:: python + .. code-block:: python x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32") y = fluid.layers.brelu(x, t_min=1.0, t_max=20.0) diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py index 2a6c93f75f..8ed5074dc2 100644 --- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py +++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py @@ -185,7 +185,7 @@ class TestHSigmoidOpSparse(OpTest): self.inputs = { 'X': x, 'W': w, - 'PTable': path_table, + 'PathTable': path_table, 'PathCode': path_code, 'Label': label, 'Bias': bias @@ -287,7 +287,7 @@ class TestHSigmoidOpWithCostumTree(OpTest): self.inputs = { 'X': x, 'W': w, - 'PTable': path_table, + 'PathTable': path_table, 'PathCode': path_code, 'Label': label, 'Bias': bias @@ -324,7 +324,7 @@ class TestHSigmoidOpWithCostumTreeWithoutBias(OpTest): self.inputs = { 'X': x, 'W': w, - 'PTable': path_table, + 'PathTable': path_table, 'PathCode': path_code, 'Label': label, } diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_remote_table_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_remote_table_op.py new file mode 100644 index 0000000000..9ed6c94bd2 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_remote_table_op.py @@ -0,0 +1,271 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import signal +import time +import unittest +from multiprocessing import Process + +import numpy as np +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.op import Operator +from paddle.fluid.framework import Program, program_guard + + +def run_pserver(pserver_id, use_cuda, sync_mode): + scope = fluid.core.Scope() + program = Program() + with fluid.scope_guard(scope): + with program_guard(program, startup_program=Program()): + # create table parameter in scope + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + # create and initialize Param Variable + param = scope.var('table').get_tensor() + + param_array = np.ones((5, 8)).astype("float32") + for i in range(len(param_array)): + param_array[i] *= param_array[i] * i + pserver_id * 10 + 1 + param.set(param_array, place) + + optimize_block = program._create_block(program.global_block().idx) + program.global_block().append_op( + type="listen_and_serv", + inputs={'X': []}, + outputs={}, + attrs={ + "optimize_blocks": [optimize_block], + "endpoint": '127.0.0.1:0', + "Fanin": 1, + "sync_mode": True, + "grad_to_block_id": [] + }) + + exe = fluid.Executor(place) + exe.run(program) + + +class TestListenAndServOp(unittest.TestCase): + def setUp(self): + self.ps_timeout = 5 + + def _start_pserver(self, pserver_id, use_cuda, sync_mode, pserver_func): + p = Process(target=pserver_func, args=(pserver_id, use_cuda, sync_mode)) + p.daemon = True + p.start() + return p + + def _wait_ps_ready(self, pid): + start_left_time = self.ps_timeout + sleep_time = 0.5 + while True: + assert start_left_time >= 0, "wait ps ready failed" + time.sleep(sleep_time) + try: + # the listen_and_serv_op would touch a file which contains the listen port + # on the /tmp directory until it was ready to process all the RPC call. + os.stat("/tmp/paddle.%d.port" % pid) + return + except os.error: + start_left_time -= sleep_time + + def _get_pserver_port(self, pid): + with open("/tmp/paddle.%d.port" % pid, 'r') as f: + port = int(f.read().strip()) + return port + + def _run_hsigmoid_op_one_pserver(self, place, port): + scope = fluid.core.Scope() + program = Program() + with fluid.scope_guard(scope): + with program_guard(program, startup_program=Program()): + x = scope.var('X').get_tensor() + x_array = np.random.random((4, 8)).astype("float32") * 2 + x.set(x_array, place) + # create and initialize Param Variable + param = scope.var('W').get_tensor() + param_array = np.zeros((5, 8)).astype("float32") * 2 + param.set(param_array, place) + + path_table = scope.var('PathTable').get_tensor() + path_table_array = np.array( + [(0, 2, -1, -1, -1), (0, 1, 2, -1, -1), (0, 1, 4, -1, -1), + (0, 2, -1, -1, -1)]).astype( + "int64" + ) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf) + path_table.set(path_table_array, place) + + path_code = scope.var('PathCode').get_tensor() + path_code_array = np.array( + [(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (1, 0, 0, -1, -1), + (0, 1, -1, -1, -1)]).astype("int64") #np.array to store + path_code.set(path_code_array, place) + + label = scope.var('Label').get_tensor() + label_array = np.array([0, 1, 4, 5]) + label.set(label_array, place) + + bias = scope.var('Bias').get_tensor() + bias_array = np.random.random((5, 1)).astype("float32") + bias.set(bias_array, place) + + out = scope.var('Out').get_tensor() + + pre_out = scope.var('PreOut').get_tensor + + w_out = scope.var('W_Out').get_tensor() + w_out.set(param_array, place) + + emaps = ['127.0.0.1:' + str(port)] + table_names = ['table'] + height_sections = [2] + + # create and run sgd operator + hsigmoid_op = Operator( + "hierarchical_sigmoid", + X='X', + W='W', + PathTable='PathTable', + PathCode='PathCode', + Label='Label', + Bias='Bias', + Out='Out', + PreOut='PreOut', + W_Out='W_Out', + remote_prefetch=True, + epmap=emaps, + table_names=table_names, + height_sections=height_sections) + + hsigmoid_op.run(scope, place) + + # get and compare result + result_array = np.array(w_out) + self.assertEqual(list(result_array.shape), [5, 8]) + correct = None + for i in range(5): + if i != 3: + correct = np.full((1, 8), i + 1).astype("float32") + self.assertTrue((result_array[i] == correct).all()) + else: + correct = np.full((1, 8), 0).astype("float32") + self.assertTrue((result_array[i] == correct).all()) + + def _run_hsigmoid_op_two_pserver(self, place, port0, port1): + scope = fluid.core.Scope() + program = Program() + with fluid.scope_guard(scope): + with program_guard(program, startup_program=Program()): + x = scope.var('X').get_tensor() + x_array = np.random.random((4, 8)).astype("float32") * 2 + x.set(x_array, place) + # create and initialize Param Variable + param = scope.var('W').get_tensor() + param_array = np.zeros((5, 8)).astype("float32") * 2 + param.set(param_array, place) + + path_table = scope.var('PathTable').get_tensor() + path_table_array = np.array( + [(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1), + (0, 2, -1, -1, -1)]).astype( + "int64" + ) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf) + path_table.set(path_table_array, place) + + path_code = scope.var('PathCode').get_tensor() + path_code_array = np.array( + [(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (1, 0, 0, -1, -1), + (0, 1, -1, -1, -1)]).astype("int64") #np.array to store + path_code.set(path_code_array, place) + + label = scope.var('Label').get_tensor() + label_array = np.array([0, 1, 4, 5]) + label.set(label_array, place) + + bias = scope.var('Bias').get_tensor() + bias_array = np.random.random((5, 1)).astype("float32") + bias.set(bias_array, place) + + out = scope.var('Out').get_tensor() + + pre_out = scope.var('PreOut').get_tensor + + w_out = scope.var('W_Out').get_tensor() + w_out.set(param_array, place) + + emaps = ['127.0.0.1:' + str(port0), '127.0.0.1:' + str(port1)] + table_names = ['table', 'table'] + height_sections = [2, 3] + + # create and run sgd operator + hsigmoid_op = Operator( + "hierarchical_sigmoid", + X='X', + W='W', + PathTable='PathTable', + PathCode='PathCode', + Label='Label', + Bias='Bias', + Out='Out', + PreOut='PreOut', + W_Out='W_Out', + remote_prefetch=True, + epmap=emaps, + table_names=table_names, + height_sections=height_sections) + hsigmoid_op.run(scope, place) + + # get and compare result + result_array = np.array(w_out) + self.assertEqual(list(result_array.shape), [5, 8]) + correct = None + for i in range(5): + if i < 2: + correct = np.full((1, 8), i + 1).astype("float32") + self.assertTrue((result_array[i] == correct).all()) + else: + correct = np.full((1, 8), i + 9).astype("float32") + self.assertTrue((result_array[i] == correct).all()) + + def test_hsigmoid_op_remote(self): + os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1" + # run pserver on CPU in sync mode + p0 = self._start_pserver(0, False, True, run_pserver) + self._wait_ps_ready(p0.pid) + port0 = self._get_pserver_port(p0.pid) + + p1 = self._start_pserver(1, False, True, run_pserver) + self._wait_ps_ready(p1.pid) + port1 = self._get_pserver_port(p1.pid) + + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + + for place in places: + self._run_hsigmoid_op_one_pserver(place, port0) + self._run_hsigmoid_op_two_pserver(place, port0, port1) + + # raise SIGTERM to pserver + os.kill(p0.pid, signal.SIGINT) + p0.join() + os.kill(p1.pid, signal.SIGINT) + p1.join() + + +if __name__ == '__main__': + unittest.main() From 106e28523641ef6bdffe301b2a63b6d0f13de29a Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Wed, 12 Dec 2018 19:57:49 +0800 Subject: [PATCH 028/197] add unittest for parllelgraph mode test=develop --- .../details/multi_devices_graph_pass.cc | 8 +- .../details/parallel_ssa_graph_executor.cc | 20 +-- paddle/fluid/framework/parallel_executor.cc | 2 +- paddle/fluid/operators/reader/ctr_reader.h | 2 +- .../unittests/parallel_executor_test_base.py | 164 +++++++++--------- .../unittests/test_parallel_executor_crf.py | 3 + .../unittests/test_parallel_executor_mnist.py | 38 ++-- .../test_parallel_executor_seresnext.py | 49 ++++-- .../test_parallel_executor_transformer.py | 6 +- 9 files changed, 164 insertions(+), 128 deletions(-) diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index c16e3006d7..e264906b57 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -300,7 +300,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( auto nodes = graph->ReleaseNodes(); ir::Graph &result = *graph; - // int num_trainers = Get(kNumTrainers); + int num_trainers = Get(kNumTrainers); for (auto &node : nodes) { if (node->IsVar() && node->Var()) { @@ -387,7 +387,11 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( } // if (!is_forwarding && (places_.size() > 1 || num_trainers > 1)) { - if (!is_forwarding && nccl_ctxs_->contexts_.size() > 1) { + // insert synchronous ops at the backpropagation; and + // insert synchronous ops if the graph contains mutilple places. + if (!is_forwarding && + (places_.size() > 1 || num_trainers > 1 || + (nccl_ctxs_ && nccl_ctxs_->contexts_.size() > 1))) { // Currently, we assume that once gradient is generated, it can be // broadcast, and each gradient is only broadcast once. if (static_cast(boost::get(node->Op()->GetAttr( diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc index f1a07edf08..214c2f7625 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc @@ -49,18 +49,18 @@ FeedFetchList ParallelSSAGraphExecutor::Run( for (size_t i = 0; i < places_.size(); ++i) { auto call = [this, i, &fetch_tensors]() -> FeedFetchList { - return executors_[i]->Run(fetch_tensors); + try { + return executors_[i]->Run(fetch_tensors); + } catch (...) { + exception_holder_.Catch(std::current_exception()); + } + return FeedFetchList(); }; if (pool_) { run_futures.emplace_back(pool_->enqueue(std::move(call))); } else { - try { - fetch_datas.emplace_back(std::move(call())); - } catch (...) { - exception_holder_.Catch(std::current_exception()); - break; - } + call(); } } @@ -69,11 +69,7 @@ FeedFetchList ParallelSSAGraphExecutor::Run( if (exception_holder_.IsCaught()) { f.wait(); } else { - try { - fetch_datas.emplace_back(std::move(f.get())); - } catch (...) { - exception_holder_.Catch(std::current_exception()); - } + fetch_datas.emplace_back(std::move(f.get())); } } } diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index b0cd1e8e90..8d35361eb6 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -87,7 +87,7 @@ ParallelExecutor::ParallelExecutor( "the number of places must be greater than 1."); PADDLE_ENFORCE(exec_strategy.type_ != ExecutionStrategy::kParallelGraph, "You should set build_strategy.reduce with 'AllReduce' for " - "ParallelGraph executor type"); + "the ParallelGraph executor type"); } // Step 1. Bcast the params to devs. diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h index 9b2a11bae1..517d669744 100644 --- a/paddle/fluid/operators/reader/ctr_reader.h +++ b/paddle/fluid/operators/reader/ctr_reader.h @@ -48,7 +48,7 @@ void MonitorThread(std::vector* thread_status, class CTRReader : public framework::FileReader { public: explicit CTRReader(const std::shared_ptr& queue, - int batch_size, int thread_num, + int batch_size, size_t thread_num, const std::vector& slots, const std::vector& file_list) : batch_size_(batch_size), slots_(slots), file_list_(file_list) { diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index 86f861674c..73b8fb74fa 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -26,23 +26,26 @@ import sys __all__ = ['TestParallelExecutorBase'] +ExecutorType = fluid.ExecutionStrategy().ExecutorType + class TestParallelExecutorBase(unittest.TestCase): - def check_network_convergence(self, - method, - use_cuda=True, - memory_opt=True, - iter=50, - batch_size=None, - allow_op_delay=False, - feed_dict=None, - seed=None, - use_parallel_executor=True, - use_reduce=False, - fuse_elewise_add_act_ops=False, - optimizer=fluid.optimizer.Adam, - use_fast_executor=False, - enable_sequential_execution=False): + def check_network_convergence( + self, + method, + use_cuda=True, + memory_opt=True, + iter=50, + batch_size=None, + allow_op_delay=False, + feed_dict=None, + seed=None, + use_parallel_executor=True, + use_reduce=False, + fuse_elewise_add_act_ops=False, + optimizer=fluid.optimizer.Adam, + exec_type=fluid.ExecutionStrategy().ExecutorType.Default, + enable_sequential_execution=False): def run_executor(exe, feed, fetch_list, program=None): if isinstance(exe, fluid.ParallelExecutor): res = exe.run(fetch_list=fetch_list, feed=feed) @@ -58,68 +61,69 @@ class TestParallelExecutorBase(unittest.TestCase): startup = fluid.Program() startup.random_seed = 1 # Fix random seed main.random_seed = 1 - with fluid.program_guard(main, startup): - if seed is not None: - startup.random_seed = seed - main.random_seed = seed - - loss = method(use_feed=feed_dict is not None) - - optimizer().minimize(loss) - - if memory_opt: - fluid.memory_optimize(main) - - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - startup_exe = fluid.Executor(place) - startup_exe.run(startup) - exec_strategy = fluid.ExecutionStrategy() - exec_strategy.allow_op_delay = allow_op_delay - if use_fast_executor: - exec_strategy.use_experimental_executor = True - - build_strategy = fluid.BuildStrategy() - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ - if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce - build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops - build_strategy.enable_sequential_execution = enable_sequential_execution - if use_cuda and core.is_compiled_with_cuda(): - build_strategy.remove_unnecessary_lock = True - - if use_parallel_executor: - exe = fluid.ParallelExecutor( - use_cuda, - loss_name=loss.name, - exec_strategy=exec_strategy, - build_strategy=build_strategy) - else: - exe = fluid.Executor(place=place) - - if batch_size is not None: - batch_size *= fluid.core.get_cuda_device_count( - ) if use_cuda else int( - os.environ.get('CPU_NUM', multiprocessing.cpu_count())) - begin = time.time() - first_loss, = run_executor( - exe=exe, feed=feed_dict, fetch_list=[loss.name]) - - for i in range(iter): - run_executor(exe=exe, feed=feed_dict, fetch_list=[]) - - last_loss, = run_executor( - exe=exe, feed=feed_dict, fetch_list=[loss.name]) - end = time.time() - - if batch_size is not None: - print("%.4f Instance per second" % ( - (batch_size * iter + 2) / (end - begin))) - - avg_last_loss_val = np.array(last_loss).mean() - avg_first_loss_val = np.array(first_loss).mean() - if math.isnan(float(avg_last_loss_val)) or math.isnan( - float(avg_first_loss_val)): - sys.exit("got NaN loss, training failed.") - - print(first_loss, last_loss) - # self.assertGreater(first_loss[0], last_loss[0]) - return first_loss, last_loss + scope = fluid.Scope() + with fluid.scope_guard(scope): + with fluid.program_guard(main, startup): + if seed is not None: + startup.random_seed = seed + main.random_seed = seed + + loss = method(use_feed=feed_dict is not None) + + optimizer().minimize(loss) + + if memory_opt: + fluid.memory_optimize(main) + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + startup_exe = fluid.Executor(place) + startup_exe.run(startup) + exec_strategy = fluid.ExecutionStrategy() + exec_strategy.allow_op_delay = allow_op_delay + exec_strategy.executor_type = exec_type + + build_strategy = fluid.BuildStrategy() + build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ + if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce + build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops + build_strategy.enable_sequential_execution = enable_sequential_execution + if use_cuda and core.is_compiled_with_cuda(): + build_strategy.remove_unnecessary_lock = True + + if use_parallel_executor: + exe = fluid.ParallelExecutor( + use_cuda, + loss_name=loss.name, + exec_strategy=exec_strategy, + build_strategy=build_strategy) + else: + exe = fluid.Executor(place=place) + + if batch_size is not None: + batch_size *= fluid.core.get_cuda_device_count( + ) if use_cuda else int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + begin = time.time() + first_loss, = run_executor( + exe=exe, feed=feed_dict, fetch_list=[loss.name]) + + for i in range(iter): + run_executor(exe=exe, feed=feed_dict, fetch_list=[]) + + last_loss, = run_executor( + exe=exe, feed=feed_dict, fetch_list=[loss.name]) + end = time.time() + + if batch_size is not None: + print("%.4f Instance per second" % ( + (batch_size * iter + 2) / (end - begin))) + + avg_last_loss_val = np.array(last_loss).mean() + avg_first_loss_val = np.array(first_loss).mean() + if math.isnan(float(avg_last_loss_val)) or math.isnan( + float(avg_first_loss_val)): + sys.exit("got NaN loss, training failed.") + + print(first_loss, last_loss) + # self.assertGreater(first_loss[0], last_loss[0]) + return first_loss, last_loss diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py index 84b0aad8ac..d75761153c 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py @@ -181,6 +181,9 @@ class TestCRFModel(unittest.TestCase): if core.is_compiled_with_cuda(): self.check_network_convergence( is_sparse=True, build_strategy=build_strategy, use_cuda=True) + self.check_network_convergence( + is_sparse=True, build_strategy=build_strategy, use_cuda=True) + self.check_network_convergence( is_sparse=True, build_strategy=build_strategy, use_cuda=False) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index 3eecc46701..3dddff0d99 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -20,7 +20,7 @@ import numpy as np import paddle.fluid.core as core import os import paddle.fluid as fluid -from parallel_executor_test_base import TestParallelExecutorBase +from parallel_executor_test_base import TestParallelExecutorBase, ExecutorType def simple_fc_net(use_feed): @@ -99,7 +99,10 @@ class TestMNIST(TestParallelExecutorBase): self.assertAlmostEqual(loss[0], loss[1], delta=1e-4) # simple_fc - def check_simple_fc_convergence(self, use_cuda, use_reduce=False): + def check_simple_fc_convergence(self, + use_cuda, + use_reduce=False, + exec_type=ExecutorType.Default): if use_cuda and not core.is_compiled_with_cuda(): return @@ -110,19 +113,21 @@ class TestMNIST(TestParallelExecutorBase): feed_dict={"image": img, "label": label}, use_cuda=use_cuda, - use_reduce=use_reduce) + use_reduce=use_reduce, + exec_type=exec_type) def test_simple_fc(self): # use_cuda - self.check_simple_fc_convergence(True) + self.check_simple_fc_convergence(True, ExecutorType.Default) + self.check_simple_fc_convergence(True, ExecutorType.ParallelGraph) self.check_simple_fc_convergence(False) def test_simple_fc_with_new_strategy(self): - # use_cuda, use_reduce + # use_cuda, use_reducea self._compare_reduce_and_allreduce(simple_fc_net, True) self._compare_reduce_and_allreduce(simple_fc_net, False) - def check_simple_fc_parallel_accuracy(self, use_cuda): + def check_simple_fc_parallel_accuracy(self, use_cuda, exec_type): if use_cuda and not core.is_compiled_with_cuda(): return @@ -134,14 +139,16 @@ class TestMNIST(TestParallelExecutorBase): feed_dict={"image": img, "label": label}, use_cuda=use_cuda, - use_parallel_executor=False) + use_parallel_executor=False, + exec_type=exec_type) parallel_first_loss, parallel_last_loss = self.check_network_convergence( method=simple_fc_net, seed=1, feed_dict={"image": img, "label": label}, use_cuda=use_cuda, - use_parallel_executor=True) + use_parallel_executor=True, + exec_type=exec_type) self.assertAlmostEquals( np.mean(parallel_first_loss), @@ -151,10 +158,12 @@ class TestMNIST(TestParallelExecutorBase): np.mean(parallel_last_loss), single_last_loss, delta=1e-6) def test_simple_fc_parallel_accuracy(self): - self.check_simple_fc_parallel_accuracy(True) - self.check_simple_fc_parallel_accuracy(False) + self.check_simple_fc_parallel_accuracy(True, ExecutorType.Default) + self.check_simple_fc_parallel_accuracy(True, ExecutorType.ParallelGraph) + # FIXME(Yancey1989): ParallelGraph executor type support CPU mode + self.check_simple_fc_parallel_accuracy(False, ExecutorType.Default) - def check_batchnorm_fc_convergence(self, use_cuda, use_fast_executor): + def check_batchnorm_fc_convergence(self, use_cuda, exec_type): if use_cuda and not core.is_compiled_with_cuda(): return @@ -165,12 +174,13 @@ class TestMNIST(TestParallelExecutorBase): feed_dict={"image": img, "label": label}, use_cuda=use_cuda, - use_fast_executor=use_fast_executor) + exec_type=exec_type) def test_batchnorm_fc(self): for use_cuda in (False, True): - for use_fast_executor in (False, True): - self.check_batchnorm_fc_convergence(use_cuda, use_fast_executor) + for exec_type in (ExecutorType.Default, ExecutorType.Experimental, + ExecutorType.ParallelGraph): + self.check_batchnorm_fc_convergence(use_cuda, exec_type) def test_batchnorm_fc_with_new_strategy(self): # FIXME(zcd): close this test temporally. diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py index e7a56bb638..bada38894f 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py @@ -19,7 +19,7 @@ import paddle.fluid.layers.ops as ops from paddle.fluid.initializer import init_on_cpu from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter import paddle.fluid.core as core -from parallel_executor_test_base import TestParallelExecutorBase +from parallel_executor_test_base import TestParallelExecutorBase, ExecutorType import unittest import math import os @@ -167,13 +167,17 @@ def cosine_decay(learning_rate, step_each_epoch, epochs=120): return decayed_lr -def optimizer(learning_rate=0.01): - optimizer = fluid.optimizer.Momentum( - learning_rate=cosine_decay( - learning_rate=learning_rate, step_each_epoch=2, epochs=1), - momentum=0.9, - regularization=fluid.regularizer.L2Decay(1e-4)) - return optimizer +def optimizer(learning_rate=0.01, lr_scale=1.0): + def _opt(): + return fluid.optimizer.Momentum( + learning_rate=cosine_decay( + learning_rate=learning_rate / lr_scale, + step_each_epoch=2, + epochs=1), + momentum=0.9, + regularization=fluid.regularizer.L2Decay(1e-4)) + + return _opt class TestResnet(TestParallelExecutorBase): @@ -216,7 +220,7 @@ class TestResnet(TestParallelExecutorBase): batch_size=batch_size, use_cuda=use_cuda, use_reduce=False, - optimizer=optimizer) + optimizer=optimizer()) reduce_first_loss, reduce_last_loss = self.check_network_convergence( model, feed_dict={"image": img, @@ -225,7 +229,7 @@ class TestResnet(TestParallelExecutorBase): batch_size=batch_size, use_cuda=use_cuda, use_reduce=True, - optimizer=optimizer) + optimizer=optimizer()) for loss in zip(all_reduce_first_loss, reduce_first_loss): self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) @@ -243,7 +247,7 @@ class TestResnet(TestParallelExecutorBase): batch_size=batch_size, use_cuda=use_cuda, use_reduce=False, - optimizer=optimizer, + optimizer=optimizer(), enable_sequential_execution=True) reduce_first_loss_seq, reduce_last_loss_seq = self.check_network_convergence( @@ -254,7 +258,7 @@ class TestResnet(TestParallelExecutorBase): batch_size=batch_size, use_cuda=use_cuda, use_reduce=True, - optimizer=optimizer, + optimizer=optimizer(), enable_sequential_execution=True) for loss in zip(all_reduce_first_loss, all_reduce_first_loss_seq): @@ -277,7 +281,9 @@ class TestResnet(TestParallelExecutorBase): use_cuda=True, use_reduce=False, iter=20, - delta2=1e-6): + delta2=1e-6, + exec_type=ExecutorType.Default, + lr_scale=1.0): if use_cuda and not core.is_compiled_with_cuda(): return @@ -295,8 +301,9 @@ class TestResnet(TestParallelExecutorBase): batch_size=batch_size, use_cuda=use_cuda, use_reduce=use_reduce, - optimizer=optimizer, - use_parallel_executor=False) + optimizer=optimizer(), + use_parallel_executor=False, + exec_type=exec_type) parallel_first_loss, parallel_last_loss = self.check_network_convergence( model, feed_dict={"image": img, @@ -305,7 +312,8 @@ class TestResnet(TestParallelExecutorBase): batch_size=batch_size, use_cuda=use_cuda, use_reduce=use_reduce, - optimizer=optimizer) + optimizer=optimizer(lr_scale=lr_scale), + exec_type=exec_type) self.assertAlmostEquals( np.mean(parallel_first_loss), single_first_loss[0], delta=1e-6) @@ -313,7 +321,14 @@ class TestResnet(TestParallelExecutorBase): np.mean(parallel_last_loss), single_last_loss[0], delta=delta2) def test_seresnext_with_learning_rate_decay(self): - self._check_resnet_convergence(model=SE_ResNeXt50Small, use_cuda=True) + if core.is_compiled_with_cuda(): + self._check_resnet_convergence( + model=SE_ResNeXt50Small, use_cuda=True) + self._check_resnet_convergence( + model=SE_ResNeXt50Small, + use_cuda=True, + exec_type=ExecutorType.ParallelGraph, + lr_scale=core.get_cuda_device_count()) self._check_resnet_convergence( model=SE_ResNeXt50Small, use_cuda=False, iter=2, delta2=1e-3) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py index 3827743908..b5ee72a24e 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py @@ -17,7 +17,7 @@ from __future__ import print_function import paddle.fluid as fluid import transformer_model import numpy as np -from parallel_executor_test_base import TestParallelExecutorBase +from parallel_executor_test_base import TestParallelExecutorBase, ExecutorType import unittest import paddle import paddle.fluid.core as core @@ -173,6 +173,10 @@ class TestTransformer(TestParallelExecutorBase): def test_main(self): if core.is_compiled_with_cuda(): self.check_network_convergence(transformer, use_cuda=True) + self.check_network_convergence( + transformer, + use_cuda=True, + exec_type=ExecutorType.ParallelGraph) self.check_network_convergence( transformer, use_cuda=True, enable_sequential_execution=True) self.check_network_convergence(transformer, use_cuda=False, iter=5) From 23eb8c4299ce9908d07505df413c4a2b79f14d32 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Thu, 13 Dec 2018 14:02:15 +0800 Subject: [PATCH 029/197] fix ci test=develop --- .../framework/details/multi_devices_graph_pass.cc | 10 +++++++--- paddle/fluid/operators/reader/ctr_reader.h | 2 +- paddle/fluid/pybind/pybind.cc | 13 ++++++++++++- .../unittests/test_parallel_executor_dry_run.py | 10 ++++++---- 4 files changed, 26 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index e264906b57..6c4e0e9168 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -386,12 +386,16 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( CreateComputationalOps(&result, node, places_.size()); } - // if (!is_forwarding && (places_.size() > 1 || num_trainers > 1)) { - // insert synchronous ops at the backpropagation; and - // insert synchronous ops if the graph contains mutilple places. +// insert synchronous ops at the backpropagation; and +// insert synchronous ops if the graph contains mutilple places. + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) if (!is_forwarding && (places_.size() > 1 || num_trainers > 1 || (nccl_ctxs_ && nccl_ctxs_->contexts_.size() > 1))) { +#else + if (!is_forwarding && (places_.size() > 1 || num_trainers > 1)) { +#endif // Currently, we assume that once gradient is generated, it can be // broadcast, and each gradient is only broadcast once. if (static_cast(boost::get(node->Op()->GetAttr( diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h index 517d669744..635483158f 100644 --- a/paddle/fluid/operators/reader/ctr_reader.h +++ b/paddle/fluid/operators/reader/ctr_reader.h @@ -95,7 +95,7 @@ class CTRReader : public framework::FileReader { queue_->ReOpen(); VLOG(3) << "reopen success"; VLOG(3) << "thread_num " << thread_num_; - for (int thread_id = 0; thread_id < thread_num_; thread_id++) { + for (size_t thread_id = 0; thread_id < thread_num_; thread_id++) { read_threads_.emplace_back(new std::thread( std::bind(&ReadThread, file_groups_[thread_id], slots_, batch_size_, thread_id, &read_thread_status_, queue_))); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 9cebdda693..3beb93e7b3 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -789,7 +789,18 @@ All parameter, weight, gradient are variables in Paddle. [](ExecutionStrategy &self, ExecutionStrategy::ExecutorType type) { self.type_ = type; }, - R"DOC()DOC"); + R"DOC(The type is ExecutorType which is the enum ranging from Default, +ParallelGraph and Experiment: + +Default: Compile the main_program into a multi-devices graph, + and execute this graph on multi-devices with multiple threads which + specified by build_strategy.num_threads. +ParallelGraph: Compile the main_program into multiple graphs, and execute each of the graphs on one + device with one thread. Please note, this mode only supports all-reduce mode and use_cuda=True. + This approach can achieve better performance in some scenarios. +Experimental: Compile the main_program into a multi-devices graph, + and executor this graph with a faster execution mode than the Default, + this approach is on the experiments.)DOC"); py::class_ build_strategy(pe, "BuildStrategy", R"DOC( BuildStrategy allows the user to more preciously control how to diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py index 18d95c94ad..eff76ce0d4 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py @@ -17,6 +17,8 @@ import unittest import logging import six +ExecutorType = fluid.ExecutionStrategy().ExecutorType + class TestBase(unittest.TestCase): def main(self, @@ -24,7 +26,7 @@ class TestBase(unittest.TestCase): iter=10, iter_per_pe=10, use_gpu=True, - use_experimental_executor=False): + exec_type=ExecutorType.Default): if use_gpu and not fluid.core.is_compiled_with_cuda(): logging.warning( "Paddle is not compiled with CUDA, skip GPU unittests") @@ -43,7 +45,7 @@ class TestBase(unittest.TestCase): for _ in six.moves.xrange(iter): exe_strategy = fluid.ExecutionStrategy() exe_strategy._dry_run = True - exe_strategy.use_experimental_executor = use_experimental_executor + exe_strategy.executor_type = exec_type pe = fluid.ParallelExecutor( use_cuda=use_gpu, loss_name=loss.name, @@ -56,11 +58,11 @@ class TestBase(unittest.TestCase): class TestMNISTDryRun(TestBase): def test_mnist_dry_run(self): for use_gpu in (False, True): - for use_experimental_executor in (False, True): + for exec_type in (ExecutorType.Default, ExecutorType.Experimental): self.main( network_func=TestMNISTDryRun.network_func, use_gpu=use_gpu, - use_experimental_executor=use_experimental_executor) + exec_type=exec_type) @staticmethod def network_func(): From affdd70976f1ad2a2de959ceb082b95791961d91 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Thu, 13 Dec 2018 14:26:52 +0800 Subject: [PATCH 030/197] fix api.spec test=develop --- paddle/fluid/API.spec | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 2722ea078e..db7b0d34a3 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -27,6 +27,7 @@ paddle.fluid.DistributeTranspilerConfig.__init__ paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None)) paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)) paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None +paddle.fluid.ExecutionStrategy.ExecutorType.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy.ExecutorType, arg0: int) -> None paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.ReduceStrategy, arg0: int) -> None paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy) -> None From 1bac8f918c9fca90db7e95dd2f27d7946ffebc40 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Thu, 13 Dec 2018 15:00:20 +0800 Subject: [PATCH 031/197] fix api.spec test=develop --- paddle/fluid/API.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index db7b0d34a3..6c6026911b 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -26,8 +26,8 @@ paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], vara paddle.fluid.DistributeTranspilerConfig.__init__ paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None)) paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)) -paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None paddle.fluid.ExecutionStrategy.ExecutorType.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy.ExecutorType, arg0: int) -> None +paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.ReduceStrategy, arg0: int) -> None paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy) -> None From 4f304eaa6fcdc5af93a4878a09387f4d7fbd5aed Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Thu, 13 Dec 2018 19:46:35 +0800 Subject: [PATCH 032/197] fix unittest test=develop --- paddle/fluid/framework/parallel_executor.cc | 14 +++++++++++--- .../unittests/test_parallel_executor_mnist.py | 2 ++ .../test_parallel_executor_transformer.py | 4 ---- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 81d1024cb6..2604e41045 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -197,9 +197,17 @@ ParallelExecutor::ParallelExecutor( PADDLE_ENFORCE(places.size() > 1, "If you set build_strategy.reduce with 'Reduce'," "the number of places must be greater than 1."); - PADDLE_ENFORCE(exec_strategy.type_ != ExecutionStrategy::kParallelGraph, - "You should set build_strategy.reduce with 'AllReduce' for " - "the ParallelGraph executor type"); + } + + if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { + PADDLE_ENFORCE( + member_->use_all_reduce_, + "build_strategy.reduce should be `AllReduce` if you want to use" + "ParallelGraph executor."); + PADDLE_ENFORCE( + member_->use_cuda_, + "execution_strategy.use_cuda should be True if you want to use" + "ParallelGraph executor."); } // Step 1. Bcast the params to devs. diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index 3dddff0d99..0ff079b4e2 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -166,6 +166,8 @@ class TestMNIST(TestParallelExecutorBase): def check_batchnorm_fc_convergence(self, use_cuda, exec_type): if use_cuda and not core.is_compiled_with_cuda(): return + if not use_cuda and exec_type == ExecutorType.ParallelGraph: + return img, label = self._init_data() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py index b5ee72a24e..8a1a3ab3ca 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py @@ -173,10 +173,6 @@ class TestTransformer(TestParallelExecutorBase): def test_main(self): if core.is_compiled_with_cuda(): self.check_network_convergence(transformer, use_cuda=True) - self.check_network_convergence( - transformer, - use_cuda=True, - exec_type=ExecutorType.ParallelGraph) self.check_network_convergence( transformer, use_cuda=True, enable_sequential_execution=True) self.check_network_convergence(transformer, use_cuda=False, iter=5) From f702ab74b9edfe6310470ad1ad98ae054f3120fc Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Fri, 14 Dec 2018 07:36:45 +0000 Subject: [PATCH 033/197] add dist transpiler test --- .../tests/unittests/test_dist_transpiler.py | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index 650a745cdc..27575897b5 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -875,5 +875,53 @@ class TestRemoteNce(TestDistLookupTableBase): pass +# test for remote prefetch +class TestRemoteHsigmoid(TestDistLookupTableBase): + def network_with_table(self, is_sparse, is_distributed): + + num_total_classes = 10 + + input = fluid.layers.data(name="input", shape=[10], dtype="float32") + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + path_table = fluid.layers.data( + name='path_table', shape=[10], dtype='int64') + path_code = fluid.layers.data( + name='path_code', shape=[10], dtype='int64') + w_param = fluid.default_main_program().global_block().create_parameter( + shape=[num_total_classes, 10], + dtype='float32', + name='hs_w', + initializer=fluid.initializer.ConstantInitializer()) + b_param = fluid.default_main_program().global_block().create_parameter( + shape=[num_total_classes, 1], + dtype='float32', + name='hs_b', + initializer=fluid.initializer.ConstantInitializer()) + + cost = fluid.layers.hsigmoid( + input=input, + label=label, + num_classes=non_leaf_num, + path_table=path_table, + path_code=path_code, + is_custom=True, + is_sparse=is_sparse) + avg_cost = fluid.layers.mean(cost) + # optimizer + optimizer = fluid.optimizer.SGD(learning_rate=0.003) + optimizer.minimize(avg_cost) + + def net_conf(self): + import os + os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1" + self.network_with_table(is_sparse=True, is_distributed=False) + + def transpiler_test_impl(self): + trainer, _ = self.get_trainer() + for op in trainer.blocks[0].ops: + if op.type == "recv": + pass + + if __name__ == "__main__": unittest.main() From 4a4ccac1d060ccf5758b7ff0d32dfb90ab3c5b7f Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Fri, 14 Dec 2018 15:53:13 +0800 Subject: [PATCH 034/197] update by comment test=develop --- .../framework/details/all_reduce_op_handle.cc | 14 ++++++-------- .../framework/details/multi_devices_graph_pass.cc | 4 ++-- paddle/fluid/framework/details/op_handle_base.cc | 1 + .../details/threaded_ssa_graph_executor.cc | 1 + paddle/fluid/framework/parallel_executor.cc | 14 ++++++++++---- paddle/fluid/framework/threadpool.h | 1 + .../reader/create_double_buffer_reader_op.cc | 1 + paddle/fluid/platform/nccl_helper.h | 5 +---- .../unittests/test_parallel_executor_mnist.py | 2 +- 9 files changed, 24 insertions(+), 19 deletions(-) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index 6b7bbf9003..5a4f218077 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -107,22 +107,20 @@ void AllReduceOpHandle::RunImpl() { PADDLE_ENFORCE(platform::dynload::ncclAllReduce( buffer, buffer, numel, static_cast(dtype), ncclSum, comm, stream)); - if (!nccl_ctxs_->need_group_call_) cudaStreamSynchronize(stream); + // TODO(Yancey1989): synchronize here can get better performance + // if don't use NCCL group call, but need more profileing. + if (local_scopes_.size() == 1UL) cudaStreamSynchronize(stream); }); } this->RunAndRecordEvent([&] { - // TODO(Yancey1989): need allreduce operator to avoid this flag - if (nccl_ctxs_->need_group_call_) { + if (all_reduce_calls.size() == 1UL) { + all_reduce_calls[0](); + } else { platform::NCCLGroupGuard guard; for (auto &call : all_reduce_calls) { call(); } - } else { - // only used in executor_type == ParallalGraph, one thread one GPU - // TODO(Yancey1989): use allreduce operator to avoid this tricky. - PADDLE_ENFORCE(all_reduce_calls.size() == 1UL); - all_reduce_calls[0](); } }); diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 6e8cf86fcc..5b82805ad9 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -386,8 +386,8 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( CreateComputationalOps(&result, node, places_.size()); } -// insert synchronous ops at the backpropagation; and -// insert synchronous ops if the graph contains mutilple places. +// insert collective ops at the backpropagation; and +// insert collective ops if the graph contains mutilple places. #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) if (!is_forwarding && diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 4914e0a5ad..4822627ac3 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -52,6 +52,7 @@ void OpHandleBase::Run(bool use_cuda) { #else PADDLE_ENFORCE(!use_cuda); #endif + RunImpl(); } diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index cebf63364d..677a293794 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -216,6 +216,7 @@ void ThreadedSSAGraphExecutor::RunOp( if (LIKELY(!strategy_.dry_run_)) { op->Run(strategy_.use_cuda_); } + VLOG(10) << op << " " << op->Name() << " Done "; running_ops_--; ready_var_q->Extend(op->Outputs()); VLOG(10) << op << " " << op->Name() << "Signal posted"; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 2604e41045..63f3ef0eac 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -231,7 +231,6 @@ ParallelExecutor::ParallelExecutor( #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME); ncclUniqueId *nccl_id = nullptr; - bool need_group_call = true; if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { // parallel graph mode should initialize nccl by ncclCommInitRank since // it call nccl operator per device per thread. @@ -243,17 +242,16 @@ ParallelExecutor::ParallelExecutor( } else { nccl_id = nccl_id_var->GetMutable(); } - need_group_call = false; } else if (nccl_id_var != nullptr) { // the other executor type. // the distributed training with nccl mode would initialize the nccl id in // startup_program. nccl_id = nccl_id_var->GetMutable(); } else { - // initlize NCCL by ncclCommInitAll, do not need nccl_id. + // initlize NCCL by ncclCommInitAll, do not need to intialize the nccl_id. } member_->nccl_ctxs_.reset(new platform::NCCLContextMap( - member_->places_, nccl_id, num_trainers, trainer_id, need_group_call)); + member_->places_, nccl_id, num_trainers, trainer_id)); #else PADDLE_THROW("Not compiled with CUDA"); #endif @@ -288,6 +286,14 @@ ParallelExecutor::ParallelExecutor( graphs.push_back(std::move(graph)); #endif + auto max_memory_size = GetEagerDeletionThreshold(); + // TODO(Yancey1989): fix gc failed on ParallelGraph executor. + if (max_memory_size >= 0 && + exec_strategy.type_ != ExecutionStrategy::kParallelGraph) { + graphs[0] = member_->PrepareGCAndRefCnts( + std::move(graphs[0]), static_cast(max_memory_size)); + } + // Step 3. Create vars in each scope. Passes may also create new vars. // skip control vars and empty vars std::vector var_infos; diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h index 8fd834be9a..7a51d18fbb 100644 --- a/paddle/fluid/framework/threadpool.h +++ b/paddle/fluid/framework/threadpool.h @@ -27,6 +27,7 @@ limitations under the License. */ namespace paddle { namespace framework { + struct ExceptionHandler { mutable std::future> future_; explicit ExceptionHandler( diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc index 440b16cf91..ed719f91d0 100644 --- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc +++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc @@ -46,6 +46,7 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase { sin >> num; place = platform::CUDAPlace(static_cast(num)); } + out->Reset(framework::MakeDecoratedReader(underlying_reader, place, 2)); } diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index 23a0222239..8d062dcdb4 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -82,15 +82,12 @@ struct NCCLContext { struct NCCLContextMap { std::unordered_map contexts_; std::vector order_; - bool need_group_call_; explicit NCCLContextMap(const std::vector &places, ncclUniqueId *nccl_id = nullptr, - size_t num_trainers = 1, size_t trainer_id = 0, - bool need_group_call = true) { + size_t num_trainers = 1, size_t trainer_id = 0) { PADDLE_ENFORCE(!places.empty()); order_.reserve(places.size()); - need_group_call_ = need_group_call; for (auto &p : places) { int dev_id = boost::get(p).device; order_.emplace_back(dev_id); diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index 0ff079b4e2..fffe8bee58 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -123,7 +123,7 @@ class TestMNIST(TestParallelExecutorBase): self.check_simple_fc_convergence(False) def test_simple_fc_with_new_strategy(self): - # use_cuda, use_reducea + # use_cuda, use_reduce self._compare_reduce_and_allreduce(simple_fc_net, True) self._compare_reduce_and_allreduce(simple_fc_net, False) From 723f68727db273902674e6046ead5f0ebdb78bf4 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Fri, 14 Dec 2018 17:00:48 +0800 Subject: [PATCH 035/197] add ut about nce in transpiler --- .../fluid/tests/unittests/test_dist_transpiler.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index 650a745cdc..8abd7d9e0c 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -870,9 +870,21 @@ class TestRemoteNce(TestDistLookupTableBase): def transpiler_test_impl(self): trainer, _ = self.get_trainer() + + out_vars = ["nce_w.block0", "nce_w.block1"] + in_vars = ["nce_b.block0", "nce_b.block1"] + + recv_var_names = [] + for op in trainer.blocks[0].ops: if op.type == "recv": - pass + for var in op.output("Out"): + recv_var_names.append(var) + + for out_var in out_vars: + self.assertFalse(out_var in recv_var_names) + for in_var in in_vars: + self.assertTrue(in_var in recv_var_names) if __name__ == "__main__": From e196fa367bc6087f08bfce44bdc194ed426c69cf Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Mon, 17 Dec 2018 10:52:05 +0800 Subject: [PATCH 036/197] update ut, test=develop --- .../unittests/test_nce_remote_table_op.py | 271 ++++++++++++++++++ 1 file changed, 271 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py diff --git a/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py b/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py new file mode 100644 index 0000000000..f08b270d89 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py @@ -0,0 +1,271 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import signal +import time +import unittest +from multiprocessing import Process + +import numpy as np +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.op import Operator +from paddle.fluid.framework import Program, program_guard + + +def run_pserver(pserver_id, use_cuda, sync_mode): + scope = fluid.core.Scope() + program = Program() + with fluid.scope_guard(scope): + with program_guard(program, startup_program=Program()): + # create table parameter in scope + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + # create and initialize Param Variable + param = scope.var('table').get_tensor() + + param_array = np.ones((5, 8)).astype("float32") + for i in range(len(param_array)): + param_array[i] *= param_array[i] * i + pserver_id * 10 + 1 + param.set(param_array, place) + + optimize_block = program._create_block(program.global_block().idx) + program.global_block().append_op( + type="listen_and_serv", + inputs={'X': []}, + outputs={}, + attrs={ + "optimize_blocks": [optimize_block], + "endpoint": '127.0.0.1:0', + "Fanin": 1, + "sync_mode": True, + "grad_to_block_id": [] + }) + + exe = fluid.Executor(place) + exe.run(program) + + +class TestListenAndServOp(unittest.TestCase): + def setUp(self): + self.ps_timeout = 5 + + def _start_pserver(self, pserver_id, use_cuda, sync_mode, pserver_func): + p = Process(target=pserver_func, args=(pserver_id, use_cuda, sync_mode)) + p.daemon = True + p.start() + return p + + def _wait_ps_ready(self, pid): + start_left_time = self.ps_timeout + sleep_time = 0.5 + while True: + assert start_left_time >= 0, "wait ps ready failed" + time.sleep(sleep_time) + try: + # the listen_and_serv_op would touch a file which contains the listen port + # on the /tmp directory until it was ready to process all the RPC call. + os.stat("/tmp/paddle.%d.port" % pid) + return + except os.error: + start_left_time -= sleep_time + + def _get_pserver_port(self, pid): + with open("/tmp/paddle.%d.port" % pid, 'r') as f: + port = int(f.read().strip()) + return port + + def _run_nce_op_one_pserver(self, place, port): + scope = fluid.core.Scope() + program = Program() + with fluid.scope_guard(scope): + with program_guard(program, startup_program=Program()): + x = scope.var('X').get_tensor() + x_array = np.random.random((4, 8)).astype("float32") * 2 + x.set(x_array, place) + # create and initialize Param Variable + param = scope.var('W').get_tensor() + param_array = np.zeros((5, 8)).astype("float32") * 2 + param.set(param_array, place) + + path_table = scope.var('PathTable').get_tensor() + path_table_array = np.array( + [(0, 2, -1, -1, -1), (0, 1, 2, -1, -1), (0, 1, 4, -1, -1), + (0, 2, -1, -1, -1)]).astype( + "int64" + ) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf) + path_table.set(path_table_array, place) + + path_code = scope.var('PathCode').get_tensor() + path_code_array = np.array( + [(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (1, 0, 0, -1, -1), + (0, 1, -1, -1, -1)]).astype("int64") #np.array to store + path_code.set(path_code_array, place) + + label = scope.var('Label').get_tensor() + label_array = np.array([0, 1, 4, 5]) + label.set(label_array, place) + + bias = scope.var('Bias').get_tensor() + bias_array = np.random.random((5, 1)).astype("float32") + bias.set(bias_array, place) + + out = scope.var('Out').get_tensor() + + pre_out = scope.var('PreOut').get_tensor + + w_out = scope.var('W_Out').get_tensor() + w_out.set(param_array, place) + + emaps = ['127.0.0.1:' + str(port)] + table_names = ['table'] + height_sections = [2] + + # create and run sgd operator + hsigmoid_op = Operator( + "hierarchical_sigmoid", + X='X', + W='W', + PathTable='PathTable', + PathCode='PathCode', + Label='Label', + Bias='Bias', + Out='Out', + PreOut='PreOut', + W_Out='W_Out', + remote_prefetch=True, + epmap=emaps, + table_names=table_names, + height_sections=height_sections) + + hsigmoid_op.run(scope, place) + + # get and compare result + result_array = np.array(w_out) + self.assertEqual(list(result_array.shape), [5, 8]) + correct = None + for i in range(5): + if i != 3: + correct = np.full((1, 8), i + 1).astype("float32") + self.assertTrue((result_array[i] == correct).all()) + else: + correct = np.full((1, 8), 0).astype("float32") + self.assertTrue((result_array[i] == correct).all()) + + def _run_nce_op_two_pserver(self, place, port0, port1): + scope = fluid.core.Scope() + program = Program() + with fluid.scope_guard(scope): + with program_guard(program, startup_program=Program()): + x = scope.var('X').get_tensor() + x_array = np.random.random((4, 8)).astype("float32") * 2 + x.set(x_array, place) + # create and initialize Param Variable + param = scope.var('W').get_tensor() + param_array = np.zeros((5, 8)).astype("float32") * 2 + param.set(param_array, place) + + path_table = scope.var('PathTable').get_tensor() + path_table_array = np.array( + [(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1), + (0, 2, -1, -1, -1)]).astype( + "int64" + ) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf) + path_table.set(path_table_array, place) + + path_code = scope.var('PathCode').get_tensor() + path_code_array = np.array( + [(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (1, 0, 0, -1, -1), + (0, 1, -1, -1, -1)]).astype("int64") #np.array to store + path_code.set(path_code_array, place) + + label = scope.var('Label').get_tensor() + label_array = np.array([0, 1, 4, 5]) + label.set(label_array, place) + + bias = scope.var('Bias').get_tensor() + bias_array = np.random.random((5, 1)).astype("float32") + bias.set(bias_array, place) + + out = scope.var('Out').get_tensor() + + pre_out = scope.var('PreOut').get_tensor + + w_out = scope.var('W_Out').get_tensor() + w_out.set(param_array, place) + + emaps = ['127.0.0.1:' + str(port0), '127.0.0.1:' + str(port1)] + table_names = ['table', 'table'] + height_sections = [2, 3] + + # create and run sgd operator + hsigmoid_op = Operator( + "hierarchical_sigmoid", + X='X', + W='W', + PathTable='PathTable', + PathCode='PathCode', + Label='Label', + Bias='Bias', + Out='Out', + PreOut='PreOut', + W_Out='W_Out', + remote_prefetch=True, + epmap=emaps, + table_names=table_names, + height_sections=height_sections) + hsigmoid_op.run(scope, place) + + # get and compare result + result_array = np.array(w_out) + self.assertEqual(list(result_array.shape), [5, 8]) + correct = None + for i in range(5): + if i < 2: + correct = np.full((1, 8), i + 1).astype("float32") + self.assertTrue((result_array[i] == correct).all()) + else: + correct = np.full((1, 8), i + 9).astype("float32") + self.assertTrue((result_array[i] == correct).all()) + + def test_nce_op_remote(self): + os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1" + # run pserver on CPU in sync mode + p0 = self._start_pserver(0, False, True, run_pserver) + self._wait_ps_ready(p0.pid) + port0 = self._get_pserver_port(p0.pid) + + p1 = self._start_pserver(1, False, True, run_pserver) + self._wait_ps_ready(p1.pid) + port1 = self._get_pserver_port(p1.pid) + + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + + for place in places: + self._run_nce_op_one_pserver(place, port0) + self._run_nce_op_two_pserver(place, port0, port1) + + # raise SIGTERM to pserver + os.kill(p0.pid, signal.SIGINT) + p0.join() + os.kill(p1.pid, signal.SIGINT) + p1.join() + + +if __name__ == '__main__': + unittest.main() From fd144954ed06128bab0b8b99cdb6722cc52881ba Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Mon, 17 Dec 2018 13:13:53 +0800 Subject: [PATCH 037/197] redefine api test=develop --- paddle/fluid/API.spec | 1 - .../fluid/framework/details/build_strategy.cc | 4 +- .../fluid/framework/details/build_strategy.h | 2 + .../framework/details/execution_strategy.h | 2 +- .../details/parallel_ssa_graph_executor.cc | 1 - paddle/fluid/framework/ir/node.h | 1 - paddle/fluid/framework/parallel_executor.cc | 29 +++++----- paddle/fluid/pybind/pybind.cc | 43 +++++++------- .../unittests/parallel_executor_test_base.py | 41 +++++++------- .../unittests/test_parallel_executor_mnist.py | 56 +++++++++++-------- .../test_parallel_executor_seresnext.py | 10 ++-- .../test_parallel_executor_transformer.py | 4 +- 12 files changed, 101 insertions(+), 93 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index da3b2f6347..8e6482ca98 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -26,7 +26,6 @@ paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], vara paddle.fluid.DistributeTranspilerConfig.__init__ paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None)) paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)) -paddle.fluid.ExecutionStrategy.ExecutorType.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy.ExecutorType, arg0: int) -> None paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.ReduceStrategy, arg0: int) -> None diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index d8526b3f24..e9688ea276 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -26,7 +26,9 @@ namespace framework { namespace details { static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) { - return (!strategy.enable_sequential_execution_ && strategy.num_trainers_ > 1); + return (!strategy.enable_sequential_execution_ && + strategy.num_trainers_ > 1) || + strategy.enable_parallel_graph_; } class ParallelExecutorPassBuilder : public ir::PassBuilder { diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index c97be16957..f66ecd80f1 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -73,6 +73,8 @@ struct BuildStrategy { bool fuse_broadcast_op_{false}; + bool enable_parallel_graph_{false}; + int num_trainers_{1}; int trainer_id_{0}; std::vector trainers_endpoints_; diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h index d3d5b6bf54..15c496130c 100644 --- a/paddle/fluid/framework/details/execution_strategy.h +++ b/paddle/fluid/framework/details/execution_strategy.h @@ -20,7 +20,7 @@ namespace framework { namespace details { struct ExecutionStrategy { - enum ExecutorType { kDefault = 0, kExperimental = 1, kParallelGraph = 2 }; + enum ExecutorType { kDefault = 0, kExperimental = 1 }; size_t num_threads_{0}; bool use_cuda_{true}; diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc index 214c2f7625..845c4379e6 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc @@ -29,7 +29,6 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor( graphs_(std::move(graphs)) { PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); // do not use threadpool for each graph execution. - strategy_.num_threads_ = 1UL; for (size_t i = 0; i < places.size(); ++i) { executors_.emplace_back(new details::ThreadedSSAGraphExecutor( strategy_, {local_scopes_[i]}, {places_[i]}, std::move(graphs_[i]))); diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index d2a393b3f1..10ae3a1c74 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -49,7 +49,6 @@ class Node { public: virtual ~Node() { if (!wrapper_.empty()) { - VLOG(4) << "ir::Node deleting a wrapper node " << Name(); wrapper_deleter_(); } } diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 63f3ef0eac..152b9b2702 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -199,7 +199,7 @@ ParallelExecutor::ParallelExecutor( "the number of places must be greater than 1."); } - if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { + if (build_strategy.enable_parallel_graph_) { PADDLE_ENFORCE( member_->use_all_reduce_, "build_strategy.reduce should be `AllReduce` if you want to use" @@ -231,7 +231,7 @@ ParallelExecutor::ParallelExecutor( #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME); ncclUniqueId *nccl_id = nullptr; - if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { + if (build_strategy.enable_parallel_graph_) { // parallel graph mode should initialize nccl by ncclCommInitRank since // it call nccl operator per device per thread. if (nccl_id_var == nullptr) { @@ -265,7 +265,7 @@ ParallelExecutor::ParallelExecutor( // ncclOp std::vector> graphs; #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { + if (build_strategy.enable_parallel_graph_) { for (size_t i = 0; i < member_->places_.size(); ++i) { std::unique_ptr graph = build_strategy.Apply( main_program, {member_->places_[i]}, loss_var_name, params, @@ -287,9 +287,8 @@ ParallelExecutor::ParallelExecutor( #endif auto max_memory_size = GetEagerDeletionThreshold(); - // TODO(Yancey1989): fix gc failed on ParallelGraph executor. - if (max_memory_size >= 0 && - exec_strategy.type_ != ExecutionStrategy::kParallelGraph) { + // TODO(Yancey1989): fix gc failed on ParallelGraph strategy. + if (max_memory_size >= 0 && !build_strategy.enable_parallel_graph_) { graphs[0] = member_->PrepareGCAndRefCnts( std::move(graphs[0]), static_cast(max_memory_size)); } @@ -323,18 +322,20 @@ ParallelExecutor::ParallelExecutor( } } - if (exec_strategy.type_ == ExecutionStrategy::kDefault) { - member_->executor_.reset(new details::ThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, member_->places_, - std::move(graphs[0]))); - } else if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { + if (build_strategy.enable_parallel_graph_) { member_->executor_.reset(new details::ParallelSSAGraphExecutor( exec_strategy, member_->local_scopes_, member_->places_, std::move(graphs))); } else { - member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, member_->places_, - std::move(graphs[0]))); + if (exec_strategy.type_ == ExecutionStrategy::kDefault) { + member_->executor_.reset(new details::ThreadedSSAGraphExecutor( + exec_strategy, member_->local_scopes_, member_->places_, + std::move(graphs[0]))); + } else { + member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( + exec_strategy, member_->local_scopes_, member_->places_, + std::move(graphs[0]))); + } } member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 1fa91114a8..866a5137de 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -761,11 +761,6 @@ All parameter, weight, gradient are variables in Paddle. )DOC"); - py::enum_(exec_strategy, "ExecutorType") - .value("Default", ExecutionStrategy::ExecutorType::kDefault) - .value("Experimental", ExecutionStrategy::ExecutorType::kExperimental) - .value("ParallelGraph", ExecutionStrategy::ExecutorType::kParallelGraph); - exec_strategy.def(py::init()) .def_property( "num_threads", @@ -823,25 +818,17 @@ All parameter, weight, gradient are variables in Paddle. [](const ExecutionStrategy &self) { return self.dry_run_; }, [](ExecutionStrategy &self, bool dry_run) { self.dry_run_ = dry_run; - }) - .def_property( - "executor_type", - [](const ExecutionStrategy &self) { return self.type_; }, - [](ExecutionStrategy &self, ExecutionStrategy::ExecutorType type) { - self.type_ = type; - }, - R"DOC(The type is ExecutorType which is the enum ranging from Default, -ParallelGraph and Experiment: - -Default: Compile the main_program into a multi-devices graph, - and execute this graph on multi-devices with multiple threads which - specified by build_strategy.num_threads. -ParallelGraph: Compile the main_program into multiple graphs, and execute each of the graphs on one - device with one thread. Please note, this mode only supports all-reduce mode and use_cuda=True. - This approach can achieve better performance in some scenarios. -Experimental: Compile the main_program into a multi-devices graph, - and executor this graph with a faster execution mode than the Default, - this approach is on the experiments.)DOC"); + }); + + exec_strategy.def_property( + "use_experimental_executor", + [](const ExecutionStrategy &self) { + return self.type_ == ExecutionStrategy::kExperimental; + }, + [](ExecutionStrategy &self, bool experimental) { + self.type_ = experimental ? ExecutionStrategy::kExperimental + : ExecutionStrategy::kDefault; + }); py::class_ build_strategy(pe, "BuildStrategy", R"DOC( BuildStrategy allows the user to more preciously control how to @@ -964,6 +951,14 @@ Experimental: Compile the main_program into a multi-devices graph, R"DOC(The type is BOOL, fuse_elewise_add_act_ops indicate whether to fuse elementwise_add_op and activation_op, it may make the execution faster. Default False)DOC") + .def_property( + "enable_parallel_graph", + [](const BuildStrategy &self) { return self.enable_parallel_graph_; }, + [](BuildStrategy &self, bool b) { self.enable_parallel_graph_ = b; }, + R"DOC(The type is BOOL, if set True, ParallelExecutor would build the main_program into multiple graphs, + each of the graphs would run with one device. This approach can achieve better performance in + some scenarios. Please note, this approach only supports all-reduce mode + on GPU device)DOC") .def("_finalize_strategy_and_create_passes", [](BuildStrategy &self) -> std::shared_ptr { return self.CreatePassesFromStrategy(true); diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index 73b8fb74fa..4e50614515 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -26,26 +26,24 @@ import sys __all__ = ['TestParallelExecutorBase'] -ExecutorType = fluid.ExecutionStrategy().ExecutorType - class TestParallelExecutorBase(unittest.TestCase): - def check_network_convergence( - self, - method, - use_cuda=True, - memory_opt=True, - iter=50, - batch_size=None, - allow_op_delay=False, - feed_dict=None, - seed=None, - use_parallel_executor=True, - use_reduce=False, - fuse_elewise_add_act_ops=False, - optimizer=fluid.optimizer.Adam, - exec_type=fluid.ExecutionStrategy().ExecutorType.Default, - enable_sequential_execution=False): + def check_network_convergence(self, + method, + use_cuda=True, + memory_opt=True, + iter=50, + batch_size=None, + allow_op_delay=False, + feed_dict=None, + seed=None, + use_parallel_executor=True, + use_reduce=False, + use_parallel_graph=False, + fuse_elewise_add_act_ops=False, + optimizer=fluid.optimizer.Adam, + use_fast_executor=False, + enable_sequential_execution=False): def run_executor(exe, feed, fetch_list, program=None): if isinstance(exe, fluid.ParallelExecutor): res = exe.run(fetch_list=fetch_list, feed=feed) @@ -61,8 +59,8 @@ class TestParallelExecutorBase(unittest.TestCase): startup = fluid.Program() startup.random_seed = 1 # Fix random seed main.random_seed = 1 - scope = fluid.Scope() - with fluid.scope_guard(scope): + self.scope = fluid.Scope() + with fluid.scope_guard(self.scope): with fluid.program_guard(main, startup): if seed is not None: startup.random_seed = seed @@ -80,13 +78,14 @@ class TestParallelExecutorBase(unittest.TestCase): startup_exe.run(startup) exec_strategy = fluid.ExecutionStrategy() exec_strategy.allow_op_delay = allow_op_delay - exec_strategy.executor_type = exec_type + exec_strategy.use_experimental_executor = use_fast_executor build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops build_strategy.enable_sequential_execution = enable_sequential_execution + build_strategy.enable_parallel_graph = use_parallel_graph if use_cuda and core.is_compiled_with_cuda(): build_strategy.remove_unnecessary_lock = True diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index fffe8bee58..c8ac6a90c1 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -20,7 +20,7 @@ import numpy as np import paddle.fluid.core as core import os import paddle.fluid as fluid -from parallel_executor_test_base import TestParallelExecutorBase, ExecutorType +from parallel_executor_test_base import TestParallelExecutorBase def simple_fc_net(use_feed): @@ -79,30 +79,32 @@ class TestMNIST(TestParallelExecutorBase): return img, label = self._init_data() - + """ all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence( model, feed_dict={"image": img, "label": label}, use_cuda=use_cuda, use_reduce=False) + """ reduce_first_loss, reduce_last_loss = self.check_network_convergence( model, feed_dict={"image": img, "label": label}, use_cuda=use_cuda, use_reduce=True) - + """ for loss in zip(all_reduce_first_loss, reduce_first_loss): self.assertAlmostEqual(loss[0], loss[1], delta=1e-6) for loss in zip(all_reduce_last_loss, reduce_last_loss): self.assertAlmostEqual(loss[0], loss[1], delta=1e-4) + """ # simple_fc def check_simple_fc_convergence(self, use_cuda, use_reduce=False, - exec_type=ExecutorType.Default): + use_parallel_graph=False): if use_cuda and not core.is_compiled_with_cuda(): return @@ -114,20 +116,24 @@ class TestMNIST(TestParallelExecutorBase): "label": label}, use_cuda=use_cuda, use_reduce=use_reduce, - exec_type=exec_type) + use_parallel_graph=use_parallel_graph) - def test_simple_fc(self): + def notest_simple_fc(self): # use_cuda - self.check_simple_fc_convergence(True, ExecutorType.Default) - self.check_simple_fc_convergence(True, ExecutorType.ParallelGraph) + if core.is_compiled_with_cuda(): + self.check_simple_fc_convergence(True) + self.check_simple_fc_convergence( + True, use_reduce=False, use_parallel_graph=True) self.check_simple_fc_convergence(False) - def test_simple_fc_with_new_strategy(self): + def notest_simple_fc_with_new_strategy(self): # use_cuda, use_reduce self._compare_reduce_and_allreduce(simple_fc_net, True) self._compare_reduce_and_allreduce(simple_fc_net, False) - def check_simple_fc_parallel_accuracy(self, use_cuda, exec_type): + def check_simple_fc_parallel_accuracy(self, + use_cuda, + use_parallel_graph=False): if use_cuda and not core.is_compiled_with_cuda(): return @@ -140,7 +146,7 @@ class TestMNIST(TestParallelExecutorBase): "label": label}, use_cuda=use_cuda, use_parallel_executor=False, - exec_type=exec_type) + use_parallel_graph=use_parallel_graph) parallel_first_loss, parallel_last_loss = self.check_network_convergence( method=simple_fc_net, seed=1, @@ -148,7 +154,7 @@ class TestMNIST(TestParallelExecutorBase): "label": label}, use_cuda=use_cuda, use_parallel_executor=True, - exec_type=exec_type) + use_parallel_graph=use_parallel_graph) self.assertAlmostEquals( np.mean(parallel_first_loss), @@ -157,17 +163,20 @@ class TestMNIST(TestParallelExecutorBase): self.assertAlmostEquals( np.mean(parallel_last_loss), single_last_loss, delta=1e-6) - def test_simple_fc_parallel_accuracy(self): - self.check_simple_fc_parallel_accuracy(True, ExecutorType.Default) - self.check_simple_fc_parallel_accuracy(True, ExecutorType.ParallelGraph) + def notest_simple_fc_parallel_accuracy(self): + if core.is_compiled_with_cuda(): + self.check_simple_fc_parallel_accuracy(True) + self.check_simple_fc_parallel_accuracy( + True, use_parallel_graph=True) # FIXME(Yancey1989): ParallelGraph executor type support CPU mode - self.check_simple_fc_parallel_accuracy(False, ExecutorType.Default) + self.check_simple_fc_parallel_accuracy(False) - def check_batchnorm_fc_convergence(self, use_cuda, exec_type): + def check_batchnorm_fc_convergence(self, + use_cuda, + use_fast_executor, + use_parallel_graph=False): if use_cuda and not core.is_compiled_with_cuda(): return - if not use_cuda and exec_type == ExecutorType.ParallelGraph: - return img, label = self._init_data() @@ -176,13 +185,14 @@ class TestMNIST(TestParallelExecutorBase): feed_dict={"image": img, "label": label}, use_cuda=use_cuda, - exec_type=exec_type) + use_fast_executor=use_fast_executor, + use_parallel_graph=use_parallel_graph) def test_batchnorm_fc(self): for use_cuda in (False, True): - for exec_type in (ExecutorType.Default, ExecutorType.Experimental, - ExecutorType.ParallelGraph): - self.check_batchnorm_fc_convergence(use_cuda, exec_type) + for use_fast_executor in (False, True): + self.check_batchnorm_fc_convergence(use_cuda, use_fast_executor) + self.check_batchnorm_fc_convergence(use_cuda, False, True) def test_batchnorm_fc_with_new_strategy(self): # FIXME(zcd): close this test temporally. diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py index bada38894f..531c99a835 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py @@ -19,7 +19,7 @@ import paddle.fluid.layers.ops as ops from paddle.fluid.initializer import init_on_cpu from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter import paddle.fluid.core as core -from parallel_executor_test_base import TestParallelExecutorBase, ExecutorType +from parallel_executor_test_base import TestParallelExecutorBase import unittest import math import os @@ -282,7 +282,7 @@ class TestResnet(TestParallelExecutorBase): use_reduce=False, iter=20, delta2=1e-6, - exec_type=ExecutorType.Default, + use_parallel_graph=False, lr_scale=1.0): if use_cuda and not core.is_compiled_with_cuda(): return @@ -303,7 +303,7 @@ class TestResnet(TestParallelExecutorBase): use_reduce=use_reduce, optimizer=optimizer(), use_parallel_executor=False, - exec_type=exec_type) + use_parallel_graph=use_parallel_graph) parallel_first_loss, parallel_last_loss = self.check_network_convergence( model, feed_dict={"image": img, @@ -313,7 +313,7 @@ class TestResnet(TestParallelExecutorBase): use_cuda=use_cuda, use_reduce=use_reduce, optimizer=optimizer(lr_scale=lr_scale), - exec_type=exec_type) + use_parallel_graph=use_parallel_graph) self.assertAlmostEquals( np.mean(parallel_first_loss), single_first_loss[0], delta=1e-6) @@ -327,7 +327,7 @@ class TestResnet(TestParallelExecutorBase): self._check_resnet_convergence( model=SE_ResNeXt50Small, use_cuda=True, - exec_type=ExecutorType.ParallelGraph, + use_parallel_graph=True, lr_scale=core.get_cuda_device_count()) self._check_resnet_convergence( model=SE_ResNeXt50Small, use_cuda=False, iter=2, delta2=1e-3) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py index 8a1a3ab3ca..c3ac9d92b4 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py @@ -17,7 +17,7 @@ from __future__ import print_function import paddle.fluid as fluid import transformer_model import numpy as np -from parallel_executor_test_base import TestParallelExecutorBase, ExecutorType +from parallel_executor_test_base import TestParallelExecutorBase import unittest import paddle import paddle.fluid.core as core @@ -175,6 +175,8 @@ class TestTransformer(TestParallelExecutorBase): self.check_network_convergence(transformer, use_cuda=True) self.check_network_convergence( transformer, use_cuda=True, enable_sequential_execution=True) + self.check_network_convergence( + transformer, use_cuda=True, use_parallel_graph=True) self.check_network_convergence(transformer, use_cuda=False, iter=5) From a7d6b1f92141f398cb442e3f5eee99d3ac156265 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Mon, 17 Dec 2018 14:17:26 +0800 Subject: [PATCH 038/197] code cleanup test=develop --- .../framework/details/computation_op_handle.h | 1 - .../details/multi_devices_graph_pass.cc | 1 + .../scope_buffered_ssa_graph_executor.cc | 2 ++ .../details/threaded_ssa_graph_executor.h | 1 - paddle/fluid/framework/details/var_handle.cc | 2 +- paddle/fluid/framework/ir/node.h | 1 + .../unittests/test_parallel_executor_dry_run.py | 10 ++++------ .../unittests/test_parallel_executor_mnist.py | 17 +++++++++-------- 8 files changed, 18 insertions(+), 17 deletions(-) diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h index 5b8b70c564..601ae4f8c6 100644 --- a/paddle/fluid/framework/details/computation_op_handle.h +++ b/paddle/fluid/framework/details/computation_op_handle.h @@ -17,7 +17,6 @@ #include #include -#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 5b82805ad9..2ab7da2d57 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -134,6 +134,7 @@ static const char kParams[] = "params"; static const char kLocalScopes[] = "local_scopes"; static const char kStrategy[] = "strategy"; static const char kNumTrainers[] = "num_trainers"; +static const char kNumLossScaled[] = "num_loss_scaled"; void MultiDevSSAGraphBuilder::Init() const { all_vars_.clear(); diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index edb7b5e70a..f432079087 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -41,10 +41,12 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( Scope &local_scope = scope->NewScope(); *scope->Var(details::kLocalExecScopeName)->GetMutable() = &local_scope; + for (auto &info : var_infos_) { if (scope->FindVar(info.name_) != nullptr) { continue; } + if (info.persistable_) { // Persistable InitializeVariable(scope->Var(info.name_), info.type_); } else { diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index b45afbc046..24da56c09e 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -24,7 +24,6 @@ #include #include "ThreadPool.h" // ThreadPool in thrird party #include "paddle/fluid/framework/blocking_queue.h" -#include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/exception_holder.h" #include "paddle/fluid/framework/details/execution_strategy.h" #include "paddle/fluid/framework/details/fetch_op_handle.h" diff --git a/paddle/fluid/framework/details/var_handle.cc b/paddle/fluid/framework/details/var_handle.cc index 7de6025a28..30da029ca2 100644 --- a/paddle/fluid/framework/details/var_handle.cc +++ b/paddle/fluid/framework/details/var_handle.cc @@ -20,7 +20,7 @@ namespace details { VarHandleBase::~VarHandleBase() {} -VarHandle::~VarHandle() { VLOG(5) << "deleting var handle " << DebugString(); } +VarHandle::~VarHandle() { VLOG(4) << "deleting var handle " << DebugString(); } std::string VarHandle::DebugString() const { std::stringstream ss; diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index 10ae3a1c74..d2a393b3f1 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -49,6 +49,7 @@ class Node { public: virtual ~Node() { if (!wrapper_.empty()) { + VLOG(4) << "ir::Node deleting a wrapper node " << Name(); wrapper_deleter_(); } } diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py index eff76ce0d4..18d95c94ad 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py @@ -17,8 +17,6 @@ import unittest import logging import six -ExecutorType = fluid.ExecutionStrategy().ExecutorType - class TestBase(unittest.TestCase): def main(self, @@ -26,7 +24,7 @@ class TestBase(unittest.TestCase): iter=10, iter_per_pe=10, use_gpu=True, - exec_type=ExecutorType.Default): + use_experimental_executor=False): if use_gpu and not fluid.core.is_compiled_with_cuda(): logging.warning( "Paddle is not compiled with CUDA, skip GPU unittests") @@ -45,7 +43,7 @@ class TestBase(unittest.TestCase): for _ in six.moves.xrange(iter): exe_strategy = fluid.ExecutionStrategy() exe_strategy._dry_run = True - exe_strategy.executor_type = exec_type + exe_strategy.use_experimental_executor = use_experimental_executor pe = fluid.ParallelExecutor( use_cuda=use_gpu, loss_name=loss.name, @@ -58,11 +56,11 @@ class TestBase(unittest.TestCase): class TestMNISTDryRun(TestBase): def test_mnist_dry_run(self): for use_gpu in (False, True): - for exec_type in (ExecutorType.Default, ExecutorType.Experimental): + for use_experimental_executor in (False, True): self.main( network_func=TestMNISTDryRun.network_func, use_gpu=use_gpu, - exec_type=exec_type) + use_experimental_executor=use_experimental_executor) @staticmethod def network_func(): diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index c8ac6a90c1..7d2349fad4 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -79,26 +79,25 @@ class TestMNIST(TestParallelExecutorBase): return img, label = self._init_data() - """ + all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence( model, feed_dict={"image": img, "label": label}, use_cuda=use_cuda, use_reduce=False) - """ + reduce_first_loss, reduce_last_loss = self.check_network_convergence( model, feed_dict={"image": img, "label": label}, use_cuda=use_cuda, use_reduce=True) - """ + for loss in zip(all_reduce_first_loss, reduce_first_loss): self.assertAlmostEqual(loss[0], loss[1], delta=1e-6) for loss in zip(all_reduce_last_loss, reduce_last_loss): self.assertAlmostEqual(loss[0], loss[1], delta=1e-4) - """ # simple_fc def check_simple_fc_convergence(self, @@ -118,7 +117,7 @@ class TestMNIST(TestParallelExecutorBase): use_reduce=use_reduce, use_parallel_graph=use_parallel_graph) - def notest_simple_fc(self): + def test_simple_fc(self): # use_cuda if core.is_compiled_with_cuda(): self.check_simple_fc_convergence(True) @@ -126,7 +125,7 @@ class TestMNIST(TestParallelExecutorBase): True, use_reduce=False, use_parallel_graph=True) self.check_simple_fc_convergence(False) - def notest_simple_fc_with_new_strategy(self): + def test_simple_fc_with_new_strategy(self): # use_cuda, use_reduce self._compare_reduce_and_allreduce(simple_fc_net, True) self._compare_reduce_and_allreduce(simple_fc_net, False) @@ -163,7 +162,7 @@ class TestMNIST(TestParallelExecutorBase): self.assertAlmostEquals( np.mean(parallel_last_loss), single_last_loss, delta=1e-6) - def notest_simple_fc_parallel_accuracy(self): + def test_simple_fc_parallel_accuracy(self): if core.is_compiled_with_cuda(): self.check_simple_fc_parallel_accuracy(True) self.check_simple_fc_parallel_accuracy( @@ -192,7 +191,9 @@ class TestMNIST(TestParallelExecutorBase): for use_cuda in (False, True): for use_fast_executor in (False, True): self.check_batchnorm_fc_convergence(use_cuda, use_fast_executor) - self.check_batchnorm_fc_convergence(use_cuda, False, True) + + self.check_batchnorm_fc_convergence( + use_cuda=True, use_fast_executor=False, use_parallel_graph=True) def test_batchnorm_fc_with_new_strategy(self): # FIXME(zcd): close this test temporally. From 49870f507de0d68fe23cd60479dab9da65d2d916 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Mon, 17 Dec 2018 18:52:21 +0800 Subject: [PATCH 039/197] delete unused code test=develop --- paddle/fluid/framework/details/all_reduce_op_handle.cc | 1 + paddle/fluid/framework/details/multi_devices_graph_pass.cc | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index 5a4f218077..59a0aef480 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -51,6 +51,7 @@ void AllReduceOpHandle::RunImpl() { // FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR, // this is a distributed or inter-process call, find a better way. #ifdef PADDLE_WITH_CUDA + // Find NCCL ID from the global scope. if (NoDummyInputSize() == 1 && local_scopes_[0]->FindVar(NCCL_ID_VARNAME) == nullptr) { #else diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 2ab7da2d57..5b82805ad9 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -134,7 +134,6 @@ static const char kParams[] = "params"; static const char kLocalScopes[] = "local_scopes"; static const char kStrategy[] = "strategy"; static const char kNumTrainers[] = "num_trainers"; -static const char kNumLossScaled[] = "num_loss_scaled"; void MultiDevSSAGraphBuilder::Init() const { all_vars_.clear(); From d3a4da5cf663a37d77af4670bdad85e06b32fae3 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Mon, 17 Dec 2018 18:53:44 +0800 Subject: [PATCH 040/197] fix comment test=develop --- paddle/fluid/framework/details/all_reduce_op_handle.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index 59a0aef480..6bca299813 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -109,7 +109,7 @@ void AllReduceOpHandle::RunImpl() { buffer, buffer, numel, static_cast(dtype), ncclSum, comm, stream)); // TODO(Yancey1989): synchronize here can get better performance - // if don't use NCCL group call, but need more profileing. + // if don't use NCCL group call, but need more profiling. if (local_scopes_.size() == 1UL) cudaStreamSynchronize(stream); }); } From 06936a2ff59ba67f6be0526bf97c26a3cf036b18 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Tue, 18 Dec 2018 11:16:14 +0800 Subject: [PATCH 041/197] fix 1gpu test=develop --- paddle/fluid/framework/details/all_reduce_op_handle.cc | 3 ++- paddle/fluid/framework/details/parallel_ssa_graph_executor.cc | 2 +- paddle/fluid/framework/parallel_executor.cc | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index 6bca299813..4a0347d07a 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -51,7 +51,8 @@ void AllReduceOpHandle::RunImpl() { // FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR, // this is a distributed or inter-process call, find a better way. #ifdef PADDLE_WITH_CUDA - // Find NCCL ID from the global scope. + // All-reduce op_handle can run on the sub-scope, find the nccl id from + // the global scope. if (NoDummyInputSize() == 1 && local_scopes_[0]->FindVar(NCCL_ID_VARNAME) == nullptr) { #else diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc index 845c4379e6..2377f2c963 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc @@ -59,7 +59,7 @@ FeedFetchList ParallelSSAGraphExecutor::Run( if (pool_) { run_futures.emplace_back(pool_->enqueue(std::move(call))); } else { - call(); + fetch_datas.emplace_back(std::move(call())); } } diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 152b9b2702..0042ccaa4f 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -231,7 +231,7 @@ ParallelExecutor::ParallelExecutor( #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME); ncclUniqueId *nccl_id = nullptr; - if (build_strategy.enable_parallel_graph_) { + if (build_strategy.enable_parallel_graph_ && places.size() > 1) { // parallel graph mode should initialize nccl by ncclCommInitRank since // it call nccl operator per device per thread. if (nccl_id_var == nullptr) { From 41790f13662a8a86fe5b6f4e3cee7a35703230a8 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Tue, 18 Dec 2018 14:04:40 +0800 Subject: [PATCH 042/197] add ut about nce --- .../unittests/test_nce_remote_table_op.py | 152 ++++-------------- 1 file changed, 33 insertions(+), 119 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py b/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py index f08b270d89..e87545cb9c 100644 --- a/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py +++ b/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py @@ -88,158 +88,73 @@ class TestListenAndServOp(unittest.TestCase): port = int(f.read().strip()) return port - def _run_nce_op_one_pserver(self, place, port): + def _run_nce_op_two_pserver(self, place, port0, port1): scope = fluid.core.Scope() program = Program() with fluid.scope_guard(scope): with program_guard(program, startup_program=Program()): - x = scope.var('X').get_tensor() + x = scope.var('Input').get_tensor() x_array = np.random.random((4, 8)).astype("float32") * 2 x.set(x_array, place) # create and initialize Param Variable - param = scope.var('W').get_tensor() + param = scope.var('Weight').get_tensor() param_array = np.zeros((5, 8)).astype("float32") * 2 param.set(param_array, place) - path_table = scope.var('PathTable').get_tensor() - path_table_array = np.array( - [(0, 2, -1, -1, -1), (0, 1, 2, -1, -1), (0, 1, 4, -1, -1), - (0, 2, -1, -1, -1)]).astype( - "int64" - ) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf) - path_table.set(path_table_array, place) - - path_code = scope.var('PathCode').get_tensor() - path_code_array = np.array( - [(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (1, 0, 0, -1, -1), - (0, 1, -1, -1, -1)]).astype("int64") #np.array to store - path_code.set(path_code_array, place) - - label = scope.var('Label').get_tensor() - label_array = np.array([0, 1, 4, 5]) - label.set(label_array, place) - bias = scope.var('Bias').get_tensor() bias_array = np.random.random((5, 1)).astype("float32") bias.set(bias_array, place) - out = scope.var('Out').get_tensor() - - pre_out = scope.var('PreOut').get_tensor - - w_out = scope.var('W_Out').get_tensor() - w_out.set(param_array, place) - - emaps = ['127.0.0.1:' + str(port)] - table_names = ['table'] - height_sections = [2] - - # create and run sgd operator - hsigmoid_op = Operator( - "hierarchical_sigmoid", - X='X', - W='W', - PathTable='PathTable', - PathCode='PathCode', - Label='Label', - Bias='Bias', - Out='Out', - PreOut='PreOut', - W_Out='W_Out', - remote_prefetch=True, - epmap=emaps, - table_names=table_names, - height_sections=height_sections) - - hsigmoid_op.run(scope, place) - - # get and compare result - result_array = np.array(w_out) - self.assertEqual(list(result_array.shape), [5, 8]) - correct = None - for i in range(5): - if i != 3: - correct = np.full((1, 8), i + 1).astype("float32") - self.assertTrue((result_array[i] == correct).all()) - else: - correct = np.full((1, 8), 0).astype("float32") - self.assertTrue((result_array[i] == correct).all()) - - def _run_nce_op_two_pserver(self, place, port0, port1): - scope = fluid.core.Scope() - program = Program() - with fluid.scope_guard(scope): - with program_guard(program, startup_program=Program()): - x = scope.var('X').get_tensor() - x_array = np.random.random((4, 8)).astype("float32") * 2 - x.set(x_array, place) - # create and initialize Param Variable - param = scope.var('W').get_tensor() - param_array = np.zeros((5, 8)).astype("float32") * 2 - param.set(param_array, place) - - path_table = scope.var('PathTable').get_tensor() - path_table_array = np.array( - [(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1), - (0, 2, -1, -1, -1)]).astype( - "int64" - ) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf) - path_table.set(path_table_array, place) - - path_code = scope.var('PathCode').get_tensor() - path_code_array = np.array( - [(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (1, 0, 0, -1, -1), - (0, 1, -1, -1, -1)]).astype("int64") #np.array to store - path_code.set(path_code_array, place) + sample_w = scope.var('SampleWeight').get_tensor() + sample_weight = np.random.random((4, 1)).astype("float32") + sample_w.set(sample_weight, place) label = scope.var('Label').get_tensor() label_array = np.array([0, 1, 4, 5]) label.set(label_array, place) - bias = scope.var('Bias').get_tensor() - bias_array = np.random.random((5, 1)).astype("float32") - bias.set(bias_array, place) + cost = scope.var('Cost').get_tensor() + cost_w = np.zeros((4, 1)).astype("float32") + cost.set(cost_w, place) - out = scope.var('Out').get_tensor() + sample_l = scope.var('SampleLogits').get_tensor() + sample_l_w = np.zeros((4, 3)).astype("float32") + sample_l.set(sample_l_w, place) - pre_out = scope.var('PreOut').get_tensor - - w_out = scope.var('W_Out').get_tensor() - w_out.set(param_array, place) + sample_la = scope.var('SampleLabels').get_tensor() + sample_la_w = np.zeros((4, 3)).astype("float32") + sample_la.set(sample_la_w, place) emaps = ['127.0.0.1:' + str(port0), '127.0.0.1:' + str(port1)] table_names = ['table', 'table'] height_sections = [2, 3] - # create and run sgd operator - hsigmoid_op = Operator( - "hierarchical_sigmoid", - X='X', - W='W', - PathTable='PathTable', - PathCode='PathCode', + # create and run nce operator + nce_op = Operator( + "nce", + Input='Input', + Weight='Weight', Label='Label', Bias='Bias', - Out='Out', - PreOut='PreOut', - W_Out='W_Out', + Cost='Cost', + SampleLogits='SampleLogits', + SampleLabels='SampleLabels', + num_total_classes=5, + num_neg_samples=2, + sampler=0, + seed=1, + is_sparse=True, remote_prefetch=True, epmap=emaps, table_names=table_names, height_sections=height_sections) - hsigmoid_op.run(scope, place) + + nce_op.run(scope, place) # get and compare result - result_array = np.array(w_out) - self.assertEqual(list(result_array.shape), [5, 8]) - correct = None - for i in range(5): - if i < 2: - correct = np.full((1, 8), i + 1).astype("float32") - self.assertTrue((result_array[i] == correct).all()) - else: - correct = np.full((1, 8), i + 9).astype("float32") - self.assertTrue((result_array[i] == correct).all()) + o_cost = np.array(cost_w) + o_logits = np.array(sample_l) + o_labels = np.array(sample_la) def test_nce_op_remote(self): os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1" @@ -257,7 +172,6 @@ class TestListenAndServOp(unittest.TestCase): places.append(core.CUDAPlace(0)) for place in places: - self._run_nce_op_one_pserver(place, port0) self._run_nce_op_two_pserver(place, port0, port1) # raise SIGTERM to pserver From aed3872c1c5c0c9957f9567071f63a89c1ace455 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Tue, 18 Dec 2018 16:17:20 +0800 Subject: [PATCH 043/197] add int cast, test=develop --- python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py b/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py index e87545cb9c..5e440bf35d 100644 --- a/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py +++ b/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py @@ -141,6 +141,7 @@ class TestListenAndServOp(unittest.TestCase): SampleLabels='SampleLabels', num_total_classes=5, num_neg_samples=2, + custom_neg_classes=list(range(2)), sampler=0, seed=1, is_sparse=True, From b5fa916413aebd0d35af8b3ae04d4d555ecb4629 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Tue, 18 Dec 2018 08:38:52 +0000 Subject: [PATCH 044/197] fix bug after merge reyoung optimization, test=develop --- .../fluid/operators/hierarchical_sigmoid_op.h | 1 - .../fluid/operators/math/matrix_bit_code.cc | 35 ------------------- paddle/fluid/operators/math/matrix_bit_code.h | 29 +++++++-------- 3 files changed, 15 insertions(+), 50 deletions(-) diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h index 802b444d7c..b47bf49ecb 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.h +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h @@ -71,7 +71,6 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel { // server auto height_sections = ctx.Attr>("height_sections"); auto table_names = ctx.Attr>("table_names"); - VLOG(3) << "path type is " << path->type().name(); std::vector real_rows = PathToRows(*path); framework::Scope& local_scope = ctx.scope().NewScope(); auto* ids = local_scope.Var("Ids@Prefetch"); diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc index d55e832cc2..d6f51c6e5c 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.cc +++ b/paddle/fluid/operators/math/matrix_bit_code.cc @@ -84,41 +84,6 @@ void MatrixBitCodeFunctor::AddGrad(const framework::Tensor &tmat, code_table_.apply_visitor(func); } -template -struct MatrixBitCodeFunctorSelectedRowsAddGrad - : public boost::static_visitor { - const framework::Tensor &tmat_; - framework::SelectedRows *vec_; - - MatrixBitCodeFunctorSelectedRowsAddGrad(const framework::Tensor &tmat, - framework::SelectedRows *vec) - : tmat_(tmat), vec_(vec) {} - - template - void operator()(const CodeTable &code_table) { - size_t batch_size = tmat_.dims()[0]; - size_t width = tmat_.dims()[1]; - auto *vec_data = vec_->mutable_value()->template data(); - auto *tmat_data = tmat_.data(); - for (size_t i = 0; i < batch_size; ++i) { - auto code = code_table.get_code(i); - int code_length = code.get_length(); - for (int j = 0; j < code_length; ++j) { - size_t index = code.calc_index(j); - int64_t row_index = vec_->GetIndexFromId(static_cast(index)); - vec_data[row_index] += tmat_data[i * width + j]; - } - } - } -}; - -template -void MatrixBitCodeFunctor::AddGrad(const framework::Tensor &tmat, - framework::SelectedRows *vec) { - MatrixBitCodeFunctorSelectedRowsAddGrad func(tmat, vec); - code_table_.apply_visitor(func); -} - template struct MatrixBitCodeFunctorSum : public boost::static_visitor { const framework::Tensor &tmat_; diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h index 7a084a41e5..c399cb5d44 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.h +++ b/paddle/fluid/operators/math/matrix_bit_code.h @@ -124,11 +124,12 @@ class SimpleCode { template class CustomCode { public: - CustomCode(const framework::Tensor& ptable, const framework::Tensor& pcode, - const int64_t* ids, int index) { - seq_len_ = ptable.dims()[1]; - ptable_data_ = ptable.data() + seq_len_ * index; - pcode_data_ = pcode.data() + seq_len_ * index; + CustomCode(const framework::Tensor& path_table, + const framework::Tensor& path_code, const int64_t* ids, + int index) { + seq_len_ = path_table.dims()[1]; + path_table_data_ = path_table.data() + seq_len_ * index; + path_code_data_ = path_code.data() + seq_len_ * index; } /** * Here the id of root should be 1 rather than 0, thus the encoding of class c @@ -139,25 +140,25 @@ class CustomCode { * Binary classification path is the suffixes of encoding, thus leave out the * left most bit in calc_bit. */ - size_t calc_index(int bit) const { return ptable_data_[bit]; } - bool calc_bit(int bit) const { return pcode_data_[bit]; } + size_t calc_index(int bit) const { return path_table_data_[bit]; } + bool calc_bit(int bit) const { return path_code_data_[bit]; } // NOTE: this function is not thread-safe. int get_length() const { if (length_ < 0) { auto len = seq_len_; - length_ = - static_cast(std::find_if(ptable_data_, ptable_data_ + len, - [](const T& val) { return val < 0; }) - - ptable_data_); + length_ = static_cast( + std::find_if(path_table_data_, path_table_data_ + len, + [](const T& val) { return val < 0; }) - + path_table_data_); } return length_; } private: int64_t seq_len_; - const T* ptable_data_; - const T* pcode_data_; + const T* path_table_data_; + const T* path_code_data_; mutable int length_{-1}; }; @@ -214,7 +215,7 @@ class MatrixBitCodeFunctor { const framework::Tensor& path_code, const int64_t* ids) : num_classes_(static_cast(path_table.dims()[1])), ids_(ids), - code_table_(CustomCodeTable(ptable, pcode, ids)) {} + code_table_(CustomCodeTable(path_table, path_code, ids)) {} /* For j < code_length tmat(i, j) += vec(0, index(i, j)) */ From e0c3c56b0664ee92e5eb86dca810c029e5cd1d67 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Tue, 18 Dec 2018 20:29:49 +0800 Subject: [PATCH 045/197] add nce remote ut, test=develop --- .../unittests/test_nce_remote_table_op.py | 68 ++++++++++++++++--- 1 file changed, 60 insertions(+), 8 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py b/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py index 5e440bf35d..b5f93f93a1 100644 --- a/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py +++ b/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py @@ -27,6 +27,45 @@ from paddle.fluid.op import Operator from paddle.fluid.framework import Program, program_guard +def nce(input, weight, bias, sample_weight, labels, num_classes, + num_sample_class): + samples = [] + sample_labels = [] + batch_size = input.shape[0] + num_true_class = labels.shape[1] + for i in range(batch_size): + w = 1 if sample_weight is None else sample_weight[i] + for label in labels[i]: + samples.append((i, label, True, w)) + sample_labels.append(label) + for num in range(num_sample_class): + samples.append((i, num, False, w)) + sample_labels.append(num) + # forward bias + sample_out = np.zeros(len(samples)).astype(np.float32) + if bias is not None: + for i in range(len(samples)): + sample_out[i] = bias[samples[i][1]] + # forward weight + for i in range(len(samples)): + sample_out[i] += np.dot(input[samples[i][0]], weight[samples[i][1]]) + + # forward activation + sample_out = 1.0 / (1.0 + np.exp(-sample_out)) + # forward cost + out = np.zeros(batch_size).astype(np.float32) + b = 1.0 / num_classes * num_sample_class + + for i in range(len(samples)): + o = sample_out[i] + cost = -np.log(o / (o + b)) if samples[i][2] else -np.log(b / (o + b)) + out[samples[i][0]] += cost * samples[i][3] + return (out[:, np.newaxis], np.array(sample_out).reshape( + batch_size, num_sample_class + num_true_class), + np.array(sample_labels).reshape(batch_size, + num_sample_class + num_true_class)) + + def run_pserver(pserver_id, use_cuda, sync_mode): scope = fluid.core.Scope() program = Program() @@ -94,11 +133,11 @@ class TestListenAndServOp(unittest.TestCase): with fluid.scope_guard(scope): with program_guard(program, startup_program=Program()): x = scope.var('Input').get_tensor() - x_array = np.random.random((4, 8)).astype("float32") * 2 + x_array = np.random.random((4, 8)).astype("float32") x.set(x_array, place) # create and initialize Param Variable param = scope.var('Weight').get_tensor() - param_array = np.zeros((5, 8)).astype("float32") * 2 + param_array = np.zeros((5, 8)).astype("float32") param.set(param_array, place) bias = scope.var('Bias').get_tensor() @@ -110,7 +149,7 @@ class TestListenAndServOp(unittest.TestCase): sample_w.set(sample_weight, place) label = scope.var('Label').get_tensor() - label_array = np.array([0, 1, 4, 5]) + label_array = np.array([[0], [1], [4], [3]]) label.set(label_array, place) cost = scope.var('Cost').get_tensor() @@ -122,7 +161,7 @@ class TestListenAndServOp(unittest.TestCase): sample_l.set(sample_l_w, place) sample_la = scope.var('SampleLabels').get_tensor() - sample_la_w = np.zeros((4, 3)).astype("float32") + sample_la_w = np.zeros((4, 3)).astype("int") sample_la.set(sample_la_w, place) emaps = ['127.0.0.1:' + str(port0), '127.0.0.1:' + str(port1)] @@ -139,11 +178,12 @@ class TestListenAndServOp(unittest.TestCase): Cost='Cost', SampleLogits='SampleLogits', SampleLabels='SampleLabels', + SampleWeight='SampleWeight', num_total_classes=5, num_neg_samples=2, custom_neg_classes=list(range(2)), sampler=0, - seed=1, + seed=0, is_sparse=True, remote_prefetch=True, epmap=emaps, @@ -153,9 +193,21 @@ class TestListenAndServOp(unittest.TestCase): nce_op.run(scope, place) # get and compare result - o_cost = np.array(cost_w) - o_logits = np.array(sample_l) - o_labels = np.array(sample_la) + o_cost = np.array(scope.var('Cost').get_tensor()) + o_logits = np.array(scope.var('SampleLogits').get_tensor()) + o_labels = np.array(scope.var('SampleLabels').get_tensor()) + + param_array = np.ones((5, 8)).astype("float32") + for i in range(2): + param_array[i] *= param_array[i] * i + 0 * 10 + 1 + for i in range(2, 5): + param_array[i] *= param_array[i] * i + 1 * 10 + 1 + out = nce(x_array, param_array, bias_array, sample_weight, + label_array, 5, 2) + + self.assertAlmostEqual(o_cost.all(), out[0].all(), delta=1e-6) + self.assertAlmostEqual(o_logits.all(), out[1].all(), delta=1e-6) + self.assertAlmostEqual(o_labels.all(), out[2].all(), delta=1e-6) def test_nce_op_remote(self): os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1" From b2f789c66dc847d9fbc030a2db218be670e7752f Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Tue, 18 Dec 2018 12:47:58 +0000 Subject: [PATCH 046/197] add test transpiler dist test, test=develop --- .../tests/unittests/test_dist_transpiler.py | 43 +++++++++++++++---- .../fluid/transpiler/distribute_transpiler.py | 2 +- 2 files changed, 36 insertions(+), 9 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index 27575897b5..f572d69277 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -879,29 +879,36 @@ class TestRemoteNce(TestDistLookupTableBase): class TestRemoteHsigmoid(TestDistLookupTableBase): def network_with_table(self, is_sparse, is_distributed): - num_total_classes = 10 + num_total_classes = 3 - input = fluid.layers.data(name="input", shape=[10], dtype="float32") + input = fluid.layers.data(name="input", shape=[1], dtype="float32") label = fluid.layers.data(name="label", shape=[1], dtype="int64") path_table = fluid.layers.data( - name='path_table', shape=[10], dtype='int64') + name='path_table', shape=[3], dtype='int64') path_code = fluid.layers.data( - name='path_code', shape=[10], dtype='int64') + name='path_code', shape=[3], dtype='int64') w_param = fluid.default_main_program().global_block().create_parameter( shape=[num_total_classes, 10], dtype='float32', name='hs_w', initializer=fluid.initializer.ConstantInitializer()) b_param = fluid.default_main_program().global_block().create_parameter( - shape=[num_total_classes, 1], + shape=[3, 1], dtype='float32', name='hs_b', initializer=fluid.initializer.ConstantInitializer()) - cost = fluid.layers.hsigmoid( + emb = fluid.layers.embedding( input=input, + is_sparse=is_sparse, + size=[3, 3], + param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal( + scale=1 / math.sqrt(num_total_classes)))) + + cost = fluid.layers.hsigmoid( + input=emb, label=label, - num_classes=non_leaf_num, + num_classes=num_total_classes, path_table=path_table, path_code=path_code, is_custom=True, @@ -918,9 +925,29 @@ class TestRemoteHsigmoid(TestDistLookupTableBase): def transpiler_test_impl(self): trainer, _ = self.get_trainer() + params_to_check = list() for op in trainer.blocks[0].ops: - if op.type == "recv": + if op.type == "hierarchical_sigmoid": + params_to_check = [op.input("W")[0], op.input("Bias")[0]] + for name in ["epmap", "table_names", "epmap"]: + assert op.has_attr(name) + if name == "epmap": + assert op.attr(name)[0] == u'127.0.0.1:6174' + elif name == "table_names": + assert op.attr(name)[0] == u'hierarchical_sigmoid_0.w_0' + else: + assert op.attr(name) == 3 + elif op.type == "lookup_table": + params_to_check.append(op.input("W")[0]) + else: pass + op_count = 0 + for op in trainer.blocks[0].ops: + if op.type == "recv": + assert len(op.output("Out")) == 1 + assert op.output("Out")[0] == u'hierarchical_sigmoid_0.b_0' + op_count += 1 + assert op_count == 1 if __name__ == "__main__": diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 378654ab5b..f5ca3dffb7 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -242,7 +242,7 @@ class DistributeTranspiler(object): def _get_all_remote_sparse_update_op(self, main_program): sparse_update_ops = [] - sparse_update_op_types = ["lookup_table", "nce"] + sparse_update_op_types = ["lookup_table", "nce", "hierarchical_sigmoid"] for op in main_program.global_block().ops: if op.type in sparse_update_op_types and op.attr( 'remote_prefetch') is True: From 19a8d965858173789376248b076fc0339422d313 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Tue, 18 Dec 2018 13:18:11 +0000 Subject: [PATCH 047/197] fix nce in test_dist_transpiler, test=develop --- python/paddle/fluid/tests/unittests/test_dist_transpiler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index 73795a2154..0555db4cba 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -871,8 +871,8 @@ class TestRemoteNce(TestDistLookupTableBase): def transpiler_test_impl(self): trainer, _ = self.get_trainer() - out_vars = ["nce_w.block0", "nce_w.block1"] - in_vars = ["nce_b.block0", "nce_b.block1"] + out_vars = ["nce_w"] + in_vars = ["nce_b"] recv_var_names = [] From f7fb937bfe64a1017f0b4c87706e6655764c775d Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Tue, 18 Dec 2018 21:29:47 +0800 Subject: [PATCH 048/197] fix in cmake, test=develop --- python/paddle/fluid/tests/unittests/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 6d6fe245d8..950029ed94 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -21,6 +21,8 @@ if(NOT WITH_DISTRIBUTE) LIST(REMOVE_ITEM TEST_OPS test_dist_simnet_bow) LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_batch_merge) LIST(REMOVE_ITEM TEST_OPS test_dist_text_classification) + LIST(REMOVE_ITEM TEST_OPS test_nce_remote_table_op) + LIST(REMOVE_ITEM TEST_OPS test_hsigmoid_remote_table_op) endif(NOT WITH_DISTRIBUTE) if (NOT ${WITH_GPU}) @@ -32,7 +34,6 @@ endif() list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290 list(REMOVE_ITEM TEST_OPS test_modified_huber_loss_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184 list(REMOVE_ITEM TEST_OPS test_lstm_unit_op) # # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5185 -list(REMOVE_ITEM TEST_OPS test_nce) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/7778 list(REMOVE_ITEM TEST_OPS test_recurrent_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/6152 list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/PaddlePaddle/Paddle/issues/5101#issuecomment-339814957 From 5ec9b377983417e6a29f43b18bf5c830f6ca8a81 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Wed, 19 Dec 2018 04:48:45 +0000 Subject: [PATCH 049/197] test=develop, fix compile error under gpu mode --- paddle/fluid/operators/lookup_table_op.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu index 6a0d6bad51..fd15539f7b 100644 --- a/paddle/fluid/operators/lookup_table_op.cu +++ b/paddle/fluid/operators/lookup_table_op.cu @@ -92,7 +92,8 @@ class LookupTableCUDAKernel : public framework::OpKernel { // server #ifdef PADDLE_WITH_DISTRIBUTE operators::distributed::prefetch(id_name, out_name, table_names, epmap, - height_sections, context); + height_sections, context, + context.scope()); #else PADDLE_THROW( "paddle is not compiled with distribute support, can not do " From 4877f5d71f73b49f94b3a775cb0b967ae15e5277 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Wed, 19 Dec 2018 04:58:52 +0000 Subject: [PATCH 050/197] test=develop, fix compile error under gpu mode --- .../operators/distributed/parameter_prefetch.h | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.h b/paddle/fluid/operators/distributed/parameter_prefetch.h index 882c6bd9b8..89671bd741 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.h +++ b/paddle/fluid/operators/distributed/parameter_prefetch.h @@ -47,10 +47,26 @@ void prefetch_with_reconstruct(const std::string& id_name, auto* out_value = out.data(); size_t original_width = original->numel() / original->dims()[0]; + bool is_on_cpu_place = true; + if (!platform::is_cpu_place(ids.place())) { + is_on_cpu_place = false; + } + for (int64_t i = 0; i < ids.numel(); i++) { const T* out_rows = out_value + original_width * i; T* original_row = original_value + original_width * ids.data()[i]; - std::memcpy(original_row, out_rows, original_width * sizeof(T)); + if (is_on_cpu_place) { + std::memcpy(original_row, out_rows, original_width * sizeof(T)); + } else { +#ifndef PADDLE_WITH_CUDA + PADDLE_THROW("paddle is not compiled with CUDA!"); +#else + auto stream = + static_cast(actual_ctx)->stream(); + memory::Copy(boost::get(ids.place()), out_rows, + cpu_place, original_row, original_width * sizeof(T), stream); +#endif + } } } From 8d88c5a87d2e2485b7a7f8714e874f9c69c0620a Mon Sep 17 00:00:00 2001 From: minqiyang Date: Wed, 19 Dec 2018 14:48:03 +0800 Subject: [PATCH 051/197] Shameless copy --- paddle/fluid/imperative/layer.cc | 6 +- paddle/fluid/imperative/tracer.h | 25 ++++-- paddle/fluid/operators/mul_op.cc | 3 +- paddle/fluid/pybind/imperative.cc | 5 +- python/paddle/fluid/backward.py | 7 +- python/paddle/fluid/framework.py | 3 + python/paddle/fluid/imperative/base.py | 3 +- python/paddle/fluid/imperative/layers.py | 11 ++- python/paddle/fluid/layers/nn.py | 45 +++++++++++ .../fluid/tests/unittests/test_imperative.py | 79 ++++++++++++++++++- 10 files changed, 166 insertions(+), 21 deletions(-) diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 6125037680..342cb68ab2 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -188,11 +188,13 @@ std::vector OpBase::ApplyGrad(framework::Scope* scope) { std::vector ret; for (size_t i = 0; i < input_vars_->size(); ++i) { bool found = false; + VarBase* origin_var = (*input_vars_)[i]; for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) { Variable* var = scope->FindVar(outvar); - VarBase* origin_var = (*input_vars_)[i]; std::string orig_var = grad_to_var_->at(outvar); - PADDLE_ENFORCE(origin_var->var_desc_->Name() == orig_var); + if (origin_var->var_desc_->Name() != orig_var) { + continue; + } VLOG(3) << "apply grad " << outvar << " with origin " << orig_var; origin_var->ApplyGrad(scope, var); found = true; diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index 433d07c0e5..97772dc110 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -43,9 +43,12 @@ void CreateGradOp(const framework::OpDesc& op_desc, class Tracer { public: - explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) { + explicit Tracer(framework::BlockDesc* root_block, + framework::BlockDesc* startup_block) + : root_block_(root_block), startup_block_(startup_block) { root_scope_ = new framework::Scope(); scopes_[root_block_] = root_scope_; + scopes_[startup_block_] = root_scope_; } virtual ~Tracer() { delete root_scope_; } @@ -80,6 +83,8 @@ class Tracer { } else { op->pre_ops_->push_back(nullptr); } + VLOG(3) << "input vname " << vname << " " + << var->Get().dims().size(); } *op->output_vars_ = outputs; @@ -98,12 +103,19 @@ class Tracer { outputs[i]->pre_op_ = op; outputs[i]->pre_op_out_idx_ = i; } + + VLOG(3) << "tracer running " << op_desc->Type(); op_base->Run(*scope, platform::CPUPlace()); - framework::OpDesc* grad_op_desc; - auto grad_to_var = new std::unordered_map(); - CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var); - op->grad_op_desc_ = grad_op_desc; - op->grad_to_var_ = grad_to_var; + if (block == startup_block_) { + op->grad_op_desc_ = nullptr; + op->grad_to_var_ = nullptr; + } else { + framework::OpDesc* grad_op_desc; + auto grad_to_var = new std::unordered_map(); + CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var); + op->grad_op_desc_ = grad_op_desc; + op->grad_to_var_ = grad_to_var; + } op->block_ = block; } @@ -121,6 +133,7 @@ class Tracer { private: std::map scopes_; framework::BlockDesc* root_block_; + framework::BlockDesc* startup_block_; framework::Scope* root_scope_; }; diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc index 8a111e6065..271428408c 100644 --- a/paddle/fluid/operators/mul_op.cc +++ b/paddle/fluid/operators/mul_op.cc @@ -49,7 +49,8 @@ class MulOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_GT( y_dims.size(), y_num_col_dims, "The input tensor Y's rank of MulOp should be larger than " - "y_num_col_dims."); + "y_num_col_dims: %ld vs %ld", + y_dims.size(), y_num_col_dims); auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims); auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims); diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 34e9c897d9..be63fb8778 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -24,8 +24,9 @@ namespace pybind { void BindTracer(pybind11::module *m) { pybind11::class_(*m, "Tracer", "") .def("__init__", - [](imperative::Tracer &self, framework::BlockDesc *root_block) { - new (&self) imperative::Tracer(root_block); + [](imperative::Tracer &self, framework::BlockDesc *root_block, + framework::BlockDesc *startup_block) { + new (&self) imperative::Tracer(root_block, startup_block); }) .def("trace", &imperative::Tracer::Trace) .def("get_scope", &imperative::Tracer::GetScope, diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index 17fe8dc3c8..34f2f2c2da 100644 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -564,8 +564,11 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, grad_to_var = dict() op_desc = _create_op_desc_( - "fill_constant", {}, {"Out": [_append_grad_suffix_(loss.name)]}, { - "shape": [1], + "fill_constant", + {}, + {"Out": [_append_grad_suffix_(loss.name)]}, + { + "shape": [1], # TODO(panyx0718): This can be loss.shape. "value": 1.0, "dtype": loss.dtype, "force_cpu": False, diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 0897920594..10d441cf3e 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1316,6 +1316,9 @@ class Block(object): def _prepend_op(self, *args, **kwargs): op_desc = self.desc._prepend_op() op = Operator(self, op_desc, *args, **kwargs) + if _in_imperative_mode(): + _imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs], + [v._ivar for v in op.outputs], self.desc) self.ops.insert(0, op) return op diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/imperative/base.py index 15d38ddb56..aa48ef71aa 100644 --- a/python/paddle/fluid/imperative/base.py +++ b/python/paddle/fluid/imperative/base.py @@ -28,7 +28,8 @@ def enabled(): def guard(): train = framework.Program() startup = framework.Program() - tracer = core.Tracer(train.current_block().desc) + tracer = core.Tracer(train.current_block().desc, + startup.current_block().desc) with framework.program_guard(train, startup): with framework.unique_name.guard(): with framework._imperative_guard(tracer): diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py index 1a28f7f4ae..044717c319 100644 --- a/python/paddle/fluid/imperative/layers.py +++ b/python/paddle/fluid/imperative/layers.py @@ -25,11 +25,9 @@ __all__ = ['PyLayer'] class PyLayer(core.Layer): def __init__(self): - pass + self._built = False def __call__(self, inputs): - # TODO(panyx0718): Support declarative mode as well. - assert base.enabled() if not isinstance(inputs, list) and not isinstance(inputs, tuple): inputs = [inputs] @@ -37,8 +35,15 @@ class PyLayer(core.Layer): for x in inputs: py_var = base.to_variable(x) var_inputs.append(py_var) + if not self._built: + self._build_once(inputs) + self._built = True + outputs = self.forward(var_inputs) return outputs + def _build_once(self, inputs): + pass + def forward(self, inputs): return [] diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 4d8311a0d3..d8bc919784 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -29,6 +29,7 @@ from . import utils from .. import unique_name from functools import reduce from .. import core +from ..imperative import layers __all__ = [ 'fc', @@ -9426,3 +9427,47 @@ def huber_loss(input, label, delta): 'Residual': residual}, attrs={'delta': delta}) return out + + +class FC(layers.PyLayer): + def __init__(self, + size, + param_attr=None, + num_flatten_dims=1, + dtype=core.VarDesc.VarType.FP32): + super(FC, self).__init__() + self._size = size + self._num_flatten_dims = num_flatten_dims + self._dtype = dtype + self._helper = LayerHelper('FC', param_attr=param_attr) + + def _build_once(self, inputs): + input_shape = inputs[0].shape + param_shape = [ + reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:], 1) + ] + [self._size] + self._w = self._helper.create_parameter( + attr=self._helper.param_attr, + shape=param_shape, + dtype=self._dtype, + is_bias=False) + + def forward(self, inputs): + tmp = self._helper.create_variable_for_type_inference(self._dtype) + self._helper.append_op( + type="mul", + inputs={"X": inputs[0], + "Y": self._w}, + outputs={"Out": tmp}, + attrs={ + "x_num_col_dims": self._num_flatten_dims, + "y_num_col_dims": 1 + }) + + out = self._helper.create_variable_for_type_inference(self._dtype) + self._helper.append_op( + type="sum", + inputs={"X": [tmp]}, + outputs={"Out": out}, + attrs={"use_mkldnn": False}) + return out diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py index b5b6305155..0fe69d1bd4 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative.py +++ b/python/paddle/fluid/tests/unittests/test_imperative.py @@ -12,12 +12,23 @@ # See the License for the specific language governing permissions and # limitations under the License. +import contextlib import unittest -import sys import numpy as np import paddle.fluid as fluid from paddle.fluid import core +from paddle.fluid.layers.nn import FC + + +@contextlib.contextmanager +def new_program_scope(): + prog = fluid.Program() + startup_prog = fluid.Program() + scope = fluid.core.Scope() + with fluid.scope_guard(scope): + with fluid.program_guard(prog, startup_prog): + yield class MyLayer(fluid.imperative.PyLayer): @@ -30,6 +41,23 @@ class MyLayer(fluid.imperative.PyLayer): return [fluid.layers.elementwise_mul(x, x)] +class MLP(fluid.imperative.PyLayer): + def __init__(self): + super(MLP, self).__init__() + self._fc1 = FC(3, + fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.1))) + self._fc2 = FC(4, + fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.1))) + + def forward(self, inputs): + x = self._fc1(inputs[0]) + x = self._fc2(x) + x = fluid.layers.reduce_sum(x) + return x + + class TestImperative(unittest.TestCase): def test_layer(self): with fluid.imperative.guard(): @@ -39,13 +67,56 @@ class TestImperative(unittest.TestCase): l.forward([]) def test_layer_in_out(self): + np_inp = np.array([1.0, 2.0, -1.0], dtype=np.float32) with fluid.imperative.guard(): l = MyLayer() - x = l(np.array([1.0, 2.0, -1.0], dtype=np.float32))[0] + x = l(np_inp)[0] self.assertIsNotNone(x) - sys.stderr.write("%s output: %s\n" % (x, x._numpy())) + dy_out = x._numpy() x._backward() - sys.stderr.write("grad %s\n" % l._x_for_debug._gradient()) + dy_grad = l._x_for_debug._gradient() + + with new_program_scope(): + inp = fluid.layers.data( + name="inp", shape=[3], append_batch_size=False) + l = MyLayer() + x = l(inp)[0] + param_grads = fluid.backward.append_backward( + x, parameter_list=[l._x_for_debug.name])[0] + exe = fluid.Executor(fluid.CPUPlace()) + + static_out, static_grad = exe.run( + feed={inp.name: np_inp}, + fetch_list=[x.name, param_grads[1].name]) + + self.assertTrue(np.allclose(dy_out, static_out)) + self.assertTrue(np.allclose(dy_grad, static_grad)) + + def test_mlp(self): + np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) + with fluid.imperative.guard(): + mlp = MLP() + out = mlp(np_inp) + dy_out = out._numpy() + out._backward() + dy_grad = mlp._fc1._w._gradient() + + with new_program_scope(): + inp = fluid.layers.data( + name="inp", shape=[2, 2], append_batch_size=False) + mlp = MLP() + out = mlp(inp) + param_grads = fluid.backward.append_backward( + out, parameter_list=[mlp._fc1._w.name])[0] + exe = fluid.Executor(fluid.CPUPlace()) + exe.run(fluid.default_startup_program()) + + static_out, static_grad = exe.run( + feed={inp.name: np_inp}, + fetch_list=[out.name, param_grads[1].name]) + + self.assertTrue(np.allclose(dy_out, static_out)) + self.assertTrue(np.allclose(dy_grad, static_grad)) if __name__ == '__main__': From bfcb5e52350bd63d9ea8b3505ae7914bdd4ee9b4 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Wed, 19 Dec 2018 13:38:58 +0000 Subject: [PATCH 052/197] test=develop, fix gpu compile error on prefetch, and fix hs/nce ut failed on gpu --- .../fluid/operators/distributed/parameter_prefetch.h | 10 +++++++--- .../tests/unittests/test_hsigmoid_remote_table_op.py | 2 -- .../fluid/tests/unittests/test_nce_remote_table_op.py | 2 -- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.h b/paddle/fluid/operators/distributed/parameter_prefetch.h index 89671bd741..47d082c4af 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.h +++ b/paddle/fluid/operators/distributed/parameter_prefetch.h @@ -39,6 +39,9 @@ void prefetch_with_reconstruct(const std::string& id_name, const framework::ExecutionContext& context, const framework::Scope& scope, framework::LoDTensor* original) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& actual_ctx = *pool.Get(context.GetPlace()); + prefetch(id_name, out_name, table_names, epmap, height_sections, context, scope); auto& out = scope.FindVar(out_name)->Get(); @@ -62,9 +65,10 @@ void prefetch_with_reconstruct(const std::string& id_name, PADDLE_THROW("paddle is not compiled with CUDA!"); #else auto stream = - static_cast(actual_ctx)->stream(); - memory::Copy(boost::get(ids.place()), out_rows, - cpu_place, original_row, original_width * sizeof(T), stream); + static_cast(&actual_ctx)->stream(); + memory::Copy(boost::get(ids.place()), original_row, + platform::CPUPlace(), out_rows, original_width * sizeof(T), + stream); #endif } } diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_remote_table_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_remote_table_op.py index 9ed6c94bd2..da343dd503 100644 --- a/python/paddle/fluid/tests/unittests/test_hsigmoid_remote_table_op.py +++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_remote_table_op.py @@ -253,8 +253,6 @@ class TestListenAndServOp(unittest.TestCase): port1 = self._get_pserver_port(p1.pid) places = [core.CPUPlace()] - if core.is_compiled_with_cuda(): - places.append(core.CUDAPlace(0)) for place in places: self._run_hsigmoid_op_one_pserver(place, port0) diff --git a/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py b/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py index b5f93f93a1..cc6f40de86 100644 --- a/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py +++ b/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py @@ -221,8 +221,6 @@ class TestListenAndServOp(unittest.TestCase): port1 = self._get_pserver_port(p1.pid) places = [core.CPUPlace()] - if core.is_compiled_with_cuda(): - places.append(core.CUDAPlace(0)) for place in places: self._run_nce_op_two_pserver(place, port0, port1) From 3cd10a7c4fd0f8063aac326e5542163d3fb3cae2 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Thu, 20 Dec 2018 00:14:03 +0800 Subject: [PATCH 053/197] Add Conv2D forward test=develop --- paddle/fluid/imperative/layer.cc | 3 + paddle/fluid/imperative/layer.h | 3 +- paddle/fluid/imperative/tracer.h | 43 ++---- paddle/fluid/pybind/imperative.cc | 5 +- paddle/fluid/pybind/pybind.cc | 6 + python/paddle/fluid/framework.py | 15 +- python/paddle/fluid/imperative/__init__.py | 4 + python/paddle/fluid/imperative/base.py | 5 +- python/paddle/fluid/imperative/layers.py | 13 +- python/paddle/fluid/initializer.py | 24 ++-- python/paddle/fluid/layer_helper.py | 2 +- python/paddle/fluid/layers/nn.py | 57 +------- .../fluid/tests/unittests/test_imperative.py | 123 ----------------- .../tests/unittests/test_imperative_mnist.py | 129 ++++++++++++++++++ 14 files changed, 198 insertions(+), 234 deletions(-) delete mode 100644 python/paddle/fluid/tests/unittests/test_imperative.py create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_mnist.py diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 342cb68ab2..35640ca6dc 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -144,6 +144,9 @@ void VarBase::ApplyGrad(framework::Scope* scope, Variable* grad) { std::vector OpBase::ApplyGrad(framework::Scope* scope) { VLOG(3) << "op grad " << grad_op_desc_->Type(); + if (!grad_to_var_) { + return {}; + } for (const std::string& grad_invar : grad_op_desc_->InputArgumentNames()) { if (grad_to_var_->find(grad_invar) == grad_to_var_->end()) { diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index 85a71ca83d..faa64ff9ea 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -60,7 +60,8 @@ class OpBase { pre_ops_(new std::vector()), pre_ops_out_idx_(new std::vector()), op_desc_(nullptr), - grad_op_desc_(nullptr) {} + grad_op_desc_(nullptr), + grad_to_var_(nullptr) {} virtual ~OpBase() { delete input_vars_; diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index 97772dc110..f6dac762fd 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -43,20 +43,14 @@ void CreateGradOp(const framework::OpDesc& op_desc, class Tracer { public: - explicit Tracer(framework::BlockDesc* root_block, - framework::BlockDesc* startup_block) - : root_block_(root_block), startup_block_(startup_block) { - root_scope_ = new framework::Scope(); - scopes_[root_block_] = root_scope_; - scopes_[startup_block_] = root_scope_; - } + explicit Tracer(framework::BlockDesc* root_block) + : root_scope_(new framework::Scope()) {} - virtual ~Tracer() { delete root_scope_; } + virtual ~Tracer() {} void Trace(OpBase* op, const std::vector& inputs, - const std::vector& outputs, - framework::BlockDesc* block) { - framework::Scope* scope = GetScope(block); + const std::vector& outputs, framework::BlockDesc* block, + const bool stop_gradient) { framework::OpDesc* op_desc = op->op_desc_; VLOG(3) << "tracer tracing " << op_desc->Type(); op_desc->InferShape(*block); @@ -67,7 +61,7 @@ class Tracer { *op->input_vars_ = inputs; for (VarBase* input : inputs) { const std::string vname = input->var_desc_->Name(); - framework::Variable* var = scope->Var(vname); + framework::Variable* var = root_scope_->Var(vname); input->var_ = var; if (!var->IsInitialized()) { framework::VarDesc* var_desc = block->FindVar(vname); @@ -90,7 +84,7 @@ class Tracer { *op->output_vars_ = outputs; for (size_t i = 0; i < outputs.size(); ++i) { const std::string vname = outputs[i]->var_desc_->Name(); - framework::Variable* var = scope->Var(vname); + framework::Variable* var = root_scope_->Var(vname); if (!var->IsInitialized()) { framework::VarDesc* var_desc = block->FindVar(vname); if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { @@ -105,11 +99,8 @@ class Tracer { } VLOG(3) << "tracer running " << op_desc->Type(); - op_base->Run(*scope, platform::CPUPlace()); - if (block == startup_block_) { - op->grad_op_desc_ = nullptr; - op->grad_to_var_ = nullptr; - } else { + op_base->Run(*root_scope_, platform::CPUPlace()); + if (!stop_gradient) { framework::OpDesc* grad_op_desc; auto grad_to_var = new std::unordered_map(); CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var); @@ -119,22 +110,10 @@ class Tracer { op->block_ = block; } - framework::Scope* GetScope(framework::BlockDesc* block) { - if (scopes_.find(block) != scopes_.end()) { - return scopes_.at(block); - } - framework::BlockDesc* parent_block = block->ParentBlock(); - PADDLE_ENFORCE(scopes_.find(parent_block) != scopes_.end()); - framework::Scope* scope = &scopes_[parent_block]->NewScope(); - scopes_[block] = scope; - return scope; - } + framework::Scope* GetScope() { return root_scope_.get(); } private: - std::map scopes_; - framework::BlockDesc* root_block_; - framework::BlockDesc* startup_block_; - framework::Scope* root_scope_; + std::unique_ptr root_scope_; }; } // namespace imperative diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index be63fb8778..34e9c897d9 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -24,9 +24,8 @@ namespace pybind { void BindTracer(pybind11::module *m) { pybind11::class_(*m, "Tracer", "") .def("__init__", - [](imperative::Tracer &self, framework::BlockDesc *root_block, - framework::BlockDesc *startup_block) { - new (&self) imperative::Tracer(root_block, startup_block); + [](imperative::Tracer &self, framework::BlockDesc *root_block) { + new (&self) imperative::Tracer(root_block); }) .def("trace", &imperative::Tracer::Trace) .def("get_scope", &imperative::Tracer::GetScope, diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 737ae2dd9c..db6c88e01c 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -117,6 +117,12 @@ PYBIND11_MODULE(core, m) { self.RunBackward(scope); }) .def("_grad", &imperative::VarBase::Grad) + .def_property("value", + [](const imperative::VarBase &self) { return self.var_; }, + [](imperative::VarBase &self, framework::Variable *var) { + self.var_ = var; + }, + py::return_value_policy::reference) .def_property( "desc", [](const imperative::VarBase &self) { return self.var_desc_; }, diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 10d441cf3e..bcf5bc3498 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -361,7 +361,7 @@ class Variable(object): self._ivar.desc = self.desc def _numpy(self): - scope = _imperative_tracer().get_scope(self.block.desc) + scope = _imperative_tracer().get_scope() tensor = core.get_variable_tensor(scope, self.desc.name()) return np.array(tensor) @@ -573,7 +573,8 @@ class Operator(object): type=None, inputs=None, outputs=None, - attrs=None): + attrs=None, + stop_gradient=False): self.block = block self.desc = desc # note: not add self.attrs here: @@ -1264,9 +1265,12 @@ class Block(object): """ op_desc = self.desc.append_op() op = Operator(block=self, desc=op_desc, *args, **kwargs) + print("append_op", kwargs.get("type"), kwargs.get("stop_gradient", + False)) if _in_imperative_mode(): _imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs], - [v._ivar for v in op.outputs], self.desc) + [v._ivar for v in op.outputs], self.desc, + kwargs.get("stop_gradient", False)) self.ops.append(op) return op @@ -1316,9 +1320,12 @@ class Block(object): def _prepend_op(self, *args, **kwargs): op_desc = self.desc._prepend_op() op = Operator(self, op_desc, *args, **kwargs) + print("prepend_op", kwargs.get("type"), kwargs.get("stop_gradient", + False)) if _in_imperative_mode(): _imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs], - [v._ivar for v in op.outputs], self.desc) + [v._ivar for v in op.outputs], self.desc, + kwargs.get("stop_gradient", False)) self.ops.insert(0, op) return op diff --git a/python/paddle/fluid/imperative/__init__.py b/python/paddle/fluid/imperative/__init__.py index 922308b6b1..54dc794ea6 100644 --- a/python/paddle/fluid/imperative/__init__.py +++ b/python/paddle/fluid/imperative/__init__.py @@ -20,6 +20,10 @@ from .base import * from . import layers from .layers import * +from . import nn +from .nn import * + __all__ = [] __all__ += layers.__all__ __all__ += base.__all__ +__all__ += nn.__all__ diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/imperative/base.py index aa48ef71aa..a33ad4b72c 100644 --- a/python/paddle/fluid/imperative/base.py +++ b/python/paddle/fluid/imperative/base.py @@ -28,8 +28,7 @@ def enabled(): def guard(): train = framework.Program() startup = framework.Program() - tracer = core.Tracer(train.current_block().desc, - startup.current_block().desc) + tracer = core.Tracer(train.current_block().desc) with framework.program_guard(train, startup): with framework.unique_name.guard(): with framework._imperative_guard(tracer): @@ -46,7 +45,7 @@ def to_variable(value, block=None): name=None, shape=value.shape, dtype=value.dtype) - scope = framework._imperative_tracer().get_scope(block.desc) + scope = framework._imperative_tracer().get_scope() var = scope.var(py_var.name) tensor = var.get_tensor() tensor.set(value, core.CPUPlace()) diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py index 044717c319..305e083644 100644 --- a/python/paddle/fluid/imperative/layers.py +++ b/python/paddle/fluid/imperative/layers.py @@ -24,8 +24,10 @@ __all__ = ['PyLayer'] class PyLayer(core.Layer): - def __init__(self): - self._built = False + def __init__(self, *args, **kwargs): + from ..layer_helper import LayerHelper + self._helper = LayerHelper(type(self).__name__, **kwargs) + self._dtype = kwargs.get("dtype", core.VarDesc.VarType.FP32) def __call__(self, inputs): if not isinstance(inputs, list) and not isinstance(inputs, tuple): @@ -35,15 +37,10 @@ class PyLayer(core.Layer): for x in inputs: py_var = base.to_variable(x) var_inputs.append(py_var) - if not self._built: - self._build_once(inputs) - self._built = True outputs = self.forward(var_inputs) - return outputs - def _build_once(self, inputs): - pass + return outputs def forward(self, inputs): return [] diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py index b37ebbe517..7acaed2250 100644 --- a/python/paddle/fluid/initializer.py +++ b/python/paddle/fluid/initializer.py @@ -161,7 +161,8 @@ class ConstantInitializer(Initializer): "dtype": int(var.dtype), "value": float(self._value), 'force_cpu': self._force_cpu or force_init_on_cpu() - }) + }, + stop_gradient=True) var.op = op return op @@ -216,7 +217,8 @@ class UniformInitializer(Initializer): "min": self._low, "max": self._high, "seed": self._seed - }) + }, + stop_gradient=True) var.op = op return op @@ -271,7 +273,8 @@ class NormalInitializer(Initializer): "std": self._std_dev, "seed": self._seed, "use_mkldnn": False - }) + }, + stop_gradient=True) var.op = op return op @@ -325,7 +328,8 @@ class TruncatedNormalInitializer(Initializer): "mean": self._mean, "std": self._std_dev, "seed": self._seed - }) + }, + stop_gradient=True) var.op = op return op @@ -415,7 +419,8 @@ class XavierInitializer(Initializer): "min": -limit, "max": limit, "seed": self._seed - }) + }, + stop_gradient=True) else: std = np.sqrt(2.0 / float(fan_in + fan_out)) @@ -428,7 +433,8 @@ class XavierInitializer(Initializer): "mean": 0.0, "std": std, "seed": self._seed - }) + }, + stop_gradient=True) var.op = op return op @@ -513,7 +519,8 @@ class MSRAInitializer(Initializer): "min": -limit, "max": limit, "seed": self._seed - }) + }, + stop_gradient=True) else: std = np.sqrt(2.0 / float(fan_in)) @@ -526,7 +533,8 @@ class MSRAInitializer(Initializer): "mean": 0.0, "std": std, "seed": self._seed - }) + }, + stop_gradient=True) var.op = op return op diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py index 74b4a977db..eba5417723 100644 --- a/python/paddle/fluid/layer_helper.py +++ b/python/paddle/fluid/layer_helper.py @@ -22,8 +22,8 @@ import numpy as np from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating from . import unique_name +from paddle.fluid.imperative.base import to_variable from paddle.fluid.initializer import Constant, Xavier -from paddle.fluid.imperative import base from .param_attr import ParamAttr, WeightNormParamAttr from . import core from six.moves import zip diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index d8bc919784..793509252d 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -29,7 +29,6 @@ from . import utils from .. import unique_name from functools import reduce from .. import core -from ..imperative import layers __all__ = [ 'fc', @@ -2537,12 +2536,12 @@ def adaptive_pool2d(input, Examples: .. code-block:: python - # suppose input data in shape of [N, C, H, W], `pool_size` is [m, n], + # suppose input data in shape of [N, C, H, W], `pool_size` is [m, n], # output shape is [N, C, m, n], adaptive pool divide H and W dimentions - # of input data into m * n grids averagely and performs poolings in each + # of input data into m * n grids averagely and performs poolings in each # grid to get output. # adaptive average pool performs calculations as follow: - # + # # for i in range(m): # for j in range(n): # hstart = floor(i * H / m) @@ -2636,10 +2635,10 @@ def adaptive_pool3d(input, # suppose input data in shape of [N, C, D, H, W], `pool_size` is [l, m, n], # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimentions - # of input data into l * m * n grids averagely and performs poolings in each + # of input data into l * m * n grids averagely and performs poolings in each # grid to get output. # adaptive average pool performs calculations as follow: - # + # # for i in range(l): # for j in range(m): # for k in range(n): @@ -2649,7 +2648,7 @@ def adaptive_pool3d(input, # hend = ceil((j + 1) * H / m) # wstart = floor(k * W / n) # wend = ceil((k + 1) * W / n) - # output[:, :, i, j, k] = + # output[:, :, i, j, k] = # avg(input[:, :, dstart:dend, hstart: hend, wstart: wend]) # data = fluid.layers.data( @@ -9427,47 +9426,3 @@ def huber_loss(input, label, delta): 'Residual': residual}, attrs={'delta': delta}) return out - - -class FC(layers.PyLayer): - def __init__(self, - size, - param_attr=None, - num_flatten_dims=1, - dtype=core.VarDesc.VarType.FP32): - super(FC, self).__init__() - self._size = size - self._num_flatten_dims = num_flatten_dims - self._dtype = dtype - self._helper = LayerHelper('FC', param_attr=param_attr) - - def _build_once(self, inputs): - input_shape = inputs[0].shape - param_shape = [ - reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:], 1) - ] + [self._size] - self._w = self._helper.create_parameter( - attr=self._helper.param_attr, - shape=param_shape, - dtype=self._dtype, - is_bias=False) - - def forward(self, inputs): - tmp = self._helper.create_variable_for_type_inference(self._dtype) - self._helper.append_op( - type="mul", - inputs={"X": inputs[0], - "Y": self._w}, - outputs={"Out": tmp}, - attrs={ - "x_num_col_dims": self._num_flatten_dims, - "y_num_col_dims": 1 - }) - - out = self._helper.create_variable_for_type_inference(self._dtype) - self._helper.append_op( - type="sum", - inputs={"X": [tmp]}, - outputs={"Out": out}, - attrs={"use_mkldnn": False}) - return out diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py deleted file mode 100644 index 0fe69d1bd4..0000000000 --- a/python/paddle/fluid/tests/unittests/test_imperative.py +++ /dev/null @@ -1,123 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import contextlib -import unittest -import numpy as np - -import paddle.fluid as fluid -from paddle.fluid import core -from paddle.fluid.layers.nn import FC - - -@contextlib.contextmanager -def new_program_scope(): - prog = fluid.Program() - startup_prog = fluid.Program() - scope = fluid.core.Scope() - with fluid.scope_guard(scope): - with fluid.program_guard(prog, startup_prog): - yield - - -class MyLayer(fluid.imperative.PyLayer): - def __init__(self): - super(MyLayer, self).__init__() - - def forward(self, inputs): - x = fluid.layers.relu(inputs[0]) - self._x_for_debug = x - return [fluid.layers.elementwise_mul(x, x)] - - -class MLP(fluid.imperative.PyLayer): - def __init__(self): - super(MLP, self).__init__() - self._fc1 = FC(3, - fluid.ParamAttr( - initializer=fluid.initializer.Constant(value=0.1))) - self._fc2 = FC(4, - fluid.ParamAttr( - initializer=fluid.initializer.Constant(value=0.1))) - - def forward(self, inputs): - x = self._fc1(inputs[0]) - x = self._fc2(x) - x = fluid.layers.reduce_sum(x) - return x - - -class TestImperative(unittest.TestCase): - def test_layer(self): - with fluid.imperative.guard(): - cl = core.Layer() - cl.forward([]) - l = fluid.imperative.PyLayer() - l.forward([]) - - def test_layer_in_out(self): - np_inp = np.array([1.0, 2.0, -1.0], dtype=np.float32) - with fluid.imperative.guard(): - l = MyLayer() - x = l(np_inp)[0] - self.assertIsNotNone(x) - dy_out = x._numpy() - x._backward() - dy_grad = l._x_for_debug._gradient() - - with new_program_scope(): - inp = fluid.layers.data( - name="inp", shape=[3], append_batch_size=False) - l = MyLayer() - x = l(inp)[0] - param_grads = fluid.backward.append_backward( - x, parameter_list=[l._x_for_debug.name])[0] - exe = fluid.Executor(fluid.CPUPlace()) - - static_out, static_grad = exe.run( - feed={inp.name: np_inp}, - fetch_list=[x.name, param_grads[1].name]) - - self.assertTrue(np.allclose(dy_out, static_out)) - self.assertTrue(np.allclose(dy_grad, static_grad)) - - def test_mlp(self): - np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) - with fluid.imperative.guard(): - mlp = MLP() - out = mlp(np_inp) - dy_out = out._numpy() - out._backward() - dy_grad = mlp._fc1._w._gradient() - - with new_program_scope(): - inp = fluid.layers.data( - name="inp", shape=[2, 2], append_batch_size=False) - mlp = MLP() - out = mlp(inp) - param_grads = fluid.backward.append_backward( - out, parameter_list=[mlp._fc1._w.name])[0] - exe = fluid.Executor(fluid.CPUPlace()) - exe.run(fluid.default_startup_program()) - - static_out, static_grad = exe.run( - feed={inp.name: np_inp}, - fetch_list=[out.name, param_grads[1].name]) - - self.assertTrue(np.allclose(dy_out, static_out)) - self.assertTrue(np.allclose(dy_grad, static_grad)) - - -if __name__ == '__main__': - unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py new file mode 100644 index 0000000000..999d5d1450 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py @@ -0,0 +1,129 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import contextlib +import unittest +import numpy as np + +import paddle.fluid as fluid +from paddle.fluid import core +from paddle.fluid.imperative.nn import Conv2D + + +@contextlib.contextmanager +def new_program_scope(): + prog = fluid.Program() + startup_prog = fluid.Program() + scope = fluid.core.Scope() + with fluid.scope_guard(scope): + with fluid.program_guard(prog, startup_prog): + yield + + +class MNIST(fluid.imperative.PyLayer): + def __init__(self): + super(MNIST, self).__init__() + + groups = 1 + dilation = [1, 1] + pad = [0, 0] + stride = [1, 1] + input_size = [2, 3, 5, 5] # NCHW + assert np.mod(input_size[1], groups) == 0 + f_c = input_size[1] // groups + filter_size = [6, f_c, 3, 3] + + self._conv2d = Conv2D( + num_channels=3, + num_filters=20, + filter_size=3, + stride=stride, + padding=pad, + dilation=dilation, + groups=groups, + use_cudnn=False) + + def forward(self, inputs): + x = self._conv2d(inputs) + return x + + +class TestImperativeMnist(unittest.TestCase): + # def test_layer(self): + # with fluid.imperative.guard(): + # cl = core.Layer() + # cl.forward([]) + # l = fluid.imperative.PyLayer() + # l.forward([]) + + # def test_layer_in_out(self): + # np_inp = np.array([1.0, 2.0, -1.0], dtype=np.float32) + # with fluid.imperative.guard(): + # l = MyLayer() + # x = l(np_inp)[0] + # self.assertIsNotNone(x) + # dy_out = x._numpy() + # x._backward() + # dy_grad = l._x_for_debug._gradient() + + # with new_program_scope(): + # inp = fluid.layers.data( + # name="inp", shape=[3], append_batch_size=False) + # l = MyLayer() + # x = l(inp)[0] + # param_grads = fluid.backward.append_backward( + # x, parameter_list=[l._x_for_debug.name])[0] + # exe = fluid.Executor(fluid.CPUPlace()) + + # static_out, static_grad = exe.run( + # feed={inp.name: np_inp}, + # fetch_list=[x.name, param_grads[1].name]) + + # self.assertTrue(np.allclose(dy_out, static_out)) + # self.assertTrue(np.allclose(dy_grad, static_grad)) + + def test_mnist_cpu_float32(self): + with fluid.imperative.guard(): + mnist = MNIST() + + data = np.random.rand(2, 3, 5, 5).astype('float32') + mnist(data) + # np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) + # with fluid.imperative.guard(): + # mlp = MLP() + # out = mlp(np_inp) + # dy_out = out._numpy() + # out._backward() + # dy_grad = mlp._fc1._w._gradient() + + # with new_program_scope(): + # inp = fluid.layers.data( + # name="inp", shape=[2, 2], append_batch_size=False) + # mlp = MLP() + # out = mlp(inp) + # param_grads = fluid.backward.append_backward( + # out, parameter_list=[mlp._fc1._w.name])[0] + # exe = fluid.Executor(fluid.CPUPlace()) + # exe.run(fluid.default_startup_program()) + + # static_out, static_grad = exe.run( + # feed={inp.name: np_inp}, + # fetch_list=[out.name, param_grads[1].name]) + + # self.assertTrue(np.allclose(dy_out, static_out)) + # self.assertTrue(np.allclose(dy_grad, static_grad)) + + +if __name__ == '__main__': + unittest.main() From 1bec52f581adec2ddb8038ca1bef78f9e2fc763f Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Thu, 20 Dec 2018 05:50:12 +0000 Subject: [PATCH 054/197] test=develop, fix cpu running error --- .../distributed/parameter_prefetch.h | 26 +++++++++++-------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.h b/paddle/fluid/operators/distributed/parameter_prefetch.h index 47d082c4af..2f850a0332 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.h +++ b/paddle/fluid/operators/distributed/parameter_prefetch.h @@ -39,9 +39,6 @@ void prefetch_with_reconstruct(const std::string& id_name, const framework::ExecutionContext& context, const framework::Scope& scope, framework::LoDTensor* original) { - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& actual_ctx = *pool.Get(context.GetPlace()); - prefetch(id_name, out_name, table_names, epmap, height_sections, context, scope); auto& out = scope.FindVar(out_name)->Get(); @@ -54,23 +51,30 @@ void prefetch_with_reconstruct(const std::string& id_name, if (!platform::is_cpu_place(ids.place())) { is_on_cpu_place = false; } - - for (int64_t i = 0; i < ids.numel(); i++) { - const T* out_rows = out_value + original_width * i; - T* original_row = original_value + original_width * ids.data()[i]; - if (is_on_cpu_place) { + if (is_on_cpu_place) { + for (int64_t i = 0; i < ids.numel(); i++) { + const T* out_rows = out_value + original_width * i; + T* original_row = + original_value + original_width * ids.data()[i]; std::memcpy(original_row, out_rows, original_width * sizeof(T)); - } else { + } + } else { #ifndef PADDLE_WITH_CUDA - PADDLE_THROW("paddle is not compiled with CUDA!"); + PADDLE_THROW("paddle is not compiled with CUDA!"); #else + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& actual_ctx = *pool.Get(context.GetPlace()); + for (int64_t i = 0; i < ids.numel(); i++) { + const T* out_rows = out_value + original_width * i; + T* original_row = + original_value + original_width * ids.data()[i]; auto stream = static_cast(&actual_ctx)->stream(); memory::Copy(boost::get(ids.place()), original_row, platform::CPUPlace(), out_rows, original_width * sizeof(T), stream); -#endif } +#endif } } From fba3712a7b326e54c9137e81a2360488cf36ee7c Mon Sep 17 00:00:00 2001 From: minqiyang Date: Thu, 20 Dec 2018 15:58:24 +0800 Subject: [PATCH 055/197] Add multi-input to forward function in Layer --- python/paddle/fluid/imperative/layers.py | 17 +-- .../tests/unittests/test_imperative_mnist.py | 132 ++++++++++-------- 2 files changed, 75 insertions(+), 74 deletions(-) diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py index 305e083644..5ebc0430cc 100644 --- a/python/paddle/fluid/imperative/layers.py +++ b/python/paddle/fluid/imperative/layers.py @@ -29,18 +29,9 @@ class PyLayer(core.Layer): self._helper = LayerHelper(type(self).__name__, **kwargs) self._dtype = kwargs.get("dtype", core.VarDesc.VarType.FP32) - def __call__(self, inputs): - if not isinstance(inputs, list) and not isinstance(inputs, tuple): - inputs = [inputs] - - var_inputs = [] - for x in inputs: - py_var = base.to_variable(x) - var_inputs.append(py_var) - - outputs = self.forward(var_inputs) - + def __call__(self, *inputs): + outputs = self.forward(*inputs) return outputs - def forward(self, inputs): - return [] + def forward(self, *inputs): + raise NotImplementedError diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py index 999d5d1450..981e9eb2d6 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py @@ -18,81 +18,91 @@ import numpy as np import paddle.fluid as fluid from paddle.fluid import core -from paddle.fluid.imperative.nn import Conv2D +from paddle.fluid.imperative.nn import Conv2D, Pool2D + + +class SimpleImgConvPool(fluid.imperative.PyLayer): + def __init__(self, + num_channels, + num_filters, + filter_size, + pool_size, + pool_stride, + pool_padding=0, + pool_type='max', + global_pooling=False, + conv_stride=1, + conv_padding=0, + conv_dilation=1, + conv_groups=1, + act=None, + use_cudnn=False, + param_attr=None, + bias_attr=None): + super(SimpleImgConvPool, self).__init__() + + # groups = 1 + # dilation = [1, 1] + # pad = [0, 0] + # stride = [1, 1] + # input_size = [2, 3, 5, 5] # NCHW + # assert np.mod(input_size[1], groups) == 0 + # f_c = input_size[1] // groups + # filter_size = [6, f_c, 3, 3] + self._conv2d = Conv2D( + num_channels=num_channels, + num_filters=num_filters, + filter_size=filter_size, + stride=conv_stride, + padding=conv_padding, + dilation=conv_dilation, + groups=conv_groups, + param_attr=None, + bias_attr=None, + use_cudnn=use_cudnn) + + self._pool2d = Pool2D( + pool_size=pool_size, + pool_type=pool_type, + pool_stride=pool_stride, + pool_padding=pool_padding, + global_pooling=global_pooling, + use_cudnn=use_cudnn) -@contextlib.contextmanager -def new_program_scope(): - prog = fluid.Program() - startup_prog = fluid.Program() - scope = fluid.core.Scope() - with fluid.scope_guard(scope): - with fluid.program_guard(prog, startup_prog): - yield + def forward(self, inputs): + x = self._conv2d(inputs) + x = self._pool2d(x) + return x class MNIST(fluid.imperative.PyLayer): - def __init__(self): - super(MNIST, self).__init__() - - groups = 1 - dilation = [1, 1] - pad = [0, 0] - stride = [1, 1] - input_size = [2, 3, 5, 5] # NCHW - assert np.mod(input_size[1], groups) == 0 - f_c = input_size[1] // groups - filter_size = [6, f_c, 3, 3] + def __init__(self, param_attr=None, bias_attr=None): + super(MNIST, self).__init__(param_attr=param_attr, bias_attr=bias_attr) - self._conv2d = Conv2D( + self._simple_img_conv_pool_1 = SimpleImgConvPool( num_channels=3, + filter_size=5, num_filters=20, - filter_size=3, - stride=stride, - padding=pad, - dilation=dilation, - groups=groups, - use_cudnn=False) + pool_size=2, + pool_stride=2, + act="relu") + + self._simple_img_conv_pool_2 = SimpleImgConvPool( + num_channels=3, + filter_size=5, + num_filters=50, + pool_size=2, + pool_stride=2, + act="relu") def forward(self, inputs): - x = self._conv2d(inputs) + x = self._simple_img_conv_pool_1(inputs) + x = self._simple_img_conv_pool_2(x) return x class TestImperativeMnist(unittest.TestCase): - # def test_layer(self): - # with fluid.imperative.guard(): - # cl = core.Layer() - # cl.forward([]) - # l = fluid.imperative.PyLayer() - # l.forward([]) - - # def test_layer_in_out(self): - # np_inp = np.array([1.0, 2.0, -1.0], dtype=np.float32) - # with fluid.imperative.guard(): - # l = MyLayer() - # x = l(np_inp)[0] - # self.assertIsNotNone(x) - # dy_out = x._numpy() - # x._backward() - # dy_grad = l._x_for_debug._gradient() - - # with new_program_scope(): - # inp = fluid.layers.data( - # name="inp", shape=[3], append_batch_size=False) - # l = MyLayer() - # x = l(inp)[0] - # param_grads = fluid.backward.append_backward( - # x, parameter_list=[l._x_for_debug.name])[0] - # exe = fluid.Executor(fluid.CPUPlace()) - - # static_out, static_grad = exe.run( - # feed={inp.name: np_inp}, - # fetch_list=[x.name, param_grads[1].name]) - - # self.assertTrue(np.allclose(dy_out, static_out)) - # self.assertTrue(np.allclose(dy_grad, static_grad)) - def test_mnist_cpu_float32(self): with fluid.imperative.guard(): mnist = MNIST() From 29697c2e259c5b6142c2ddac8448e3f8597a63e1 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Thu, 20 Dec 2018 23:03:15 +0800 Subject: [PATCH 056/197] Add stop_gradient to VarBase to support loss function test=develop --- paddle/fluid/framework/framework.proto | 2 +- paddle/fluid/imperative/layer.cc | 39 +++++++++++++--- paddle/fluid/imperative/layer.h | 7 ++- paddle/fluid/operators/cross_entropy_op.h | 2 + paddle/fluid/pybind/pybind.cc | 11 ++++- python/paddle/fluid/framework.py | 12 ++++- python/paddle/fluid/imperative/layers.py | 10 +++++ .../tests/unittests/test_imperative_mnist.py | 45 ++++++++++++------- 8 files changed, 99 insertions(+), 29 deletions(-) diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto index efdabffb9b..665adfd8cb 100644 --- a/paddle/fluid/framework/framework.proto +++ b/paddle/fluid/framework/framework.proto @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ syntax = "proto2"; -option optimize_for = LITE_RUNTIME; +/* option optimize_for = LITE_RUNTIME; */ package paddle.framework.proto; // Any incompatible changes to ProgramDesc and its dependencies should diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 35640ca6dc..395fbd1000 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -115,6 +115,7 @@ framework::Variable* CreateVariable(const std::string& name, varname = string::Sprintf("%s@%d", varname, id); } + LOG(ERROR) << "creating var " << varname; VLOG(3) << "creating var " << varname; framework::Variable* var = scope->Var(varname); framework::LoDTensor* tensor = var->GetMutable(); @@ -130,13 +131,22 @@ framework::LoDTensor& VarBase::Grad() { } void VarBase::ApplyGrad(framework::Scope* scope, Variable* grad) { + PADDLE_ENFORCE(grad->IsInitialized(), "grad %s must be initialized", + var_desc_->Name()); + + PADDLE_ENFORCE(grad->Get().IsInitialized(), + "variable %s has NO gradient, please set stop_gradient to it", + var_desc_->Name()); + VLOG(3) << "apply var grad " << var_desc_->Name() << " " << grad->Get().data()[0]; + if (!grads_) { grads_ = CreateVariable(string::Sprintf("%s@IGrad", var_desc_->Name()), var_->Get().dims(), 0.0, scope); } + AddTo(grad, grads_); VLOG(3) << "grad_ after apply var grad " << var_desc_->Name() << " " << grads_->Get().data()[0]; @@ -153,8 +163,9 @@ std::vector OpBase::ApplyGrad(framework::Scope* scope) { // grad op inputs can be forward inputs, so not in grad_to_var. continue; } - VLOG(3) << "op grad in var " << grad_invar; - block_->FindRecursiveOrCreateVar(grad_invar); + VLOG(3) << "op grad input var " << grad_invar; + framework::VarDesc& grad_invar_desc = + block_->FindRecursiveOrCreateVar(grad_invar); framework::Variable* var = scope->Var(grad_invar); const std::string& invar = grad_to_var_->at(grad_invar); for (VarBase* varbase : *output_vars_) { @@ -165,21 +176,33 @@ std::vector OpBase::ApplyGrad(framework::Scope* scope) { break; } } + grad_invar_desc.SetShape( + framework::vectorize(var->Get().dims())); + VLOG(3) + << "set op grad var desc's shape size " + << framework::vectorize(var->Get().dims()).size(); } + LOG(ERROR) << "grad_op_desc_" << grad_op_desc_->Proto()->DebugString(); + for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) { - VLOG(3) << "grad outvar " << outvar; + VLOG(3) << "op grad output var " << outvar; block_->FindRecursiveOrCreateVar(outvar); framework::Variable* var = scope->Var(outvar); if (!var->IsInitialized()) { + VLOG(3) << "init op grad output var " << outvar; framework::VarDesc* var_desc = block_->FindVar(outvar); if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { var->GetMutable(); + // framework::Tensor* tensor = var->GetMutable(); + // tensor->mutable_data(platform::CPUPlace()); } else { LOG(ERROR) << "tracer doesn't support yet"; } } + VLOG(3) << "op grad output var " << outvar << " is inited"; } + grad_op_desc_->InferShape(*block_); grad_op_desc_->InferVarType(block_); std::unique_ptr opbase = @@ -194,11 +217,15 @@ std::vector OpBase::ApplyGrad(framework::Scope* scope) { VarBase* origin_var = (*input_vars_)[i]; for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) { Variable* var = scope->FindVar(outvar); - std::string orig_var = grad_to_var_->at(outvar); - if (origin_var->var_desc_->Name() != orig_var) { + if (var->IsInitialized()) { + VLOG(3) << "get grad op output var " << outvar; + } + std::string orig_var_name = grad_to_var_->at(outvar); + if (origin_var->var_desc_->Name() != orig_var_name || + origin_var->stop_gradient_) { continue; } - VLOG(3) << "apply grad " << outvar << " with origin " << orig_var; + VLOG(3) << "apply grad " << outvar << " with origin " << orig_var_name; origin_var->ApplyGrad(scope, var); found = true; ret.push_back(var); diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index faa64ff9ea..90cc3ae1a9 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -29,12 +29,13 @@ class OpBase; class VarBase { public: - VarBase() + explicit VarBase(bool stop_gradient = false) : pre_op_(nullptr), pre_op_out_idx_(-1), var_desc_(nullptr), var_(nullptr), - grads_(nullptr) {} + grads_(nullptr), + stop_gradient_(stop_gradient) {} virtual ~VarBase() {} @@ -50,6 +51,8 @@ class VarBase { framework::VarDesc* var_desc_; framework::Variable* var_; framework::Variable* grads_; + + bool stop_gradient_; }; class OpBase { diff --git a/paddle/fluid/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h index f123e11542..2500c0443f 100644 --- a/paddle/fluid/operators/cross_entropy_op.h +++ b/paddle/fluid/operators/cross_entropy_op.h @@ -110,6 +110,8 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel { auto* dy = ctx.Input(framework::GradVarName("Y")); auto* label = ctx.Input("Label"); auto* dx = ctx.Output(framework::GradVarName("X")); + LOG(ERROR) << "CROSS ENTROPY GRAD DX: " + << ctx.op().Output(framework::GradVarName("X")); T* dx_data = dx->mutable_data(ctx.GetPlace()); // Following computation only depends on the last dimension size. So it's diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index db6c88e01c..e0d4505028 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -111,7 +111,8 @@ PYBIND11_MODULE(core, m) { BindException(&m); py::class_(m, "VarBase", R"DOC()DOC") - .def(py::init<>()) + // .def(py::init<>()) + .def(py::init(), py::arg("stop_gradient") = false) .def("_run_backward", [](imperative::VarBase &self, framework::Scope *scope) { self.RunBackward(scope); @@ -129,7 +130,13 @@ PYBIND11_MODULE(core, m) { [](imperative::VarBase &self, framework::VarDesc *var_desc) { self.var_desc_ = var_desc; }, - py::return_value_policy::reference); + py::return_value_policy::reference) + .def_property( + "stop_gradient", + [](const imperative::VarBase &self) { return self.stop_gradient_; }, + [](imperative::VarBase &self, bool stop_gradient) { + self.stop_gradient_ = stop_gradient; + }); py::class_(m, "OpBase", R"DOC()DOC") .def(py::init<>()) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index bcf5bc3498..dbe8fa429e 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -354,11 +354,11 @@ class Variable(object): self.block.vars[name] = self self.op = None - self.stop_gradient = stop_gradient self.is_data = is_data if _in_imperative_mode(): self._ivar = core.VarBase() self._ivar.desc = self.desc + self._ivar.stop_gradient = stop_gradient def _numpy(self): scope = _imperative_tracer().get_scope() @@ -366,7 +366,7 @@ class Variable(object): return np.array(tensor) def _backward(self): - scope = _imperative_tracer().get_scope(self.block.desc) + scope = _imperative_tracer().get_scope() self._ivar._run_backward(scope) def _gradient(self): @@ -415,6 +415,14 @@ class Variable(object): """ self.desc = input + @property + def _stop_gradient(self): + return self._ivar.stop_gradient + + @_stop_gradient.setter + def _stop_gradient(self, s): + self._ivar.stop_gradient = s + @property def persistable(self): return self.desc.persistable() diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py index 5ebc0430cc..80645acc8a 100644 --- a/python/paddle/fluid/imperative/layers.py +++ b/python/paddle/fluid/imperative/layers.py @@ -25,12 +25,22 @@ __all__ = ['PyLayer'] class PyLayer(core.Layer): def __init__(self, *args, **kwargs): + self._once_built = True + from ..layer_helper import LayerHelper self._helper = LayerHelper(type(self).__name__, **kwargs) self._dtype = kwargs.get("dtype", core.VarDesc.VarType.FP32) + def _build_once(self, inputs): + pass + def __call__(self, *inputs): + if self._once_built: + self._build_once(*inputs) + self._once_built = False + outputs = self.forward(*inputs) + return outputs def forward(self, *inputs): diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py index 981e9eb2d6..85b613bddc 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py @@ -18,14 +18,15 @@ import numpy as np import paddle.fluid as fluid from paddle.fluid import core -from paddle.fluid.imperative.nn import Conv2D, Pool2D +from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC +from paddle.fluid.imperative.base import to_variable class SimpleImgConvPool(fluid.imperative.PyLayer): def __init__(self, num_channels, - num_filters, filter_size, + num_filters, pool_size, pool_stride, pool_padding=0, @@ -81,24 +82,24 @@ class MNIST(fluid.imperative.PyLayer): super(MNIST, self).__init__(param_attr=param_attr, bias_attr=bias_attr) self._simple_img_conv_pool_1 = SimpleImgConvPool( - num_channels=3, - filter_size=5, - num_filters=20, - pool_size=2, - pool_stride=2, - act="relu") + 1, 5, 20, 2, 2, act="relu") self._simple_img_conv_pool_2 = SimpleImgConvPool( - num_channels=3, - filter_size=5, - num_filters=50, - pool_size=2, - pool_stride=2, - act="relu") + 20, 5, 50, 2, 2, act="relu") + + pool_2_shape = 50 * 8 * 8 + SIZE = 10 + scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5 + self._fc = FC(-1, + 10, + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.NormalInitializer( + loc=0.0, scale=scale))) def forward(self, inputs): x = self._simple_img_conv_pool_1(inputs) x = self._simple_img_conv_pool_2(x) + x = self._fc(x) return x @@ -107,8 +108,20 @@ class TestImperativeMnist(unittest.TestCase): with fluid.imperative.guard(): mnist = MNIST() - data = np.random.rand(2, 3, 5, 5).astype('float32') - mnist(data) + x_data = np.random.rand(128, 1, 28, 28).astype('float32') + img = to_variable(x_data) + y_data = np.random.rand(128, 1).astype('int64') + label = to_variable(y_data) + label._stop_gradient = True + + predict = mnist(img) + print(predict.shape, predict.dtype, label.shape, label.dtype) + out = fluid.layers.cross_entropy(predict, label) + print(out.shape, out.dtype) + out._backward() + filter_grad = mnist._simple_img_conv_pool_1._conv2d._filter_param._gradient( + ) + print(filter_grad) # np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) # with fluid.imperative.guard(): # mlp = MLP() From 9e24fa3aeba2ce9b2bd23e625019c84723031685 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Thu, 20 Dec 2018 23:11:55 +0800 Subject: [PATCH 057/197] Polish code test=develop --- paddle/fluid/imperative/layer.cc | 5 ----- paddle/fluid/operators/cross_entropy_op.h | 2 -- python/paddle/fluid/framework.py | 4 ---- python/paddle/fluid/tests/unittests/test_imperative_mnist.py | 2 -- 4 files changed, 13 deletions(-) diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 15e237a0e8..ef6d8f4016 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -115,7 +115,6 @@ framework::Variable* CreateVariable(const std::string& name, varname = string::Sprintf("%s@%d", varname, id); } - LOG(ERROR) << "creating var " << varname; VLOG(3) << "creating var " << varname; framework::Variable* var = scope->Var(varname); framework::LoDTensor* tensor = var->GetMutable(); @@ -183,8 +182,6 @@ std::vector OpBase::ApplyGrad(framework::Scope* scope) { << framework::vectorize(var->Get().dims()).size(); } - LOG(ERROR) << "grad_op_desc_" << grad_op_desc_->Proto()->DebugString(); - for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) { VLOG(3) << "op grad output var " << outvar; block_->FindRecursiveOrCreateVar(outvar); @@ -194,8 +191,6 @@ std::vector OpBase::ApplyGrad(framework::Scope* scope) { framework::VarDesc* var_desc = block_->FindVar(outvar); if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { var->GetMutable(); - // framework::Tensor* tensor = var->GetMutable(); - // tensor->mutable_data(platform::CPUPlace()); } else { LOG(ERROR) << "tracer doesn't support yet"; } diff --git a/paddle/fluid/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h index 2500c0443f..f123e11542 100644 --- a/paddle/fluid/operators/cross_entropy_op.h +++ b/paddle/fluid/operators/cross_entropy_op.h @@ -110,8 +110,6 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel { auto* dy = ctx.Input(framework::GradVarName("Y")); auto* label = ctx.Input("Label"); auto* dx = ctx.Output(framework::GradVarName("X")); - LOG(ERROR) << "CROSS ENTROPY GRAD DX: " - << ctx.op().Output(framework::GradVarName("X")); T* dx_data = dx->mutable_data(ctx.GetPlace()); // Following computation only depends on the last dimension size. So it's diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index dde08a79d2..3dc23bd060 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1281,8 +1281,6 @@ class Block(object): """ op_desc = self.desc.append_op() op = Operator(block=self, desc=op_desc, *args, **kwargs) - print("append_op", kwargs.get("type"), kwargs.get("stop_gradient", - False)) if _in_imperative_mode(): _imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs], [v._ivar for v in op.outputs], self.desc, @@ -1336,8 +1334,6 @@ class Block(object): def _prepend_op(self, *args, **kwargs): op_desc = self.desc._prepend_op() op = Operator(self, op_desc, *args, **kwargs) - print("prepend_op", kwargs.get("type"), kwargs.get("stop_gradient", - False)) if _in_imperative_mode(): _imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs], [v._ivar for v in op.outputs], self.desc, diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py index 85b613bddc..9d1e079998 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py @@ -115,9 +115,7 @@ class TestImperativeMnist(unittest.TestCase): label._stop_gradient = True predict = mnist(img) - print(predict.shape, predict.dtype, label.shape, label.dtype) out = fluid.layers.cross_entropy(predict, label) - print(out.shape, out.dtype) out._backward() filter_grad = mnist._simple_img_conv_pool_1._conv2d._filter_param._gradient( ) From 3de0f612e8b5340b502045a4fd0d2b41b4465ff4 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 21 Dec 2018 13:09:31 +0800 Subject: [PATCH 058/197] Polish code test=develop --- paddle/fluid/pybind/imperative.cc | 5 ++--- python/paddle/fluid/imperative/base.py | 3 +-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index be63fb8778..34e9c897d9 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -24,9 +24,8 @@ namespace pybind { void BindTracer(pybind11::module *m) { pybind11::class_(*m, "Tracer", "") .def("__init__", - [](imperative::Tracer &self, framework::BlockDesc *root_block, - framework::BlockDesc *startup_block) { - new (&self) imperative::Tracer(root_block, startup_block); + [](imperative::Tracer &self, framework::BlockDesc *root_block) { + new (&self) imperative::Tracer(root_block); }) .def("trace", &imperative::Tracer::Trace) .def("get_scope", &imperative::Tracer::GetScope, diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/imperative/base.py index c9e7c86483..a33ad4b72c 100644 --- a/python/paddle/fluid/imperative/base.py +++ b/python/paddle/fluid/imperative/base.py @@ -28,8 +28,7 @@ def enabled(): def guard(): train = framework.Program() startup = framework.Program() - tracer = core.Tracer(train.current_block().desc, - startup.current_block().desc) + tracer = core.Tracer(train.current_block().desc) with framework.program_guard(train, startup): with framework.unique_name.guard(): with framework._imperative_guard(tracer): From 74ead6ff35c7002ab735eaa52a25c20080e2aca3 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 21 Dec 2018 15:24:14 +0800 Subject: [PATCH 059/197] Polish code --- paddle/fluid/imperative/layer.cc | 3 --- 1 file changed, 3 deletions(-) diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index ef6d8f4016..fcddcc4ed4 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -153,9 +153,6 @@ void VarBase::ApplyGrad(framework::Scope* scope, Variable* grad) { std::vector OpBase::ApplyGrad(framework::Scope* scope) { VLOG(3) << "op grad " << grad_op_desc_->Type(); - if (!grad_to_var_) { - return {}; - } for (const std::string& grad_invar : grad_op_desc_->InputArgumentNames()) { if (grad_to_var_->find(grad_invar) == grad_to_var_->end()) { From 224c90a84c99eff9f0bbeda0dfa6ac21d58d6b3a Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 21 Dec 2018 15:26:01 +0800 Subject: [PATCH 060/197] Add nn to imperative test=develop --- python/paddle/fluid/imperative/nn.py | 239 +++++++++++++++++++++++++++ 1 file changed, 239 insertions(+) create mode 100644 python/paddle/fluid/imperative/nn.py diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/imperative/nn.py new file mode 100644 index 0000000000..15d0fcaf77 --- /dev/null +++ b/python/paddle/fluid/imperative/nn.py @@ -0,0 +1,239 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +from six.moves import reduce + +from .. import core +from ..layers import utils +from . import layers +from ..framework import Variable, OpProtoHolder +from ..param_attr import ParamAttr +from ..initializer import Normal, Constant + +__all__ = [ + 'Conv2D', + 'Pool2D', + 'FC', +] + + +class Conv2D(layers.PyLayer): + def __init__(self, + num_channels, + num_filters, + filter_size, + stride=1, + padding=0, + dilation=1, + groups=None, + use_cudnn=True, + act=None, + param_attr=None, + bias_attr=None, + name=None, + dtype=core.VarDesc.VarType.FP32): + assert param_attr is not False, "param_attr should not be False here." + super(Conv2D, self).__init__( + param_attr=param_attr, bias_attr=bias_attr, name=name, dtype=dtype) + + self._groups = groups + self._stride = utils.convert_to_list(stride, 2, 'stride') + self._padding = utils.convert_to_list(padding, 2, 'padding') + self._dilation = utils.convert_to_list(dilation, 2, 'dilation') + if not isinstance(use_cudnn, bool): + raise ValueError("use_cudnn should be True or False") + self._use_cudnn = use_cudnn + self._num_channels = num_channels + if (self._num_channels == self._groups and + num_filters % self._num_channels == 0 and not self._use_cudnn): + self._l_type = 'depthwise_conv2d' + else: + self._l_type = 'conv2d' + + if groups is None: + num_filter_channels = num_channels + else: + if num_channels % groups != 0: + raise ValueError("num_channels must be divisible by groups.") + num_filter_channels = num_channels // groups + filter_size = utils.convert_to_list(filter_size, 2, 'filter_size') + filter_shape = [num_filters, int(num_filter_channels)] + filter_size + + def _get_default_param_initializer(): + filter_elem_num = filter_size[0] * filter_size[1] * num_channels + std = (2.0 / filter_elem_num)**0.5 + return Normal(0.0, std, 0) + + self._filter_param = self._helper.create_parameter( + attr=self._helper.param_attr, + shape=filter_shape, + dtype=self._dtype, + default_initializer=_get_default_param_initializer()) + + if self._use_cudnn: + self._helper.create_variable( + name="kCUDNNFwdAlgoCache", + persistable=True, + type=core.VarDesc.VarType.RAW) + self._helper.create_variable( + name="kCUDNNBwdDataAlgoCache", + persistable=True, + type=core.VarDesc.VarType.RAW) + self._helper.create_variable( + name="kCUDNNBwdFilterAlgoCache", + persistable=True, + type=core.VarDesc.VarType.RAW) + + self._pre_bias = self._helper.create_variable_for_type_inference( + dtype=self._dtype) + + def forward(self, input): + self._helper.append_op( + type=self._l_type, + inputs={ + 'Input': input, + 'Filter': self._filter_param, + }, + outputs={"Output": self._pre_bias}, + attrs={ + 'strides': self._stride, + 'paddings': self._padding, + 'dilations': self._dilation, + 'groups': self._groups, + 'use_cudnn': self._use_cudnn, + 'use_mkldnn': False, + }) + + self._pre_act = self._helper.append_bias_op( + self._pre_bias, dim_start=1, dim_end=2) + + out = self._helper.append_activation(self._pre_act) + return out + + +class Pool2D(layers.PyLayer): + def __init__(self, + pool_size=-1, + pool_type="max", + pool_stride=1, + pool_padding=0, + global_pooling=False, + use_cudnn=True, + ceil_mode=False, + exclusive=True, + name=None, + dtype=core.VarDesc.VarType.FP32): + if pool_type not in ["max", "avg"]: + raise ValueError( + "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.", + str(pool_type)) + + if global_pooling is False and pool_size == -1: + raise ValueError( + "When the global_pooling is False, pool_size must be passed " + "and be a valid value. Received pool_size: " + str(pool_size)) + + if not isinstance(use_cudnn, bool): + raise ValueError("use_cudnn should be True or False") + + super(Pool2D, self).__init__(name=name, dtype=dtype) + + self._pool_type = pool_type + self._pool_size = utils.convert_to_list(pool_size, 2, 'pool_size') + self._pool_padding = utils.convert_to_list(pool_padding, 2, + 'pool_padding') + self._pool_stride = utils.convert_to_list(pool_stride, 2, 'pool_stride') + self._global_pooling = global_pooling + self._use_cudnn = use_cudnn + self._ceil_mode = ceil_mode + self._exclusive = exclusive + self._l_type = 'pool2d' + + self._pool_out = self._helper.create_variable_for_type_inference( + self._dtype) + + def forward(self, input): + self._helper.append_op( + type=self._l_type, + inputs={"X": input}, + outputs={"Out": self._pool_out}, + attrs={ + "pooling_type": self._pool_type, + "ksize": self._pool_size, + "global_pooling": self._global_pooling, + "strides": self._pool_stride, + "paddings": self._pool_padding, + "use_cudnn": self._use_cudnn, + "ceil_mode": self._ceil_mode, + "use_mkldnn": False, + "exclusive": self._exclusive, + }) + return self._pool_out + + +class FC(layers.PyLayer): + def __init__(self, + size_in, + size_out, + num_flatten_dims=1, + param_attr=None, + dtype=core.VarDesc.VarType.FP32): + super(FC, self).__init__(param_attr=param_attr, dtype=dtype) + + self._size_in = size_in + self._size_out = size_out + self._num_flatten_dims = num_flatten_dims + self._dtype = dtype + if self._size_in != -1: + self._w = self._helper.create_parameter( + attr=self._helper.param_attr, + shape=[size_in, size_out], + dtype=self._dtype, + is_bias=False) + self._tmp = self._helper.create_variable_for_type_inference(self._dtype) + self._out = self._helper.create_variable_for_type_inference(self._dtype) + + def _build_once(self, input): + if self._size_in != -1: + return + + input_shape = input.shape + param_shape = [ + reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:], 1) + ] + [self._size_out] + self._w = self._helper.create_parameter( + attr=self._helper.param_attr, + shape=param_shape, + dtype=self._dtype, + is_bias=False) + + def forward(self, input): + self._helper.append_op( + type="mul", + inputs={"X": input, + "Y": self._w}, + outputs={"Out": self._tmp}, + attrs={ + "x_num_col_dims": self._num_flatten_dims, + "y_num_col_dims": 1 + }) + + self._helper.append_op( + type="sum", + inputs={"X": [self._tmp]}, + outputs={"Out": self._out}, + attrs={"use_mkldnn": False}) + return self._out From 1a8cbb679989be672afd76a72f85fe694769a049 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Mon, 24 Dec 2018 10:14:59 +0000 Subject: [PATCH 061/197] test=develop, accelerate_hs_op and add prefetch with is_sparse --- paddle/fluid/operators/hierarchical_sigmoid_op.h | 3 ++- python/paddle/fluid/layers/nn.py | 15 ++++++++------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h index b47bf49ecb..1a7ca96301 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.h +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h @@ -40,8 +40,9 @@ using platform::Transform; static std::vector PathToRows(const framework::LoDTensor& path) { std::set rows; + const int64_t* paths = path.data(); for (int64_t i = 0; i < path.numel(); ++i) { - int64_t row = path.data()[i]; + int64_t row = paths[i]; if (row < 0) { continue; } diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 861bc32026..6379031ee4 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -5013,9 +5013,10 @@ def nce(input, else: num_neg_samples = int(num_neg_samples) - remote_prefetch = False - if os.environ.get('PADDLE_ENABLE_REMOTE_PREFETCH'): - remote_prefetch = True + remote_prefetch = is_sparse + print( + "With sparse mode, if your models has only small parameter prefetch may cause speed down" + ) attrs = { 'num_total_classes': int(num_total_classes), @@ -5133,10 +5134,10 @@ def hsigmoid(input, pass weights = None - remote_prefetch = False - if os.environ.get('PADDLE_ENABLE_REMOTE_PREFETCH'): - remote_prefetch = True - + remote_prefetch = is_sparse + print( + "With sparse mode, if your models has only small parameter prefetch may cause speed down" + ) if not is_custom: weights = helper.create_parameter( attr=helper.param_attr, From 2e38faa3fe279520f98b2030e35ae8db68ba66d8 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Mon, 24 Dec 2018 10:17:32 +0000 Subject: [PATCH 062/197] test=develop, accelerate_hs_op and add prefetch with is_sparse --- python/paddle/fluid/layers/nn.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 6379031ee4..9af62bf06f 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -336,9 +336,7 @@ def embedding(input, """ helper = LayerHelper('embedding', **locals()) - remote_prefetch = False - if os.environ.get('PADDLE_ENABLE_REMOTE_PREFETCH'): - remote_prefetch = True + remote_prefetch = is_sparse if remote_prefetch: assert is_sparse is True and is_distributed is False w = helper.create_parameter( From 5bfb26a8b2c252702bb140a9c146e10288ea806e Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Tue, 25 Dec 2018 02:56:25 +0000 Subject: [PATCH 063/197] test=develop, fix embeding distribute and sparse can't be true and the same time --- python/paddle/fluid/layers/nn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 9af62bf06f..96ea720b9a 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -336,7 +336,7 @@ def embedding(input, """ helper = LayerHelper('embedding', **locals()) - remote_prefetch = is_sparse + remote_prefetch = is_sparse and (not is_distributed) if remote_prefetch: assert is_sparse is True and is_distributed is False w = helper.create_parameter( From cb478f7a94f52b48750cbe64ef20941732b06e9b Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Tue, 25 Dec 2018 09:04:05 +0000 Subject: [PATCH 064/197] just for test --- python/paddle/fluid/tests/unittests/test_dist_transpiler.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index 0555db4cba..e166ab43de 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -521,7 +521,7 @@ class TestLocalLookupTable(TestDistLookupTableBase): 'split_selected_rows', 'send', 'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad', 'sum', 'split_selected_rows', 'send', 'send_barrier', 'recv', - 'recv', 'recv', 'recv', 'fetch_barrier', 'concat', 'concat' + 'recv', 'fetch_barrier' ] self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) @@ -608,8 +608,7 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase): 'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad', 'split_selected_rows', 'send', 'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad', - 'sum', 'split_selected_rows', 'send', 'recv', 'recv', 'recv', - 'recv', 'concat', 'concat' + 'sum', 'split_selected_rows', 'send', 'recv', 'recv' ] self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) From 2aa1dc67cee9c0a1e04b1b72ff7358e4a57661d5 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Wed, 26 Dec 2018 04:35:11 +0000 Subject: [PATCH 065/197] test=develop, fix test_dist_transpiler failed --- python/paddle/fluid/tests/unittests/test_dist_transpiler.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index e166ab43de..3d1ce6b27c 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -561,7 +561,7 @@ class TestDistLookupTable(TestDistLookupTableBase): 'lookup_table_grad', 'split_selected_rows', 'send', 'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad', 'sum', 'split_ids', 'send', 'send_barrier', - 'recv', 'recv', 'recv', 'fetch_barrier', 'concat' + 'recv', 'recv', 'fetch_barrier' ] self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) startup_ops = [ @@ -648,8 +648,7 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase): 'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad', 'split_selected_rows', 'send', 'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad', - 'lookup_table_grad', 'sum', 'split_ids', 'send', 'recv', 'recv', - 'recv', 'concat' + 'lookup_table_grad', 'sum', 'split_ids', 'send', 'recv', 'recv' ] self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) startup_ops = [ From 845bfd5807df67e18f2657712c46f055f11a76ad Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Wed, 26 Dec 2018 17:05:05 +0800 Subject: [PATCH 066/197] cleanup code --- .../framework/details/all_reduce_op_handle.cc | 189 +++++++++--------- .../fluid/framework/details/build_strategy.cc | 8 +- .../fluid/framework/details/build_strategy.h | 1 + .../details/multi_devices_graph_pass.cc | 20 +- paddle/fluid/framework/parallel_executor.cc | 72 ++++--- .../unittests/parallel_executor_test_base.py | 134 ++++++------- .../unittests/test_parallel_executor_crf.py | 55 +++-- .../test_parallel_executor_seresnext.py | 2 +- 8 files changed, 260 insertions(+), 221 deletions(-) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index 414b0970c7..47872a9f2a 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -19,6 +19,13 @@ #include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/platform/profiler.h" +// async nccl allreduce or sync issue: +// https://github.com/PaddlePaddle/Paddle/issues/15049 +DEFINE_bool( + sync_nccl_allreduce, true, + "If set true, will call `cudaStreamSynchronize(nccl_stream)`" + "after allreduce, this mode can get better performance in some scenarios."); + namespace paddle { namespace framework { namespace details { @@ -48,111 +55,107 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, void AllReduceOpHandle::RunImpl() { platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second); -// FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR, -// this is a distributed or inter-process call, find a better way. -#ifdef PADDLE_WITH_CUDA - // All-reduce op_handle can run on the sub-scope, find the nccl id from - // the global scope. - if (NoDummyInputSize() == 1 && - local_scopes_[0]->FindVar(NCCL_ID_VARNAME) == nullptr) { -#else - if (NoDummyInputSize() == 1) { -#endif - return; // No need to all reduce when GPU count = 1; - } else { - // Wait input done - WaitInputVarGenerated(); - auto in_var_handles = DynamicCast(this->Inputs()); - auto out_var_handles = DynamicCast(this->Outputs()); - PADDLE_ENFORCE_EQ( - in_var_handles.size(), places_.size(), - "The NoDummyInputSize should be equal to the number of places."); - PADDLE_ENFORCE_EQ( - in_var_handles.size(), out_var_handles.size(), - "The NoDummyInputSize and NoDummyOutputSize should be equal."); - - std::vector lod_tensors; - for (size_t i = 0; i < local_scopes_.size(); ++i) { - auto *s = local_scopes_[i]; - auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get(); - auto &lod_tensor = - local_scope.FindVar(in_var_handles[i]->name_)->Get(); - lod_tensors.emplace_back(&lod_tensor); - PADDLE_ENFORCE_EQ(in_var_handles[i]->name_, out_var_handles[i]->name_, - "The name of input and output should be equal."); - } + // FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR, + // this is a distributed or inter-process call, find a better way. + // Wait input done + WaitInputVarGenerated(); + auto in_var_handles = DynamicCast(this->Inputs()); + auto out_var_handles = DynamicCast(this->Outputs()); + PADDLE_ENFORCE_EQ( + in_var_handles.size(), places_.size(), + "The NoDummyInputSize should be equal to the number of places."); + PADDLE_ENFORCE_EQ( + in_var_handles.size(), out_var_handles.size(), + "The NoDummyInputSize and NoDummyOutputSize should be equal."); + + std::vector lod_tensors; + for (size_t i = 0; i < local_scopes_.size(); ++i) { + auto *s = local_scopes_[i]; + auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get(); + auto &lod_tensor = + local_scope.FindVar(in_var_handles[i]->name_)->Get(); + lod_tensors.emplace_back(&lod_tensor); + PADDLE_ENFORCE_EQ(in_var_handles[i]->name_, out_var_handles[i]->name_, + "The name of input and output should be equal."); + } - if (platform::is_gpu_place(lod_tensors[0]->place())) { + if (platform::is_gpu_place(lod_tensors[0]->place())) { #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr."); - int dtype = -1; - size_t numel = 0; - std::vector> all_reduce_calls; - for (size_t i = 0; i < local_scopes_.size(); ++i) { - auto &p = places_[i]; - auto &lod_tensor = *lod_tensors[i]; - void *buffer = const_cast(lod_tensor.data()); - - if (dtype == -1) { - dtype = platform::ToNCCLDataType(lod_tensor.type()); - } + PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr."); + int dtype = -1; + size_t numel = 0; + std::vector> all_reduce_calls; + for (size_t i = 0; i < local_scopes_.size(); ++i) { + auto &p = places_[i]; + auto &lod_tensor = *lod_tensors[i]; + void *buffer = const_cast(lod_tensor.data()); + + if (dtype == -1) { + dtype = platform::ToNCCLDataType(lod_tensor.type()); + } + + if (numel == 0) { + numel = static_cast(lod_tensor.numel()); + } + + int dev_id = boost::get(p).device; + auto &nccl_ctx = nccl_ctxs_->at(dev_id); + auto stream = nccl_ctx.stream(); + auto comm = nccl_ctx.comm_; + all_reduce_calls.emplace_back([=] { + PADDLE_ENFORCE(platform::dynload::ncclAllReduce( + buffer, buffer, numel, static_cast(dtype), ncclSum, + comm, stream)); + }); + } - if (numel == 0) { - numel = static_cast(lod_tensor.numel()); + this->RunAndRecordEvent([&] { + if (all_reduce_calls.size() == 1UL) { + // Do not use NCCLGroup when manage NCCL by per thread per device + all_reduce_calls[0](); + } else { + platform::NCCLGroupGuard guard; + for (auto &call : all_reduce_calls) { + call(); } + } + }); + if (FLAGS_sync_nccl_allreduce) { + for (auto &p : places_) { int dev_id = boost::get(p).device; auto &nccl_ctx = nccl_ctxs_->at(dev_id); auto stream = nccl_ctx.stream(); - auto comm = nccl_ctx.comm_; - all_reduce_calls.emplace_back([=] { - PADDLE_ENFORCE(platform::dynload::ncclAllReduce( - buffer, buffer, numel, static_cast(dtype), - ncclSum, comm, stream)); - // TODO(Yancey1989): synchronize here can get better performance - // if don't use NCCL group call, but need more profiling. - if (local_scopes_.size() == 1UL) cudaStreamSynchronize(stream); - }); + cudaStreamSynchronize(stream); } - - this->RunAndRecordEvent([&] { - if (all_reduce_calls.size() == 1UL) { - all_reduce_calls[0](); - } else { - platform::NCCLGroupGuard guard; - for (auto &call : all_reduce_calls) { - call(); - } - } - }); + } #else - PADDLE_THROW("Not compiled with CUDA"); + PADDLE_THROW("Not compiled with CUDA"); #endif - } else { // Special handle CPU only Operator's gradient. Like CRF - auto &trg = *this->local_scopes_[0] - ->FindVar(kLocalExecScopeName) - ->Get() - ->FindVar(out_var_handles[0]->name_) - ->GetMutable(); - - // Reduce All Tensor to trg in CPU - ReduceLoDTensor func(lod_tensors, &trg); - VisitDataType(lod_tensors[0]->type(), func); - - for (size_t i = 1; i < local_scopes_.size(); ++i) { - auto &scope = - *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get(); - auto &p = places_[i]; - auto *var = scope.FindVar(out_var_handles[i]->name_); - auto *dev_ctx = dev_ctxes_.at(p); - - RunAndRecordEvent(p, [&trg, var, dev_ctx, p] { - auto &tensor_gpu = *var->GetMutable(); - auto &tensor_cpu = trg; - TensorCopy(tensor_cpu, p, *dev_ctx, &tensor_gpu); - }); - } + } else { // Special handle CPU only Operator's gradient. Like CRF + auto &trg = *this->local_scopes_[0] + ->FindVar(kLocalExecScopeName) + ->Get() + ->FindVar(out_var_handles[0]->name_) + ->GetMutable(); + + // Reduce All Tensor to trg in CPU + ReduceLoDTensor func(lod_tensors, &trg); + VisitDataType(lod_tensors[0]->type(), func); + + for (size_t i = 1; i < local_scopes_.size(); ++i) { + auto &scope = + *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get(); + auto &p = places_[i]; + auto *var = scope.FindVar(out_var_handles[i]->name_); + auto *dev_ctx = dev_ctxes_.at(p); + + RunAndRecordEvent(p, [&trg, var, dev_ctx, p] { + auto &tensor_gpu = *var->GetMutable(); + auto &tensor_cpu = trg; + TensorCopy(tensor_cpu, p, *dev_ctx, &tensor_gpu); + }); } } } diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index b927b21b6f..cb660cb8c2 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -31,6 +31,8 @@ namespace framework { namespace details { static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) { + // Should fix the allreduce op order if scheduling + // them in multiple threads or processes to avoid hang. return (!strategy.enable_sequential_execution_ && strategy.num_trainers_ > 1) || strategy.enable_parallel_graph_; @@ -88,8 +90,6 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { auto multi_devices_pass = AppendPass("multi_devices_pass"); multi_devices_pass->SetNotOwned("strategy", &strategy_); - multi_devices_pass->Set("num_trainers", - new int(strategy_.num_trainers_)); // Add a graph print pass to record a graph with device info. if (!strategy_.debug_graphviz_path_.empty()) { @@ -134,6 +134,7 @@ std::shared_ptr BuildStrategy::CreatePassesFromStrategy( std::unique_ptr BuildStrategy::Apply( const ProgramDesc &main_program, const std::vector &places, const std::string &loss_var_name, const std::vector &local_scopes, + const size_t &num_parallel_devices, #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const { #else @@ -152,6 +153,9 @@ std::unique_ptr BuildStrategy::Apply( pass->Erase("local_scopes"); pass->SetNotOwned>("local_scopes", &local_scopes); + pass->Set("num_parallel_devices", + new size_t(num_parallel_devices)); + #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr; pass->Erase("nccl_ctxs"); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index f9351fb8d2..b31e60ad8e 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -112,6 +112,7 @@ struct BuildStrategy { const std::vector &places, const std::string &loss_var_name, const std::vector &local_scopes, + const size_t &num_parallel_devices_, #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const; diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 0be81a48ff..a6d583777a 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -132,7 +132,7 @@ static const char kLossVarName[] = "loss_var_name"; static const char kPlaces[] = "places"; static const char kLocalScopes[] = "local_scopes"; static const char kStrategy[] = "strategy"; -static const char kNumTrainers[] = "num_trainers"; +static const char kNumParallelDevices[] = "num_parallel_devices"; void MultiDevSSAGraphBuilder::Init() const { all_vars_.clear(); @@ -296,7 +296,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( auto nodes = graph->ReleaseNodes(); ir::Graph &result = *graph; - int num_trainers = Get(kNumTrainers); + size_t num_parallel_devices = Get(kNumParallelDevices); for (auto &node : nodes) { if (node->IsVar() && node->Var()) { @@ -382,16 +382,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( CreateComputationalOps(&result, node, places_.size()); } -// insert collective ops at the backpropagation; and -// insert collective ops if the graph contains mutilple places. - -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - if (!is_forwarding && - (places_.size() > 1 || num_trainers > 1 || - (nccl_ctxs_ && nccl_ctxs_->contexts_.size() > 1))) { -#else - if (!is_forwarding && (places_.size() > 1 || num_trainers > 1)) { -#endif + if (!is_forwarding && num_parallel_devices > 1) { // Currently, we assume that once gradient is generated, it can be // broadcast, and each gradient is only broadcast once. if (static_cast(boost::get(node->Op()->GetAttr( @@ -668,12 +659,13 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID( void MultiDevSSAGraphBuilder::CreateScaleLossGradOp( ir::Graph *result, const std::string &loss_grad_name, ir::Node *out_var_node) const { + size_t num_parallel_devices = Get("num_parallel_devices"); for (size_t i = 0; i < places_.size(); ++i) { // Insert ScaleCost OpHandle auto *dev_ctx = platform::DeviceContextPool::Instance().Get(places_[i]); auto *op_handle = new ScaleLossGradOpHandle( result->CreateEmptyNode("scale_loss_grad", ir::Node::Type::kOperation), - local_scopes_.size(), local_scopes_[i], places_[i], dev_ctx); + num_parallel_devices, local_scopes_[i], places_[i], dev_ctx); result->Get(kGraphOps).emplace_back(op_handle); // FIXME: Currently ScaleLossGradOp only use device_count as scale @@ -903,4 +895,4 @@ REGISTER_PASS(multi_devices_pass, .RequirePassAttr(paddle::framework::details::kPlaces) .RequirePassAttr(paddle::framework::details::kLocalScopes) .RequirePassAttr(paddle::framework::details::kStrategy) - .RequirePassAttr(paddle::framework::details::kNumTrainers); + .RequirePassAttr(paddle::framework::details::kNumParallelDevices); diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 1637ee3c7e..ec44cae3b3 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -107,6 +107,7 @@ class ParallelExecutorPrivate { bool own_local_scope_; bool use_cuda_; bool use_all_reduce_; + size_t num_parallel_devices_; // global_ref_cnts_ is only initialized when ParallelExecutor constructs, and // then keeps unchanged @@ -202,6 +203,7 @@ ParallelExecutor::ParallelExecutor( member_->build_strategy_ = build_strategy; member_->use_all_reduce_ = build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce; + member_->num_parallel_devices_ = num_trainers * places.size(); if (!member_->use_all_reduce_) { PADDLE_ENFORCE(places.size() > 1, @@ -212,12 +214,12 @@ ParallelExecutor::ParallelExecutor( if (build_strategy.enable_parallel_graph_) { PADDLE_ENFORCE( member_->use_all_reduce_, - "build_strategy.reduce should be `AllReduce` if you want to use" - "ParallelGraph executor."); + "build_strategy.reduce should be `AllReduce` if you want to enable" + "ParallelGraph."); PADDLE_ENFORCE( member_->use_cuda_, - "execution_strategy.use_cuda should be True if you want to use" - "ParallelGraph executor."); + "execution_strategy.use_cuda should be True if you want to enable " + "ParallelGraph."); } // Step 1. Bcast the bcast_vars to devs. @@ -241,27 +243,43 @@ ParallelExecutor::ParallelExecutor( #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME); ncclUniqueId *nccl_id = nullptr; + // nccl collective would broadcast nccl id by gen_nccl_id operator. + if (nccl_id_var != nullptr) { + nccl_id = nccl_id_var->GetMutable(); + } + if (build_strategy.enable_parallel_graph_ && places.size() > 1) { - // parallel graph mode should initialize nccl by ncclCommInitRank since - // it call nccl operator per device per thread. - if (nccl_id_var == nullptr) { + if (nccl_id == nullptr) { nccl_id = new ncclUniqueId(); PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(nccl_id)); - *member_->global_scope_->Var(NCCL_ID_VARNAME) - ->GetMutable() = *nccl_id; - } else { - nccl_id = nccl_id_var->GetMutable(); } - } else if (nccl_id_var != nullptr) { // the other executor type. - // the distributed training with nccl mode would initialize the nccl id in - // startup_program. - nccl_id = nccl_id_var->GetMutable(); - } else { - // initlize NCCL by ncclCommInitAll, do not need to intialize the nccl_id. } - member_->nccl_ctxs_.reset(new platform::NCCLContextMap( member_->places_, nccl_id, num_trainers, trainer_id)); + +/** +if (build_strategy.enable_parallel_graph_ && places.size() > 1) { + // parallel graph mode should initialize nccl by ncclCommInitRank since + // it call nccl operator per device per thread. + if (nccl_id_var == nullptr) { + nccl_id = new ncclUniqueId(); + PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(nccl_id)); + *member_->global_scope_->Var(NCCL_ID_VARNAME) + ->GetMutable() = *nccl_id; + } else { + nccl_id = nccl_id_var->GetMutable(); + } +} else if (nccl_id_var != nullptr) { // the other executor type. + // the distributed training with nccl mode would initialize the nccl id in + // startup_program. + nccl_id = nccl_id_var->GetMutable(); +} else { + // initlize NCCL by ncclCommInitAll, do not need to intialize the nccl_id. +} + +member_->nccl_ctxs_.reset(new platform::NCCLContextMap( + member_->places_, nccl_id, num_trainers, trainer_id)); +**/ #else PADDLE_THROW("Not compiled with CUDA"); #endif @@ -274,25 +292,27 @@ ParallelExecutor::ParallelExecutor( // Step 2. Convert main_program to SSA form and dependency graph. Also, insert // ncclOp std::vector> graphs; + member_->num_parallel_devices_ = member_->places_.size() * num_trainers; #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) if (build_strategy.enable_parallel_graph_) { for (size_t i = 0; i < member_->places_.size(); ++i) { - std::unique_ptr graph = - build_strategy.Apply(main_program, {member_->places_[i]}, - loss_var_name, {member_->local_scopes_[i]}, - member_->use_cuda_, member_->nccl_ctxs_.get()); + std::unique_ptr graph = build_strategy.Apply( + main_program, {member_->places_[i]}, loss_var_name, + {member_->local_scopes_[i]}, member_->num_parallel_devices_, + member_->use_cuda_, member_->nccl_ctxs_.get()); graphs.push_back(std::move(graph)); } } else { std::unique_ptr graph = build_strategy.Apply( main_program, member_->places_, loss_var_name, member_->local_scopes_, - member_->use_cuda_, member_->nccl_ctxs_.get()); + member_->num_parallel_devices_, member_->use_cuda_, + member_->nccl_ctxs_.get()); graphs.push_back(std::move(graph)); } #else - std::unique_ptr graph = - build_strategy.Apply(main_program, member_->places_, loss_var_name, - member_->local_scopes_, member_->use_cuda_); + std::unique_ptr graph = build_strategy.Apply( + main_program, member_->places_, loss_var_name, member_->local_scopes_, + member_->num_parallel_devices_, member_->use_cuda_); graphs.push_back(std::move(graph)); #endif auto max_memory_size = GetEagerDeletionThreshold(); diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index a2c8ee120f..36b13d4558 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -60,71 +60,69 @@ class TestParallelExecutorBase(unittest.TestCase): startup = fluid.Program() startup.random_seed = 1 # Fix random seed main.random_seed = 1 - self.scope = fluid.Scope() - with fluid.scope_guard(self.scope): - with fluid.program_guard(main, startup): - if seed is not None: - startup.random_seed = seed - main.random_seed = seed - - loss = method(use_feed=feed_dict is not None) - - optimizer().minimize(loss) - - if memory_opt: - fluid.memory_optimize(main) - - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - startup_exe = fluid.Executor(place) - startup_exe.run(startup) - exec_strategy = fluid.ExecutionStrategy() - exec_strategy.allow_op_delay = allow_op_delay - if use_fast_executor: - exec_strategy.use_experimental_executor = True - build_strategy.enable_parallel_graph = use_parallel_graph - build_strategy = fluid.BuildStrategy() - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ - if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce - build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops - build_strategy.memory_optimize = use_ir_memory_optimize - build_strategy.enable_sequential_execution = enable_sequential_execution - if use_cuda and core.is_compiled_with_cuda(): - build_strategy.remove_unnecessary_lock = True - - if use_parallel_executor: - exe = fluid.ParallelExecutor( - use_cuda, - loss_name=loss.name, - exec_strategy=exec_strategy, - build_strategy=build_strategy) - else: - exe = fluid.Executor(place=place) - - if batch_size is not None: - batch_size *= fluid.core.get_cuda_device_count( - ) if use_cuda else int( - os.environ.get('CPU_NUM', multiprocessing.cpu_count())) - begin = time.time() - first_loss, = run_executor( - exe=exe, feed=feed_dict, fetch_list=[loss.name]) - - for i in range(iter): - run_executor(exe=exe, feed=feed_dict, fetch_list=[]) - - last_loss, = run_executor( - exe=exe, feed=feed_dict, fetch_list=[loss.name]) - end = time.time() - - if batch_size is not None: - print("%.4f Instance per second" % ( - (batch_size * iter + 2) / (end - begin))) - - avg_last_loss_val = np.array(last_loss).mean() - avg_first_loss_val = np.array(first_loss).mean() - if math.isnan(float(avg_last_loss_val)) or math.isnan( - float(avg_first_loss_val)): - sys.exit("got NaN loss, training failed.") - - print(first_loss, last_loss) - # self.assertGreater(first_loss[0], last_loss[0]) - return first_loss, last_loss + with fluid.program_guard(main, startup): + if seed is not None: + startup.random_seed = seed + main.random_seed = seed + + loss = method(use_feed=feed_dict is not None) + + optimizer().minimize(loss) + + if memory_opt: + fluid.memory_optimize(main) + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + startup_exe = fluid.Executor(place) + startup_exe.run(startup) + exec_strategy = fluid.ExecutionStrategy() + exec_strategy.allow_op_delay = allow_op_delay + if use_fast_executor: + exec_strategy.use_experimental_executor = True + build_strategy = fluid.BuildStrategy() + build_strategy.enable_parallel_graph = use_parallel_graph + build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ + if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce + build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops + build_strategy.memory_optimize = use_ir_memory_optimize + build_strategy.enable_sequential_execution = enable_sequential_execution + if use_cuda and core.is_compiled_with_cuda(): + build_strategy.remove_unnecessary_lock = True + + if use_parallel_executor: + exe = fluid.ParallelExecutor( + use_cuda, + loss_name=loss.name, + exec_strategy=exec_strategy, + build_strategy=build_strategy) + else: + exe = fluid.Executor(place=place) + + if batch_size is not None: + batch_size *= fluid.core.get_cuda_device_count( + ) if use_cuda else int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + begin = time.time() + first_loss, = run_executor( + exe=exe, feed=feed_dict, fetch_list=[loss.name]) + + for i in range(iter): + run_executor(exe=exe, feed=feed_dict, fetch_list=[]) + + last_loss, = run_executor( + exe=exe, feed=feed_dict, fetch_list=[loss.name]) + end = time.time() + + if batch_size is not None: + print("%.4f Instance per second" % ( + (batch_size * iter + 2) / (end - begin))) + + avg_last_loss_val = np.array(last_loss).mean() + avg_first_loss_val = np.array(first_loss).mean() + if math.isnan(float(avg_last_loss_val)) or math.isnan( + float(avg_first_loss_val)): + sys.exit("got NaN loss, training failed.") + + print(first_loss, last_loss) + # self.assertGreater(first_loss[0], last_loss[0]) + return first_loss, last_loss diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py index d75761153c..3e4490aa58 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py @@ -175,44 +175,65 @@ class TestCRFModel(unittest.TestCase): print(pe.run(feed=feeder.feed(cur_batch), fetch_list=[avg_cost.name])[0]) - def test_update_sparse_parameter_all_reduce(self): + def _new_build_strategy(self, use_reduce=False, use_parallel_graph=False): build_strategy = fluid.BuildStrategy() - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce + + if use_reduce: + build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce + else: + build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce + build_strategy.enable_parallel_graph = use_parallel_graph + + return build_strategy + + def test_update_sparse_parameter_all_reduce(self): if core.is_compiled_with_cuda(): self.check_network_convergence( - is_sparse=True, build_strategy=build_strategy, use_cuda=True) - self.check_network_convergence( - is_sparse=True, build_strategy=build_strategy, use_cuda=True) + is_sparse=True, + build_strategy=self._new_build_strategy(), + use_cuda=True) self.check_network_convergence( - is_sparse=True, build_strategy=build_strategy, use_cuda=False) + is_sparse=True, + build_strategy=self._new_build_strategy(), + use_cuda=False) def test_update_dense_parameter_all_reduce(self): - build_strategy = fluid.BuildStrategy() - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce if core.is_compiled_with_cuda(): self.check_network_convergence( - is_sparse=False, build_strategy=build_strategy, use_cuda=True) + is_sparse=False, + build_strategy=self._new_build_strategy(), + use_cuda=True) + self.check_network_convergence( + is_sparse=False, + build_strategy=self._new_build_strategy( + use_parallel_graph=True), + use_cuda=True) + self.check_network_convergence( is_sparse=False, build_strategy=build_strategy, use_cuda=False) def test_update_sparse_parameter_reduce(self): - build_strategy = fluid.BuildStrategy() - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce if core.is_compiled_with_cuda(): self.check_network_convergence( - is_sparse=True, build_strategy=build_strategy, use_cuda=True) + is_sparse=True, + build_strategy=self._new_build_strategy(use_reduce=True), + use_cuda=True) self.check_network_convergence( - is_sparse=True, build_strategy=build_strategy, use_cuda=False) + is_sparse=True, + build_strategy=self._new_build_strategy(use_reduce=True), + use_cuda=False) def test_update_dense_parameter_reduce(self): - build_strategy = fluid.BuildStrategy() - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce if core.is_compiled_with_cuda(): self.check_network_convergence( - is_sparse=False, build_strategy=build_strategy, use_cuda=True) + is_sparse=False, + build_strategy=self._new_build_strategy(use_reduce=True), + use_cuda=True) self.check_network_convergence( - is_sparse=False, build_strategy=build_strategy, use_cuda=False) + is_sparse=False, + build_strategy=self._new_build_strategy(use_reduce=True), + use_cuda=False) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py index 531c99a835..5515ff0bb2 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py @@ -312,7 +312,7 @@ class TestResnet(TestParallelExecutorBase): batch_size=batch_size, use_cuda=use_cuda, use_reduce=use_reduce, - optimizer=optimizer(lr_scale=lr_scale), + optimizer=optimizer(), use_parallel_graph=use_parallel_graph) self.assertAlmostEquals( From 28cdfbc2b0b2df44ea94eefd8f4839fa99e4b39d Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Wed, 26 Dec 2018 17:07:05 +0800 Subject: [PATCH 067/197] delete comment code --- paddle/fluid/framework/parallel_executor.cc | 26 +-------------------- 1 file changed, 1 insertion(+), 25 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index ec44cae3b3..6ad86e900d 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -247,39 +247,15 @@ ParallelExecutor::ParallelExecutor( if (nccl_id_var != nullptr) { nccl_id = nccl_id_var->GetMutable(); } - if (build_strategy.enable_parallel_graph_ && places.size() > 1) { if (nccl_id == nullptr) { nccl_id = new ncclUniqueId(); PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(nccl_id)); } } + member_->nccl_ctxs_.reset(new platform::NCCLContextMap( member_->places_, nccl_id, num_trainers, trainer_id)); - -/** -if (build_strategy.enable_parallel_graph_ && places.size() > 1) { - // parallel graph mode should initialize nccl by ncclCommInitRank since - // it call nccl operator per device per thread. - if (nccl_id_var == nullptr) { - nccl_id = new ncclUniqueId(); - PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(nccl_id)); - *member_->global_scope_->Var(NCCL_ID_VARNAME) - ->GetMutable() = *nccl_id; - } else { - nccl_id = nccl_id_var->GetMutable(); - } -} else if (nccl_id_var != nullptr) { // the other executor type. - // the distributed training with nccl mode would initialize the nccl id in - // startup_program. - nccl_id = nccl_id_var->GetMutable(); -} else { - // initlize NCCL by ncclCommInitAll, do not need to intialize the nccl_id. -} - -member_->nccl_ctxs_.reset(new platform::NCCLContextMap( - member_->places_, nccl_id, num_trainers, trainer_id)); -**/ #else PADDLE_THROW("Not compiled with CUDA"); #endif From 495e73d766014189a2d544094e37ddb09189f84c Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Wed, 26 Dec 2018 17:18:44 +0800 Subject: [PATCH 068/197] enable gc --- paddle/fluid/framework/parallel_executor.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 6ad86e900d..45e34cab4c 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -292,10 +292,11 @@ ParallelExecutor::ParallelExecutor( graphs.push_back(std::move(graph)); #endif auto max_memory_size = GetEagerDeletionThreshold(); - // TODO(Yancey1989): fix gc failed on ParallelGraph strategy. - if (max_memory_size >= 0 && !build_strategy.enable_parallel_graph_) { - graphs[0] = member_->PrepareGCAndRefCnts( - std::move(graphs[0]), static_cast(max_memory_size)); + if (max_memory_size >= 0) { + for (size_t i = 0; i < graphs.size(); ++i) { + graphs[i] = member_->PrepareGCAndRefCnts( + std::move(graphs[i]), static_cast(max_memory_size)); + } } // Step 3. Create vars in each scope. Passes may also create new vars. From a8612adb04cbf2383c1c7a9a23ab1e8b7997130f Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Wed, 26 Dec 2018 17:49:02 +0800 Subject: [PATCH 069/197] fix lr scale test=develop --- .../test_parallel_executor_seresnext.py | 30 ++++++++----------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py index 5515ff0bb2..9bdaab162f 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py @@ -167,17 +167,13 @@ def cosine_decay(learning_rate, step_each_epoch, epochs=120): return decayed_lr -def optimizer(learning_rate=0.01, lr_scale=1.0): - def _opt(): - return fluid.optimizer.Momentum( - learning_rate=cosine_decay( - learning_rate=learning_rate / lr_scale, - step_each_epoch=2, - epochs=1), - momentum=0.9, - regularization=fluid.regularizer.L2Decay(1e-4)) - - return _opt +def optimizer(learning_rate=0.01): + optimizer = fluid.optimizer.Momentum( + learning_rate=cosine_decay( + learning_rate=learning_rate, step_each_epoch=2, epochs=1), + momentum=0.9, + regularization=fluid.regularizer.L2Decay(1e-4)) + return optimizer class TestResnet(TestParallelExecutorBase): @@ -220,7 +216,7 @@ class TestResnet(TestParallelExecutorBase): batch_size=batch_size, use_cuda=use_cuda, use_reduce=False, - optimizer=optimizer()) + optimizer=optimizer) reduce_first_loss, reduce_last_loss = self.check_network_convergence( model, feed_dict={"image": img, @@ -229,7 +225,7 @@ class TestResnet(TestParallelExecutorBase): batch_size=batch_size, use_cuda=use_cuda, use_reduce=True, - optimizer=optimizer()) + optimizer=optimizer) for loss in zip(all_reduce_first_loss, reduce_first_loss): self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) @@ -247,7 +243,7 @@ class TestResnet(TestParallelExecutorBase): batch_size=batch_size, use_cuda=use_cuda, use_reduce=False, - optimizer=optimizer(), + optimizer=optimizer, enable_sequential_execution=True) reduce_first_loss_seq, reduce_last_loss_seq = self.check_network_convergence( @@ -258,7 +254,7 @@ class TestResnet(TestParallelExecutorBase): batch_size=batch_size, use_cuda=use_cuda, use_reduce=True, - optimizer=optimizer(), + optimizer=optimizer, enable_sequential_execution=True) for loss in zip(all_reduce_first_loss, all_reduce_first_loss_seq): @@ -301,7 +297,7 @@ class TestResnet(TestParallelExecutorBase): batch_size=batch_size, use_cuda=use_cuda, use_reduce=use_reduce, - optimizer=optimizer(), + optimizer=optimizer, use_parallel_executor=False, use_parallel_graph=use_parallel_graph) parallel_first_loss, parallel_last_loss = self.check_network_convergence( @@ -312,7 +308,7 @@ class TestResnet(TestParallelExecutorBase): batch_size=batch_size, use_cuda=use_cuda, use_reduce=use_reduce, - optimizer=optimizer(), + optimizer=optimizer, use_parallel_graph=use_parallel_graph) self.assertAlmostEquals( From 1a4f79a7dedfa962a87a51c56dd0f422ea73b8e2 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Wed, 26 Dec 2018 19:01:53 +0800 Subject: [PATCH 070/197] fix unittest test=develop --- paddle/fluid/framework/details/build_strategy.cc | 1 + .../fluid/tests/unittests/test_parallel_executor_crf.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index cb660cb8c2..5042652602 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -153,6 +153,7 @@ std::unique_ptr BuildStrategy::Apply( pass->Erase("local_scopes"); pass->SetNotOwned>("local_scopes", &local_scopes); + pass->Erase("num_parallel_devices"); pass->Set("num_parallel_devices", new size_t(num_parallel_devices)); diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py index 3e4490aa58..41286ba08c 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py @@ -211,7 +211,9 @@ class TestCRFModel(unittest.TestCase): use_cuda=True) self.check_network_convergence( - is_sparse=False, build_strategy=build_strategy, use_cuda=False) + is_sparse=False, + build_strategy=self._new_build_strategy(), + use_cuda=False) def test_update_sparse_parameter_reduce(self): if core.is_compiled_with_cuda(): From 68e9b841ab69f8484944e77c486aa226e12ed5f2 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Thu, 27 Dec 2018 10:28:30 +0800 Subject: [PATCH 071/197] Add support for optimizer --- paddle/fluid/imperative/layer.cc | 2 +- paddle/fluid/imperative/layer.h | 9 +++ paddle/fluid/imperative/tracer.h | 8 ++- paddle/fluid/operators/optimizers/sgd_op.h | 5 ++ paddle/fluid/pybind/pybind.cc | 13 +++++ python/paddle/fluid/framework.py | 28 ++++++++- python/paddle/fluid/initializer.py | 1 + python/paddle/fluid/layer_helper.py | 2 +- python/paddle/fluid/layers/tensor.py | 57 ++++++++++++------- python/paddle/fluid/optimizer.py | 45 +++++++++++---- .../tests/unittests/test_imperative_mnist.py | 7 ++- 11 files changed, 139 insertions(+), 38 deletions(-) diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index fcddcc4ed4..2c615275d1 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -104,7 +104,7 @@ class Autograd { framework::Variable* CreateVariable(const std::string& name, const framework::DDim& dim, float val, framework::Scope* scope, - bool random_name = true) { + bool random_name = false) { std::string varname = name; if (random_name) { std::mt19937 rng; diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index 90cc3ae1a9..56112f9a90 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -45,6 +45,15 @@ class VarBase { framework::LoDTensor& Grad(); + inline framework::Variable* GradVar() { return grads_; } + + inline std::string GradName() const { + PADDLE_ENFORCE( + var_desc_, + "Couldn't get gradient variable's name, please call backward() first"); + return string::Sprintf("%s@IGrad", var_desc_->Name()); + } + OpBase* pre_op_; int pre_op_out_idx_; diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index f6dac762fd..c885f39ced 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -52,7 +52,7 @@ class Tracer { const std::vector& outputs, framework::BlockDesc* block, const bool stop_gradient) { framework::OpDesc* op_desc = op->op_desc_; - VLOG(3) << "tracer tracing " << op_desc->Type(); + LOG(ERROR) << "tracer tracing " << op_desc->Type(); op_desc->InferShape(*block); op_desc->InferVarType(block); std::unique_ptr op_base = @@ -61,7 +61,10 @@ class Tracer { *op->input_vars_ = inputs; for (VarBase* input : inputs) { const std::string vname = input->var_desc_->Name(); + LOG(ERROR) << "input: " << vname; + LOG(ERROR) << "input var: " << input->var_; framework::Variable* var = root_scope_->Var(vname); + LOG(ERROR) << "var_ in tracer pointer: " << var; input->var_ = var; if (!var->IsInitialized()) { framework::VarDesc* var_desc = block->FindVar(vname); @@ -84,6 +87,7 @@ class Tracer { *op->output_vars_ = outputs; for (size_t i = 0; i < outputs.size(); ++i) { const std::string vname = outputs[i]->var_desc_->Name(); + LOG(ERROR) << "output name: " << vname; framework::Variable* var = root_scope_->Var(vname); if (!var->IsInitialized()) { framework::VarDesc* var_desc = block->FindVar(vname); @@ -98,7 +102,7 @@ class Tracer { outputs[i]->pre_op_out_idx_ = i; } - VLOG(3) << "tracer running " << op_desc->Type(); + LOG(ERROR) << "tracer running " << op_desc->Type(); op_base->Run(*root_scope_, platform::CPUPlace()); if (!stop_gradient) { framework::OpDesc* grad_op_desc; diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h index 98bae5e1d3..ec4218497a 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.h +++ b/paddle/fluid/operators/optimizers/sgd_op.h @@ -29,6 +29,8 @@ class SGDOpKernel : public framework::OpKernel { const auto *param_var = ctx.InputVar("Param"); const auto *grad_var = ctx.InputVar("Grad"); + LOG(ERROR) << "grad_var: " << grad_var; + if (param_var->IsType()) { const auto *param = ctx.Input("Param"); auto *param_out = ctx.Output("ParamOut"); @@ -39,8 +41,11 @@ class SGDOpKernel : public framework::OpKernel { const auto *grad = ctx.Input("Grad"); auto p = framework::EigenVector::Flatten(*param); + LOG(ERROR) << "param flattened"; auto g = framework::EigenVector::Flatten(*grad); + LOG(ERROR) << "grad flattened"; auto o = framework::EigenVector::Flatten(*param_out); + LOG(ERROR) << "paramout flattened"; auto *lr = learning_rate->data(); o = p - lr[0] * g; diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 9608aa9d69..c690d1b8b3 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -117,10 +117,23 @@ PYBIND11_MODULE(core, m) { [](imperative::VarBase &self, framework::Scope *scope) { self.RunBackward(scope); }) + .def("_grad_var", + [](const imperative::VarBase &self) { + LOG(ERROR) << "grad_var_ pointer: " << self.grads_; + return self.grads_; + }, + py::return_value_policy::reference) + .def("_grad_name", &imperative::VarBase::GradName) .def("_grad", &imperative::VarBase::Grad) + .def("_print_var_pointer", + [](const imperative::VarBase &self) { + LOG(ERROR) << self.var_desc_->Name() + << " print_var pointer: " << self.var_; + }) .def_property("value", [](const imperative::VarBase &self) { return self.var_; }, [](imperative::VarBase &self, framework::Variable *var) { + LOG(ERROR) << "set var to pointer: " << var; self.var_ = var; }, py::return_value_policy::reference) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 3dc23bd060..9073fa79b0 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -19,7 +19,6 @@ import contextlib import os import re import six -import sys import numpy as np @@ -369,6 +368,7 @@ class Variable(object): self._ivar.stop_gradient = stop_gradient def _numpy(self): + print("get_variable_tensor", self.desc.name()) scope = _imperative_tracer().get_scope() tensor = core.get_variable_tensor(scope, self.desc.name()) return np.array(tensor) @@ -380,6 +380,14 @@ class Variable(object): def _gradient(self): return np.array(self._ivar._grad()) + @property + def _value(self): + return self._ivar.value + + @_value.setter + def _value(self, v): + self._ivar.value = v + def __str__(self): return self.to_string(True) @@ -632,6 +640,7 @@ class Operator(object): if inputs is not None: for in_proto in proto.inputs: + print("create op: find_name", in_proto.name) found = find_name(inputs, in_proto.name) assert found or in_proto.dispensable, "Input {} not found".format( in_proto.name) @@ -695,9 +704,11 @@ class Operator(object): self._update_desc_attr(attr_name, attr_val) self.desc.check_attrs() + if self._has_kernel(type): self.desc.infer_var_type(self.block.desc) self.desc.infer_shape(self.block.desc) + if _in_imperative_mode(): self.iop = core.OpBase() self.iop.desc = self.desc @@ -1167,6 +1178,7 @@ class Block(object): def create_var(self, *args, **kwargs): var = Variable(block=self, *args, **kwargs) if 'initializer' in kwargs: + print("initializer, ", type(kwargs['initializer'])) kwargs['initializer'](var, self) return var @@ -1281,6 +1293,16 @@ class Block(object): """ op_desc = self.desc.append_op() op = Operator(block=self, desc=op_desc, *args, **kwargs) + print("op inputs: ", [v._numpy() for v in op.inputs]) + print("op inputs: ", [v for v in op.inputs]) + import sys + sys.stdout.flush() + for v in op.inputs: + v._ivar._print_var_pointer() + print("print var pointer end") + import sys + sys.stdout.flush() + if _in_imperative_mode(): _imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs], [v._ivar for v in op.outputs], self.desc, @@ -1338,6 +1360,10 @@ class Block(object): _imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs], [v._ivar for v in op.outputs], self.desc, kwargs.get("stop_gradient", False)) + print([v.name for v in op.outputs]) + for v in op.outputs: + v._ivar._print_var_pointer() + print("fill_constant end") self.ops.insert(0, op) return op diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py index 7acaed2250..fe8357aa06 100644 --- a/python/paddle/fluid/initializer.py +++ b/python/paddle/fluid/initializer.py @@ -153,6 +153,7 @@ class ConstantInitializer(Initializer): assert isinstance(var, framework.Variable) assert isinstance(block, framework.Block) # Initialization Ops should be prepended and not appended + print("fill_constant") op = block._prepend_op( type="fill_constant", outputs={"Out": var}, diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py index eba5417723..f3413d7296 100644 --- a/python/paddle/fluid/layer_helper.py +++ b/python/paddle/fluid/layer_helper.py @@ -369,7 +369,7 @@ class LayerHelper(object): def set_variable_initializer(self, var, initializer): assert isinstance(var, Variable) - self.startup_program.global_block().create_var( + return self.startup_program.global_block().create_var( name=var.name, type=var.type, dtype=var.dtype, diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index 49a486cf0c..a7565aa108 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -20,6 +20,7 @@ from ..framework import convert_np_dtype_to_dtype_ from ..framework import Variable from ..initializer import Constant, force_init_on_cpu from ..core import VarDesc +from ..imperative import base as imperative_base from .layer_function_generator import templatedoc import numpy @@ -104,15 +105,15 @@ def create_global_var(shape, Args: shape(list[int]): shape of the variable - value(float): the value of the variable. The new created + value(float): the value of the variable. The new created variable will be filled with it. dtype(string): data type of the variable - persistable(bool): if this variable is persistable. + persistable(bool): if this variable is persistable. Default: False - force_cpu(bool): force this variable to be on CPU. + force_cpu(bool): force this variable to be on CPU. Default: False - name(str|None): The name of the variable. If set to None the variable - name will be generated automatically. + name(str|None): The name of the variable. If set to None the variable + name will be generated automatically. Default: None Returns: @@ -121,21 +122,33 @@ def create_global_var(shape, Examples: .. code-block:: python - var = fluid.create_global_var(shape=[2,3], value=1.0, dtype='float32', + var = fluid.create_global_var(shape=[2,3], value=1.0, dtype='float32', persistable=True, force_cpu=True, name='new_var') """ helper = LayerHelper("global_var", **locals()) var = helper.create_global_variable( - dtype=dtype, shape=shape, persistable=persistable, name=name) - helper.set_variable_initializer( - var, initializer=Constant( - value=float(value), force_cpu=force_cpu)) + dtype=dtype, + shape=shape, + persistable=persistable, + name=name, + stop_gradient=True) + print("set_variable_initializer, ", var.name) + if imperative_base.enabled(): + var = helper.set_variable_initializer( + var, initializer=Constant( + value=float(value), force_cpu=force_cpu)) + print("get var", var) + else: + helper.set_variable_initializer( + var, initializer=Constant( + value=float(value), force_cpu=force_cpu)) + return var def cast(x, dtype): """ - This layer takes in the Variable :attr:`x` with :attr:`x.dtype` and casts + This layer takes in the Variable :attr:`x` with :attr:`x.dtype` and casts it to the output with :attr:`dtype`. Args: @@ -199,9 +212,9 @@ def tensor_array_to_tensor(input, axis=1, name=None): and returns that as the output. A simple example as below: - + .. code-block:: text - + Given: input.data = {[[0.6, 0.1, 0.3], @@ -210,9 +223,9 @@ def tensor_array_to_tensor(input, axis=1, name=None): [1.8]], [[2.3, 2.1], [2.5, 2.4]]} - + axis = 1 - + Then: output.data = [[0.6, 0.1, 0.3, 1.3, 2.3, 2.1], @@ -493,12 +506,12 @@ def argmax(x, axis=0): def argsort(input, axis=-1, name=None): """ - Performs sorting on the input Variable along the given axis, and outputs - sorted data Varibale and its corresponding index Variable with the same + Performs sorting on the input Variable along the given axis, and outputs + sorted data Varibale and its corresponding index Variable with the same shape as :attr:`input`. .. code-block:: text - + For example, the given axis is -1 and the input Variable input = [[0.15849551, 0.45865775, 0.8563702 ], @@ -511,15 +524,15 @@ def argsort(input, axis=-1, name=None): and the sorted indices along the given axis turn outs to be - indices = [[0, 1, 2], + indices = [[0, 1, 2], [0, 2, 1]] Args: input(Variable): The input Variable for sorting. - axis(int): The axis along which to sort the input Variable. When - :attr:`axis` < 0, the actual axis will be :attr:`axis` + + axis(int): The axis along which to sort the input Variable. When + :attr:`axis` < 0, the actual axis will be :attr:`axis` + rank(:attr:`input`). Default -1, the last dimension. - name(str|None): (optional) A name for this layer. If set None, the + name(str|None): (optional) A name for this layer. If set None, the layer will be named automatically. Returns: diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 59c22d4e49..7e90d47870 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -30,6 +30,7 @@ from .initializer import Constant from .layer_helper import LayerHelper from .layers import ops from .regularizer import append_regularization_ops +from .imperative import base as imperative_base __all__ = [ 'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl', @@ -108,6 +109,7 @@ class Optimizer(object): # create learning rate variable for every parameter param = param_and_grad[0] param_lr = param.optimize_attr['learning_rate'] + print("param_lr: ", param_lr, self._global_learning_rate()._numpy()) if type(param_lr) == Variable: return param_lr else: @@ -301,19 +303,38 @@ class Optimizer(object): This method combines interface `append_backward()` and `create_optimization_pass()` into one. """ - params_grads = append_backward(loss, parameter_list, no_grad_set, - [error_clip_callback]) + if imperative_base.enabled: + if parameter_list is not None: + params_grads = parameter_list + else: + program = loss.block.program + parameters = program.global_block().all_parameters() + params_grads = [] + for param in parameters: + grad_var = Variable( + block=loss.block, + name=param._ivar._grad_name(), + stop_gradient=True) + grad_var._value = param._ivar._grad_var() + print("create grad var: ", grad_var.name) + print("grad_var value: ", grad_var._numpy()) + import sys + sys.stdout.flush() + params_grads.append((param, grad_var)) + else: + params_grads = append_backward(loss, parameter_list, no_grad_set, + [error_clip_callback]) - params_grads = sorted(params_grads, key=lambda x: x[0].name) + params_grads = sorted(params_grads, key=lambda x: x[0].name) - params_grads, table_param_and_grad, table_optimize_op = \ - self._process_distribute_lookuptable(params_grads, loss, startup_program) + params_grads, table_param_and_grad, table_optimize_op = \ + self._process_distribute_lookuptable(params_grads, loss, startup_program) - params_grads = append_gradient_clip_ops(params_grads) + params_grads = append_gradient_clip_ops(params_grads) - # Add regularization if any - params_grads = append_regularization_ops(params_grads, - self.regularization) + # Add regularization if any + params_grads = append_regularization_ops(params_grads, + self.regularization) optimize_ops = self._create_optimization_pass(params_grads, loss, startup_program) @@ -356,6 +377,10 @@ class SGDOptimizer(Optimizer): def _append_optimize_op(self, block, param_and_grad): assert isinstance(block, framework.Block) + print("append sgd") + import sys + sys.stdout.flush() + # create the optimize op sgd_op = block.append_op( type=self.type, @@ -477,7 +502,7 @@ class LarsMomentumOptimizer(Optimizer): regularization: A Regularizer, such as fluid.regularizer.L2DecayRegularizer. name: A optional name prefix. - + Examples: .. code-block:: python diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py index 9d1e079998..12d605316c 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py @@ -18,6 +18,7 @@ import numpy as np import paddle.fluid as fluid from paddle.fluid import core +from paddle.fluid.optimizer import SGDOptimizer from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC from paddle.fluid.imperative.base import to_variable @@ -119,7 +120,11 @@ class TestImperativeMnist(unittest.TestCase): out._backward() filter_grad = mnist._simple_img_conv_pool_1._conv2d._filter_param._gradient( ) - print(filter_grad) + # print(filter_grad) + + sgd = SGDOptimizer(learning_rate=1e-3) + sgd.minimize(out) + # np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) # with fluid.imperative.guard(): # mlp = MLP() From fff44af83f30fa698816fa888d3bf2b3b440d9d7 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Thu, 27 Dec 2018 10:37:25 +0800 Subject: [PATCH 072/197] Support simple optimizer test=develop --- python/paddle/fluid/optimizer.py | 56 +++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 19 deletions(-) diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 7e90d47870..ba3902bcb7 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -321,6 +321,9 @@ class Optimizer(object): import sys sys.stdout.flush() params_grads.append((param, grad_var)) + + optimize_ops = self._create_optimization_pass(params_grads, loss, + startup_program) else: params_grads = append_backward(loss, parameter_list, no_grad_set, [error_clip_callback]) @@ -336,11 +339,12 @@ class Optimizer(object): params_grads = append_regularization_ops(params_grads, self.regularization) - optimize_ops = self._create_optimization_pass(params_grads, loss, - startup_program) - if table_optimize_op is not None: - optimize_ops.append(table_optimize_op) - params_grads.append(table_param_and_grad) + optimize_ops = self._create_optimization_pass(params_grads, loss, + startup_program) + if table_optimize_op is not None: + optimize_ops.append(table_optimize_op) + params_grads.append(table_param_and_grad) + return optimize_ops, params_grads @@ -389,7 +393,8 @@ class SGDOptimizer(Optimizer): "Grad": param_and_grad[1], "LearningRate": self._create_param_lr(param_and_grad) }, - outputs={"ParamOut": param_and_grad[0]}) + outputs={"ParamOut": param_and_grad[0]}, + stop_gradient=True) return sgd_op @@ -473,7 +478,8 @@ class MomentumOptimizer(Optimizer): "VelocityOut": velocity_acc }, attrs={"mu": self._momentum, - "use_nesterov": self._use_nesterov}) + "use_nesterov": self._use_nesterov}, + stop_gradient=True) return momentum_op @@ -558,7 +564,8 @@ class LarsMomentumOptimizer(Optimizer): "mu": self._momentum, "lars_coeff": self._lars_coeff, "lars_weight_decay": self._lars_weight_decay - }) + }, + stop_gradient=True) return momentum_op @@ -633,7 +640,8 @@ class AdagradOptimizer(Optimizer): }, outputs={"ParamOut": param_and_grad[0], "MomentOut": moment_acc}, - attrs={"epsilon": self._epsilon}) + attrs={"epsilon": self._epsilon}, + stop_gradient=True) return adagrad_op @@ -763,7 +771,8 @@ class AdamOptimizer(Optimizer): "beta2": self._beta2, "epsilon": self._epsilon, "lazy_mode": self._lazy_mode - }) + }, + stop_gradient=True) return adam_op @@ -785,13 +794,15 @@ class AdamOptimizer(Optimizer): type="scale", inputs={"X": beta1_pow_acc}, outputs={"Out": beta1_pow_acc}, - attrs={"scale": self._beta1}) + attrs={"scale": self._beta1}, + stop_gradient=True) main_block.append_op( type="scale", inputs={"X": beta2_pow_acc}, outputs={"Out": beta2_pow_acc}, - attrs={"scale": self._beta2}) + attrs={"scale": self._beta2}, + stop_gradient=True) class AdamaxOptimizer(Optimizer): @@ -902,7 +913,8 @@ class AdamaxOptimizer(Optimizer): "beta1": self._beta1, "beta2": self._beta2, "epsilon": self._epsilon - }) + }, + stop_gradient=True) return adamax_op @@ -922,7 +934,8 @@ class AdamaxOptimizer(Optimizer): type="scale", inputs={"X": beta1_pow_acc}, outputs={"Out": beta1_pow_acc}, - attrs={"scale": self._beta1}) + attrs={"scale": self._beta1}, + stop_gradient=True) class DecayedAdagradOptimizer(Optimizer): @@ -1004,7 +1017,8 @@ class DecayedAdagradOptimizer(Optimizer): }, outputs={"ParamOut": param_and_grad[0], "MomentOut": moment_acc}, - attrs={"epsilon": self._epsilon}) + attrs={"epsilon": self._epsilon}, + stop_gradient=True) return decayed_adagrad_op @@ -1100,7 +1114,8 @@ class AdadeltaOptimizer(Optimizer): "AvgSquaredUpdateOut": avg_squared_update_acc }, attrs={"epsilon": self._epsilon, - "rho": self._rho}) + "rho": self._rho}, + stop_gradient=True) return adadelta_op @@ -1249,7 +1264,8 @@ class RMSPropOptimizer(Optimizer): "decay": self._rho, "momentum": self._momentum, "centered": self._centered - }) + }, + stop_gradient=True) return rmsprop_op @@ -1370,7 +1386,8 @@ class FtrlOptimizer(Optimizer): }, attrs={"l1": self._l1, "l2": self._l1, - "lr_power": self._lr_power}) + "lr_power": self._lr_power}, + stop_gradient=True) return ftrl_op @@ -1534,7 +1551,8 @@ class ModelAverage(Optimizer): "average_window": self.average_window, "min_average_window": self.min_average_window, "max_average_window": self.max_average_window, - }) + }, + stop_gradient=True) @contextmanager def apply(self, executor, need_restore=True): From 8cad371a60d4933be202a9316fc340ede24ec6d4 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Thu, 27 Dec 2018 11:05:53 +0800 Subject: [PATCH 073/197] fix nccl unittest acc test=develop --- python/paddle/fluid/tests/unittests/test_dist_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 07cc44aaa2..0caab08f0d 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -442,10 +442,10 @@ class TestDistBase(unittest.TestCase): tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --update_method nccl2 --lr %f" tr0_cmd = tr_cmd % \ (self._python_interp, model, self._ps_endpoints, - 0, w0_ep, self._lr / 2) + 0, w0_ep, self._lr) tr1_cmd = tr_cmd % \ (self._python_interp, model, self._ps_endpoints, - 1, w1_ep, self._lr / 2) + 1, w1_ep, self._lr) if self._mem_opt: tr0_cmd += " --mem_opt" From 5822f7f1d8b1f53ac57c53f3118407b7024068e2 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Thu, 27 Dec 2018 12:46:08 +0800 Subject: [PATCH 074/197] Polish code test=develop --- paddle/fluid/framework/framework.proto | 2 +- paddle/fluid/imperative/layer.cc | 1 - paddle/fluid/imperative/tracer.h | 7 ++----- paddle/fluid/operators/optimizers/sgd_op.h | 5 ----- paddle/fluid/pybind/pybind.cc | 17 ++++++----------- python/paddle/fluid/framework.py | 21 ++------------------- python/paddle/fluid/initializer.py | 1 - python/paddle/fluid/layer_helper.py | 18 +++++++++++------- python/paddle/fluid/layers/tensor.py | 13 +++---------- python/paddle/fluid/optimizer.py | 10 +--------- 10 files changed, 26 insertions(+), 69 deletions(-) diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto index 665adfd8cb..efdabffb9b 100644 --- a/paddle/fluid/framework/framework.proto +++ b/paddle/fluid/framework/framework.proto @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ syntax = "proto2"; -/* option optimize_for = LITE_RUNTIME; */ +option optimize_for = LITE_RUNTIME; package paddle.framework.proto; // Any incompatible changes to ProgramDesc and its dependencies should diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 2c615275d1..02d9ef866c 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -192,7 +192,6 @@ std::vector OpBase::ApplyGrad(framework::Scope* scope) { LOG(ERROR) << "tracer doesn't support yet"; } } - VLOG(3) << "op grad output var " << outvar << " is inited"; } grad_op_desc_->InferShape(*block_); diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index c885f39ced..de7899055d 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -52,7 +52,7 @@ class Tracer { const std::vector& outputs, framework::BlockDesc* block, const bool stop_gradient) { framework::OpDesc* op_desc = op->op_desc_; - LOG(ERROR) << "tracer tracing " << op_desc->Type(); + VLOG(3) << "tracer tracing " << op_desc->Type(); op_desc->InferShape(*block); op_desc->InferVarType(block); std::unique_ptr op_base = @@ -61,10 +61,7 @@ class Tracer { *op->input_vars_ = inputs; for (VarBase* input : inputs) { const std::string vname = input->var_desc_->Name(); - LOG(ERROR) << "input: " << vname; - LOG(ERROR) << "input var: " << input->var_; framework::Variable* var = root_scope_->Var(vname); - LOG(ERROR) << "var_ in tracer pointer: " << var; input->var_ = var; if (!var->IsInitialized()) { framework::VarDesc* var_desc = block->FindVar(vname); @@ -102,7 +99,7 @@ class Tracer { outputs[i]->pre_op_out_idx_ = i; } - LOG(ERROR) << "tracer running " << op_desc->Type(); + VLOG(3) << "tracer running " << op_desc->Type(); op_base->Run(*root_scope_, platform::CPUPlace()); if (!stop_gradient) { framework::OpDesc* grad_op_desc; diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h index ec4218497a..98bae5e1d3 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.h +++ b/paddle/fluid/operators/optimizers/sgd_op.h @@ -29,8 +29,6 @@ class SGDOpKernel : public framework::OpKernel { const auto *param_var = ctx.InputVar("Param"); const auto *grad_var = ctx.InputVar("Grad"); - LOG(ERROR) << "grad_var: " << grad_var; - if (param_var->IsType()) { const auto *param = ctx.Input("Param"); auto *param_out = ctx.Output("ParamOut"); @@ -41,11 +39,8 @@ class SGDOpKernel : public framework::OpKernel { const auto *grad = ctx.Input("Grad"); auto p = framework::EigenVector::Flatten(*param); - LOG(ERROR) << "param flattened"; auto g = framework::EigenVector::Flatten(*grad); - LOG(ERROR) << "grad flattened"; auto o = framework::EigenVector::Flatten(*param_out); - LOG(ERROR) << "paramout flattened"; auto *lr = learning_rate->data(); o = p - lr[0] * g; diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index c690d1b8b3..6c74eef07a 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -117,19 +117,14 @@ PYBIND11_MODULE(core, m) { [](imperative::VarBase &self, framework::Scope *scope) { self.RunBackward(scope); }) - .def("_grad_var", - [](const imperative::VarBase &self) { - LOG(ERROR) << "grad_var_ pointer: " << self.grads_; - return self.grads_; - }, - py::return_value_policy::reference) .def("_grad_name", &imperative::VarBase::GradName) .def("_grad", &imperative::VarBase::Grad) - .def("_print_var_pointer", - [](const imperative::VarBase &self) { - LOG(ERROR) << self.var_desc_->Name() - << " print_var pointer: " << self.var_; - }) + .def_property("grad_value", + [](const imperative::VarBase &self) { return self.grads_; }, + [](imperative::VarBase &self, framework::Variable *grad) { + self.grads_ = grad; + }, + py::return_value_policy::reference) .def_property("value", [](const imperative::VarBase &self) { return self.var_; }, [](imperative::VarBase &self, framework::Variable *var) { diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 9073fa79b0..6c5dd84460 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -361,6 +361,7 @@ class Variable(object): self.block.vars[name] = self self.op = None + self.stop_gradient = stop_gradient self.is_data = is_data if _in_imperative_mode(): self._ivar = core.VarBase() @@ -368,7 +369,6 @@ class Variable(object): self._ivar.stop_gradient = stop_gradient def _numpy(self): - print("get_variable_tensor", self.desc.name()) scope = _imperative_tracer().get_scope() tensor = core.get_variable_tensor(scope, self.desc.name()) return np.array(tensor) @@ -597,8 +597,7 @@ class Operator(object): type=None, inputs=None, outputs=None, - attrs=None, - stop_gradient=False): + attrs=None): self.block = block self.desc = desc # note: not add self.attrs here: @@ -640,7 +639,6 @@ class Operator(object): if inputs is not None: for in_proto in proto.inputs: - print("create op: find_name", in_proto.name) found = find_name(inputs, in_proto.name) assert found or in_proto.dispensable, "Input {} not found".format( in_proto.name) @@ -1178,7 +1176,6 @@ class Block(object): def create_var(self, *args, **kwargs): var = Variable(block=self, *args, **kwargs) if 'initializer' in kwargs: - print("initializer, ", type(kwargs['initializer'])) kwargs['initializer'](var, self) return var @@ -1293,16 +1290,6 @@ class Block(object): """ op_desc = self.desc.append_op() op = Operator(block=self, desc=op_desc, *args, **kwargs) - print("op inputs: ", [v._numpy() for v in op.inputs]) - print("op inputs: ", [v for v in op.inputs]) - import sys - sys.stdout.flush() - for v in op.inputs: - v._ivar._print_var_pointer() - print("print var pointer end") - import sys - sys.stdout.flush() - if _in_imperative_mode(): _imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs], [v._ivar for v in op.outputs], self.desc, @@ -1360,10 +1347,6 @@ class Block(object): _imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs], [v._ivar for v in op.outputs], self.desc, kwargs.get("stop_gradient", False)) - print([v.name for v in op.outputs]) - for v in op.outputs: - v._ivar._print_var_pointer() - print("fill_constant end") self.ops.insert(0, op) return op diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py index fe8357aa06..7acaed2250 100644 --- a/python/paddle/fluid/initializer.py +++ b/python/paddle/fluid/initializer.py @@ -153,7 +153,6 @@ class ConstantInitializer(Initializer): assert isinstance(var, framework.Variable) assert isinstance(block, framework.Block) # Initialization Ops should be prepended and not appended - print("fill_constant") op = block._prepend_op( type="fill_constant", outputs={"Out": var}, diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py index f3413d7296..8a8470db46 100644 --- a/python/paddle/fluid/layer_helper.py +++ b/python/paddle/fluid/layer_helper.py @@ -22,6 +22,7 @@ import numpy as np from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating from . import unique_name +from paddle.fluid.imperative import base as imperative_base from paddle.fluid.imperative.base import to_variable from paddle.fluid.initializer import Constant, Xavier from .param_attr import ParamAttr, WeightNormParamAttr @@ -369,13 +370,16 @@ class LayerHelper(object): def set_variable_initializer(self, var, initializer): assert isinstance(var, Variable) - return self.startup_program.global_block().create_var( - name=var.name, - type=var.type, - dtype=var.dtype, - shape=var.shape, - persistable=True, - initializer=initializer) + if imperative_base.enabled(): + initializer(var, self.startup_program.global_block()) + else: + self.startup_program.global_block().create_var( + name=var.name, + type=var.type, + dtype=var.dtype, + shape=var.shape, + persistable=True, + initializer=initializer) def append_bias_op(self, input_var, dim_start=1, dim_end=None): """ diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index a7565aa108..fcad14c748 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -132,16 +132,9 @@ def create_global_var(shape, persistable=persistable, name=name, stop_gradient=True) - print("set_variable_initializer, ", var.name) - if imperative_base.enabled(): - var = helper.set_variable_initializer( - var, initializer=Constant( - value=float(value), force_cpu=force_cpu)) - print("get var", var) - else: - helper.set_variable_initializer( - var, initializer=Constant( - value=float(value), force_cpu=force_cpu)) + helper.set_variable_initializer( + var, initializer=Constant( + value=float(value), force_cpu=force_cpu)) return var diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index ba3902bcb7..5a13ee5368 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -109,7 +109,6 @@ class Optimizer(object): # create learning rate variable for every parameter param = param_and_grad[0] param_lr = param.optimize_attr['learning_rate'] - print("param_lr: ", param_lr, self._global_learning_rate()._numpy()) if type(param_lr) == Variable: return param_lr else: @@ -311,15 +310,12 @@ class Optimizer(object): parameters = program.global_block().all_parameters() params_grads = [] for param in parameters: + # create gradient variable grad_var = Variable( block=loss.block, name=param._ivar._grad_name(), stop_gradient=True) grad_var._value = param._ivar._grad_var() - print("create grad var: ", grad_var.name) - print("grad_var value: ", grad_var._numpy()) - import sys - sys.stdout.flush() params_grads.append((param, grad_var)) optimize_ops = self._create_optimization_pass(params_grads, loss, @@ -381,10 +377,6 @@ class SGDOptimizer(Optimizer): def _append_optimize_op(self, block, param_and_grad): assert isinstance(block, framework.Block) - print("append sgd") - import sys - sys.stdout.flush() - # create the optimize op sgd_op = block.append_op( type=self.type, From 28013a50488eb02086fa2f591f11c7c0d2bc49b9 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Thu, 27 Dec 2018 12:51:56 +0800 Subject: [PATCH 075/197] Polish code test=develop --- python/paddle/fluid/optimizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 5a13ee5368..5cdbe7c10d 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -315,7 +315,7 @@ class Optimizer(object): block=loss.block, name=param._ivar._grad_name(), stop_gradient=True) - grad_var._value = param._ivar._grad_var() + grad_var._value = param._ivar.grad_value() params_grads.append((param, grad_var)) optimize_ops = self._create_optimization_pass(params_grads, loss, From 336160e65118354c134274b09bb996b03ddf8460 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Thu, 27 Dec 2018 17:26:29 +0800 Subject: [PATCH 076/197] Complete imperative optimizer implementation test=develop --- paddle/fluid/imperative/tracer.h | 1 - paddle/fluid/pybind/pybind.cc | 1 - python/paddle/fluid/framework.py | 30 ++++++--- python/paddle/fluid/layer_helper.py | 5 +- python/paddle/fluid/optimizer.py | 4 +- .../tests/unittests/test_imperative_mnist.py | 63 +++++-------------- 6 files changed, 39 insertions(+), 65 deletions(-) diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index de7899055d..f6dac762fd 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -84,7 +84,6 @@ class Tracer { *op->output_vars_ = outputs; for (size_t i = 0; i < outputs.size(); ++i) { const std::string vname = outputs[i]->var_desc_->Name(); - LOG(ERROR) << "output name: " << vname; framework::Variable* var = root_scope_->Var(vname); if (!var->IsInitialized()) { framework::VarDesc* var_desc = block->FindVar(vname); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 23248a5dee..74fee64671 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -139,7 +139,6 @@ PYBIND11_MODULE(core, m) { .def_property("value", [](const imperative::VarBase &self) { return self.var_; }, [](imperative::VarBase &self, framework::Variable *var) { - LOG(ERROR) << "set var to pointer: " << var; self.var_ = var; }, py::return_value_policy::reference) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 6c5dd84460..fc00adfbb6 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1289,13 +1289,22 @@ class Block(object): Operator: the append Operator. """ op_desc = self.desc.append_op() - op = Operator(block=self, desc=op_desc, *args, **kwargs) + op = Operator( + block=self, + desc=op_desc, + type=kwargs.get("type", None), + inputs=kwargs.get("inputs", None), + outputs=kwargs.get("outputs", None), + attrs=kwargs.get("attrs", None)) + self.ops.append(op) + self._trace_op(op, kwargs.get("stop_gradient", False)) + return op + + def _trace_op(self, op, stop_gradient=False): if _in_imperative_mode(): _imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs], [v._ivar for v in op.outputs], self.desc, - kwargs.get("stop_gradient", False)) - self.ops.append(op) - return op + stop_gradient) def _insert_op(self, index, *args, **kwargs): """ @@ -1342,12 +1351,15 @@ class Block(object): def _prepend_op(self, *args, **kwargs): op_desc = self.desc._prepend_op() - op = Operator(self, op_desc, *args, **kwargs) - if _in_imperative_mode(): - _imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs], - [v._ivar for v in op.outputs], self.desc, - kwargs.get("stop_gradient", False)) + op = Operator( + self, + op_desc, + type=kwargs.get("type", None), + inputs=kwargs.get("inputs", None), + outputs=kwargs.get("outputs", None), + attrs=kwargs.get("attrs", None)) self.ops.insert(0, op) + self._trace_op(op, kwargs.get("stop_gradient", False)) return op def _sync_with_cpp(self): diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py index 8a8470db46..5429a73533 100644 --- a/python/paddle/fluid/layer_helper.py +++ b/python/paddle/fluid/layer_helper.py @@ -23,7 +23,6 @@ import numpy as np from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating from . import unique_name from paddle.fluid.imperative import base as imperative_base -from paddle.fluid.imperative.base import to_variable from paddle.fluid.initializer import Constant, Xavier from .param_attr import ParamAttr, WeightNormParamAttr from . import core @@ -51,7 +50,7 @@ class LayerHelper(object): return default_startup_program() def to_variable(self, x): - return base.to_variable(x, self.main_program.current_block()) + return imperative_base.to_variable(x, self.main_program.current_block()) def append_op(self, *args, **kwargs): return self.main_program.current_block().append_op(*args, **kwargs) @@ -371,7 +370,7 @@ class LayerHelper(object): def set_variable_initializer(self, var, initializer): assert isinstance(var, Variable) if imperative_base.enabled(): - initializer(var, self.startup_program.global_block()) + initializer(var, var.block) else: self.startup_program.global_block().create_var( name=var.name, diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 5cdbe7c10d..779cb5f961 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -302,7 +302,7 @@ class Optimizer(object): This method combines interface `append_backward()` and `create_optimization_pass()` into one. """ - if imperative_base.enabled: + if imperative_base.enabled(): if parameter_list is not None: params_grads = parameter_list else: @@ -315,7 +315,7 @@ class Optimizer(object): block=loss.block, name=param._ivar._grad_name(), stop_gradient=True) - grad_var._value = param._ivar.grad_value() + grad_var._value = param._ivar.grad_value params_grads.append((param, grad_var)) optimize_ops = self._create_optimization_pass(params_grads, loss, diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py index 12d605316c..a2e008615c 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py @@ -43,15 +43,6 @@ class SimpleImgConvPool(fluid.imperative.PyLayer): bias_attr=None): super(SimpleImgConvPool, self).__init__() - # groups = 1 - # dilation = [1, 1] - # pad = [0, 0] - # stride = [1, 1] - # input_size = [2, 3, 5, 5] # NCHW - # assert np.mod(input_size[1], groups) == 0 - # f_c = input_size[1] // groups - # filter_size = [6, f_c, 3, 3] - self._conv2d = Conv2D( num_channels=num_channels, num_filters=num_filters, @@ -108,47 +99,21 @@ class TestImperativeMnist(unittest.TestCase): def test_mnist_cpu_float32(self): with fluid.imperative.guard(): mnist = MNIST() - - x_data = np.random.rand(128, 1, 28, 28).astype('float32') - img = to_variable(x_data) - y_data = np.random.rand(128, 1).astype('int64') - label = to_variable(y_data) - label._stop_gradient = True - - predict = mnist(img) - out = fluid.layers.cross_entropy(predict, label) - out._backward() - filter_grad = mnist._simple_img_conv_pool_1._conv2d._filter_param._gradient( - ) - # print(filter_grad) - sgd = SGDOptimizer(learning_rate=1e-3) - sgd.minimize(out) - - # np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) - # with fluid.imperative.guard(): - # mlp = MLP() - # out = mlp(np_inp) - # dy_out = out._numpy() - # out._backward() - # dy_grad = mlp._fc1._w._gradient() - - # with new_program_scope(): - # inp = fluid.layers.data( - # name="inp", shape=[2, 2], append_batch_size=False) - # mlp = MLP() - # out = mlp(inp) - # param_grads = fluid.backward.append_backward( - # out, parameter_list=[mlp._fc1._w.name])[0] - # exe = fluid.Executor(fluid.CPUPlace()) - # exe.run(fluid.default_startup_program()) - - # static_out, static_grad = exe.run( - # feed={inp.name: np_inp}, - # fetch_list=[out.name, param_grads[1].name]) - - # self.assertTrue(np.allclose(dy_out, static_out)) - # self.assertTrue(np.allclose(dy_grad, static_grad)) + + for i in range(1): + x_data = np.random.rand(128, 1, 28, 28).astype('float32') + img = to_variable(x_data) + y_data = np.random.rand(128, 1).astype('int64') + label = to_variable(y_data) + label._stop_gradient = True + + predict = mnist(img) + out = fluid.layers.cross_entropy(predict, label) + out._backward() + filter_grad = mnist._simple_img_conv_pool_1._conv2d._filter_param._gradient( + ) + sgd.minimize(out) if __name__ == '__main__': From 6bb84490af42ebf77c3fa1caf8416d7ba15e2b8e Mon Sep 17 00:00:00 2001 From: minqiyang Date: Thu, 27 Dec 2018 22:57:59 +0800 Subject: [PATCH 077/197] Fix imperative unit test test=develop --- paddle/fluid/imperative/layer.cc | 5 +- paddle/fluid/imperative/tracer.h | 3 + python/paddle/fluid/layers/nn.py | 62 +++++++++---------- .../fluid/tests/unittests/test_imperative.py | 12 ++-- .../tests/unittests/test_imperative_mnist.py | 2 +- 5 files changed, 46 insertions(+), 38 deletions(-) diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 02d9ef866c..cf330cda5e 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -61,6 +61,9 @@ class Autograd { for (size_t i = 0; i < input_grads.size(); ++i) { if (!input_grads[i]) continue; + if (ready_op->input_vars_->at(i)->stop_gradient_) { + continue; + } OpBase* pre_op = ready_op->pre_ops_->at(i); if (!pre_op) continue; @@ -152,7 +155,7 @@ void VarBase::ApplyGrad(framework::Scope* scope, Variable* grad) { } std::vector OpBase::ApplyGrad(framework::Scope* scope) { - VLOG(3) << "op grad " << grad_op_desc_->Type(); + VLOG(3) << "op grad type: " << grad_op_desc_->Type(); for (const std::string& grad_invar : grad_op_desc_->InputArgumentNames()) { if (grad_to_var_->find(grad_invar) == grad_to_var_->end()) { diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index f6dac762fd..776f228875 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -93,6 +93,8 @@ class Tracer { LOG(ERROR) << "tracer doesn't support yet"; } } + + outputs[i]->stop_gradient_ = stop_gradient; outputs[i]->var_ = var; outputs[i]->pre_op_ = op; outputs[i]->pre_op_out_idx_ = i; @@ -106,6 +108,7 @@ class Tracer { CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var); op->grad_op_desc_ = grad_op_desc; op->grad_to_var_ = grad_to_var; + VLOG(3) << "tracer create grad op " << grad_op_desc->Type(); } op->block_ = block; } diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 613025d3c6..541c757389 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -9348,7 +9348,7 @@ class PyFuncRegistry(object): raise TypeError('func must be a Python function') self._func = func - # find named args using reflection + # find named args using reflection args = inspect.getargspec(self._func) if len(args[0]) == 0 and args[1] is None and args[2] is None: # Function with no inputs @@ -9359,15 +9359,15 @@ class PyFuncRegistry(object): ''' Why record self here? - 1. For debug usage. Users can call - :code:`py_func.registered_func(idx)` method + 1. For debug usage. Users can call + :code:`py_func.registered_func(idx)` method to find the registered function corresponding - to :code:`idx`. + to :code:`idx`. - 2. For increasing reference count of self. - It seems that to release Python object + 2. For increasing reference count of self. + It seems that to release Python object whose reference count is 1 would cause - segmentation fault error in C++ side. + segmentation fault error in C++ side. May be lack of Python GC in C++ side? ''' PyFuncRegistry._register_funcs.append(self) @@ -9418,7 +9418,7 @@ class PyFuncRegistry(object): def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None): """ PyFunc Operator. - + User can use :code:`py_func` to register operators in Python side. The inputs of :code:`func` is :code:`LoDTensor` and outputs can be numpy array or :code:`LoDTensor`. Paddle would call the registered @@ -9436,7 +9436,7 @@ def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None): no gradient, users should return None. This function can also be used to debug the running network. User can - add a :code:`py_func` operator without output, and print input + add a :code:`py_func` operator without output, and print input :code:`x` inside :code:`func`. Args: @@ -9444,50 +9444,50 @@ def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None): x (Variable|list(Variable)|tuple(Variable)): inputs of :code:`func`. out (Variable|list(Variable)|tuple(Variable)): outputs of :code:`func`. Paddle cannot infer shapes and data types of :code:`out`. Users - should create :code:`out` beforehand. + should create :code:`out` beforehand. backward_func (callable|None): backward Python function. - None means no backward. Default None. + None means no backward. Default None. skip_vars_in_backward_input (Variable|list(Variable)|tuple(Variable)): - Variables that are not needed in :code:`backward_func` inputs. + Variables that are not needed in :code:`backward_func` inputs. These variables must be any of :code:`x` and :code:`out`. If set, these vars would not be inputs of :code:`backward_func`, - Only useful when :code:`backward_func` is not None. Default None. + Only useful when :code:`backward_func` is not None. Default None. Returns: out (Variable|list(Variable)|tuple(Variable)): input :code:`out` Examples: - + >>> import paddle.fluid as fluid >>> import six >>> >>> def create_tmp_var(name, dtype, shape): >>> return fluid.default_main_program().current_block().create_var( - >>> name=name, dtype=dtype, shape=shape) + >>> name=name, dtype=dtype, shape=shape) >>> >>> # tanh activation has been provided by Paddle C++ op - >>> # Here, we only use tanh to be an example to show the usage + >>> # Here, we only use tanh to be an example to show the usage >>> # of py_func >>> def tanh(x): >>> return np.tanh(x) - >>> + >>> >>> # forward input x is skipped >>> def tanh_grad(y, dy): >>> return np.array(dy) * (1 - np.square(np.array(y))) >>> >>> def debug_func(x): - >>> print(x) + >>> print(x) >>> >>> def simple_net(img, label): >>> hidden = img >>> for idx in six.moves.range(4): >>> hidden = fluid.layers.fc(hidden, size=200) >>> new_hidden = create_tmp_var(name='hidden_{}'.format(idx), - >>> dtype=hidden.dtype, shape=hidden.shape) + >>> dtype=hidden.dtype, shape=hidden.shape) >>> >>> # user-defined layers with forward and backward - >>> hidden = fluid.layers.py_func(func=tanh, x=hidden, - >>> out=new_hidden, backward_func=tanh_grad, + >>> hidden = fluid.layers.py_func(func=tanh, x=hidden, + >>> out=new_hidden, backward_func=tanh_grad, >>> skip_vars_in_backward_input=hidden) >>> >>> # user-defined debug layers to print variables @@ -9666,14 +9666,15 @@ class FC(layers.PyLayer): param_attr=None, num_flatten_dims=1, dtype=core.VarDesc.VarType.FP32): - super(FC, self).__init__() + super(FC, self).__init__(param_attr=param_attr) self._size = size self._num_flatten_dims = num_flatten_dims self._dtype = dtype - self._helper = LayerHelper('FC', param_attr=param_attr) + self._tmp = self._helper.create_variable_for_type_inference(self._dtype) + self._out = self._helper.create_variable_for_type_inference(self._dtype) def _build_once(self, inputs): - input_shape = inputs[0].shape + input_shape = inputs.shape param_shape = [ reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:], 1) ] + [self._size] @@ -9684,21 +9685,20 @@ class FC(layers.PyLayer): is_bias=False) def forward(self, inputs): - tmp = self._helper.create_variable_for_type_inference(self._dtype) self._helper.append_op( type="mul", - inputs={"X": inputs[0], + inputs={"X": inputs, "Y": self._w}, - outputs={"Out": tmp}, + outputs={"Out": self._tmp}, attrs={ "x_num_col_dims": self._num_flatten_dims, "y_num_col_dims": 1 }) - out = self._helper.create_variable_for_type_inference(self._dtype) self._helper.append_op( type="sum", - inputs={"X": [tmp]}, - outputs={"Out": out}, + inputs={"X": [self._tmp]}, + outputs={"Out": self._out}, attrs={"use_mkldnn": False}) - return out + + return self._out diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py index 0fe69d1bd4..ccf0743ea6 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative.py +++ b/python/paddle/fluid/tests/unittests/test_imperative.py @@ -36,7 +36,7 @@ class MyLayer(fluid.imperative.PyLayer): super(MyLayer, self).__init__() def forward(self, inputs): - x = fluid.layers.relu(inputs[0]) + x = fluid.layers.relu(inputs) self._x_for_debug = x return [fluid.layers.elementwise_mul(x, x)] @@ -52,7 +52,7 @@ class MLP(fluid.imperative.PyLayer): initializer=fluid.initializer.Constant(value=0.1))) def forward(self, inputs): - x = self._fc1(inputs[0]) + x = self._fc1(inputs) x = self._fc2(x) x = fluid.layers.reduce_sum(x) return x @@ -64,13 +64,14 @@ class TestImperative(unittest.TestCase): cl = core.Layer() cl.forward([]) l = fluid.imperative.PyLayer() - l.forward([]) + self.assertRaises(NotImplementedError, l.forward, []) def test_layer_in_out(self): np_inp = np.array([1.0, 2.0, -1.0], dtype=np.float32) with fluid.imperative.guard(): + var_inp = fluid.imperative.base.to_variable(np_inp) l = MyLayer() - x = l(np_inp)[0] + x = l(var_inp)[0] self.assertIsNotNone(x) dy_out = x._numpy() x._backward() @@ -95,8 +96,9 @@ class TestImperative(unittest.TestCase): def test_mlp(self): np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) with fluid.imperative.guard(): + var_inp = fluid.imperative.base.to_variable(np_inp) mlp = MLP() - out = mlp(np_inp) + out = mlp(var_inp) dy_out = out._numpy() out._backward() dy_grad = mlp._fc1._w._gradient() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py index a2e008615c..802db5d1e0 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py @@ -101,7 +101,7 @@ class TestImperativeMnist(unittest.TestCase): mnist = MNIST() sgd = SGDOptimizer(learning_rate=1e-3) - for i in range(1): + for i in range(2): x_data = np.random.rand(128, 1, 28, 28).astype('float32') img = to_variable(x_data) y_data = np.random.rand(128, 1).astype('int64') From 49cce3fd0eac5d1247350290e9642acefbb549fa Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 28 Dec 2018 12:08:15 +0800 Subject: [PATCH 078/197] fix dist sparse l2 decay test=develop --- .../fluid/tests/unittests/dist_se_resnext.py | 1 - .../fluid/transpiler/distribute_transpiler.py | 24 ++++++++++--------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/dist_se_resnext.py b/python/paddle/fluid/tests/unittests/dist_se_resnext.py index 5da3705706..c3d84dba0a 100644 --- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py @@ -235,7 +235,6 @@ class DistSeResneXt2x2(TestDistRunnerBase): bd = [step * e for e in epochs] base_lr = 0.1 - lr = [] lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] optimizer = fluid.optimizer.Momentum( diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index d21ec42dcc..f223d86554 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -744,12 +744,6 @@ class DistributeTranspiler(object): elif op not in lr_ops: self._append_pserver_non_opt_ops(block, op) - def __op_have_grad_input__(op): - for varname in op.input_arg_names: - if varname.find("@GRAD") >= 0: - return varname - return "" - def __clone_lr_op_sub_block__(op, program, lr_block): if not op.has_attr('sub_block'): return @@ -800,7 +794,7 @@ class DistributeTranspiler(object): merged_var = None for _, op in enumerate(self.optimize_ops): # find the origin grad var before clipping/L2Decay, - # merged_var should be the input var name of L2Decaybuil + # merged_var should be the input var name of L2Decay grad_varname_for_block = op.attr(OP_ROLE_VAR_ATTR_NAME)[1] if op.attr(OP_ROLE_VAR_ATTR_NAME)[ 0] == optimize_target_param_name: @@ -1278,9 +1272,8 @@ class DistributeTranspiler(object): # create table param and grad var in pserver program # create table optimize block in pserver program table_opt_op = [ - op for op in self.optimize_ops - if 'Param' in op.input_names and op.input("Param")[0] == - self.table_name + op for op in self.optimize_ops if 'Param' in op.input_names and + op.input("Param")[0] == self.table_name ][0] origin_param_var = self.origin_program.global_block().vars[ @@ -1676,7 +1669,16 @@ class DistributeTranspiler(object): if self.config.enable_dc_asgd: new_inputs[key] = dc else: - new_inputs[key] = merged_var + # Note!! This is for l2decay on sparse gradient, because it will create a new tensor for + # decayed gradient but not inplace modify the origin one + origin_grad_name = opt_op.input(key)[0] + if core.kNewGradSuffix( + ) in origin_grad_name and pserver_block.has_var( + origin_grad_name): + new_grad = pserver_block.var(origin_grad_name) + new_inputs[key] = new_grad + else: + new_inputs[key] = merged_var elif key == "Param": param_block = _get_param_block(opt_op) if not param_block: From e77f54734b04484aac99fa866cf9d40db53da876 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 28 Dec 2018 12:28:52 +0800 Subject: [PATCH 079/197] add unit test for dist sparse l2 decay --- .../paddle/fluid/tests/unittests/dist_ctr.py | 13 ++++++++- .../tests/unittests/dist_ctr_with_l2_decay.py | 27 +++++++++++++++++++ .../fluid/tests/unittests/test_dist_ctr.py | 10 +++++++ 3 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 python/paddle/fluid/tests/unittests/dist_ctr_with_l2_decay.py diff --git a/python/paddle/fluid/tests/unittests/dist_ctr.py b/python/paddle/fluid/tests/unittests/dist_ctr.py index 6596982433..dd97853a4c 100644 --- a/python/paddle/fluid/tests/unittests/dist_ctr.py +++ b/python/paddle/fluid/tests/unittests/dist_ctr.py @@ -30,7 +30,12 @@ fluid.default_main_program().random_seed = 1 class TestDistCTR2x2(TestDistRunnerBase): + def config(self): + self.use_l2_decay = False + def get_model(self, batch_size=2): + self.config() + dnn_input_dim, lr_input_dim = dist_ctr_reader.load_data_meta() """ network definition """ dnn_data = fluid.layers.data( @@ -97,7 +102,13 @@ class TestDistCTR2x2(TestDistRunnerBase): inference_program = paddle.fluid.default_main_program().clone() - sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001) + regularization = None + if self.use_l2_decay: + regularization = fluid.regularizer.L2DecayRegularizer( + regularization_coeff=1e-3) + + sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001, + regularization=regularization) sgd_optimizer.minimize(avg_cost) dataset = dist_ctr_reader.Dataset() diff --git a/python/paddle/fluid/tests/unittests/dist_ctr_with_l2_decay.py b/python/paddle/fluid/tests/unittests/dist_ctr_with_l2_decay.py new file mode 100644 index 0000000000..a7fbfd644d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dist_ctr_with_l2_decay.py @@ -0,0 +1,27 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import dist_ctr +from test_dist_base import runtime_main + + +class TestDistCTRWithL2Decay(dist_ctr.TestDistCTR2x2): + def config(self): + self.use_l2_decay = True + + +if __name__ == "__main__": + runtime_main(TestDistCTRWithL2Decay) diff --git a/python/paddle/fluid/tests/unittests/test_dist_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_ctr.py index b2d979729b..f6b0971c5c 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_ctr.py +++ b/python/paddle/fluid/tests/unittests/test_dist_ctr.py @@ -28,5 +28,15 @@ class TestDistCTR2x2(TestDistBase): self.check_with_place("dist_ctr.py", delta=1e-7, check_error_log=False) +class TestDistCTR2x2WithL2Decay(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._enforce_place = "CPU" + + def test_dist_ctr(self): + self.check_with_place( + "dist_ctr_with_l2_decay.py", delta=1e-7, check_error_log=False) + + if __name__ == "__main__": unittest.main() From 6a5f604607e06a0dffaf16ffe88d7033ecc42b30 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 28 Dec 2018 13:19:33 +0800 Subject: [PATCH 080/197] Support stop_gradients var in imperative backward test=develop --- paddle/fluid/framework/operator.h | 9 +++++++ paddle/fluid/framework/operator_test.cc | 9 +++++++ paddle/fluid/imperative/layer.cc | 33 ++++++++++++++----------- paddle/fluid/imperative/tracer.h | 2 +- paddle/fluid/pybind/pybind.cc | 4 +-- 5 files changed, 40 insertions(+), 17 deletions(-) diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index e2bedc60d2..87bb28c0c5 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -69,6 +69,15 @@ inline std::string GradVarName(const std::string& var_name) { return result; } +inline std::string OriginVarName(const std::string& grad_var_name) { + std::size_t pos = grad_var_name.find_last_of(kGradVarSuffix); + if (pos == std::string::npos) { + return grad_var_name; + } else { + return grad_var_name.substr(0, pos); + } +} + proto::VarType::Type GetDataTypeOfVar(const Variable* var); const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var); Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var); diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc index ab14732e4d..1623dfca6f 100644 --- a/paddle/fluid/framework/operator_test.cc +++ b/paddle/fluid/framework/operator_test.cc @@ -288,3 +288,12 @@ TEST(OpKernel, multi_inputs) { auto op = paddle::framework::OpRegistry::CreateOp(op_desc); op->Run(scope, cpu_place); } + +TEST(Functions, all) { + std::string var_name("X"); + std::string grad_var_name = paddle::framework::GradVarName(var_name); + ASSERT_EQ(grad_var_name.c_str(), "X@GRAD"); + std::string original_var_name = + paddle::framework::OriginVarName(grad_var_name); + ASSERT_EQ(original_var_name.c_str(), "X"); +} diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 0c07f77583..28ad829aa9 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -21,6 +21,7 @@ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/string/printf.h" namespace paddle { @@ -31,8 +32,9 @@ using framework::Variable; void AddTo(Variable* src, Variable* dst) { framework::LoDTensor* dst_tensor = dst->GetMutable(); framework::LoDTensor* src_tensor = src->GetMutable(); - PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(), "%lld vs %lld", - dst_tensor->numel(), src_tensor->numel()); + PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(), + "dst_numel %lld vs. src_numel %lld", dst_tensor->numel(), + src_tensor->numel()); float* dst_data = dst_tensor->mutable_data(platform::CPUPlace()); const float* src_data = src_tensor->data(); for (size_t i = 0; i < src_tensor->numel(); ++i) { @@ -114,7 +116,7 @@ framework::LoDTensor& VarBase::Grad() { std::map> OpBase::ApplyGrad() { if (!grad_op_desc_) { - VLOG(3) << "op with no grad: " << op_desc_->Type(); + LOG(WARNING) << "op with no grad: " << op_desc_->Type(); return {}; } VLOG(3) << "op grad " << grad_op_desc_->Type(); @@ -124,20 +126,18 @@ std::map> OpBase::ApplyGrad() { for (auto it : grad_output_vars_) { auto& outputs = grad_outputs[it.first]; for (size_t i = 0; i < it.second.size(); ++i) { - tmp_vars.emplace_back(new framework::Variable()); - outputs.push_back(tmp_vars.back().get()); - outputs.back()->GetMutable(); + // Allocate a new variable + Variable* tmp_var = new framework::Variable(); + tmp_var->GetMutable(); + + tmp_vars.emplace_back(tmp_var); + outputs.push_back(tmp_var); } - grad_invar_desc.SetShape( - framework::vectorize(var->Get().dims())); - VLOG(3) - << "set op grad var desc's shape size " - << framework::vectorize(var->Get().dims()).size(); } framework::RuntimeContext ctx(grad_input_vars_, grad_outputs); - // No need to do static infer shape here. + // No need to do compile time infer shape here. // grad_op_desc_->InferShape(*block_); grad_op_desc_->InferVarType(block_); @@ -156,9 +156,14 @@ std::map> OpBase::ApplyGrad() { for (auto it : grad_output_vars_) { auto& outputs = grad_outputs[it.first]; auto& origin_outputs = it.second; + + auto& forward_inputs = input_vars_[framework::OriginVarName(it.first)]; + for (size_t i = 0; i < outputs.size(); ++i) { - framework::Variable* orig_grad = origin_outputs[i]; - AddTo(outputs[i], orig_grad); + if (!forward_inputs[i]->stop_gradient_) { + framework::Variable* orig_grad = origin_outputs[i]; + AddTo(outputs[i], orig_grad); + } } } return input_vars_; diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index 4d4ea22ed2..420ca646e6 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -57,7 +57,7 @@ class Tracer { void Trace(OpBase* op, const std::map>& inputs, const std::map>& outputs, - framework::BlockDesc* block) { + framework::BlockDesc* block, const bool stop_gradient) { std::map vars; framework::OpDesc* op_desc = op->op_desc_; diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 21208e8209..5e9d196531 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -152,9 +152,9 @@ PYBIND11_MODULE(core, m) { [](const imperative::VarBase &self) { return self.stop_gradient_; }, [](imperative::VarBase &self, bool stop_gradient) { self.stop_gradient_ = stop_gradient; - }) + }); - py::class_(m, "OpBase", R"DOC()DOC") + py::class_(m, "OpBase", R"DOC()DOC") .def(py::init<>()) .def_property( "desc", [](const imperative::OpBase &self) { return self.op_desc_; }, From 25d44d40acfca5ed92dbc57fbaa2b01367a66f99 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 28 Dec 2018 14:17:33 +0800 Subject: [PATCH 081/197] sum op support empty selected rows as input --- paddle/fluid/operators/math/selected_rows_functor.cc | 4 ++++ paddle/fluid/operators/sum_op.cc | 8 +++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc index 1a11b584e2..5f169dda22 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cc +++ b/paddle/fluid/operators/math/selected_rows_functor.cc @@ -195,6 +195,10 @@ struct SelectedRowsAddToTensor { void operator()(const platform::CPUDeviceContext& context, const framework::SelectedRows& input1, framework::Tensor* input2) { + if (input1.rows().size() == 0) { + LOG(WARNING) << "input selected rows is empty!"; + return; + } auto in1_height = input1.height(); auto in2_dims = input2->dims(); PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc index 4f717a4355..83afe5819a 100644 --- a/paddle/fluid/operators/sum_op.cc +++ b/paddle/fluid/operators/sum_op.cc @@ -41,7 +41,9 @@ class SumOp : public framework::OperatorWithKernel { return; // skip runtime infershape when is tensor array; } + auto x_var_types = ctx->GetInputsVarType("X"); auto x_dims = ctx->GetInputsDim("X"); + size_t N = x_dims.size(); PADDLE_ENFORCE_GT(N, 0, "Input tensors count should > 0."); if (N == 1) { @@ -49,7 +51,11 @@ class SumOp : public framework::OperatorWithKernel { } framework::DDim in_dim({0}); - for (auto& x_dim : x_dims) { + for (size_t i = 0; i < x_dims.size(); ++i) { + if (x_var_types[i] == framework::proto::VarType::SELECTED_ROWS) { + continue; + } + auto& x_dim = x_dims[i]; if (framework::product(x_dim) == 0) { continue; } From 1e04222890511ab57d4b285d6e540a41be78e307 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 28 Dec 2018 14:38:40 +0800 Subject: [PATCH 082/197] add test_dist_ctr_with_l2_decay.py --- .../fluid/tests/unittests/CMakeLists.txt | 3 ++- .../paddle/fluid/tests/unittests/dist_ctr.py | 7 ++---- .../fluid/tests/unittests/test_dist_ctr.py | 11 --------- ...ecay.py => test_dist_ctr_with_l2_decay.py} | 23 +++++++++++++------ 4 files changed, 20 insertions(+), 24 deletions(-) rename python/paddle/fluid/tests/unittests/{dist_ctr_with_l2_decay.py => test_dist_ctr_with_l2_decay.py} (60%) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 6d6fe245d8..c28c0809d8 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -18,6 +18,7 @@ if(NOT WITH_DISTRIBUTE) LIST(REMOVE_ITEM TEST_OPS test_dist_mnist) LIST(REMOVE_ITEM TEST_OPS test_dist_word2vec) LIST(REMOVE_ITEM TEST_OPS test_dist_ctr) + LIST(REMOVE_ITEM TEST_OPS test_dist_ctr_with_l2_decay) LIST(REMOVE_ITEM TEST_OPS test_dist_simnet_bow) LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_batch_merge) LIST(REMOVE_ITEM TEST_OPS test_dist_text_classification) @@ -100,7 +101,7 @@ if(WITH_DISTRIBUTE) # FIXME(typhoonzero): add these tests back # py_test_modules(test_dist_transformer MODULES test_dist_transformer) # set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000) - set_tests_properties(test_dist_ctr test_dist_mnist test_dist_mnist_batch_merge test_dist_save_load test_dist_se_resnext test_dist_simnet_bow test_dist_text_classification test_dist_train test_dist_word2vec PROPERTIES RUN_SERIAL TRUE) + set_tests_properties(test_dist_ctr test_dist_ctr_with_l2_decay test_dist_mnist test_dist_mnist_batch_merge test_dist_save_load test_dist_se_resnext test_dist_simnet_bow test_dist_text_classification test_dist_train test_dist_word2vec PROPERTIES RUN_SERIAL TRUE) endif(NOT APPLE) py_test_modules(test_dist_transpiler MODULES test_dist_transpiler) endif() diff --git a/python/paddle/fluid/tests/unittests/dist_ctr.py b/python/paddle/fluid/tests/unittests/dist_ctr.py index dd97853a4c..e696ef23bd 100644 --- a/python/paddle/fluid/tests/unittests/dist_ctr.py +++ b/python/paddle/fluid/tests/unittests/dist_ctr.py @@ -30,11 +30,7 @@ fluid.default_main_program().random_seed = 1 class TestDistCTR2x2(TestDistRunnerBase): - def config(self): - self.use_l2_decay = False - def get_model(self, batch_size=2): - self.config() dnn_input_dim, lr_input_dim = dist_ctr_reader.load_data_meta() """ network definition """ @@ -103,7 +99,8 @@ class TestDistCTR2x2(TestDistRunnerBase): inference_program = paddle.fluid.default_main_program().clone() regularization = None - if self.use_l2_decay: + use_l2_decay = bool(os.getenv(['USE_L2_DECAY'], 0)) + if use_l2_decay: regularization = fluid.regularizer.L2DecayRegularizer( regularization_coeff=1e-3) diff --git a/python/paddle/fluid/tests/unittests/test_dist_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_ctr.py index f6b0971c5c..390393e04f 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_ctr.py +++ b/python/paddle/fluid/tests/unittests/test_dist_ctr.py @@ -18,7 +18,6 @@ import unittest from test_dist_base import TestDistBase -# FIXME(tangwei): sum op can not handle when inputs is empty. class TestDistCTR2x2(TestDistBase): def _setup_config(self): self._sync_mode = True @@ -28,15 +27,5 @@ class TestDistCTR2x2(TestDistBase): self.check_with_place("dist_ctr.py", delta=1e-7, check_error_log=False) -class TestDistCTR2x2WithL2Decay(TestDistBase): - def _setup_config(self): - self._sync_mode = True - self._enforce_place = "CPU" - - def test_dist_ctr(self): - self.check_with_place( - "dist_ctr_with_l2_decay.py", delta=1e-7, check_error_log=False) - - if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/dist_ctr_with_l2_decay.py b/python/paddle/fluid/tests/unittests/test_dist_ctr_with_l2_decay.py similarity index 60% rename from python/paddle/fluid/tests/unittests/dist_ctr_with_l2_decay.py rename to python/paddle/fluid/tests/unittests/test_dist_ctr_with_l2_decay.py index a7fbfd644d..558aee3653 100644 --- a/python/paddle/fluid/tests/unittests/dist_ctr_with_l2_decay.py +++ b/python/paddle/fluid/tests/unittests/test_dist_ctr_with_l2_decay.py @@ -11,17 +11,26 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from __future__ import print_function -import dist_ctr -from test_dist_base import runtime_main +import os +import unittest +from test_dist_base import TestDistBase + +class TestDistCTR2x2(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._enforce_place = "CPU" -class TestDistCTRWithL2Decay(dist_ctr.TestDistCTR2x2): - def config(self): - self.use_l2_decay = True + def test_dist_ctr(self): + need_envs = {"USE_L2_DECAY": "1"} + self.check_with_place( + "dist_ctr.py", + delta=1e-7, + check_error_log=False, + need_envs=need_envs) if __name__ == "__main__": - runtime_main(TestDistCTRWithL2Decay) + unittest.main() From 877289c4ca0b1d0d9df30b8c29f490f9ee117fe2 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 28 Dec 2018 14:51:38 +0800 Subject: [PATCH 083/197] fix dist_ctr getenv, test=develop --- python/paddle/fluid/tests/unittests/dist_ctr.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/dist_ctr.py b/python/paddle/fluid/tests/unittests/dist_ctr.py index e696ef23bd..fd09d47258 100644 --- a/python/paddle/fluid/tests/unittests/dist_ctr.py +++ b/python/paddle/fluid/tests/unittests/dist_ctr.py @@ -99,10 +99,10 @@ class TestDistCTR2x2(TestDistRunnerBase): inference_program = paddle.fluid.default_main_program().clone() regularization = None - use_l2_decay = bool(os.getenv(['USE_L2_DECAY'], 0)) + use_l2_decay = bool(os.getenv('USE_L2_DECAY', 0)) if use_l2_decay: regularization = fluid.regularizer.L2DecayRegularizer( - regularization_coeff=1e-3) + regularization_coeff=1e-1) sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001, regularization=regularization) From 858e90323101236027aaec8e5685e97b0fb4f201 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 28 Dec 2018 16:11:18 +0800 Subject: [PATCH 084/197] Add unittest for operator test=develop --- paddle/fluid/framework/operator.h | 2 +- paddle/fluid/framework/operator_test.cc | 24 +++++++++++++++++++++--- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 87bb28c0c5..51de4c9dfb 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -70,7 +70,7 @@ inline std::string GradVarName(const std::string& var_name) { } inline std::string OriginVarName(const std::string& grad_var_name) { - std::size_t pos = grad_var_name.find_last_of(kGradVarSuffix); + std::size_t pos = grad_var_name.rfind(kGradVarSuffix); if (pos == std::string::npos) { return grad_var_name; } else { diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc index 1623dfca6f..3bbbda6424 100644 --- a/paddle/fluid/framework/operator_test.cc +++ b/paddle/fluid/framework/operator_test.cc @@ -289,11 +289,29 @@ TEST(OpKernel, multi_inputs) { op->Run(scope, cpu_place); } -TEST(Functions, all) { +TEST(VarNameTest, all) { std::string var_name("X"); std::string grad_var_name = paddle::framework::GradVarName(var_name); - ASSERT_EQ(grad_var_name.c_str(), "X@GRAD"); + ASSERT_EQ(grad_var_name, "X@GRAD"); std::string original_var_name = paddle::framework::OriginVarName(grad_var_name); - ASSERT_EQ(original_var_name.c_str(), "X"); + ASSERT_EQ(original_var_name, "X"); + original_var_name = paddle::framework::OriginVarName(original_var_name); + ASSERT_EQ(original_var_name, "X"); + + std::string var_name_2("XYZ"); + grad_var_name = paddle::framework::GradVarName(var_name_2); + ASSERT_EQ(grad_var_name, "XYZ@GRAD"); + original_var_name = paddle::framework::OriginVarName(grad_var_name); + ASSERT_EQ(original_var_name, "XYZ"); + original_var_name = paddle::framework::OriginVarName(original_var_name); + ASSERT_EQ(original_var_name, "XYZ"); + + std::string var_name_3(""); + grad_var_name = paddle::framework::GradVarName(var_name_3); + ASSERT_EQ(grad_var_name, "@GRAD"); + original_var_name = paddle::framework::OriginVarName(grad_var_name); + ASSERT_EQ(original_var_name, ""); + original_var_name = paddle::framework::OriginVarName(original_var_name); + ASSERT_EQ(original_var_name, ""); } From ca8c77d966c963c4afafe4750391de63014dea0f Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Fri, 28 Dec 2018 17:08:29 +0800 Subject: [PATCH 085/197] selecte execution according to strategy test=develop --- .../fluid/framework/details/build_strategy.cc | 7 +- .../fluid/framework/details/build_strategy.h | 11 ++- .../details/multi_devices_graph_pass.cc | 12 +-- paddle/fluid/framework/parallel_executor.cc | 77 ++++++++++++------- paddle/fluid/framework/parallel_executor.h | 3 + paddle/fluid/pybind/pybind.cc | 8 -- python/paddle/fluid/__init__.py | 3 +- .../unittests/parallel_executor_test_base.py | 2 - .../unittests/test_parallel_executor_crf.py | 8 +- .../unittests/test_parallel_executor_mnist.py | 39 +++------- .../test_parallel_executor_seresnext.py | 15 +--- .../test_parallel_executor_transformer.py | 2 - 12 files changed, 86 insertions(+), 101 deletions(-) diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 5042652602..9a092104e6 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -134,7 +134,7 @@ std::shared_ptr BuildStrategy::CreatePassesFromStrategy( std::unique_ptr BuildStrategy::Apply( const ProgramDesc &main_program, const std::vector &places, const std::string &loss_var_name, const std::vector &local_scopes, - const size_t &num_parallel_devices, + const size_t &nranks, #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const { #else @@ -153,9 +153,8 @@ std::unique_ptr BuildStrategy::Apply( pass->Erase("local_scopes"); pass->SetNotOwned>("local_scopes", &local_scopes); - pass->Erase("num_parallel_devices"); - pass->Set("num_parallel_devices", - new size_t(num_parallel_devices)); + pass->Erase("nranks"); + pass->Set("nranks", new size_t(nranks)); #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr; diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index b31e60ad8e..b75c01c485 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -84,8 +84,6 @@ struct BuildStrategy { bool fuse_broadcast_op_{false}; - bool enable_parallel_graph_{false}; - int num_trainers_{1}; int trainer_id_{0}; std::vector trainers_endpoints_; @@ -112,7 +110,7 @@ struct BuildStrategy { const std::vector &places, const std::string &loss_var_name, const std::vector &local_scopes, - const size_t &num_parallel_devices_, + const size_t &nranks, #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const; @@ -120,6 +118,13 @@ struct BuildStrategy { const bool use_cuda) const; #endif + // If set true, ParallelExecutor would build the main_program into multiple + // graphs, + // each of the graphs would run with one device. This approach can achieve + // better performance + // on some scenarios. + mutable bool enable_parallel_graph_ = false; + private: mutable bool is_finalized_ = false; mutable std::shared_ptr pass_builder_; diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 211668b871..761c9ab904 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -138,7 +138,7 @@ static const char kLossVarName[] = "loss_var_name"; static const char kPlaces[] = "places"; static const char kLocalScopes[] = "local_scopes"; static const char kStrategy[] = "strategy"; -static const char kNumParallelDevices[] = "num_parallel_devices"; +static const char kNRanks[] = "nranks"; void MultiDevSSAGraphBuilder::Init() const { all_vars_.clear(); @@ -174,7 +174,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( auto nodes = graph->ReleaseNodes(); ir::Graph &result = *graph; - size_t num_parallel_devices = Get(kNumParallelDevices); + size_t nranks = Get(kNRanks); for (auto &node : nodes) { if (node->IsVar() && node->Var()) { @@ -251,7 +251,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( CreateComputationalOps(&result, node, places_.size()); } - if (!is_forwarding && num_parallel_devices > 1UL) { + if (!is_forwarding && nranks > 1UL) { bool is_bk_op = static_cast(boost::get(node->Op()->GetAttr( OpProtoAndCheckerMaker::OpRoleAttrName())) & @@ -649,13 +649,13 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID( void MultiDevSSAGraphBuilder::CreateScaleLossGradOp( ir::Graph *result, const std::string &loss_grad_name, ir::Node *out_var_node, proto::VarType::Type dtype) const { - size_t num_parallel_devices = Get("num_parallel_devices"); + size_t nranks = Get("nranks"); for (size_t i = 0; i < places_.size(); ++i) { // Insert ScaleCost OpHandle auto *dev_ctx = platform::DeviceContextPool::Instance().Get(places_[i]); auto *op_handle = new ScaleLossGradOpHandle( result->CreateEmptyNode("scale_loss_grad", ir::Node::Type::kOperation), - num_parallel_devices, local_scopes_[i], places_[i], dev_ctx, dtype); + nranks, local_scopes_[i], places_[i], dev_ctx, dtype); result->Get(kGraphOps).emplace_back(op_handle); // FIXME: Currently ScaleLossGradOp only use device_count as scale @@ -888,4 +888,4 @@ REGISTER_PASS(multi_devices_pass, .RequirePassAttr(paddle::framework::details::kPlaces) .RequirePassAttr(paddle::framework::details::kLocalScopes) .RequirePassAttr(paddle::framework::details::kStrategy) - .RequirePassAttr(paddle::framework::details::kNumParallelDevices); + .RequirePassAttr(paddle::framework::details::kNRanks); diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index fd566be44c..934cf34cbd 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -107,7 +107,7 @@ class ParallelExecutorPrivate { bool own_local_scope_; bool use_cuda_; bool use_all_reduce_; - size_t num_parallel_devices_; + size_t nranks_; // global_ref_cnts_ is only initialized when ParallelExecutor constructs, and // then keeps unchanged @@ -203,7 +203,7 @@ ParallelExecutor::ParallelExecutor( member_->build_strategy_ = build_strategy; member_->use_all_reduce_ = build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce; - member_->num_parallel_devices_ = num_trainers * places.size(); + member_->nranks_ = num_trainers * places.size(); if (!member_->use_all_reduce_) { PADDLE_ENFORCE(places.size() > 1, @@ -211,16 +211,14 @@ ParallelExecutor::ParallelExecutor( "the number of places must be greater than 1."); } - if (build_strategy.enable_parallel_graph_) { - PADDLE_ENFORCE( - member_->use_all_reduce_, - "build_strategy.reduce should be `AllReduce` if you want to enable" - "ParallelGraph."); - PADDLE_ENFORCE( - member_->use_cuda_, - "execution_strategy.use_cuda should be True if you want to enable " - "ParallelGraph."); - } + // FIXME(Yancey1989): parallel graph mode get better performance + // in GPU allreduce distributed training. Need an elegant way to + // choice the execution strategy. + build_strategy.enable_parallel_graph_ = + EnableParallelGraphExecution(main_program, exec_strategy, build_strategy); + + VLOG(1) << "Enable ParallelGraph Execution: " + << build_strategy.enable_parallel_graph_; // Step 1. Bcast the bcast_vars to devs. // Create local scopes @@ -242,20 +240,20 @@ ParallelExecutor::ParallelExecutor( // Bcast Parameters to all GPUs #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME); - ncclUniqueId *nccl_id = nullptr; - // nccl collective would broadcast nccl id by gen_nccl_id operator. + std::unique_ptr nccl_id; + // nccl collective would broadcast ncclUniqueId by gen_nccl_id operator. if (nccl_id_var != nullptr) { - nccl_id = nccl_id_var->GetMutable(); + nccl_id.reset(nccl_id_var->GetMutable()); } - if (build_strategy.enable_parallel_graph_ && places.size() > 1) { - if (nccl_id == nullptr) { - nccl_id = new ncclUniqueId(); - PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(nccl_id)); + if (build_strategy.enable_parallel_graph_ && member_->nranks_ > 1UL) { + if (nccl_id.get() == nullptr) { + nccl_id.reset(new ncclUniqueId()); + platform::dynload::ncclGetUniqueId(nccl_id.get()); } } member_->nccl_ctxs_.reset(new platform::NCCLContextMap( - member_->places_, nccl_id, num_trainers, trainer_id)); + member_->places_, nccl_id.get(), num_trainers, trainer_id)); #else PADDLE_THROW("Not compiled with CUDA"); #endif @@ -268,27 +266,25 @@ ParallelExecutor::ParallelExecutor( // Step 2. Convert main_program to SSA form and dependency graph. Also, insert // ncclOp std::vector> graphs; - member_->num_parallel_devices_ = member_->places_.size() * num_trainers; #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) if (build_strategy.enable_parallel_graph_) { for (size_t i = 0; i < member_->places_.size(); ++i) { std::unique_ptr graph = build_strategy.Apply( main_program, {member_->places_[i]}, loss_var_name, - {member_->local_scopes_[i]}, member_->num_parallel_devices_, - member_->use_cuda_, member_->nccl_ctxs_.get()); + {member_->local_scopes_[i]}, member_->nranks_, member_->use_cuda_, + member_->nccl_ctxs_.get()); graphs.push_back(std::move(graph)); } } else { std::unique_ptr graph = build_strategy.Apply( main_program, member_->places_, loss_var_name, member_->local_scopes_, - member_->num_parallel_devices_, member_->use_cuda_, - member_->nccl_ctxs_.get()); + member_->nranks_, member_->use_cuda_, member_->nccl_ctxs_.get()); graphs.push_back(std::move(graph)); } #else std::unique_ptr graph = build_strategy.Apply( main_program, member_->places_, loss_var_name, member_->local_scopes_, - member_->num_parallel_devices_, member_->use_cuda_); + member_->nranks_, member_->use_cuda_); graphs.push_back(std::move(graph)); #endif auto max_memory_size = GetEagerDeletionThreshold(); @@ -470,6 +466,35 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes( } } +bool ParallelExecutor::EnableParallelGraphExecution( + const ProgramDesc &main_program, const ExecutionStrategy &exec_strategy, + const BuildStrategy &build_strategy) const { + bool enable_parallel_graph = true; + + // TODO(Yancey1989): support sparse update in ParallelGraph mode. + for (auto &var_desc : main_program.Block(0).AllVars()) { + if (var_desc->GetType() == proto::VarType::SELECTED_ROWS) { + enable_parallel_graph = false; + } + } + + // TODO(Yancey1989): support pserver mode + for (auto &op_desc : main_program.Block(0).AllOps()) { + if (op_desc->Type() == "send" || op_desc->Type() == "recv") { + enable_parallel_graph = false; + break; + } + } + + if (!member_->use_all_reduce_ || !member_->use_cuda_) + enable_parallel_graph = false; + + if (build_strategy.enable_sequential_execution_ || + exec_strategy.type_ == ExecutionStrategy::ExecutorType::kExperimental) + enable_parallel_graph = false; + return enable_parallel_graph; +} + ParallelExecutor::~ParallelExecutor() { for (auto &p : member_->places_) { platform::DeviceContextPool::Instance().Get(p)->Wait(); diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 5f6c2159aa..dc70894dbd 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -68,6 +68,9 @@ class ParallelExecutor { private: void BCastParamsToDevices(const std::unordered_set &vars) const; + bool EnableParallelGraphExecution(const ProgramDesc &main_program, + const ExecutionStrategy &exec_strategy, + const BuildStrategy &build_strategy) const; ParallelExecutorPrivate *member_; }; diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 3bb08cbeb7..81d63aace0 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -980,14 +980,6 @@ All parameter, weight, gradient are variables in Paddle. R"DOC(The type is BOOL, fuse_elewise_add_act_ops indicate whether to fuse elementwise_add_op and activation_op, it may make the execution faster. Default False)DOC") - .def_property( - "enable_parallel_graph", - [](const BuildStrategy &self) { return self.enable_parallel_graph_; }, - [](BuildStrategy &self, bool b) { self.enable_parallel_graph_ = b; }, - R"DOC(The type is BOOL, if set True, ParallelExecutor would build the main_program into multiple graphs, - each of the graphs would run with one device. This approach can achieve better performance in - some scenarios. Please note, this approach only supports all-reduce mode - on GPU device)DOC") .def_property( "memory_optimize", [](const BuildStrategy &self) { return self.memory_optimize_; }, diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index e0078e5314..cdc631860f 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -156,7 +156,8 @@ def __bootstrap__(): read_env_flags += [ 'fraction_of_gpu_memory_to_use', 'cudnn_deterministic', 'enable_cublas_tensor_op_math', 'conv_workspace_size_limit', - 'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus' + 'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus', + 'sync_nccl_allreduce' ] core.init_gflags([sys.argv[0]] + diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index 36b13d4558..2b0ab0cc3b 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -39,7 +39,6 @@ class TestParallelExecutorBase(unittest.TestCase): seed=None, use_parallel_executor=True, use_reduce=False, - use_parallel_graph=False, use_ir_memory_optimize=False, fuse_elewise_add_act_ops=False, optimizer=fluid.optimizer.Adam, @@ -80,7 +79,6 @@ class TestParallelExecutorBase(unittest.TestCase): if use_fast_executor: exec_strategy.use_experimental_executor = True build_strategy = fluid.BuildStrategy() - build_strategy.enable_parallel_graph = use_parallel_graph build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py index 41286ba08c..1c6cfce0c2 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py @@ -175,14 +175,13 @@ class TestCRFModel(unittest.TestCase): print(pe.run(feed=feeder.feed(cur_batch), fetch_list=[avg_cost.name])[0]) - def _new_build_strategy(self, use_reduce=False, use_parallel_graph=False): + def _new_build_strategy(self, use_reduce=False): build_strategy = fluid.BuildStrategy() if use_reduce: build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce else: build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce - build_strategy.enable_parallel_graph = use_parallel_graph return build_strategy @@ -204,11 +203,6 @@ class TestCRFModel(unittest.TestCase): is_sparse=False, build_strategy=self._new_build_strategy(), use_cuda=True) - self.check_network_convergence( - is_sparse=False, - build_strategy=self._new_build_strategy( - use_parallel_graph=True), - use_cuda=True) self.check_network_convergence( is_sparse=False, diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index 7d2349fad4..0ff7b73123 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -100,10 +100,7 @@ class TestMNIST(TestParallelExecutorBase): self.assertAlmostEqual(loss[0], loss[1], delta=1e-4) # simple_fc - def check_simple_fc_convergence(self, - use_cuda, - use_reduce=False, - use_parallel_graph=False): + def check_simple_fc_convergence(self, use_cuda, use_reduce=False): if use_cuda and not core.is_compiled_with_cuda(): return @@ -114,15 +111,13 @@ class TestMNIST(TestParallelExecutorBase): feed_dict={"image": img, "label": label}, use_cuda=use_cuda, - use_reduce=use_reduce, - use_parallel_graph=use_parallel_graph) + use_reduce=use_reduce) def test_simple_fc(self): # use_cuda if core.is_compiled_with_cuda(): self.check_simple_fc_convergence(True) - self.check_simple_fc_convergence( - True, use_reduce=False, use_parallel_graph=True) + self.check_simple_fc_convergence(True, use_reduce=False) self.check_simple_fc_convergence(False) def test_simple_fc_with_new_strategy(self): @@ -130,9 +125,7 @@ class TestMNIST(TestParallelExecutorBase): self._compare_reduce_and_allreduce(simple_fc_net, True) self._compare_reduce_and_allreduce(simple_fc_net, False) - def check_simple_fc_parallel_accuracy(self, - use_cuda, - use_parallel_graph=False): + def check_simple_fc_parallel_accuracy(self, use_cuda): if use_cuda and not core.is_compiled_with_cuda(): return @@ -144,16 +137,7 @@ class TestMNIST(TestParallelExecutorBase): feed_dict={"image": img, "label": label}, use_cuda=use_cuda, - use_parallel_executor=False, - use_parallel_graph=use_parallel_graph) - parallel_first_loss, parallel_last_loss = self.check_network_convergence( - method=simple_fc_net, - seed=1, - feed_dict={"image": img, - "label": label}, - use_cuda=use_cuda, - use_parallel_executor=True, - use_parallel_graph=use_parallel_graph) + use_parallel_executor=False) self.assertAlmostEquals( np.mean(parallel_first_loss), @@ -165,15 +149,11 @@ class TestMNIST(TestParallelExecutorBase): def test_simple_fc_parallel_accuracy(self): if core.is_compiled_with_cuda(): self.check_simple_fc_parallel_accuracy(True) - self.check_simple_fc_parallel_accuracy( - True, use_parallel_graph=True) + self.check_simple_fc_parallel_accuracy(True) # FIXME(Yancey1989): ParallelGraph executor type support CPU mode self.check_simple_fc_parallel_accuracy(False) - def check_batchnorm_fc_convergence(self, - use_cuda, - use_fast_executor, - use_parallel_graph=False): + def check_batchnorm_fc_convergence(self, use_cuda, use_fast_executor): if use_cuda and not core.is_compiled_with_cuda(): return @@ -184,8 +164,7 @@ class TestMNIST(TestParallelExecutorBase): feed_dict={"image": img, "label": label}, use_cuda=use_cuda, - use_fast_executor=use_fast_executor, - use_parallel_graph=use_parallel_graph) + use_fast_executor=use_fast_executor) def test_batchnorm_fc(self): for use_cuda in (False, True): @@ -193,7 +172,7 @@ class TestMNIST(TestParallelExecutorBase): self.check_batchnorm_fc_convergence(use_cuda, use_fast_executor) self.check_batchnorm_fc_convergence( - use_cuda=True, use_fast_executor=False, use_parallel_graph=True) + use_cuda=True, use_fast_executor=False) def test_batchnorm_fc_with_new_strategy(self): # FIXME(zcd): close this test temporally. diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py index 9bdaab162f..4f1d902f5c 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py @@ -277,9 +277,7 @@ class TestResnet(TestParallelExecutorBase): use_cuda=True, use_reduce=False, iter=20, - delta2=1e-6, - use_parallel_graph=False, - lr_scale=1.0): + delta2=1e-6): if use_cuda and not core.is_compiled_with_cuda(): return @@ -298,8 +296,7 @@ class TestResnet(TestParallelExecutorBase): use_cuda=use_cuda, use_reduce=use_reduce, optimizer=optimizer, - use_parallel_executor=False, - use_parallel_graph=use_parallel_graph) + use_parallel_executor=False) parallel_first_loss, parallel_last_loss = self.check_network_convergence( model, feed_dict={"image": img, @@ -308,8 +305,7 @@ class TestResnet(TestParallelExecutorBase): batch_size=batch_size, use_cuda=use_cuda, use_reduce=use_reduce, - optimizer=optimizer, - use_parallel_graph=use_parallel_graph) + optimizer=optimizer) self.assertAlmostEquals( np.mean(parallel_first_loss), single_first_loss[0], delta=1e-6) @@ -320,11 +316,6 @@ class TestResnet(TestParallelExecutorBase): if core.is_compiled_with_cuda(): self._check_resnet_convergence( model=SE_ResNeXt50Small, use_cuda=True) - self._check_resnet_convergence( - model=SE_ResNeXt50Small, - use_cuda=True, - use_parallel_graph=True, - lr_scale=core.get_cuda_device_count()) self._check_resnet_convergence( model=SE_ResNeXt50Small, use_cuda=False, iter=2, delta2=1e-3) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py index c3ac9d92b4..3827743908 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py @@ -175,8 +175,6 @@ class TestTransformer(TestParallelExecutorBase): self.check_network_convergence(transformer, use_cuda=True) self.check_network_convergence( transformer, use_cuda=True, enable_sequential_execution=True) - self.check_network_convergence( - transformer, use_cuda=True, use_parallel_graph=True) self.check_network_convergence(transformer, use_cuda=False, iter=5) From 82b42e31f063ddf4210e43e8daba044878aa8d58 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Fri, 28 Dec 2018 18:32:31 +0800 Subject: [PATCH 086/197] polish unittest test=develop --- paddle/fluid/platform/profiler.cc | 1 - .../tests/unittests/test_parallel_executor_mnist.py | 12 +++++++----- .../unittests/test_parallel_executor_seresnext.py | 4 +--- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 040a68f672..85977366e6 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -186,7 +186,6 @@ RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx) RecordEvent::~RecordEvent() { if (g_state == ProfilerState::kDisabled || !is_enabled_) return; - VLOG(5) << "call ~RecordEvent"; std::lock_guard l(profiler_mu); DeviceTracer* tracer = GetDeviceTracer(); if (tracer) { diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index 0ff7b73123..63bc1de208 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -138,6 +138,13 @@ class TestMNIST(TestParallelExecutorBase): "label": label}, use_cuda=use_cuda, use_parallel_executor=False) + parallel_first_loss, parallel_last_loss = self.check_network_convergence( + method=simple_fc_net, + seed=1, + feed_dict={"image": img, + "label": label}, + use_cuda=use_cuda, + use_parallel_executor=True) self.assertAlmostEquals( np.mean(parallel_first_loss), @@ -149,8 +156,6 @@ class TestMNIST(TestParallelExecutorBase): def test_simple_fc_parallel_accuracy(self): if core.is_compiled_with_cuda(): self.check_simple_fc_parallel_accuracy(True) - self.check_simple_fc_parallel_accuracy(True) - # FIXME(Yancey1989): ParallelGraph executor type support CPU mode self.check_simple_fc_parallel_accuracy(False) def check_batchnorm_fc_convergence(self, use_cuda, use_fast_executor): @@ -171,9 +176,6 @@ class TestMNIST(TestParallelExecutorBase): for use_fast_executor in (False, True): self.check_batchnorm_fc_convergence(use_cuda, use_fast_executor) - self.check_batchnorm_fc_convergence( - use_cuda=True, use_fast_executor=False) - def test_batchnorm_fc_with_new_strategy(self): # FIXME(zcd): close this test temporally. # self._compare_reduce_and_allreduce(fc_with_batchnorm, True) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py index 4f1d902f5c..e7a56bb638 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py @@ -313,9 +313,7 @@ class TestResnet(TestParallelExecutorBase): np.mean(parallel_last_loss), single_last_loss[0], delta=delta2) def test_seresnext_with_learning_rate_decay(self): - if core.is_compiled_with_cuda(): - self._check_resnet_convergence( - model=SE_ResNeXt50Small, use_cuda=True) + self._check_resnet_convergence(model=SE_ResNeXt50Small, use_cuda=True) self._check_resnet_convergence( model=SE_ResNeXt50Small, use_cuda=False, iter=2, delta2=1e-3) From f1c973b0141b4396596bccaace1848ddec6faa24 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 28 Dec 2018 18:33:02 +0800 Subject: [PATCH 087/197] adam op should not create tmp var in compute --- paddle/fluid/operators/optimizers/adam_op.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 1138bb7400..de18edcd44 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -423,6 +423,7 @@ class AdamOpKernel : public framework::OpKernel { } } + framework::SelectedRows cpu_grad_merge; const framework::SelectedRows* grad_merge_ptr; if (is_strict_sorted) { grad_merge_ptr = &grad; @@ -430,12 +431,16 @@ class AdamOpKernel : public framework::OpKernel { // merge duplicated rows if any. // The rows of grad_merge have been sorted inside MergeAdd functor scatter::MergeAdd merge_func; - auto* grad_merge_var = const_cast(ctx.scope()) - .Var() - ->GetMutable(); + if (platform::is_cpu_place(ctx.GetPlace())) { + grad_merge_ptr = &cpu_grad_merge; + } else { + // FIXME(qiao): GPU also need to fix this + auto* grad_merge_var = const_cast(ctx.scope()) + .Var() + ->GetMutable(); + } merge_func(ctx.template device_context(), grad, - grad_merge_var, true); - grad_merge_ptr = grad_merge_var; + grad_merge_ptr, true); } auto& grad_merge = *grad_merge_ptr; From dfe85fb358d2b022ee4b4a73212e3d864b10ce4b Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 28 Dec 2018 19:02:28 +0800 Subject: [PATCH 088/197] fix build --- paddle/fluid/operators/optimizers/adam_op.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index dda4ffb908..61b9384f84 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -431,17 +431,19 @@ class AdamOpKernel : public framework::OpKernel { } else { // merge duplicated rows if any. // The rows of grad_merge have been sorted inside MergeAdd functor + framework::SelectedRows* grad_merge_var; scatter::MergeAdd merge_func; if (platform::is_cpu_place(ctx.GetPlace())) { - grad_merge_ptr = &cpu_grad_merge; + grad_merge_var = &cpu_grad_merge; } else { // FIXME(qiao): GPU also need to fix this - auto* grad_merge_var = const_cast(ctx.scope()) - .Var() - ->GetMutable(); + grad_merge_var = const_cast(ctx.scope()) + .Var() + ->GetMutable(); } merge_func(ctx.template device_context(), grad, - grad_merge_ptr, true); + grad_merge_var, true); + grad_merge_ptr = grad_merge_var; } auto& grad_merge = *grad_merge_ptr; From d319ffcd27c4962b34d75fe4bee7a5805c23dbe1 Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Fri, 28 Dec 2018 14:57:07 +0800 Subject: [PATCH 089/197] update mkl version, and add mkl-mac version test=develop --- cmake/external/mklml.cmake | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake index d49839a89d..96127e78d6 100644 --- a/cmake/external/mklml.cmake +++ b/cmake/external/mklml.cmake @@ -16,14 +16,6 @@ IF(NOT ${WITH_MKLML}) return() ENDIF(NOT ${WITH_MKLML}) -IF(APPLE) - MESSAGE(WARNING - "Mac is not supported with MKLML in Paddle yet." - "Force WITH_MKLML=OFF") - SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in Windows and MacOS" FORCE) - return() -ENDIF() - INCLUDE(ExternalProject) SET(MKLML_DST_DIR "mklml") SET(MKLML_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") @@ -47,10 +39,13 @@ SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib") IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL)) MESSAGE(STATUS "use pre defined download url") if(WIN32) - SET(MKLML_VER "mklml_win_2019.0.20180710" CACHE STRING "" FORCE) + SET(MKLML_VER "mklml_win_2019.0.1.20180928" CACHE STRING "" FORCE) SET(MKLML_URL "https://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE) + elseif(APPLE) + SET(MKLML_VER "mklml_mac_2019.0.1.20180928" CACHE STRING "" FORCE) + SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE) else() - SET(MKLML_VER "mklml_lnx_2019.0.20180710" CACHE STRING "" FORCE) + SET(MKLML_VER "mklml_lnx_2019.0.1.20180928" CACHE STRING "" FORCE) SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE) ENDIF() endif() From 2547f9d1b8ea0805a7101a1bef6f82275b250a89 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Sat, 29 Dec 2018 12:02:25 +0800 Subject: [PATCH 090/197] Polish code test=develop --- paddle/fluid/framework/operator.h | 2 +- paddle/fluid/framework/operator_test.cc | 12 +++++----- paddle/fluid/imperative/layer.cc | 13 ++++++----- paddle/fluid/imperative/layer.h | 23 ++++++++----------- paddle/fluid/imperative/tracer.h | 3 ++- paddle/fluid/pybind/pybind.cc | 3 ++- python/paddle/fluid/imperative/layers.py | 23 +++++++++++++------ python/paddle/fluid/layer_helper.py | 4 +--- .../tests/unittests/test_imperative_mnist.py | 2 -- 9 files changed, 45 insertions(+), 40 deletions(-) diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 51de4c9dfb..5709eb1a7d 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -69,7 +69,7 @@ inline std::string GradVarName(const std::string& var_name) { return result; } -inline std::string OriginVarName(const std::string& grad_var_name) { +inline std::string GradOriginalVarName(const std::string& grad_var_name) { std::size_t pos = grad_var_name.rfind(kGradVarSuffix); if (pos == std::string::npos) { return grad_var_name; diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc index 3bbbda6424..fe4804ac25 100644 --- a/paddle/fluid/framework/operator_test.cc +++ b/paddle/fluid/framework/operator_test.cc @@ -294,24 +294,24 @@ TEST(VarNameTest, all) { std::string grad_var_name = paddle::framework::GradVarName(var_name); ASSERT_EQ(grad_var_name, "X@GRAD"); std::string original_var_name = - paddle::framework::OriginVarName(grad_var_name); + paddle::framework::GradOriginalVarName(grad_var_name); ASSERT_EQ(original_var_name, "X"); - original_var_name = paddle::framework::OriginVarName(original_var_name); + original_var_name = paddle::framework::GradOriginalVarName(original_var_name); ASSERT_EQ(original_var_name, "X"); std::string var_name_2("XYZ"); grad_var_name = paddle::framework::GradVarName(var_name_2); ASSERT_EQ(grad_var_name, "XYZ@GRAD"); - original_var_name = paddle::framework::OriginVarName(grad_var_name); + original_var_name = paddle::framework::GradOriginalVarName(grad_var_name); ASSERT_EQ(original_var_name, "XYZ"); - original_var_name = paddle::framework::OriginVarName(original_var_name); + original_var_name = paddle::framework::GradOriginalVarName(original_var_name); ASSERT_EQ(original_var_name, "XYZ"); std::string var_name_3(""); grad_var_name = paddle::framework::GradVarName(var_name_3); ASSERT_EQ(grad_var_name, "@GRAD"); - original_var_name = paddle::framework::OriginVarName(grad_var_name); + original_var_name = paddle::framework::GradOriginalVarName(grad_var_name); ASSERT_EQ(original_var_name, ""); - original_var_name = paddle::framework::OriginVarName(original_var_name); + original_var_name = paddle::framework::GradOriginalVarName(original_var_name); ASSERT_EQ(original_var_name, ""); } diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 28ad829aa9..9813149865 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -32,6 +32,11 @@ using framework::Variable; void AddTo(Variable* src, Variable* dst) { framework::LoDTensor* dst_tensor = dst->GetMutable(); framework::LoDTensor* src_tensor = src->GetMutable(); + // FIXME(minqiyang): loss_grad op will pass a zero grad of label + // ugly fix for it + if (src_tensor->numel() == 0) { + return; + } PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(), "dst_numel %lld vs. src_numel %lld", dst_tensor->numel(), src_tensor->numel()); @@ -157,13 +162,9 @@ std::map> OpBase::ApplyGrad() { auto& outputs = grad_outputs[it.first]; auto& origin_outputs = it.second; - auto& forward_inputs = input_vars_[framework::OriginVarName(it.first)]; - for (size_t i = 0; i < outputs.size(); ++i) { - if (!forward_inputs[i]->stop_gradient_) { - framework::Variable* orig_grad = origin_outputs[i]; - AddTo(outputs[i], orig_grad); - } + framework::Variable* orig_grad = origin_outputs[i]; + AddTo(outputs[i], orig_grad); } } return input_vars_; diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index 85740222e8..2abda933cf 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -81,7 +81,15 @@ class OpBase; class VarBase { public: - explicit VarBase(bool stop_gradient = false) + VarBase() + : pre_op_(nullptr), + pre_op_out_idx_(-1), + var_desc_(nullptr), + var_(new framework::Variable()), + grads_(new framework::Variable()), + stop_gradient_(false) {} + + explicit VarBase(bool stop_gradient) : pre_op_(nullptr), pre_op_out_idx_(-1), var_desc_(nullptr), @@ -89,23 +97,12 @@ class VarBase { grads_(new framework::Variable()), stop_gradient_(stop_gradient) {} - virtual ~VarBase() { - if (var_) { - delete var_; - var_ = nullptr; - } - if (grads_) { - delete grads_; - grads_ = nullptr; - } - } + virtual ~VarBase() {} void RunBackward(); framework::LoDTensor& Grad(); - inline framework::Variable* GradVar() { return grads_; } - inline std::string GradName() const { PADDLE_ENFORCE( var_desc_, diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index 420ca646e6..c6eff86fac 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -57,7 +57,7 @@ class Tracer { void Trace(OpBase* op, const std::map>& inputs, const std::map>& outputs, - framework::BlockDesc* block, const bool stop_gradient) { + framework::BlockDesc* block, const bool stop_gradient = false) { std::map vars; framework::OpDesc* op_desc = op->op_desc_; @@ -153,6 +153,7 @@ class Tracer { } } } + for (auto it : grad_op_desc->Outputs()) { auto& grad_out_vars = op->grad_output_vars_[it.first]; for (const std::string& grad_outvar : it.second) { diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 3be0d9085b..3b81d59ad9 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -125,7 +125,8 @@ PYBIND11_MODULE(core, m) { m.add_object("_cleanup", py::capsule([]() { ScopePool::Instance().Clear(); })); - py::class_(m, "VarBase", R"DOC()DOC") + py::class_>( + m, "VarBase", R"DOC()DOC") // .def(py::init<>()) .def(py::init(), py::arg("stop_gradient") = false) .def("_run_backward", diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py index 80645acc8a..c95b89a2c4 100644 --- a/python/paddle/fluid/imperative/layers.py +++ b/python/paddle/fluid/imperative/layers.py @@ -24,20 +24,29 @@ __all__ = ['PyLayer'] class PyLayer(core.Layer): - def __init__(self, *args, **kwargs): - self._once_built = True - + def __init__(self, + dtype=core.VarDesc.VarType.FP32, + param_attr=None, + bias_attr=None, + name=None): from ..layer_helper import LayerHelper - self._helper = LayerHelper(type(self).__name__, **kwargs) - self._dtype = kwargs.get("dtype", core.VarDesc.VarType.FP32) + self._helper = LayerHelper( + type(self).__name__, + param_attr=param_attr, + bias_attr=bias_attr, + dtype=dtype, + name=name) + + self._once_built = False + self._dtype = dtype def _build_once(self, inputs): pass def __call__(self, *inputs): - if self._once_built: + if not self._once_built: self._build_once(*inputs) - self._once_built = False + self._once_built = True outputs = self.forward(*inputs) diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py index f44009a05d..ea9953f581 100644 --- a/python/paddle/fluid/layer_helper.py +++ b/python/paddle/fluid/layer_helper.py @@ -314,11 +314,9 @@ class LayerHelper(object): WeightNormParamAttr.params_with_weight_norm.append(param) return param if _in_imperative_mode(): - self.main_program.global_block().create_parameter( - dtype=dtype, shape=shape, **attr._to_kwargs()) # In imperative mode, we want the returned parameter to be # initialized so that it can be used imperatively. - return self.startup_program.global_block().create_parameter( + return self.main_program.global_block().create_parameter( dtype=dtype, shape=shape, **attr._to_kwargs(with_initializer=True)) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py index 802db5d1e0..bda9f0e410 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py @@ -111,8 +111,6 @@ class TestImperativeMnist(unittest.TestCase): predict = mnist(img) out = fluid.layers.cross_entropy(predict, label) out._backward() - filter_grad = mnist._simple_img_conv_pool_1._conv2d._filter_param._gradient( - ) sgd.minimize(out) From d25395fc9876d439a477be59cb13f168d3dcd752 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Sat, 29 Dec 2018 07:17:59 +0000 Subject: [PATCH 091/197] remove tensor core lock test=develop --- paddle/fluid/operators/math/blas_impl.cu.h | 89 ++++++++-------------- paddle/fluid/platform/device_context.cc | 25 ++++++ paddle/fluid/platform/device_context.h | 53 ++----------- 3 files changed, 66 insertions(+), 101 deletions(-) diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h index d35073029a..a4fb1cdcd9 100644 --- a/paddle/fluid/operators/math/blas_impl.cu.h +++ b/paddle/fluid/operators/math/blas_impl.cu.h @@ -62,27 +62,17 @@ struct CUBlas { cudaDataType_t Atype, int lda, const void *B, cudaDataType_t Btype, int ldb, const float *beta, void *C, cudaDataType_t Ctype, int ldc) { - // Because the gcc 4.8 doesn't expand template parameter pack that - // appears in a lambda-expression, I can not use template parameter pack - // here. - auto cublas_call = [&]() { +// Because the gcc 4.8 doesn't expand template parameter pack that +// appears in a lambda-expression, I can not use template parameter pack +// here. #if CUDA_VERSION >= 8000 - VLOG(5) << "use_tensor_op_math: " - << (platform::TensorCoreAvailable() ? "True" : "False"); - PADDLE_ENFORCE(platform::dynload::cublasSgemmEx( - dev_ctx->cublas_handle(), transa, transb, m, n, k, alpha, A, Atype, - lda, B, Btype, ldb, beta, C, Ctype, ldc)); + VLOG(5) << "use_tensor_op_math: " + << (dev_ctx->tensor_core_available() ? "True" : "False"); + PADDLE_ENFORCE(platform::dynload::cublasSgemmEx( + dev_ctx->possible_cublas_tensor_core_handle(), transa, transb, m, n, k, + alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc)); #else - PADDLE_THROW("cublasSgemmEx is supported on cuda >= 8.0"); -#endif - }; - -#if CUDA_VERSION >= 9000 - // NOTES: To use Tensor Core, we should change the cublas config, - // but the cublas may be hold by multi-thread. - dev_ctx->CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH); -#else - cublas_call(); + PADDLE_THROW("cublasSgemmEx is supported on cuda >= 8.0"); #endif } }; @@ -170,32 +160,23 @@ struct CUBlas { cudaDataType_t Btype, int ldb, const void *beta, void *C, cudaDataType_t Ctype, int ldc, cudaDataType_t computeType) { - auto cublas_call = [&]() { #if CUDA_VERSION >= 8000 - cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; + cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; #if CUDA_VERSION >= 9000 - bool use_tensor_op_math = platform::TensorCoreAvailable(); - if (use_tensor_op_math) { - algo = CUBLAS_GEMM_DFALT_TENSOR_OP; - } - VLOG(5) << "use_tensor_op_math: " - << (use_tensor_op_math ? "True" : "False"); + bool use_tensor_op_math = dev_ctx->tensor_core_available(); + if (use_tensor_op_math) { + algo = CUBLAS_GEMM_DFALT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " + << (use_tensor_op_math ? "True" : "False"); #endif // CUDA_VERSION >= 9000 - PADDLE_ENFORCE(platform::dynload::cublasGemmEx( - dev_ctx->cublas_handle(), transa, transb, m, n, k, alpha, A, Atype, - lda, B, Btype, ldb, beta, C, Ctype, ldc, computeType, algo)); -#else - PADDLE_THROW("cublasGemmEx is supported on cuda >= 8.0"); -#endif - }; - -#if CUDA_VERSION >= 9000 - // NOTES: To use Tensor Core, we should change the cublas config, - // but the cublas may be hold by multi-thread. - dev_ctx->CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH); + PADDLE_ENFORCE(platform::dynload::cublasGemmEx( + dev_ctx->possible_cublas_tensor_core_handle(), transa, transb, m, n, k, + alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc, computeType, + algo)); #else - cublas_call(); + PADDLE_THROW("cublasGemmEx is supported on cuda >= 8.0"); #endif } }; @@ -353,22 +334,18 @@ void Blas::BatchedGEMM( #if CUDA_VERSION >= 9010 if (FLAGS_enable_cublas_tensor_op_math && std::is_same::value) { - auto cublas_call = [&]() { - cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; - bool use_tensor_op_math = platform::TensorCoreAvailable(); - if (use_tensor_op_math) { - algo = CUBLAS_GEMM_DFALT_TENSOR_OP; - } - VLOG(5) << "use_tensor_op_math: " - << (use_tensor_op_math ? "True" : "False"); - - PADDLE_ENFORCE(platform::dynload::cublasGemmStridedBatchedEx( - context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, - CUDA_R_32F, ldb, strideB, A, CUDA_R_32F, lda, strideA, &beta, C, - CUDA_R_32F, ldc, strideC, batchCount, CUDA_R_32F, algo)); - }; - auto &dev_ctx = const_cast(context_); - dev_ctx.CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH); + cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; + bool use_tensor_op_math = context_.tensor_core_available(); + if (use_tensor_op_math) { + algo = CUBLAS_GEMM_DFALT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " + << (use_tensor_op_math ? "True" : "False"); + + PADDLE_ENFORCE(platform::dynload::cublasGemmStridedBatchedEx( + context_.possible_cublas_tensor_core_handle(), cuTransB, cuTransA, N, M, + K, &alpha, B, CUDA_R_32F, ldb, strideB, A, CUDA_R_32F, lda, strideA, + &beta, C, CUDA_R_32F, ldc, strideC, batchCount, CUDA_R_32F, algo)); } else { #endif // CUDA_VERSION >= 9010 diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 022afb686b..e40928fe5d 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -247,6 +247,18 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get())); PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_)); PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_)); + + if (TensorCoreAvailable()) { +#if CUDA_VERSION >= 9000 + cublas_tensor_core_handle_.reset(new cublasHandle_t()); + PADDLE_ENFORCE(dynload::cublasCreate(cublas_tensor_core_handle_.get())); + PADDLE_ENFORCE( + dynload::cublasSetStream(*cublas_tensor_core_handle_, stream_)); + PADDLE_ENFORCE(dynload::cublasSetMathMode(*cublas_tensor_core_handle_, + CUBLAS_TENSOR_OP_MATH)); +#endif + } + if (dynload::HasCUDNN()) { cudnn_holder_.reset(new CudnnHolder(&stream_, place)); } @@ -307,6 +319,10 @@ CUDADeviceContext::~CUDADeviceContext() { Wait(); WaitStreamCallback(); PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_)); + if (cublas_tensor_core_handle_) { + PADDLE_ENFORCE(dynload::cublasDestroy(*cublas_tensor_core_handle_)); + cublas_tensor_core_handle_.reset(); + } eigen_stream_.reset(); eigen_device_.reset(); PADDLE_ENFORCE(cudaStreamDestroy(stream_)); @@ -339,6 +355,15 @@ cublasHandle_t CUDADeviceContext::cublas_handle() const { return cublas_handle_; } +cublasHandle_t CUDADeviceContext::possible_cublas_tensor_core_handle() const { + return cublas_tensor_core_handle_ ? *cublas_tensor_core_handle_ + : cublas_handle_; +} + +bool CUDADeviceContext::tensor_core_available() const { + return cublas_tensor_core_handle_ != nullptr; +} + cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_holder_->cudnn_handle(); } diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 7e87580189..41b741a68f 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -209,39 +209,6 @@ class CudnnWorkspaceHandle { std::unique_ptr> guard_; }; -#if CUDA_VERSION >= 9000 -class ScopedCublasMathMode { - public: - ScopedCublasMathMode(cublasHandle_t handle, cublasMath_t new_math_mode) - : handle_(handle) { - need_reset = false; - PADDLE_ENFORCE( - platform::dynload::cublasGetMathMode(handle_, &old_math_mode_), - "Failed to get old cublas math mode"); - if (old_math_mode_ != new_math_mode) { - PADDLE_ENFORCE( - platform::dynload::cublasSetMathMode(handle_, new_math_mode), - "Failed to set old cublas math mode"); - need_reset = true; - } - } - - ~ScopedCublasMathMode() { - if (need_reset) { - PADDLE_ENFORCE( - platform::dynload::cublasSetMathMode(handle_, old_math_mode_), - "Failed to set old cublas math mode"); - } - } - - private: - cublasHandle_t handle_; - cublasMath_t old_math_mode_; - bool need_reset; -}; - -#endif - class CUDADeviceContext : public DeviceContext { public: explicit CUDADeviceContext(CUDAPlace place); @@ -265,6 +232,13 @@ class CUDADeviceContext : public DeviceContext { /*! \brief Return cublas handle in the device context. */ cublasHandle_t cublas_handle() const; + /*! \brief Check whether tensor core is supported */ + bool tensor_core_available() const; + + /*! \brief Return cublas handle supporting Tensor Core. If Tensor Core is + * not supported, return the same handle as cublas_handle(). */ + cublasHandle_t possible_cublas_tensor_core_handle() const; + /*! \brief Return cudnn handle in the device context. */ cudnnHandle_t cudnn_handle() const; @@ -294,18 +268,6 @@ class CUDADeviceContext : public DeviceContext { void WaitStreamCallback() const { callback_manager_->Wait(); } -#if CUDA_VERSION >= 9000 - /*! \brief CublasCall may need to change cublas's config, - * but the cublas may be hold by multi-thread, so we should - * add lock here. */ - template - void CublasCall(Callback callback, cublasMath_t new_math) { - std::lock_guard guard(cublas_mtx_); - ScopedCublasMathMode scoped_cublas_math(cublas_handle_, new_math); - callback(); - } -#endif - private: CUDAPlace place_; @@ -314,6 +276,7 @@ class CUDADeviceContext : public DeviceContext { std::unique_ptr cudnn_holder_; cudaStream_t stream_; cublasHandle_t cublas_handle_; + std::unique_ptr cublas_tensor_core_handle_; int compute_capability_; int runtime_version_; From 1f423f84ace49e8377e3fc4dee44679fdf33954e Mon Sep 17 00:00:00 2001 From: peizhilin Date: Sat, 29 Dec 2018 16:46:19 +0800 Subject: [PATCH 092/197] fix the huber loss compile issue on windows test=develop --- paddle/fluid/operators/huber_loss_op.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/huber_loss_op.h b/paddle/fluid/operators/huber_loss_op.h index 9efda3dfc9..fa21bd01cb 100644 --- a/paddle/fluid/operators/huber_loss_op.h +++ b/paddle/fluid/operators/huber_loss_op.h @@ -105,14 +105,16 @@ class HuberLossGradKernel : public framework::OpKernel { out0->mutable_data(context.GetPlace()); auto x_grad = EigenVector::Flatten(*out0); x_grad.device(place) = - out_grad * residual.unaryExpr(HuberLossBackward(delta, -1.0)); + residual.unaryExpr(HuberLossBackward(delta, -1.0)); + x_grad.device(place) = out_grad * x_grad; } if (out1) { out1->mutable_data(context.GetPlace()); auto y_grad = EigenVector::Flatten(*out1); y_grad.device(place) = - out_grad * residual.unaryExpr(HuberLossBackward(delta, 1.0)); + residual.unaryExpr(HuberLossBackward(delta, 1.0)); + y_grad.device(place) = out_grad * y_grad; } } }; From 35cda13e9fd65cb2f41c5e7e58fe513c19a84f5b Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Sat, 29 Dec 2018 17:09:28 +0800 Subject: [PATCH 093/197] fix unittest test=develop --- .../details/parallel_ssa_graph_executor.cc | 8 +++- paddle/fluid/framework/parallel_executor.cc | 42 +++++++++---------- paddle/fluid/framework/parallel_executor.h | 7 ++++ paddle/fluid/pybind/pybind.cc | 2 +- python/paddle/fluid/parallel_executor.py | 2 +- 5 files changed, 37 insertions(+), 24 deletions(-) diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc index 2377f2c963..bb1f415128 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc @@ -28,7 +28,13 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor( places_(std::move(places)), graphs_(std::move(graphs)) { PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); - // do not use threadpool for each graph execution. + + // set the correct size of thread pool to each device. + strategy_.num_threads_ = strategy_.num_threads_ < places_.size() + ? 1UL + : strategy_.num_threads_ / places_.size(); + VLOG(1) << "set num_threads: " << strategy_.num_threads_ + << " to schedule operators on each device."; for (size_t i = 0; i < places.size(); ++i) { executors_.emplace_back(new details::ThreadedSSAGraphExecutor( strategy_, {local_scopes_[i]}, {places_[i]}, std::move(graphs_[i]))); diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 934cf34cbd..176c1db349 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -21,10 +21,6 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/graph.h" -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) -#include "paddle/fluid/platform/nccl_helper.h" -#endif - #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h" @@ -39,6 +35,8 @@ limitations under the License. */ DEFINE_string(pe_profile_fname, "", "Profiler filename for PE, which generated by gperftools." "Only valid when compiled `WITH_PRIFILER=ON`. Empty if disable."); +DEFINE_bool(enable_parallel_graph, true, + "Force disable parallel graph execution mode if set false."); namespace paddle { namespace framework { @@ -211,15 +209,6 @@ ParallelExecutor::ParallelExecutor( "the number of places must be greater than 1."); } - // FIXME(Yancey1989): parallel graph mode get better performance - // in GPU allreduce distributed training. Need an elegant way to - // choice the execution strategy. - build_strategy.enable_parallel_graph_ = - EnableParallelGraphExecution(main_program, exec_strategy, build_strategy); - - VLOG(1) << "Enable ParallelGraph Execution: " - << build_strategy.enable_parallel_graph_; - // Step 1. Bcast the bcast_vars to devs. // Create local scopes if (local_scopes.empty()) { @@ -236,24 +225,35 @@ ParallelExecutor::ParallelExecutor( } } + // FIXME(Yancey1989): parallel graph mode get better performance + // in GPU allreduce distributed training. Need an elegant way to + // choice the execution strategy. + build_strategy.enable_parallel_graph_ = + EnableParallelGraphExecution(main_program, exec_strategy, build_strategy); + + VLOG(1) << "Enable ParallelGraph Execution: " + << build_strategy.enable_parallel_graph_; + if (member_->use_cuda_) { // Bcast Parameters to all GPUs #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + ncclUniqueId *nccl_id = nullptr; + // gen_nccl_id operator can broadcast the ncclUniqueId for nccl2 collective + // distributed training auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME); - std::unique_ptr nccl_id; - // nccl collective would broadcast ncclUniqueId by gen_nccl_id operator. if (nccl_id_var != nullptr) { - nccl_id.reset(nccl_id_var->GetMutable()); + nccl_id = nccl_id_var->GetMutable(); } if (build_strategy.enable_parallel_graph_ && member_->nranks_ > 1UL) { - if (nccl_id.get() == nullptr) { - nccl_id.reset(new ncclUniqueId()); - platform::dynload::ncclGetUniqueId(nccl_id.get()); + if (nccl_id == nullptr) { + local_nccl_id_.reset(new ncclUniqueId()); + platform::dynload::ncclGetUniqueId(local_nccl_id_.get()); + nccl_id = local_nccl_id_.get(); } } member_->nccl_ctxs_.reset(new platform::NCCLContextMap( - member_->places_, nccl_id.get(), num_trainers, trainer_id)); + member_->places_, nccl_id, num_trainers, trainer_id)); #else PADDLE_THROW("Not compiled with CUDA"); #endif @@ -492,7 +492,7 @@ bool ParallelExecutor::EnableParallelGraphExecution( if (build_strategy.enable_sequential_execution_ || exec_strategy.type_ == ExecutionStrategy::ExecutorType::kExperimental) enable_parallel_graph = false; - return enable_parallel_graph; + return enable_parallel_graph && FLAGS_enable_parallel_graph; } ParallelExecutor::~ParallelExecutor() { diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index dc70894dbd..49d3f0d3f6 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -28,6 +28,10 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device_context.h" +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#include "paddle/fluid/platform/nccl_helper.h" +#endif + namespace paddle { namespace framework { @@ -73,6 +77,9 @@ class ParallelExecutor { const BuildStrategy &build_strategy) const; ParallelExecutorPrivate *member_; +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + std::unique_ptr local_nccl_id_; +#endif }; } // namespace framework diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index d664107d57..1473603a74 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -810,7 +810,7 @@ All parameter, weight, gradient are variables in Paddle. If :math:`num\_threads=1`, all the operators will execute one by one, but the order maybe difference between iterations. If it is not set, it will be set in ParallelExecutor according to the - device type and device count, for GPU, :math:`num\_threads=device\_count*4`, for CPU, + device type and device count, for GPU, :math:`num\_threads=device\_count`, for CPU, :math:`num\_threads=CPU\_NUM*4`, the explanation of:math:`CPU\_NUM` is in ParallelExecutor. if it is not set, ParallelExecutor will get the cpu count by calling `multiprocessing.cpu_count()`. Default 0.)DOC") diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index c97a93ec36..9709961286 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -117,7 +117,7 @@ class ParallelExecutor(object): if use_cuda: # Experiments on se-resnext shows that too many threads hurt # performance. Worth tunning for other models in the future. - exec_strategy.num_threads = len(self._places) * 4 + exec_strategy.num_threads = len(self._places) else: cpu_num = int( os.environ.get('CPU_NUM', multiprocessing.cpu_count())) From af91444cd6039ae7f57cfdcd3549adf433655f6d Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Sat, 29 Dec 2018 17:15:01 +0800 Subject: [PATCH 094/197] polish unittest test=develop --- .../fluid/tests/unittests/test_parallel_executor_mnist.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index 63bc1de208..9768f7db26 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -115,9 +115,7 @@ class TestMNIST(TestParallelExecutorBase): def test_simple_fc(self): # use_cuda - if core.is_compiled_with_cuda(): - self.check_simple_fc_convergence(True) - self.check_simple_fc_convergence(True, use_reduce=False) + self.check_simple_fc_convergence(True) self.check_simple_fc_convergence(False) def test_simple_fc_with_new_strategy(self): @@ -154,8 +152,7 @@ class TestMNIST(TestParallelExecutorBase): np.mean(parallel_last_loss), single_last_loss, delta=1e-6) def test_simple_fc_parallel_accuracy(self): - if core.is_compiled_with_cuda(): - self.check_simple_fc_parallel_accuracy(True) + self.check_simple_fc_parallel_accuracy(True) self.check_simple_fc_parallel_accuracy(False) def check_batchnorm_fc_convergence(self, use_cuda, use_fast_executor): From cd2d60b4c822db2bc0fba5eec8163b90888a9e6f Mon Sep 17 00:00:00 2001 From: peizhilin Date: Sat, 29 Dec 2018 17:18:26 +0800 Subject: [PATCH 095/197] fix build issue for density prior box op on windows test=develop --- paddle/fluid/operators/detection/density_prior_box_op.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cu b/paddle/fluid/operators/detection/density_prior_box_op.cu index acd5993154..6337a4837a 100644 --- a/paddle/fluid/operators/detection/density_prior_box_op.cu +++ b/paddle/fluid/operators/detection/density_prior_box_op.cu @@ -148,7 +148,7 @@ class DensityPriorBoxOpCUDAKernel : public framework::OpKernel { // blockx is multiple of 32. int blockx = std::min( static_cast(((feature_width * num_priors + 31) >> 5) << 5), - 512L); + static_cast(512L)); int gridx = (feature_width * num_priors + blockx - 1) / blockx; dim3 threads(blockx, 1); dim3 grids(gridx, feature_height); From 133f100552270cc98a762cbadb6934c9ccd5026d Mon Sep 17 00:00:00 2001 From: minqiyang Date: Sat, 29 Dec 2018 17:28:38 +0800 Subject: [PATCH 096/197] Complete the unittest of optimizers test=develop --- python/paddle/fluid/imperative/nn.py | 47 ++++++---- .../fluid/tests/unittests/test_imperative.py | 11 +-- .../tests/unittests/test_imperative_mnist.py | 91 +++++++++++++++++-- 3 files changed, 113 insertions(+), 36 deletions(-) diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/imperative/nn.py index 15d0fcaf77..7f3be20463 100644 --- a/python/paddle/fluid/imperative/nn.py +++ b/python/paddle/fluid/imperative/nn.py @@ -97,17 +97,23 @@ class Conv2D(layers.PyLayer): persistable=True, type=core.VarDesc.VarType.RAW) - self._pre_bias = self._helper.create_variable_for_type_inference( - dtype=self._dtype) + self._bias_param = self._helper.create_parameter( + attr=self._helper.bias_attr, + shape=[num_filter_channels], + dtype=self._dtype, + is_bias=True) def forward(self, input): + pre_bias = self._helper.create_variable_for_type_inference( + dtype=self._dtype) + self._helper.append_op( type=self._l_type, inputs={ 'Input': input, 'Filter': self._filter_param, }, - outputs={"Output": self._pre_bias}, + outputs={"Output": pre_bias}, attrs={ 'strides': self._stride, 'paddings': self._padding, @@ -117,11 +123,17 @@ class Conv2D(layers.PyLayer): 'use_mkldnn': False, }) - self._pre_act = self._helper.append_bias_op( - self._pre_bias, dim_start=1, dim_end=2) + pre_act = self._helper.create_variable_for_type_inference( + dtype=self._dtype) - out = self._helper.append_activation(self._pre_act) - return out + self._helper.append_op( + type='elementwise_add', + inputs={'X': [pre_bias], + 'Y': [self._bias_param]}, + outputs={'Out': [pre_act]}, + attrs={'axis': 1}) + + return self._helper.append_activation(pre_act) class Pool2D(layers.PyLayer): @@ -162,14 +174,13 @@ class Pool2D(layers.PyLayer): self._exclusive = exclusive self._l_type = 'pool2d' - self._pool_out = self._helper.create_variable_for_type_inference( - self._dtype) - def forward(self, input): + pool_out = self._helper.create_variable_for_type_inference(self._dtype) + self._helper.append_op( type=self._l_type, inputs={"X": input}, - outputs={"Out": self._pool_out}, + outputs={"Out": pool_out}, attrs={ "pooling_type": self._pool_type, "ksize": self._pool_size, @@ -181,7 +192,7 @@ class Pool2D(layers.PyLayer): "use_mkldnn": False, "exclusive": self._exclusive, }) - return self._pool_out + return pool_out class FC(layers.PyLayer): @@ -203,8 +214,6 @@ class FC(layers.PyLayer): shape=[size_in, size_out], dtype=self._dtype, is_bias=False) - self._tmp = self._helper.create_variable_for_type_inference(self._dtype) - self._out = self._helper.create_variable_for_type_inference(self._dtype) def _build_once(self, input): if self._size_in != -1: @@ -221,19 +230,21 @@ class FC(layers.PyLayer): is_bias=False) def forward(self, input): + tmp = self._helper.create_variable_for_type_inference(self._dtype) self._helper.append_op( type="mul", inputs={"X": input, "Y": self._w}, - outputs={"Out": self._tmp}, + outputs={"Out": tmp}, attrs={ "x_num_col_dims": self._num_flatten_dims, "y_num_col_dims": 1 }) + out = self._helper.create_variable_for_type_inference(self._dtype) self._helper.append_op( type="sum", - inputs={"X": [self._tmp]}, - outputs={"Out": self._out}, + inputs={"X": [tmp]}, + outputs={"Out": out}, attrs={"use_mkldnn": False}) - return self._out + return out diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py index d3df9f829a..f717801bae 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative.py +++ b/python/paddle/fluid/tests/unittests/test_imperative.py @@ -19,16 +19,7 @@ import numpy as np import paddle.fluid as fluid from paddle.fluid import core from paddle.fluid.layers.nn import FC - - -@contextlib.contextmanager -def new_program_scope(): - prog = fluid.Program() - startup_prog = fluid.Program() - scope = fluid.core.Scope() - with fluid.scope_guard(scope): - with fluid.program_guard(prog, startup_prog): - yield +from test_imperative_base import new_program_scope class MyLayer(fluid.imperative.PyLayer): diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py index bda9f0e410..775b10e6dc 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py @@ -15,12 +15,15 @@ import contextlib import unittest import numpy as np +import six +import paddle import paddle.fluid as fluid from paddle.fluid import core from paddle.fluid.optimizer import SGDOptimizer from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC from paddle.fluid.imperative.base import to_variable +from test_imperative_base import new_program_scope class SimpleImgConvPool(fluid.imperative.PyLayer): @@ -97,21 +100,93 @@ class MNIST(fluid.imperative.PyLayer): class TestImperativeMnist(unittest.TestCase): def test_mnist_cpu_float32(self): + seed = 90 + with fluid.imperative.guard(): - mnist = MNIST() + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + + mnist = Conv2D(1, 20, 5) sgd = SGDOptimizer(learning_rate=1e-3) + train_reader = paddle.batch( + paddle.dataset.mnist.train(), batch_size=128) + + dy_param_value = {} + for param in fluid.default_main_program().global_block( + ).all_parameters(): + dy_param_value[param.name] = param._numpy() + + for batch_id, data in enumerate(train_reader()): + if batch_id >= 1: + break + + x_data = np.array( + [x[0].reshape(1, 28, 28) for x in data]).astype('float32') + y_data = np.array([x[1] for x in data]).astype('int64').reshape( + 128, 1) - for i in range(2): - x_data = np.random.rand(128, 1, 28, 28).astype('float32') img = to_variable(x_data) - y_data = np.random.rand(128, 1).astype('int64') label = to_variable(y_data) label._stop_gradient = True - predict = mnist(img) - out = fluid.layers.cross_entropy(predict, label) - out._backward() - sgd.minimize(out) + cost = mnist(img) + loss = fluid.layers.reduce_mean(cost) + dy_out = loss._numpy() + + loss._backward() + sgd.minimize(loss) + dy_filter_param = mnist._filter_param._numpy() + + with new_program_scope(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + + exe = fluid.Executor(fluid.CPUPlace()) + + mnist = Conv2D(1, 20, 5) + sgd = SGDOptimizer(learning_rate=1e-3) + train_reader = paddle.batch( + paddle.dataset.mnist.train(), batch_size=128) + + img = fluid.layers.data( + name='pixel', shape=[1, 28, 28], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + cost = mnist(img) + loss = fluid.layers.reduce_mean(cost) + sgd.minimize(loss) + + # initialize params and fetch them + static_param_value = {} + static_param_name_list = [] + for param in fluid.default_startup_program().global_block( + ).all_parameters(): + static_param_name_list.append(param.name) + + out = exe.run(fluid.default_startup_program(), + fetch_list=static_param_name_list) + + for i in range(len(static_param_name_list)): + static_param_value[static_param_name_list[i]] = out[i] + + for batch_id, data in enumerate(train_reader()): + if batch_id >= 1: + break + + x_data = np.array( + [x[0].reshape(1, 28, 28) for x in data]).astype('float32') + y_data = np.array([x[1] for x in data]).astype('int64').reshape( + [128, 1]) + static_out, static_filter_param = exe.run( + fluid.default_main_program(), + feed={"pixel": x_data, + "label": y_data}, + fetch_list=[loss.name, mnist._filter_param.name]) + + for key, value in six.iteritems(static_param_value): + self.assertTrue(np.allclose(value.all(), dy_param_value[key].all())) + self.assertTrue(np.allclose(static_out.all(), dy_out.all())) + self.assertTrue( + np.allclose(static_filter_param.all(), dy_filter_param.all())) if __name__ == '__main__': From a7966e673b7bfe0de850af6b3eb7dc8dd7203e66 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Sat, 29 Dec 2018 17:29:46 +0800 Subject: [PATCH 097/197] Polish code test=develop --- .../tests/unittests/test_imperative_base.py | 30 +++++++++++++++++++ ..._mnist.py => test_imperative_optimizer.py} | 0 2 files changed, 30 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_base.py rename python/paddle/fluid/tests/unittests/{test_imperative_mnist.py => test_imperative_optimizer.py} (100%) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_base.py b/python/paddle/fluid/tests/unittests/test_imperative_base.py new file mode 100644 index 0000000000..478cc13fb5 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_imperative_base.py @@ -0,0 +1,30 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import contextlib +import unittest +import numpy as np + +import paddle.fluid as fluid +from paddle.fluid import core + + +@contextlib.contextmanager +def new_program_scope(): + prog = fluid.Program() + startup_prog = fluid.Program() + scope = fluid.core.Scope() + with fluid.scope_guard(scope): + with fluid.program_guard(prog, startup_prog): + yield diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py similarity index 100% rename from python/paddle/fluid/tests/unittests/test_imperative_mnist.py rename to python/paddle/fluid/tests/unittests/test_imperative_optimizer.py From 0f6ef8edba17736ced024c62e773f001299f84fb Mon Sep 17 00:00:00 2001 From: minqiyang Date: Sat, 29 Dec 2018 18:17:05 +0800 Subject: [PATCH 098/197] Add MNIST test=develop --- python/paddle/fluid/imperative/nn.py | 2 +- .../unittests/test_imperative_optimizer.py | 60 ++++++++++++------- 2 files changed, 38 insertions(+), 24 deletions(-) diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/imperative/nn.py index 7f3be20463..8757670ef8 100644 --- a/python/paddle/fluid/imperative/nn.py +++ b/python/paddle/fluid/imperative/nn.py @@ -99,7 +99,7 @@ class Conv2D(layers.PyLayer): self._bias_param = self._helper.create_parameter( attr=self._helper.bias_attr, - shape=[num_filter_channels], + shape=[num_filters], dtype=self._dtype, is_bias=True) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py index 775b10e6dc..e9dd158295 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py @@ -29,8 +29,8 @@ from test_imperative_base import new_program_scope class SimpleImgConvPool(fluid.imperative.PyLayer): def __init__(self, num_channels, - filter_size, num_filters, + filter_size, pool_size, pool_stride, pool_padding=0, @@ -77,10 +77,10 @@ class MNIST(fluid.imperative.PyLayer): super(MNIST, self).__init__(param_attr=param_attr, bias_attr=bias_attr) self._simple_img_conv_pool_1 = SimpleImgConvPool( - 1, 5, 20, 2, 2, act="relu") + 1, 20, 5, 2, 2, act="relu") self._simple_img_conv_pool_2 = SimpleImgConvPool( - 20, 5, 50, 2, 2, act="relu") + 20, 50, 5, 2, 2, act="relu") pool_2_shape = 50 * 8 * 8 SIZE = 10 @@ -106,18 +106,15 @@ class TestImperativeMnist(unittest.TestCase): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed - mnist = Conv2D(1, 20, 5) + # mnist = Conv2D(1, 20, 5) + mnist = MNIST() sgd = SGDOptimizer(learning_rate=1e-3) train_reader = paddle.batch( paddle.dataset.mnist.train(), batch_size=128) - dy_param_value = {} - for param in fluid.default_main_program().global_block( - ).all_parameters(): - dy_param_value[param.name] = param._numpy() - + dy_param_init_value = {} for batch_id, data in enumerate(train_reader()): - if batch_id >= 1: + if batch_id >= 2: break x_data = np.array( @@ -133,9 +130,17 @@ class TestImperativeMnist(unittest.TestCase): loss = fluid.layers.reduce_mean(cost) dy_out = loss._numpy() + if batch_id == 0: + for param in fluid.default_main_program().global_block( + ).all_parameters(): + dy_param_init_value[param.name] = param._numpy() + loss._backward() sgd.minimize(loss) - dy_filter_param = mnist._filter_param._numpy() + dy_param_value = {} + for param in fluid.default_main_program().global_block( + ).all_parameters(): + dy_param_value[param.name] = param._numpy() with new_program_scope(): fluid.default_startup_program().random_seed = seed @@ -143,7 +148,8 @@ class TestImperativeMnist(unittest.TestCase): exe = fluid.Executor(fluid.CPUPlace()) - mnist = Conv2D(1, 20, 5) + # mnist = Conv2D(1, 20, 5) + mnist = MNIST() sgd = SGDOptimizer(learning_rate=1e-3) train_reader = paddle.batch( paddle.dataset.mnist.train(), batch_size=128) @@ -156,7 +162,7 @@ class TestImperativeMnist(unittest.TestCase): sgd.minimize(loss) # initialize params and fetch them - static_param_value = {} + static_param_init_value = {} static_param_name_list = [] for param in fluid.default_startup_program().global_block( ).all_parameters(): @@ -166,27 +172,35 @@ class TestImperativeMnist(unittest.TestCase): fetch_list=static_param_name_list) for i in range(len(static_param_name_list)): - static_param_value[static_param_name_list[i]] = out[i] + static_param_init_value[static_param_name_list[i]] = out[i] for batch_id, data in enumerate(train_reader()): - if batch_id >= 1: + if batch_id >= 2: break x_data = np.array( [x[0].reshape(1, 28, 28) for x in data]).astype('float32') y_data = np.array([x[1] for x in data]).astype('int64').reshape( [128, 1]) - static_out, static_filter_param = exe.run( - fluid.default_main_program(), - feed={"pixel": x_data, - "label": y_data}, - fetch_list=[loss.name, mnist._filter_param.name]) + fetch_list = [loss.name] + fetch_list.extend(static_param_name_list) + out = exe.run(fluid.default_main_program(), + feed={"pixel": x_data, + "label": y_data}, + fetch_list=fetch_list) + + static_param_value = {} + static_out = out[0] + for i in range(1, len(out)): + static_param_value[static_param_name_list[i - 1]] = out[i] + + for key, value in six.iteritems(static_param_init_value): + self.assertTrue( + np.allclose(value.all(), dy_param_init_value[key].all())) + self.assertTrue(np.allclose(static_out.all(), dy_out.all())) for key, value in six.iteritems(static_param_value): self.assertTrue(np.allclose(value.all(), dy_param_value[key].all())) - self.assertTrue(np.allclose(static_out.all(), dy_out.all())) - self.assertTrue( - np.allclose(static_filter_param.all(), dy_filter_param.all())) if __name__ == '__main__': From dba009dbbf1edeb7513505a17508cb1e294e68a3 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Sat, 29 Dec 2018 19:15:43 +0800 Subject: [PATCH 099/197] fix script issue test=develop --- paddle/fluid/operators/CMakeLists.txt | 2 +- python/paddle/fluid/tests/unittests/CMakeLists.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index ee15420775..e53a6a562a 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -53,7 +53,7 @@ if (WITH_GPU) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale) endif() # conv_fusion_op needs cudnn 7 above - if (NOT ${CUDNN_MAJOR_VERSION} VERSION_LESS 7) + if (NOT ${CUDNN_VERSION} VERSION_LESS 7100) op_library(conv_fusion_op) file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_fusion);\n") endif() diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 6d6fe245d8..3441304995 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -25,7 +25,7 @@ endif(NOT WITH_DISTRIBUTE) if (NOT ${WITH_GPU}) LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op) -elseif(${CUDNN_MAJOR_VERSION} VERSION_LESS 7) +elseif(${CUDNN_VERSION} VERSION_LESS 7100) LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op) endif() From 229565303f71e5acb674cdb013ea1c66cea46643 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Sat, 29 Dec 2018 23:44:54 +0800 Subject: [PATCH 100/197] Polish PyLayers test=develop --- python/paddle/fluid/imperative/layers.py | 14 +- python/paddle/fluid/imperative/nn.py | 38 ++--- python/paddle/fluid/layers/nn.py | 132 ++++++------------ .../fluid/tests/unittests/test_imperative.py | 2 +- .../unittests/test_imperative_optimizer.py | 5 +- 5 files changed, 67 insertions(+), 124 deletions(-) diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py index c95b89a2c4..d78d61eb3f 100644 --- a/python/paddle/fluid/imperative/layers.py +++ b/python/paddle/fluid/imperative/layers.py @@ -24,19 +24,7 @@ __all__ = ['PyLayer'] class PyLayer(core.Layer): - def __init__(self, - dtype=core.VarDesc.VarType.FP32, - param_attr=None, - bias_attr=None, - name=None): - from ..layer_helper import LayerHelper - self._helper = LayerHelper( - type(self).__name__, - param_attr=param_attr, - bias_attr=bias_attr, - dtype=dtype, - name=name) - + def __init__(self, dtype=core.VarDesc.VarType.FP32, name=None): self._once_built = False self._dtype = dtype diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/imperative/nn.py index 8757670ef8..4f30417e99 100644 --- a/python/paddle/fluid/imperative/nn.py +++ b/python/paddle/fluid/imperative/nn.py @@ -46,8 +46,15 @@ class Conv2D(layers.PyLayer): name=None, dtype=core.VarDesc.VarType.FP32): assert param_attr is not False, "param_attr should not be False here." - super(Conv2D, self).__init__( - param_attr=param_attr, bias_attr=bias_attr, name=name, dtype=dtype) + super(Conv2D, self).__init__(name=name, dtype=dtype) + + from ..layer_helper import LayerHelper + self._helper = LayerHelper( + type(self).__name__, + param_attr=param_attr, + bias_attr=bias_attr, + dtype=dtype, + name=name) self._groups = groups self._stride = utils.convert_to_list(stride, 2, 'stride') @@ -163,6 +170,9 @@ class Pool2D(layers.PyLayer): super(Pool2D, self).__init__(name=name, dtype=dtype) + from ..layer_helper import LayerHelper + self._helper = LayerHelper(type(self).__name__, dtype=dtype, name=name) + self._pool_type = pool_type self._pool_size = utils.convert_to_list(pool_size, 2, 'pool_size') self._pool_padding = utils.convert_to_list(pool_padding, 2, @@ -197,32 +207,22 @@ class Pool2D(layers.PyLayer): class FC(layers.PyLayer): def __init__(self, - size_in, - size_out, - num_flatten_dims=1, + size, param_attr=None, + num_flatten_dims=1, dtype=core.VarDesc.VarType.FP32): - super(FC, self).__init__(param_attr=param_attr, dtype=dtype) - - self._size_in = size_in - self._size_out = size_out + super(FC, self).__init__() + self._size = size self._num_flatten_dims = num_flatten_dims self._dtype = dtype - if self._size_in != -1: - self._w = self._helper.create_parameter( - attr=self._helper.param_attr, - shape=[size_in, size_out], - dtype=self._dtype, - is_bias=False) + from ..layer_helper import LayerHelper + self._helper = LayerHelper('FC', param_attr=param_attr) def _build_once(self, input): - if self._size_in != -1: - return - input_shape = input.shape param_shape = [ reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:], 1) - ] + [self._size_out] + ] + [self._size] self._w = self._helper.create_parameter( attr=self._helper.param_attr, shape=param_shape, diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 81b2148989..9572fcb385 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -502,22 +502,22 @@ def lstm(input, If Device is GPU, This op will use cudnn LSTM implementation A four-gate Long Short-Term Memory network with no peephole connections. - In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, + In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations: .. math:: - - i_t &= \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) - - f_t &= \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) - - o_t &= \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) - + + i_t &= \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) + + f_t &= \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) + + o_t &= \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) + \\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) - - c_t &= f_t \odot c_{t-1} + i_t \odot \\tilde{c_t} - - h_t &= o_t \odot tanh(c_t) + + c_t &= f_t \odot c_{t-1} + i_t \odot \\tilde{c_t} + + h_t &= o_t \odot tanh(c_t) - $W$ terms denote weight matrices (e.g. $W_{ix}$ is the matrix of weights from the input gate to the input) @@ -531,19 +531,19 @@ def lstm(input, - :math:`\\tilde{c_t}` is also called candidate hidden state, which is computed based on the current input and the previous hidden state. - Where sigmoid is the sigmoid operator: :math:`sigmoid(x) = 1 / (1 + e^{-x})` , * represents a point-wise multiplication, + Where sigmoid is the sigmoid operator: :math:`sigmoid(x) = 1 / (1 + e^{-x})` , * represents a point-wise multiplication, X represensts a matrix multiplication Args: input (Variable): LSTM input tensor, shape MUST be ( seq_len x batch_size x input_size ) - init_h(Variable): The initial hidden state of the LSTM + init_h(Variable): The initial hidden state of the LSTM This is a tensor with shape ( num_layers x batch_size x hidden_size) if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size) init_c(Variable): The initial cell state of the LSTM. This is a tensor with shape ( num_layers x batch_size x hidden_size ) if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size) - max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len + max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len hidden_size (int): hidden size of the LSTM num_layers (int): total layers number of the LSTM dropout_prob(float|0.0): dropout prob, dropout ONLY work between rnn layers, NOT between time steps @@ -558,18 +558,18 @@ def lstm(input, Returns: - rnn_out(Tensor),last_h(Tensor),last_c(Tensor): - + rnn_out(Tensor),last_h(Tensor),last_c(Tensor): + Three tensors, rnn_out, last_h, last_c: - + - rnn_out is result of LSTM hidden, shape is (seq_len x batch_size x hidden_size) \ if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2) - last_h is the hidden state of the last step of LSTM \ shape is ( num_layers x batch_size x hidden_size ) \ - if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size) + if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size) - last_c(Tensor): the cell state of the last step of LSTM \ shape is ( num_layers x batch_size x hidden_size ) \ - if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size) + if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size) Examples: @@ -1255,7 +1255,7 @@ def dropout(x, (mask is a tensor same shape with input, value is 0 or 1 ratio of 0 is dropout_prob) - + Returns: Variable: A tensor variable is the shape with `x`. @@ -1346,10 +1346,10 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex): ValueError: 1. the 1st dimension of ``input`` and ``label`` are not equal. - + 2. when ``soft_label == True``, and the 2nd dimension of ``input`` and ``label`` are not equal. - + 3. when ``soft_label == False``, and the 2nd dimension of ``label`` is not 1. @@ -1471,7 +1471,7 @@ def chunk_eval(input, This function computes and outputs the precision, recall and F1-score of chunk detection. - For some basics of chunking, please refer to + For some basics of chunking, please refer to `Chunking with Support Vector Machines `_ . ChunkEvalOp computes the precision, recall, and F1-score of chunk detection, @@ -2306,7 +2306,7 @@ def sequence_slice(input, offset, length, name=None): out.lod = [[2, 1]], out.dims = (3, 2). - Note: + Note: The first dimension size of **input**, **offset** and **length** should be equal. The **offset** should start from 0. @@ -4678,7 +4678,7 @@ def ctc_greedy_decoder(input, blank, name=None): [0.5, 0.1, 0.3, 0.1]] input.lod = [[4, 4]] - + Computation: step1: Apply argmax to first input sequence which is input.data[0:4]. Then we get: @@ -4712,7 +4712,7 @@ def ctc_greedy_decoder(input, blank, name=None): Variable: CTC greedy decode result which is a 2-D tensor with shape [Lp, 1]. \ 'Lp' is the sum if all output sequences' length. If all the sequences \ in result were empty, the result LoDTensor will be [-1] with \ - LoD [[]] and dims [1, 1]. + LoD [[]] and dims [1, 1]. Examples: .. code-block:: python @@ -5065,7 +5065,7 @@ def hsigmoid(input, """ The hierarchical sigmoid operator is used to accelerate the training process of language model. This operator organizes the classes into a - complete binary tree, or you can use is_custom to pass your own tree to + complete binary tree, or you can use is_custom to pass your own tree to implement hierarchical. Each leaf node represents a class(a word) and each internal node acts as a binary classifier. For each word there's a unique path from root to it's leaf node, hsigmoid calculate the cost for each @@ -5082,7 +5082,7 @@ def hsigmoid(input, 2. build a dict to store word_id -> word's leaf to root path, we call it path_table. 3. build a dict to store word_id -> code of word's leaf to root path, we call it path_code. Code means label of each binary classification, using 1 indicate true, 0 indicate false. - 4. now, each word should has its path and code along the path, you can pass a batch of path and code + 4. now, each word should has its path and code along the path, you can pass a batch of path and code related to the same batch of inputs. Args: @@ -5091,8 +5091,8 @@ def hsigmoid(input, and :math:`D` is the feature size. label (Variable): The tensor variable contains labels of training data. It's a tensor with shape is :math:`[N \\times 1]`. - num_classes: (int), The number of classes, must not be less than 2. with default tree this has to be set, - it should never be None under is_custom=False, but while is_custom is true, it should be non leaf num + num_classes: (int), The number of classes, must not be less than 2. with default tree this has to be set, + it should never be None under is_custom=False, but while is_custom is true, it should be non leaf num which indicates the num of classes using by binary classify. param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights of hsigmoid. If it is set to None or one attribute of ParamAttr, hsigmoid @@ -5105,15 +5105,15 @@ def hsigmoid(input, is not set, the bias is initialized zero. Default: None. name (str|None): A name for this layer(optional). If set None, the layer will be named automatically. Default: None. - path_table: (Variable|None) this variable can store each batch of samples' path to root, + path_table: (Variable|None) this variable can store each batch of samples' path to root, it should be in leaf -> root order - path_table should have the same shape with path_code, and for each sample i path_table[i] indicates a np.array like - structure and each element in this array is indexes in parent nodes' Weight Matrix. - path_code: (Variable|None) this variable can store each batch of samples' code, + path_table should have the same shape with path_code, and for each sample i path_table[i] indicates a np.array like + structure and each element in this array is indexes in parent nodes' Weight Matrix. + path_code: (Variable|None) this variable can store each batch of samples' code, each code consist with every code of parent nodes. it should be in leaf -> root order - is_custom: (bool|False)using user defined binary tree instead of default complete binary tree, if costum is + is_custom: (bool|False)using user defined binary tree instead of default complete binary tree, if costum is set you need to set path_table/path_code/num_classes, otherwise num_classes should be set - is_sparse: (bool|False)using sparse update instead of dense update, if set, the gradient + is_sparse: (bool|False)using sparse update instead of dense update, if set, the gradient of W and input will be sparse. Returns: @@ -6965,10 +6965,10 @@ def mean_iou(input, label, num_classes): num_classes (int): The possible number of labels. Returns: - mean_iou (Variable),out_wrong(Variable),out_correct(Variable): - + mean_iou (Variable),out_wrong(Variable),out_correct(Variable): + Three variables: - + - mean_iou : A Tensor representing the mean intersection-over-union with shape [1]. - out_wrong: A Tensor with shape [num_classes]. The wrong numbers of each class. - out_correct: A Tensor with shape [num_classes]. The correct numbers of each class. @@ -7166,7 +7166,7 @@ def affine_grid(theta, out_shape, name=None): Args: theta (Variable): A batch of affine transform parameters with shape [N, 2, 3]. - out_shape (Variable | list | tuple): The shape of target output with format [N, C, H, W]. + out_shape (Variable | list | tuple): The shape of target output with format [N, C, H, W]. ``out_shape`` can be a Variable or a list or tuple. name(str|None): A name for this layer(optional). If set None, the layer will be named automatically. @@ -7762,9 +7762,9 @@ def flatten(x, axis=1, name=None): """ **Flatten layer** Flattens the input tensor into a 2D matrix. - + For Example: - + .. code-block:: text Case 1: @@ -8942,7 +8942,7 @@ def similarity_focus(input, axis, indexes, name=None): SimilarityFocus Operator Generate a similarity focus mask with the same shape of input using the following method: - + 1. Extract the 3-D tensor(here the first dimension is BatchSize) corresponding to the axis according to the indexes. For example, if axis=1 and indexes=[a], it will get the matrix T=X[:, a, :, :]. In this case, if the shape of input X @@ -9713,47 +9713,3 @@ def huber_loss(input, label, delta): 'Residual': residual}, attrs={'delta': delta}) return out - - -class FC(layers.PyLayer): - def __init__(self, - size, - param_attr=None, - num_flatten_dims=1, - dtype=core.VarDesc.VarType.FP32): - super(FC, self).__init__(param_attr=param_attr) - self._size = size - self._num_flatten_dims = num_flatten_dims - self._dtype = dtype - self._tmp = self._helper.create_variable_for_type_inference(self._dtype) - self._out = self._helper.create_variable_for_type_inference(self._dtype) - - def _build_once(self, inputs): - input_shape = inputs.shape - param_shape = [ - reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:], 1) - ] + [self._size] - self._w = self._helper.create_parameter( - attr=self._helper.param_attr, - shape=param_shape, - dtype=self._dtype, - is_bias=False) - - def forward(self, inputs): - self._helper.append_op( - type="mul", - inputs={"X": inputs, - "Y": self._w}, - outputs={"Out": self._tmp}, - attrs={ - "x_num_col_dims": self._num_flatten_dims, - "y_num_col_dims": 1 - }) - - self._helper.append_op( - type="sum", - inputs={"X": [self._tmp]}, - outputs={"Out": self._out}, - attrs={"use_mkldnn": False}) - - return self._out diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py index f717801bae..1dc13ec74e 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative.py +++ b/python/paddle/fluid/tests/unittests/test_imperative.py @@ -18,7 +18,7 @@ import numpy as np import paddle.fluid as fluid from paddle.fluid import core -from paddle.fluid.layers.nn import FC +from paddle.fluid.imperative.nn import FC from test_imperative_base import new_program_scope diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py index e9dd158295..5d97edf876 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py @@ -74,7 +74,7 @@ class SimpleImgConvPool(fluid.imperative.PyLayer): class MNIST(fluid.imperative.PyLayer): def __init__(self, param_attr=None, bias_attr=None): - super(MNIST, self).__init__(param_attr=param_attr, bias_attr=bias_attr) + super(MNIST, self).__init__() self._simple_img_conv_pool_1 = SimpleImgConvPool( 1, 20, 5, 2, 2, act="relu") @@ -85,8 +85,7 @@ class MNIST(fluid.imperative.PyLayer): pool_2_shape = 50 * 8 * 8 SIZE = 10 scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5 - self._fc = FC(-1, - 10, + self._fc = FC(10, param_attr=fluid.param_attr.ParamAttr( initializer=fluid.initializer.NormalInitializer( loc=0.0, scale=scale))) From 9186451f6052d4fc87042b281097d65eee9bebf9 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Wed, 2 Jan 2019 11:14:52 +0800 Subject: [PATCH 101/197] hide GetTensor test=develop --- paddle/fluid/framework/operator.h | 24 ++++ paddle/fluid/framework/tensor_util.h | 22 ---- paddle/fluid/operators/conv_op.h | 11 +- paddle/fluid/platform/CMakeLists.txt | 4 +- .../platform/temporary_allocator_test.cc | 115 ++++++++++-------- 5 files changed, 91 insertions(+), 85 deletions(-) diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index e2bedc60d2..7d45be9ddc 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -368,6 +368,30 @@ class ExecutionContext { return op_.Outputs(name); } + template + Tensor AllocateTmpTensor(const framework::DDim& dim, + const DevContext& dev_ctx) const { + auto tmp_allocation_ptr = platform::DeviceTemporaryAllocator::Instance() + .Get(dev_ctx) + .Allocate(product(dim) * sizeof(T)); + auto& deleter = tmp_allocation_ptr.get_deleter(); + auto* allocation_ptr = tmp_allocation_ptr.release(); + auto shared_allocation = std::shared_ptr( + allocation_ptr, deleter); + + PADDLE_ENFORCE( + dynamic_cast(allocation_ptr) != nullptr, + "The AllocationPtr must be TemporaryAllocation."); + PADDLE_ENFORCE_EQ(allocation_ptr->size(), + framework::product(dim) * sizeof(T)); + + paddle::framework::Tensor temp_tensor( + framework::ToDataType(std::type_index(typeid(T)))); + temp_tensor.Resize(dim); + temp_tensor.ResetHolder(std::move(shared_allocation)); + return temp_tensor; + } + private: const OperatorBase& op_; const Scope& scope_; diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index 871c7bd2a7..1ffd357e62 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -151,27 +151,5 @@ void TensorToVector(const Tensor& src, std::vector* dst) { memory::Copy(dst_place, dst_ptr, boost::get(src.place()), src_ptr, size); } - -template -paddle::framework::Tensor GetTensor( - memory::allocation::AllocationPtr temp_allocation_ptr, - const framework::DDim& dim) { - auto& deleter = temp_allocation_ptr.get_deleter(); - auto* allocation_ptr = temp_allocation_ptr.release(); - auto shared_allocation = - std::shared_ptr(allocation_ptr, deleter); - - PADDLE_ENFORCE( - dynamic_cast(allocation_ptr) != nullptr, - "The AllocationPtr must be TemporaryAllocation."); - PADDLE_ENFORCE_EQ(allocation_ptr->size(), - framework::product(dim) * sizeof(T)); - - paddle::framework::Tensor temp_tensor( - framework::ToDataType(std::type_index(typeid(T)))); - temp_tensor.Resize(dim); - temp_tensor.ResetHolder(std::move(shared_allocation)); - return temp_tensor; -} } // namespace framework } // namespace paddle diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h index 2519f5e7ac..24b8e23879 100644 --- a/paddle/fluid/operators/conv_op.h +++ b/paddle/fluid/operators/conv_op.h @@ -18,7 +18,6 @@ limitations under the License. */ #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/depthwise_conv.h" #include "paddle/fluid/operators/math/im2col.h" @@ -158,10 +157,7 @@ class GemmConvKernel : public framework::OpKernel { // to call the matrix multiplication interface. Tensor col_matrix; if (is_expand) { - auto tmp_allocation_ptr = - platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate( - framework::product(col_shape) * sizeof(T)); - col = framework::GetTensor(std::move(tmp_allocation_ptr), col_shape); + col = context.AllocateTmpTensor(col_shape, dev_ctx); col_matrix.ShareDataWith(col); col_matrix.Resize(col_matrix_shape); } @@ -293,10 +289,7 @@ class GemmConvGradKernel : public framework::OpKernel { // to call the matrix multiplication interface. Tensor col_matrix; if (is_expand) { - auto tmp_allocation_ptr = - platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate( - framework::product(col_shape) * sizeof(T)); - col = framework::GetTensor(std::move(tmp_allocation_ptr), col_shape); + col = context.AllocateTmpTensor(col_shape, dev_ctx); col_matrix.ShareDataWith(col); col_matrix.Resize(col_matrix_shape); } diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 05a0f14440..1f51b5bab3 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -100,7 +100,7 @@ ENDIF() nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info) if(WITH_GPU) - nv_test(temporal_allocator_test SRCS temporary_allocator_test.cc DEPS temp_allocator tensor) + nv_test(temporal_allocator_test SRCS temporary_allocator_test.cc DEPS temp_allocator tensor operator) else() - cc_test(temporal_allocator_test SRCS temporary_allocator_test.cc DEPS temp_allocator tensor) + cc_test(temporal_allocator_test SRCS temporary_allocator_test.cc DEPS temp_allocator tensor operator) endif() diff --git a/paddle/fluid/platform/temporary_allocator_test.cc b/paddle/fluid/platform/temporary_allocator_test.cc index e4e5be5b89..35d1d92981 100644 --- a/paddle/fluid/platform/temporary_allocator_test.cc +++ b/paddle/fluid/platform/temporary_allocator_test.cc @@ -14,12 +14,27 @@ #include "paddle/fluid/platform/temporary_allocator.h" #include +#include +#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/tensor_util.h" + DECLARE_double(limit_of_temporary_allocation); namespace paddle { namespace platform { +class DummyOp : public framework::OperatorBase { + public: + DummyOp(const std::string& type, const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + protected: + void RunImpl(const framework::Scope& scope, + const platform::Place& place) const override {} +}; + TEST(temporary_allocator, temporary_allocator) { platform::CPUPlace cpu_place; TemporaryAllocator alloc(cpu_place); @@ -68,96 +83,92 @@ TEST(temporary_allocator, add_callback) { } TEST(temporary_allocator, create_tensor_with_allocationptr) { - platform::CPUPlace cpu_place; - TemporaryAllocator cpu_alloc(cpu_place); + framework::VariableNameMap dummy_vars; + framework::AttributeMap dummy_attrs; + DummyOp op("dummy", dummy_vars, dummy_vars, dummy_attrs); + framework::Scope scope; + framework::VariableValueMap vars; + framework::RuntimeContext run_ctx(vars, vars); + size_t memory_size = 300; { - size_t memory_size = 200; - auto allocation = cpu_alloc.Allocate(memory_size); - void* address = allocation->ptr(); + platform::CPUPlace cpu_place; + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = + static_cast(pool.Get(cpu_place)); + framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx); + int numel = memory_size / sizeof(float); - framework::Tensor tensor = framework::GetTensor( - std::move(allocation), framework::make_ddim({numel})); - PADDLE_ENFORCE_EQ(address, tensor.data()); + framework::Tensor tensor = + ctx.AllocateTmpTensor( + framework::make_ddim({numel}), *dev_ctx); PADDLE_ENFORCE_EQ(tensor.numel(), numel); } #ifdef PADDLE_WITH_CUDA - platform::CUDAPlace gpu_place(0); - TemporaryAllocator gpu_alloc(gpu_place); - { - size_t memory_size = 300; - auto allocation = gpu_alloc.Allocate(memory_size); - void* address = allocation->ptr(); + platform::CUDAPlace gpu_place(0); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = + static_cast(pool.Get(gpu_place)); + framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx); int numel = memory_size / sizeof(float); - framework::Tensor tensor = framework::GetTensor( - std::move(allocation), framework::make_ddim({numel})); - PADDLE_ENFORCE_EQ(address, tensor.data()); + framework::Tensor tensor = + ctx.AllocateTmpTensor( + framework::make_ddim({numel}), *dev_ctx); PADDLE_ENFORCE_EQ(tensor.numel(), numel); } - - // The allocation is not holded now, it should be placed to - // TemporaryAllocationQueue. - PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 1); - gpu_alloc.Release([]() {}); - PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0); #endif } TEST(temporary_allocator, create_tensor_with_allocationptr2) { - platform::CPUPlace cpu_place; - TemporaryAllocator cpu_alloc(cpu_place); + framework::VariableNameMap dummy_vars; + framework::AttributeMap dummy_attrs; + DummyOp op("dummy", dummy_vars, dummy_vars, dummy_attrs); + framework::Scope scope; + framework::VariableValueMap vars; + framework::RuntimeContext run_ctx(vars, vars); + size_t memory_size = 400; { - size_t memory_size = 400; + platform::CPUPlace cpu_place; + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = + static_cast(pool.Get(cpu_place)); + framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx); int numel = memory_size / sizeof(float); framework::Tensor out_side_tensor; - void* address; { - auto allocation = cpu_alloc.Allocate(memory_size); - address = allocation->ptr(); - framework::Tensor tensor = framework::GetTensor( - std::move(allocation), framework::make_ddim({numel})); - PADDLE_ENFORCE_EQ(address, tensor.data()); + framework::Tensor tensor = + ctx.AllocateTmpTensor( + framework::make_ddim({numel}), *dev_ctx); PADDLE_ENFORCE_EQ(tensor.numel(), numel); out_side_tensor.ShareDataWith(tensor); } - PADDLE_ENFORCE_EQ(address, out_side_tensor.data()); PADDLE_ENFORCE_EQ(out_side_tensor.numel(), numel); } #ifdef PADDLE_WITH_CUDA - platform::CUDAPlace gpu_place(0); - TemporaryAllocator gpu_alloc(gpu_place); { - void* address; + platform::CUDAPlace gpu_place(0); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = + static_cast(pool.Get(gpu_place)); + framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx); + size_t memory_size = 500; int numel = memory_size / sizeof(float); framework::Tensor out_side_tensor; { - auto allocation = gpu_alloc.Allocate(memory_size); - address = allocation->ptr(); - framework::Tensor tensor = framework::GetTensor( - std::move(allocation), framework::make_ddim({numel})); - PADDLE_ENFORCE_EQ(address, tensor.data()); + framework::Tensor tensor = + ctx.AllocateTmpTensor( + framework::make_ddim({numel}), *dev_ctx); PADDLE_ENFORCE_EQ(tensor.numel(), numel); out_side_tensor.ShareDataWith(tensor); } - PADDLE_ENFORCE_EQ(address, out_side_tensor.data()); PADDLE_ENFORCE_EQ(out_side_tensor.numel(), numel); - // The allocation is holded by out_side_tensor. - PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0); - gpu_alloc.Release([]() {}); - PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0); } - - // The allocation is not holded now, it should be placed to - // TemporaryAllocationQueue. - PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 1); - gpu_alloc.Release([]() {}); - PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0); #endif } From 1cb74b061b273db10ca79d0df926caefacb170f2 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Wed, 2 Jan 2019 13:12:21 +0800 Subject: [PATCH 102/197] fix the whl issue test=develop --- python/paddle/fluid/__init__.py | 7 ------- python/paddle/fluid/framework.py | 6 ++++++ .../unittests/test_eager_deletion_dynamic_rnn_base.py | 6 ++++++ 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 7a72670935..abcad4ca52 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -102,13 +102,6 @@ def __bootstrap__(): import sys import os import platform - - if os.name == 'nt': - third_lib_path = os.path.abspath(os.path.dirname( - __file__)) + os.sep + '..' + os.sep + 'libs' - os.environ['path'] += ';' + third_lib_path - sys.path.append(third_lib_path) - from . import core in_test = 'unittest' in sys.modules diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 921d59158f..c15d54a7f0 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -27,6 +27,12 @@ import numpy as np from .. import compat as cpt from .proto import framework_pb2 try: + if os.name == 'nt': + third_lib_path = os.path.abspath(os.path.dirname( + __file__)) + os.sep + '..' + os.sep + 'libs' + os.environ['path'] += ';' + third_lib_path + sys.path.append(third_lib_path) + from . import core except ImportError as e: if os.name == 'nt': diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py index 89476ee641..81b0b66781 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py @@ -29,6 +29,12 @@ def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2): print('Skip use_cuda=True because Paddle is not compiled with cuda') return + if use_parallel_executor and os.name == 'nt': + print( + 'Skip use_parallel_executor=True because Paddle comes without parallel support on windows' + ) + return + word_dict = paddle.dataset.imdb.word_dict() train_reader = paddle.batch( paddle.dataset.imdb.train(word_dict), batch_size=batch_size) From 227e0c4518901f1ece25db76dbcef384583cf8af Mon Sep 17 00:00:00 2001 From: Wu Yi Date: Wed, 2 Jan 2019 13:18:45 +0800 Subject: [PATCH 103/197] fix nccl2 mode startup test=develop (#15132) --- paddle/fluid/framework/details/build_strategy.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 389366a8a9..7edbe596be 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -67,7 +67,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { context->endpoints_ = strategy_.trainers_endpoints_; context->trainer_id_ = strategy_.trainer_id_; PADDLE_ENFORCE(strategy_.trainer_id_ >= 0, "trainer_id_ >= 0"); - if (strategy_.trainer_id_ > 0) { + if (strategy_.trainer_id_ > 0 && strategy_.trainers_endpoints_.size() > 0) { PADDLE_ENFORCE((unsigned)(strategy_.trainer_id_) < strategy_.trainers_endpoints_.size(), "trainer_id_ < endpoints_ size"); From 8eb1f2621183766adbd7e36542caac77a6d5ff3f Mon Sep 17 00:00:00 2001 From: xiaolil1 <39753926+xiaolil1@users.noreply.github.com> Date: Wed, 2 Jan 2019 13:54:13 +0800 Subject: [PATCH 104/197] Enable INT8 pool OP (#15046) * Enable INT8 pool OP test=develop * fix unittest test=develop * Clean unittest code. test=develop --- paddle/fluid/operators/pool_mkldnn_op.cc | 31 +++-- .../unittests/test_pool2d_int8_mkldnn_op.py | 110 ++++++++++++++++++ .../tests/unittests/test_pool2d_mkldnn_op.py | 45 +++---- .../fluid/tests/unittests/test_pool2d_op.py | 5 +- 4 files changed, 150 insertions(+), 41 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_pool2d_int8_mkldnn_op.py diff --git a/paddle/fluid/operators/pool_mkldnn_op.cc b/paddle/fluid/operators/pool_mkldnn_op.cc index 0a9a29956a..f6f40b1daf 100644 --- a/paddle/fluid/operators/pool_mkldnn_op.cc +++ b/paddle/fluid/operators/pool_mkldnn_op.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/data_layout_transform.h" #include "paddle/fluid/operators/pool_op.h" #include "paddle/fluid/platform/mkldnn_helper.h" @@ -71,7 +72,6 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { void Compute(const paddle::framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace."); - auto& dev_ctx = ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); @@ -130,20 +130,25 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { CorrectOutputSize(src_tz, dst_tz, ksize, paddings, strides, padding_right_bottom); } - auto src_md = platform::MKLDNNMemDesc( - src_tz, platform::MKLDNNGetDataType(), input_format); + + mkldnn::memory::data_type dt = + paddle::framework::ToMKLDNNDataType(input->type()); + + auto src_md = platform::MKLDNNMemDesc(src_tz, dt, input_format); /* create memory descriptor for pooling without specified format * ('any') which lets a primitive (pooling in this case) choose * the memory format preferred for best performance */ - auto dst_md = platform::MKLDNNMemDesc(dst_tz, mkldnn::memory::f32, - mkldnn::memory::format::any); - + auto dst_md = + platform::MKLDNNMemDesc(dst_tz, dt, mkldnn::memory::format::any); + auto propagation = src_md.data.data_type == mkldnn_f32 + ? mkldnn::prop_kind::forward_training + : mkldnn::prop_kind::forward_scoring; std::shared_ptr pool_pd = - CreatePrimitiveDesc(src_md, dst_md, strides, padding_left_top, - padding_right_bottom, ksize, pooling_type, - mkldnn_engine, ceil_mode, is_test); + CreatePrimitiveDesc(src_md, dst_md, propagation, strides, + padding_left_top, padding_right_bottom, ksize, + pooling_type, mkldnn_engine, ceil_mode, is_test); // save pool_pd into global device context to be referred in backward path if (!is_test) dev_ctx.SetBlob(key_pool_pd, pool_pd); @@ -203,7 +208,8 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { private: std::unique_ptr CreatePrimitiveDesc( const mkldnn::memory::desc& src, const mkldnn::memory::desc& dst, - const std::vector& stride, const std::vector& padding_left_top, + const mkldnn::prop_kind& propagation, const std::vector& stride, + const std::vector& padding_left_top, const std::vector& padding_right_bot, const std::vector& kernel, const std::string& pooling_type, const mkldnn::engine& engine, bool ceil_mode, bool is_test) const { @@ -411,6 +417,9 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_KERNEL(pool2d, MKLDNN, ::paddle::platform::CPUPlace, - ops::PoolMKLDNNOpKernel); + ops::PoolMKLDNNOpKernel, + ops::PoolMKLDNNOpKernel, + ops::PoolMKLDNNOpKernel); + REGISTER_OP_KERNEL(pool2d_grad, MKLDNN, ::paddle::platform::CPUPlace, ops::PoolMKLDNNGradOpKernel); diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_int8_mkldnn_op.py new file mode 100644 index 0000000000..f4495d0bc8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_pool2d_int8_mkldnn_op.py @@ -0,0 +1,110 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +from __future__ import division + +import unittest +import numpy as np + +import paddle.fluid.core as core +from op_test import OpTest +from test_pool2d_op import TestPool2D_Op, avg_pool2D_forward_naive, max_pool2D_forward_naive + + +class TestPool2dMKLDNNInt8_Op(TestPool2D_Op): + def init_kernel_type(self): + self.use_mkldnn = True + + def init_data_type(self): + self.dtype = np.int8 + + def setUp(self): + TestPool2D_Op.setUp(self) + assert self.dtype in [np.int8, np.uint8 + ], 'Dtype should be int8 or uint8' + + def test_check_output(self): + self.check_output_with_place(core.CPUPlace(), atol=1e-5) + + def test_check_grad(self): + pass + + +class TestCase1Avg(TestPool2dMKLDNNInt8_Op): + def init_test_case(self): + self.shape = [2, 3, 7, 7] + self.ksize = [3, 3] + self.strides = [1, 1] + self.paddings = [0, 0] + + def init_global_pool(self): + self.global_pool = False + + +class TestCase2Avg(TestPool2dMKLDNNInt8_Op): + def init_test_case(self): + self.shape = [2, 3, 7, 7] + self.ksize = [3, 3] + self.strides = [1, 1] + self.paddings = [1, 1] + + def init_global_pool(self): + self.global_pool = False + + +class TestCase0Max(TestPool2dMKLDNNInt8_Op): + def init_pool_type(self): + self.pool_type = "max" + self.pool2D_forward_naive = max_pool2D_forward_naive + + +class TestCase1Max(TestCase1Avg): + def init_pool_type(self): + self.pool_type = "max" + self.pool2D_forward_naive = max_pool2D_forward_naive + + +class TestCase2Max(TestCase2Avg): + def init_pool_type(self): + self.pool_type = "max" + self.pool2D_forward_naive = max_pool2D_forward_naive + + +def create_test_s8_u8_class(parent): + class TestS8Case(parent): + def init_data_type(self): + self.dtype = np.int8 + + class TestU8Case(parent): + def init_data_type(self): + self.dtype = np.uint8 + + cls_name_s8 = "{0}_{1}".format(parent.__name__, "mkldnn_s8") + cls_name_u8 = "{0}_{1}".format(parent.__name__, "mkldnn_u8") + TestS8Case.__name__ = cls_name_s8 + TestU8Case.__name__ = cls_name_u8 + globals()[cls_name_s8] = TestS8Case + globals()[cls_name_u8] = TestU8Case + + +create_test_s8_u8_class(TestPool2dMKLDNNInt8_Op) +create_test_s8_u8_class(TestCase1Avg) +create_test_s8_u8_class(TestCase2Avg) +create_test_s8_u8_class(TestCase0Max) +create_test_s8_u8_class(TestCase1Max) +create_test_s8_u8_class(TestCase2Max) + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py index 19f29c7826..7de5fefc14 100644 --- a/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py @@ -18,35 +18,22 @@ import unittest from test_pool2d_op import TestPool2D_Op, TestCase1, TestCase2, TestCase3, TestCase4, TestCase5 -class TestMKLDNNCase1(TestPool2D_Op): - def init_kernel_type(self): - self.use_mkldnn = True - - -class TestMKLDNNCase2(TestCase1): - def init_kernel_type(self): - self.use_mkldnn = True - - -class TestMKLDNNCase3(TestCase2): - def init_kernel_type(self): - self.use_mkldnn = True - - -class TestMKLDNNCase4(TestCase3): - def init_kernel_type(self): - self.use_mkldnn = True - - -class TestMKLDNNCase5(TestCase4): - def init_kernel_type(self): - self.use_mkldnn = True - - -class TestMKLDNNCase6(TestCase5): - def init_kernel_type(self): - self.use_mkldnn = True - +def create_test_mkldnn_class(parent): + class TestMKLDNNCase(parent): + def init_kernel_type(self): + self.use_mkldnn = True + + cls_name = "{0}_{1}".format(parent.__name__, "MKLDNNOp") + TestMKLDNNCase.__name__ = cls_name + globals()[cls_name] = TestMKLDNNCase + + +create_test_mkldnn_class(TestPool2D_Op) +create_test_mkldnn_class(TestCase1) +create_test_mkldnn_class(TestCase2) +create_test_mkldnn_class(TestCase3) +create_test_mkldnn_class(TestCase4) +create_test_mkldnn_class(TestCase5) if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py index 5ccdf082e8..92515add59 100644 --- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py +++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py @@ -115,7 +115,7 @@ class TestPool2D_Op(OpTest): self.op_type = "pool2d" self.use_cudnn = False self.use_mkldnn = False - self.dtype = np.float32 + self.init_data_type() self.init_test_case() self.init_global_pool() self.init_kernel_type() @@ -177,6 +177,9 @@ class TestPool2D_Op(OpTest): def init_kernel_type(self): pass + def init_data_type(self): + self.dtype = np.float32 + def init_pool_type(self): self.pool_type = "avg" self.pool2D_forward_naive = avg_pool2D_forward_naive From 94c80347b6a5ba4684a619585578b4940b512d7a Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Wed, 2 Jan 2019 15:03:22 +0800 Subject: [PATCH 105/197] update by comment --- .../framework/details/parallel_ssa_graph_executor.cc | 12 ++++++------ paddle/fluid/framework/parallel_executor.cc | 5 +++-- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc index bb1f415128..128aaa33a2 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc @@ -34,7 +34,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor( ? 1UL : strategy_.num_threads_ / places_.size(); VLOG(1) << "set num_threads: " << strategy_.num_threads_ - << " to schedule operators on each device."; + << " to run the operators of the graph on each device."; for (size_t i = 0; i < places.size(); ++i) { executors_.emplace_back(new details::ThreadedSSAGraphExecutor( strategy_, {local_scopes_[i]}, {places_[i]}, std::move(graphs_[i]))); @@ -45,10 +45,10 @@ FeedFetchList ParallelSSAGraphExecutor::Run( const std::vector &fetch_tensors) { std::vector> run_futures; - std::vector fetch_datas; + std::vector fetch_data; FeedFetchList ret; - fetch_datas.reserve(places_.size()); + fetch_data.reserve(places_.size()); ret.reserve(fetch_tensors.size()); exception_holder_.Clear(); @@ -65,7 +65,7 @@ FeedFetchList ParallelSSAGraphExecutor::Run( if (pool_) { run_futures.emplace_back(pool_->enqueue(std::move(call))); } else { - fetch_datas.emplace_back(std::move(call())); + fetch_data.emplace_back(std::move(call())); } } @@ -74,7 +74,7 @@ FeedFetchList ParallelSSAGraphExecutor::Run( if (exception_holder_.IsCaught()) { f.wait(); } else { - fetch_datas.emplace_back(std::move(f.get())); + fetch_data.emplace_back(std::move(f.get())); } } } @@ -86,7 +86,7 @@ FeedFetchList ParallelSSAGraphExecutor::Run( std::vector lodtensor_ptrs; lodtensor_ptrs.reserve(local_scopes_.size()); for (size_t scope_idx = 0; scope_idx < local_scopes_.size(); ++scope_idx) { - lodtensor_ptrs.push_back(&fetch_datas.at(scope_idx).at(fetch_idx)); + lodtensor_ptrs.push_back(&fetch_data.at(scope_idx).at(fetch_idx)); } ret.emplace_back(); ret.back().MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace()); diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 176c1db349..5a3f5e9e69 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -469,8 +469,9 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes( bool ParallelExecutor::EnableParallelGraphExecution( const ProgramDesc &main_program, const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy) const { - bool enable_parallel_graph = true; + if (!FLAGS_enable_parallel_graph) return false; + bool enable_parallel_graph = true; // TODO(Yancey1989): support sparse update in ParallelGraph mode. for (auto &var_desc : main_program.Block(0).AllVars()) { if (var_desc->GetType() == proto::VarType::SELECTED_ROWS) { @@ -492,7 +493,7 @@ bool ParallelExecutor::EnableParallelGraphExecution( if (build_strategy.enable_sequential_execution_ || exec_strategy.type_ == ExecutionStrategy::ExecutorType::kExperimental) enable_parallel_graph = false; - return enable_parallel_graph && FLAGS_enable_parallel_graph; + return enable_parallel_graph; } ParallelExecutor::~ParallelExecutor() { From d0a8a1e950f3b12b6a9bc03f559c2368111983de Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Wed, 2 Jan 2019 07:29:56 +0000 Subject: [PATCH 106/197] remove_op_handle_lock test=develop --- paddle/fluid/operators/math/blas_impl.cu.h | 73 ++++++++++++-------- paddle/fluid/platform/cuda_helper.h | 58 ++++++++++++++++ paddle/fluid/platform/device_context.cc | 27 ++------ paddle/fluid/platform/device_context.h | 31 ++++++--- paddle/fluid/platform/device_context_test.cu | 3 - 5 files changed, 128 insertions(+), 64 deletions(-) create mode 100644 paddle/fluid/platform/cuda_helper.h diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h index a4fb1cdcd9..58f7be12ce 100644 --- a/paddle/fluid/operators/math/blas_impl.cu.h +++ b/paddle/fluid/operators/math/blas_impl.cu.h @@ -68,9 +68,11 @@ struct CUBlas { #if CUDA_VERSION >= 8000 VLOG(5) << "use_tensor_op_math: " << (dev_ctx->tensor_core_available() ? "True" : "False"); - PADDLE_ENFORCE(platform::dynload::cublasSgemmEx( - dev_ctx->possible_cublas_tensor_core_handle(), transa, transb, m, n, k, - alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc)); + dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE(platform::dynload::cublasSgemmEx( + handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, + beta, C, Ctype, ldc)); + }); #else PADDLE_THROW("cublasSgemmEx is supported on cuda >= 8.0"); #endif @@ -171,10 +173,11 @@ struct CUBlas { << (use_tensor_op_math ? "True" : "False"); #endif // CUDA_VERSION >= 9000 - PADDLE_ENFORCE(platform::dynload::cublasGemmEx( - dev_ctx->possible_cublas_tensor_core_handle(), transa, transb, m, n, k, - alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc, computeType, - algo)); + dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE(platform::dynload::cublasGemmEx( + handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, + beta, C, Ctype, ldc, computeType, algo)); + }); #else PADDLE_THROW("cublasGemmEx is supported on cuda >= 8.0"); #endif @@ -204,9 +207,10 @@ void Blas::GEMM(CBLAS_TRANSPOSE transA, CUDA_R_32F, N); } else { #endif // CUDA_VERSION >= 8000 - - CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, - &alpha, B, ldb, A, lda, &beta, C, N); + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, + lda, &beta, C, N); + }); #if CUDA_VERSION >= 8000 } @@ -247,9 +251,12 @@ inline void Blas::GEMM( CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N, CUDA_R_32F); #else // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm - CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, - N, M, K, &h_alpha, h_B, ldb, h_A, lda, - &h_beta, h_C, N); + + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM(handle, cuTransB, cuTransA, N, M, K, + &h_alpha, h_B, ldb, h_A, lda, &h_beta, h_C, + N); + }); #endif // CUDA_VERSION >= 8000 } @@ -273,8 +280,10 @@ void Blas::GEMM(bool transA, bool transB, int M, } else { #endif // CUDA_VERSION >= 8000 - CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, - &alpha, B, ldb, A, lda, &beta, C, ldc); + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, + lda, &beta, C, ldc); + }); #if CUDA_VERSION >= 8000 } @@ -292,16 +301,19 @@ inline void Blas::GEMM( cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N; cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N; - CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, - N, M, K, &alpha, B, ldb, A, lda, &beta, C, - ldc); + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, + B, ldb, A, lda, &beta, C, ldc); + }); } template <> template void Blas::AXPY(int n, T alpha, const T *x, T *y) const { - CUBlas::AXPY(context_.cublas_handle(), n, &alpha, x, 1, y, 1); + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::AXPY(handle, n, &alpha, x, 1, y, 1); + }); } template <> @@ -311,8 +323,9 @@ void Blas::GEMV(bool trans_a, int M, int N, T beta, T *C) const { cublasOperation_t cuTransA = !trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; - CUBlas::GEMV(context_.cublas_handle(), cuTransA, N, M, &alpha, A, N, B, 1, - &beta, C, 1); + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1); + }); } template <> @@ -342,16 +355,20 @@ void Blas::BatchedGEMM( VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False"); - PADDLE_ENFORCE(platform::dynload::cublasGemmStridedBatchedEx( - context_.possible_cublas_tensor_core_handle(), cuTransB, cuTransA, N, M, - K, &alpha, B, CUDA_R_32F, ldb, strideB, A, CUDA_R_32F, lda, strideA, - &beta, C, CUDA_R_32F, ldc, strideC, batchCount, CUDA_R_32F, algo)); + context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE(platform::dynload::cublasGemmStridedBatchedEx( + handle, cuTransB, cuTransA, N, M, K, &alpha, B, CUDA_R_32F, ldb, + strideB, A, CUDA_R_32F, lda, strideA, &beta, C, CUDA_R_32F, ldc, + strideC, batchCount, CUDA_R_32F, algo)); + }); } else { #endif // CUDA_VERSION >= 9010 - CUBlas::GEMM_STRIDED_BATCH(context_.cublas_handle(), cuTransB, cuTransA, - N, M, K, &alpha, B, ldb, strideB, A, lda, - strideA, &beta, C, ldc, strideC, batchCount); + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM_STRIDED_BATCH(handle, cuTransB, cuTransA, N, M, K, &alpha, + B, ldb, strideB, A, lda, strideA, &beta, C, + ldc, strideC, batchCount); + }); #if CUDA_VERSION >= 9010 } diff --git a/paddle/fluid/platform/cuda_helper.h b/paddle/fluid/platform/cuda_helper.h new file mode 100644 index 0000000000..122de72e15 --- /dev/null +++ b/paddle/fluid/platform/cuda_helper.h @@ -0,0 +1,58 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include // NOLINT + +#include "paddle/fluid/platform/dynload/cublas.h" +#include "paddle/fluid/platform/macros.h" + +#if CUDA_VERSION < 9000 +enum cublasMath_t { CUBLAS_DEFAULT_MATH = 0 }; +#endif + +namespace paddle { +namespace platform { + +class CublasHandleHolder { + public: + CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) { + PADDLE_ENFORCE(dynload::cublasCreate(&handle_)); + PADDLE_ENFORCE(dynload::cublasSetStream(handle_, stream)); +#if CUDA_VERSION >= 9000 + if (math_type == CUBLAS_TENSOR_OP_MATH) { + PADDLE_ENFORCE( + dynload::cublasSetMathMode(handle_, CUBLAS_TENSOR_OP_MATH)); + } +#endif + } + + ~CublasHandleHolder() { PADDLE_ENFORCE(dynload::cublasDestroy(handle_)); } + + template + inline void Call(Callback &&callback) const { + std::lock_guard guard(mtx_); + callback(handle_); + } + + private: + DISABLE_COPY_AND_ASSIGN(CublasHandleHolder); + + cublasHandle_t handle_; + mutable std::mutex mtx_; +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index e40928fe5d..be7f4949d6 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -245,17 +245,12 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) eigen_stream_.reset(new EigenCudaStreamDevice()); eigen_stream_->Reinitialize(&stream_, place); eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get())); - PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_)); - PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_)); + cublas_handle_.reset(new CublasHandleHolder(stream_, CUBLAS_DEFAULT_MATH)); if (TensorCoreAvailable()) { #if CUDA_VERSION >= 9000 - cublas_tensor_core_handle_.reset(new cublasHandle_t()); - PADDLE_ENFORCE(dynload::cublasCreate(cublas_tensor_core_handle_.get())); - PADDLE_ENFORCE( - dynload::cublasSetStream(*cublas_tensor_core_handle_, stream_)); - PADDLE_ENFORCE(dynload::cublasSetMathMode(*cublas_tensor_core_handle_, - CUBLAS_TENSOR_OP_MATH)); + cublas_tensor_core_handle_.reset( + new CublasHandleHolder(stream_, CUBLAS_TENSOR_OP_MATH)); #endif } @@ -318,11 +313,8 @@ CUDADeviceContext::~CUDADeviceContext() { SetDeviceId(place_.device); Wait(); WaitStreamCallback(); - PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_)); - if (cublas_tensor_core_handle_) { - PADDLE_ENFORCE(dynload::cublasDestroy(*cublas_tensor_core_handle_)); - cublas_tensor_core_handle_.reset(); - } + cublas_handle_.reset(); + cublas_tensor_core_handle_.reset(); eigen_stream_.reset(); eigen_device_.reset(); PADDLE_ENFORCE(cudaStreamDestroy(stream_)); @@ -351,15 +343,6 @@ Eigen::GpuDevice* CUDADeviceContext::eigen_device() const { return eigen_device_.get(); } -cublasHandle_t CUDADeviceContext::cublas_handle() const { - return cublas_handle_; -} - -cublasHandle_t CUDADeviceContext::possible_cublas_tensor_core_handle() const { - return cublas_tensor_core_handle_ ? *cublas_tensor_core_handle_ - : cublas_handle_; -} - bool CUDADeviceContext::tensor_core_available() const { return cublas_tensor_core_handle_ != nullptr; } diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 41b741a68f..c81d17380c 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -20,6 +20,7 @@ limitations under the License. */ #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/platform/temporary_allocator.h" #ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/dynload/cublas.h" #include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/gpu_info.h" @@ -229,15 +230,25 @@ class CUDADeviceContext : public DeviceContext { /*! \brief Return eigen device in the device context. */ Eigen::GpuDevice* eigen_device() const; - /*! \brief Return cublas handle in the device context. */ - cublasHandle_t cublas_handle() const; + /*! \brief Call cublas function safely. */ + template + inline void CublasCall(Callback&& callback) const { + cublas_handle_->Call(std::forward(callback)); + } /*! \brief Check whether tensor core is supported */ bool tensor_core_available() const; - /*! \brief Return cublas handle supporting Tensor Core. If Tensor Core is - * not supported, return the same handle as cublas_handle(). */ - cublasHandle_t possible_cublas_tensor_core_handle() const; + /*! \brief Call cublas function with Tensor Core safely. If + Tensor Core is not available, use DEFAULT_MATH instead. */ + template + inline void TensorCoreCublasCallIfAvailable(Callback&& callback) const { + if (cublas_tensor_core_handle_) { + cublas_tensor_core_handle_->Call(std::forward(callback)); + } else { + cublas_handle_->Call(std::forward(callback)); + } + } /*! \brief Return cudnn handle in the device context. */ cudnnHandle_t cudnn_handle() const; @@ -256,7 +267,6 @@ class CUDADeviceContext : public DeviceContext { template void RecordEvent(cudaEvent_t ev, Callback callback) { - std::lock_guard guard(mtx_); callback(); PADDLE_ENFORCE(cudaEventRecord(ev, stream_)); } @@ -275,8 +285,9 @@ class CUDADeviceContext : public DeviceContext { std::unique_ptr eigen_stream_; std::unique_ptr cudnn_holder_; cudaStream_t stream_; - cublasHandle_t cublas_handle_; - std::unique_ptr cublas_tensor_core_handle_; + + std::unique_ptr cublas_handle_; + std::unique_ptr cublas_tensor_core_handle_; int compute_capability_; int runtime_version_; @@ -284,12 +295,10 @@ class CUDADeviceContext : public DeviceContext { int multi_process_; int max_threads_per_mp_; - mutable std::mutex mtx_; - // StreamCallbackManager is thread-safe std::unique_ptr callback_manager_; - mutable std::mutex cublas_mtx_; + DISABLE_COPY_AND_ASSIGN(CUDADeviceContext); }; template <> diff --git a/paddle/fluid/platform/device_context_test.cu b/paddle/fluid/platform/device_context_test.cu index 171d2979a0..5b3aa98efb 100644 --- a/paddle/fluid/platform/device_context_test.cu +++ b/paddle/fluid/platform/device_context_test.cu @@ -43,9 +43,6 @@ TEST(Device, CUDADeviceContext) { ASSERT_NE(nullptr, gpu_device); cudnnHandle_t cudnn_handle = device_context->cudnn_handle(); ASSERT_NE(nullptr, cudnn_handle); - cublasHandle_t cublas_handle = device_context->cublas_handle(); - ASSERT_NE(nullptr, cublas_handle); - ASSERT_NE(nullptr, device_context->stream()); delete device_context; } } From af615825432a1f5417b6b1065e0fab52e3afc120 Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Wed, 2 Jan 2019 19:21:33 +0800 Subject: [PATCH 107/197] test=develop --- paddle/scripts/paddle_build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index d7ab36223c..57e059bcf9 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -918,11 +918,11 @@ function main() { cmake_gen ${PYTHON_ABI:-""} build assert_api_not_changed ${PYTHON_ABI:-""} - assert_api_spec_approvals run_test gen_capi_package gen_fluid_lib test_fluid_lib + assert_api_spec_approvals ;; assert_api) assert_api_not_changed ${PYTHON_ABI:-""} From db603398b780660c683a9aeac2e4e8b8374e0094 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Wed, 2 Jan 2019 19:43:50 +0800 Subject: [PATCH 108/197] disable parallel graph executor by default --- paddle/fluid/framework/parallel_executor.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 5a3f5e9e69..450fe1508f 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -35,7 +35,7 @@ limitations under the License. */ DEFINE_string(pe_profile_fname, "", "Profiler filename for PE, which generated by gperftools." "Only valid when compiled `WITH_PRIFILER=ON`. Empty if disable."); -DEFINE_bool(enable_parallel_graph, true, +DEFINE_bool(enable_parallel_graph, false, "Force disable parallel graph execution mode if set false."); namespace paddle { From 449bf58ea6f23490b903c13dde31b1015090ed61 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Wed, 2 Jan 2019 19:48:23 +0800 Subject: [PATCH 109/197] disable parallelgraph mode by default test=develop --- paddle/fluid/pybind/pybind.cc | 2 +- python/paddle/fluid/__init__.py | 3 ++- python/paddle/fluid/parallel_executor.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index c803864e61..3b81d59ad9 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -825,7 +825,7 @@ All parameter, weight, gradient are variables in Paddle. If :math:`num\_threads=1`, all the operators will execute one by one, but the order maybe difference between iterations. If it is not set, it will be set in ParallelExecutor according to the - device type and device count, for GPU, :math:`num\_threads=device\_count`, for CPU, + device type and device count, for GPU, :math:`num\_threads=device\_count*4`, for CPU, :math:`num\_threads=CPU\_NUM*4`, the explanation of:math:`CPU\_NUM` is in ParallelExecutor. if it is not set, ParallelExecutor will get the cpu count by calling `multiprocessing.cpu_count()`. Default 0.)DOC") diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 999065f8aa..a8643bc542 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -135,7 +135,8 @@ def __bootstrap__(): 'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb', 'fast_eager_deletion_mode', 'allocator_strategy', 'reader_queue_speed_test_mode', - 'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir' + 'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir', + 'enable_parallel_graph' ] if 'Darwin' not in sysstr: read_env_flags.append('use_pinned_memory') diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 9709961286..c97a93ec36 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -117,7 +117,7 @@ class ParallelExecutor(object): if use_cuda: # Experiments on se-resnext shows that too many threads hurt # performance. Worth tunning for other models in the future. - exec_strategy.num_threads = len(self._places) + exec_strategy.num_threads = len(self._places) * 4 else: cpu_num = int( os.environ.get('CPU_NUM', multiprocessing.cpu_count())) From 4ad9de74ddf99cb35722dcec99690444a76b27af Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Thu, 3 Jan 2019 13:17:25 +0800 Subject: [PATCH 110/197] disable sync nccl by default test=develop --- paddle/fluid/framework/details/all_reduce_op_handle.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index 6f8409d8fc..a24e3d3e48 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -19,10 +19,10 @@ #include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/platform/profiler.h" -// async nccl allreduce or sync issue: +// asynchronous nccl allreduce or synchronous issue: // https://github.com/PaddlePaddle/Paddle/issues/15049 DEFINE_bool( - sync_nccl_allreduce, true, + sync_nccl_allreduce, false, "If set true, will call `cudaStreamSynchronize(nccl_stream)`" "after allreduce, this mode can get better performance in some scenarios."); From 8bb513cad41ca10c1f69c5570fa03db308c0a0ea Mon Sep 17 00:00:00 2001 From: peizhilin Date: Thu, 3 Jan 2019 13:37:40 +0800 Subject: [PATCH 111/197] test=develop --- python/paddle/fluid/framework.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index c15d54a7f0..921a3ea183 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -28,6 +28,7 @@ from .. import compat as cpt from .proto import framework_pb2 try: if os.name == 'nt': + import sys third_lib_path = os.path.abspath(os.path.dirname( __file__)) + os.sep + '..' + os.sep + 'libs' os.environ['path'] += ';' + third_lib_path From c981bf0f9dc7a70d807acc837d5ced65a6e33f44 Mon Sep 17 00:00:00 2001 From: qingqing01 Date: Thu, 3 Jan 2019 14:34:09 +0800 Subject: [PATCH 112/197] Fix compling error with cuDNN v5 (#15148) test=develop --- paddle/fluid/operators/fused/CMakeLists.txt | 6 ++++-- paddle/fluid/operators/fused/fusion_conv_inception_op.cu | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt index 2bddba7db2..42ab8e9966 100644 --- a/paddle/fluid/operators/fused/CMakeLists.txt +++ b/paddle/fluid/operators/fused/CMakeLists.txt @@ -2,7 +2,9 @@ include(operators) register_operators(EXCLUDES fusion_transpose_flatten_concat_op fusion_conv_inception_op) if (WITH_GPU) op_library(fusion_transpose_flatten_concat_op) - op_library(fusion_conv_inception_op) file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fusion_transpose_flatten_concat);\n") - file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_inception_fusion);\n") + if (NOT ${CUDNN_VERSION} VERSION_LESS 7100) + op_library(fusion_conv_inception_op) + file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_inception_fusion);\n") + endif() endif() diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu index 3349b0b31e..6e13887866 100644 --- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu +++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu @@ -21,7 +21,7 @@ DECLARE_uint64(conv_workspace_size_limit); namespace paddle { namespace operators { -#if CUDNN_VERSION >= 7001 +#if CUDNN_VERSION >= 7100 using Tensor = framework::Tensor; using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; using ScopedFilterDescriptor = platform::ScopedFilterDescriptor; @@ -264,7 +264,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -#if CUDNN_VERSION >= 7001 +#if CUDNN_VERSION >= 7100 namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL(conv2d_inception_fusion, ops::CUDNNConvInceptionFusionOpKernel, From 5e928e579a98cfa0badd3366c2a19a5f29c2d0ec Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Thu, 27 Dec 2018 18:57:22 +0800 Subject: [PATCH 113/197] try unify Executor and ParallelExecutor test=develop --- paddle/fluid/framework/parallel_executor.cc | 6 +- paddle/fluid/framework/parallel_executor.h | 3 +- paddle/fluid/pybind/pybind.cc | 3 +- python/paddle/fluid/compiler.py | 118 ++++++++++++++++++ python/paddle/fluid/executor.py | 104 +++++++++++++-- python/paddle/fluid/parallel_executor.py | 8 +- .../unittests/parallel_executor_test_base.py | 33 ++--- .../fluid/tests/unittests/test_dist_base.py | 23 ++-- 8 files changed, 248 insertions(+), 50 deletions(-) create mode 100644 python/paddle/fluid/compiler.py diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 450fe1508f..5c8776b62f 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -193,8 +193,7 @@ ParallelExecutor::ParallelExecutor( const std::unordered_set &bcast_vars, const ProgramDesc &main_program, const std::string &loss_var_name, Scope *scope, const std::vector &local_scopes, - const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy, - size_t num_trainers, size_t trainer_id) + const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy) : member_(new ParallelExecutorPrivate(places)) { member_->global_scope_ = scope; member_->use_cuda_ = exec_strategy.use_cuda_; @@ -253,7 +252,8 @@ ParallelExecutor::ParallelExecutor( } member_->nccl_ctxs_.reset(new platform::NCCLContextMap( - member_->places_, nccl_id, num_trainers, trainer_id)); + member_->places_, nccl_id, build_strategy.num_trainers_, + build_strategy.trainer_id_)); #else PADDLE_THROW("Not compiled with CUDA"); #endif diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 49d3f0d3f6..121bbd55ad 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -50,8 +50,7 @@ class ParallelExecutor { const std::string &loss_var_name, Scope *scope, const std::vector &local_scopes, const ExecutionStrategy &exec_strategy, - const BuildStrategy &build_strategy, - size_t num_trainers = 1, size_t trainer_id = 0); + const BuildStrategy &build_strategy); ~ParallelExecutor(); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 3b81d59ad9..2d817bcb0d 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1022,8 +1022,7 @@ All parameter, weight, gradient are variables in Paddle. pe.def(py::init &, const std::unordered_set &, const ProgramDesc &, const std::string &, Scope *, std::vector &, - const ExecutionStrategy &, const BuildStrategy &, size_t, - size_t>()) + const ExecutionStrategy &, const BuildStrategy &>()) // NOTE: even we return a vec* to Python use reference policy. // We still cannot get local_scope from this vector, since the element // of vec will be freed by Python GC. We can only return Scope* diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py new file mode 100644 index 0000000000..63331f5708 --- /dev/null +++ b/python/paddle/fluid/compiler.py @@ -0,0 +1,118 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import multiprocessing +import os +import six +from .. import compat as cpt + +from . import core + +ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy +BuildStrategy = core.ParallelExecutor.BuildStrategy + + +def _place_obj(place): + p = core.Place() + p.set_place(place) + return p + + +class _ProgramCompiler(object): + def __init__(self, program): + self._program = program + self._compiled = False + self._is_data_parallel = False + + def _with_data_parallel(self, + loss_name=None, + build_strategy=None, + exec_strategy=None): + assert not self._is_data_parallel, "Already compiled with parallel." + self._is_data_parallel = True + self._build_strategy = build_strategy + self._exec_strategy = exec_strategy + self._loss_name = loss_name + return self + + def _compile_data_parallel(self): + self._places = [] + self._local_scopes = [] + + if self._exec_strategy is None: + self._exec_strategy = ExecutionStrategy() + if self._build_strategy is None: + self._build_strategy = BuildStrategy() + + self._exec_strategy.use_cuda = isinstance(self._place, core.CUDAPlace) + if self._exec_strategy.use_cuda: + gpus_env = os.getenv("FLAGS_selected_gpus") + if gpus_env: + gpus = [int(s) for s in gpus_env.split(",")] + else: + gpus = [ + i for i in six.moves.range(core.get_cuda_device_count()) + ] + self._places = [core.CUDAPlace(i) for i in gpus] + else: + cpu_num = int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + self._places = [core.CPUPlace() for _ in six.moves.range(cpu_num)] + assert self._places, "no place for execution" + + if self._exec_strategy.num_threads == 0: + if self._exec_strategy.use_cuda: + # Experiments on se-resnext shows that too many threads hurt + # performance. Worth tunning for other models in the future. + self._exec_strategy.num_threads = len(self._places) * 4 + else: + cpu_num = int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + self._exec_strategy.num_threads = cpu_num * 2 + + trainers_endpoints = self._program._trainers_endpoints + if self._build_strategy.num_trainers > 1 and trainers_endpoints: + assert self._build_strategy.num_trainers == len( + trainers_endpoints), "num_trainers == len(end_points)" + self._build_strategy.trainers_endpoints = trainers_endpoints + + self._persistable_vars = set([ + cpt.to_text(v.name) + for v in [ + var for var in self._program.list_vars() + if var.persistable and var.type != core.VarDesc.VarType.RAW + ] + ]) + + places = list(map(_place_obj, self._places)) + return core.ParallelExecutor( + places, self._persistable_vars, self._program.desc, + cpt.to_text(self._loss_name) + if self._loss_name else six.u(''), self._scope, self._local_scopes, + self._exec_strategy, self._build_strategy) + + def _compile(self, scope, place): + if self._compiled: + return self + self._compiled = True + + self._scope = scope + self._place = place + + if self._is_data_parallel: + self._executor = self._compile_data_parallel() + else: + p = _place_obj(self._place) + self._executor = core.Executor(p) + return self diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 5a9e908b61..ee7df74007 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -14,11 +14,15 @@ from __future__ import print_function +import os +import multiprocessing import numpy as np import contextlib import six from .framework import Program, default_main_program, Variable from . import core +from . import compiler +from .. import compat as cpt __all__ = ['Executor', 'global_scope', 'scope_guard'] @@ -275,11 +279,8 @@ class Executor(object): def __init__(self, place): self.place = place - p = core.Place() - p.set_place(place) - self.executor = core.Executor(p) - self.program_caches = dict() + self.executor = None self._closed = False def _get_program_cache(self, program_cache_key): @@ -361,6 +362,7 @@ class Executor(object): You can no long use this executor after calling this method. For the distributed training, this method would free the resource on PServers related to the current Trainer. + TODO(panyx0718): Why ParallelExecutor doesn't have close? Example: >>> cpu = core.CPUPlace() @@ -368,10 +370,58 @@ class Executor(object): >>> ... >>> exe.close() """ - if not self._closed: + if not self._closed and self.executor: self.executor.close() self._closed = True + def _run_parallel(self, + exe, + scope, + feed=None, + fetch_list=None, + return_numpy=True): + if isinstance(feed, dict): + feed_tensor_dict = dict() + for feed_name in feed: + feed_tensor = feed[feed_name] + if not isinstance(feed_tensor, core.LoDTensor): + feed_tensor = core.LoDTensor() + # always set to CPU place, since the tensor need to be splitted + # it is fast in CPU + feed_tensor.set(feed[feed_name], core.CPUPlace()) + feed_tensor_dict[feed_name] = feed_tensor + + exe.feed_and_split_tensor_into_local_scopes(feed_tensor_dict) + elif isinstance(feed, list) or isinstance(feed, tuple): + if len(feed) != len(self._places): + raise ValueError( + "Feed a list of tensor, the list should be the same size as places" + ) + + res = list() + for i, each in enumerate(feed): + if not isinstance(each, dict): + raise TypeError( + "Each element of feed list should be a dict") + res_dict = dict() + for feed_name in each: + tensor = each[feed_name] + if not isinstance(tensor, core.LoDTensor): + tmp = core.LoDTensor() + tmp.set(tensor, self._places[i]) + tensor = tmp + res_dict[feed_name] = tensor + res.append(res_dict) + exe.feed_tensors_into_local_scopes(res) + + fetch_var_name = '@FETCHED_VAR_NAME@' + exe.run(fetch_list, fetch_var_name) + arr = scope.find_var(fetch_var_name).get_lod_tensor_array() + + if return_numpy: + return as_numpy(arr) + return [arr[i] for i in range(len(arr))] + def run(self, program=None, feed=None, @@ -428,6 +478,47 @@ class Executor(object): if self._closed: raise RuntimeError("Attempted to use a closed Executor") + if scope is None: + scope = global_scope() + + compiled = isinstance(program, compiler._ProgramCompiler) + if not compiled: + p = core.Place() + p.set_place(self.place) + self.executor = core.Executor(p) + return self._run( + program, + feed=feed, + fetch_list=fetch_list, + feed_var_name=feed_var_name, + fetch_var_name=fetch_var_name, + scope=scope, + return_numpy=return_numpy, + use_program_cache=use_program_cache) + + program._compile(scope, self.place) + self.executor = program._executor + if program._is_data_parallel: + return self._run_parallel( + exe=program._executor, + scope=scope, + feed=feed, + fetch_list=fetch_list, + return_numpy=return_numpy) + else: + return self._run( + program._program, + feed=feed, + fetch_list=fetch_list, + feed_var_name=feed_var_name, + fetch_var_name=fetch_var_name, + scope=scope, + return_numpy=return_numpy, + use_program_cache=use_program_cache) + + def _run(self, program, feed, fetch_list, feed_var_name, fetch_var_name, + scope, return_numpy, use_program_cache): + if feed is None: feed = {} if not isinstance(feed, dict): @@ -444,9 +535,6 @@ class Executor(object): "Executor requires Program as its Parameter. But you passed in %s" % (type(program))) - if scope is None: - scope = global_scope() - cache_key = _get_program_cache_key(feed, fetch_list) if use_program_cache: cached_program = self._get_program_cache(cache_key) diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index c97a93ec36..917db02bb8 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -167,9 +167,8 @@ class ParallelExecutor(object): # step7: init ParallelExecutor self.executor = core.ParallelExecutor( places, persistable_vars, main.desc, - cpt.to_text(loss_name) - if loss_name else six.u(''), scope, local_scopes, exec_strategy, - build_strategy, num_trainers, trainer_id) + cpt.to_text(loss_name) if loss_name else six.u(''), scope, + local_scopes, exec_strategy, build_strategy) self.scope = scope @@ -292,3 +291,6 @@ class ParallelExecutor(object): @property def device_count(self): return len(self._places) + + def close(self): + pass diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index 2b0ab0cc3b..2038b57a6c 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -19,6 +19,7 @@ import os import unittest import paddle.fluid as fluid import paddle.fluid.core as core +from paddle.fluid import compiler import time import numpy as np import math @@ -44,15 +45,8 @@ class TestParallelExecutorBase(unittest.TestCase): optimizer=fluid.optimizer.Adam, use_fast_executor=False, enable_sequential_execution=False): - def run_executor(exe, feed, fetch_list, program=None): - if isinstance(exe, fluid.ParallelExecutor): - res = exe.run(fetch_list=fetch_list, feed=feed) - elif isinstance(exe, fluid.Executor): - if program is None: - program = fluid.default_main_program() - res = exe.run(program=program, feed=feed, fetch_list=fetch_list) - else: - raise ValueError('Unkown type exe') + def run_executor(exe, binary, feed, fetch_list): + res = exe.run(binary, feed=feed, fetch_list=fetch_list) return res main = fluid.Program() @@ -72,8 +66,8 @@ class TestParallelExecutorBase(unittest.TestCase): fluid.memory_optimize(main) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - startup_exe = fluid.Executor(place) - startup_exe.run(startup) + exe = fluid.Executor(place) + exe.run(startup) exec_strategy = fluid.ExecutionStrategy() exec_strategy.allow_op_delay = allow_op_delay if use_fast_executor: @@ -86,15 +80,13 @@ class TestParallelExecutorBase(unittest.TestCase): build_strategy.enable_sequential_execution = enable_sequential_execution if use_cuda and core.is_compiled_with_cuda(): build_strategy.remove_unnecessary_lock = True - if use_parallel_executor: - exe = fluid.ParallelExecutor( - use_cuda, + binary = compiler._ProgramCompiler(main)._with_data_parallel( loss_name=loss.name, - exec_strategy=exec_strategy, - build_strategy=build_strategy) + build_strategy=build_strategy, + exec_strategy=exec_strategy) else: - exe = fluid.Executor(place=place) + binary = compiler._ProgramCompiler(main) if batch_size is not None: batch_size *= fluid.core.get_cuda_device_count( @@ -102,13 +94,14 @@ class TestParallelExecutorBase(unittest.TestCase): os.environ.get('CPU_NUM', multiprocessing.cpu_count())) begin = time.time() first_loss, = run_executor( - exe=exe, feed=feed_dict, fetch_list=[loss.name]) + exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name]) for i in range(iter): - run_executor(exe=exe, feed=feed_dict, fetch_list=[]) + run_executor( + exe=exe, binary=binary, feed=feed_dict, fetch_list=[]) last_loss, = run_executor( - exe=exe, feed=feed_dict, fetch_list=[loss.name]) + exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name]) end = time.time() if batch_size is not None: diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 0caab08f0d..5cc5d9f3d3 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -26,6 +26,7 @@ import pickle import numpy as np import paddle.fluid as fluid +from paddle.fluid import compiler RUN_STEP = 10 DEFAULT_BATCH_SIZE = 2 @@ -104,8 +105,8 @@ class TestDistRunnerBase(object): else: place = fluid.CPUPlace() - startup_exe = fluid.Executor(place) - startup_exe.run(fluid.default_startup_program()) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) strategy = fluid.ExecutionStrategy() strategy.num_threads = 1 @@ -125,19 +126,16 @@ class TestDistRunnerBase(object): mypass.set_int("num_repeats", args.batch_merge_repeat) if args.update_method == "nccl2": - num_trainers = len(args.endpoints.split(",")) - trainer_id = args.trainer_id + build_stra.num_trainers = len(args.endpoints.split(",")) + build_stra.trainer_id = args.trainer_id else: - num_trainers = 1 - trainer_id = 0 + build_stra.num_trainers = 1 + build_stra.trainer_id = 0 - exe = fluid.ParallelExecutor( - args.use_cuda, + binary = compiler._ProgramCompiler(trainer_prog)._with_data_parallel( loss_name=avg_cost.name, - exec_strategy=strategy, build_strategy=build_stra, - num_trainers=num_trainers, - trainer_id=trainer_id) + exec_strategy=strategy) feed_var_list = [ var for var in trainer_prog.global_block().vars.values() @@ -160,7 +158,8 @@ class TestDistRunnerBase(object): out_losses = [] for _ in six.moves.xrange(RUN_STEP): - loss, = exe.run(fetch_list=[avg_cost.name], + loss, = exe.run(binary, + fetch_list=[avg_cost.name], feed=feeder.feed(get_data())) out_losses.append(loss[0]) if six.PY2: From beaae61a163412826776088d9974775470bcfd27 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Fri, 4 Jan 2019 10:41:35 +0800 Subject: [PATCH 114/197] polish test=develop --- python/paddle/fluid/compiler.py | 38 ++++++++++++++++--- python/paddle/fluid/executor.py | 10 +++-- .../unittests/parallel_executor_test_base.py | 4 +- .../fluid/tests/unittests/test_dist_base.py | 2 +- ...test_parallel_executor_test_while_train.py | 29 +++++++------- 5 files changed, 56 insertions(+), 27 deletions(-) diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py index 63331f5708..e5b1ab351e 100644 --- a/python/paddle/fluid/compiler.py +++ b/python/paddle/fluid/compiler.py @@ -15,6 +15,7 @@ import multiprocessing import os import six +import sys from .. import compat as cpt from . import core @@ -29,27 +30,50 @@ def _place_obj(place): return p -class _ProgramCompiler(object): +class CompiledProgram(object): def __init__(self, program): self._program = program + self._scope = None + self._place = None + self._executor = None self._compiled = False self._is_data_parallel = False def _with_data_parallel(self, loss_name=None, build_strategy=None, - exec_strategy=None): + exec_strategy=None, + share_vars_from=None): assert not self._is_data_parallel, "Already compiled with parallel." self._is_data_parallel = True self._build_strategy = build_strategy self._exec_strategy = exec_strategy self._loss_name = loss_name + self._share_vars_from = share_vars_from return self + def _with_distributed(self): + raise NotImplementedError() + + def _with_inference_optimize(self): + raise NotImplementedError() + def _compile_data_parallel(self): - self._places = [] - self._local_scopes = [] + if self._share_vars_from: + if self._scope: + sys.stderr.write("share_vars_from is set, scope is ignored.\n") + if not self._share_vars_from._is_data_parallel: + raise ValueError("share_vars_from is not data parallel. Cannot " + "share vars from it.") + if self._share_vars_from._executor is None: + raise ValueError( + "share_vars_from is not compiled and run, so there is no " + "var to share.") + self._local_scopes = self._share_vars_from._executor.local_scopes() + else: + self._local_scopes = [] + self._places = [] if self._exec_strategy is None: self._exec_strategy = ExecutionStrategy() if self._build_strategy is None: @@ -104,12 +128,14 @@ class _ProgramCompiler(object): def _compile(self, scope, place): if self._compiled: + if scope and self._scope != scope: + raise ValueError("Cannot compile with different scope") + if place and self._place != place: + raise ValueError("Cannot compile with different place") return self - self._compiled = True self._scope = scope self._place = place - if self._is_data_parallel: self._executor = self._compile_data_parallel() else: diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index ee7df74007..7c417cd828 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -481,11 +481,13 @@ class Executor(object): if scope is None: scope = global_scope() - compiled = isinstance(program, compiler._ProgramCompiler) + compiled = isinstance(program, compiler.CompiledProgram) + # For backward compatibility, run directly. if not compiled: - p = core.Place() - p.set_place(self.place) - self.executor = core.Executor(p) + if not self.executor: + p = core.Place() + p.set_place(self.place) + self.executor = core.Executor(p) return self._run( program, feed=feed, diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index 2038b57a6c..784fe64c4e 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -81,12 +81,12 @@ class TestParallelExecutorBase(unittest.TestCase): if use_cuda and core.is_compiled_with_cuda(): build_strategy.remove_unnecessary_lock = True if use_parallel_executor: - binary = compiler._ProgramCompiler(main)._with_data_parallel( + binary = compiler.CompiledProgram(main)._with_data_parallel( loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) else: - binary = compiler._ProgramCompiler(main) + binary = compiler.CompiledProgram(main) if batch_size is not None: batch_size *= fluid.core.get_cuda_device_count( diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 5cc5d9f3d3..aacf52e011 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -132,7 +132,7 @@ class TestDistRunnerBase(object): build_stra.num_trainers = 1 build_stra.trainer_id = 0 - binary = compiler._ProgramCompiler(trainer_prog)._with_data_parallel( + binary = compiler.CompiledProgram(trainer_prog)._with_data_parallel( loss_name=avg_cost.name, build_strategy=build_stra, exec_strategy=strategy) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py index db2826653e..3cc954a77a 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py @@ -15,6 +15,7 @@ from __future__ import print_function import paddle.fluid as fluid +from paddle.fluid import compiler import paddle.fluid.core as core import numpy as np import unittest @@ -61,22 +62,22 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase): exe.run(startup) feed_dict = {'image': image, 'label': label} - train_exe = fluid.ParallelExecutor( - use_cuda=use_cuda, - loss_name=loss.name, - main_program=main, - build_strategy=build_strategy) - - test_exe = fluid.ParallelExecutor( - use_cuda=use_cuda, - main_program=test_program, - share_vars_from=train_exe, - build_strategy=build_strategy) + train_cp = compiler.CompiledProgram(main)._with_data_parallel( + loss_name=loss.name, build_strategy=build_strategy) + test_cp = compiler.CompiledProgram( + test_program)._with_data_parallel( + loss_name=loss.name, + build_strategy=build_strategy, + share_vars_from=train_cp) for i in range(5): - test_loss, = test_exe.run([loss.name], feed=feed_dict) - - train_loss, = train_exe.run([loss.name], feed=feed_dict) + exe.run(train_cp, feed=feed_dict, fetch_list=[loss.name]) + test_loss, = exe.run(test_cp, + feed=feed_dict, + fetch_list=[loss.name]) + train_loss, = exe.run(train_cp, + feed=feed_dict, + fetch_list=[loss.name]) avg_test_loss_val = np.array(test_loss).mean() if math.isnan(float(avg_test_loss_val)): From 8ae9094e0759db04bfd80cbda0ead703c053ebdf Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Fri, 4 Jan 2019 11:32:34 +0800 Subject: [PATCH 115/197] polish and resolve conflicts test=develop --- paddle/fluid/framework/parallel_executor.cc | 2 +- python/paddle/fluid/executor.py | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 5c8776b62f..f61c9e3a91 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -200,7 +200,7 @@ ParallelExecutor::ParallelExecutor( member_->build_strategy_ = build_strategy; member_->use_all_reduce_ = build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce; - member_->nranks_ = num_trainers * places.size(); + member_->nranks_ = build_strategy.num_trainers_ * places.size(); if (!member_->use_all_reduce_) { PADDLE_ENFORCE(places.size() > 1, diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 7c417cd828..4003e988f2 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -375,7 +375,6 @@ class Executor(object): self._closed = True def _run_parallel(self, - exe, scope, feed=None, fetch_list=None, @@ -391,7 +390,8 @@ class Executor(object): feed_tensor.set(feed[feed_name], core.CPUPlace()) feed_tensor_dict[feed_name] = feed_tensor - exe.feed_and_split_tensor_into_local_scopes(feed_tensor_dict) + self.executor.feed_and_split_tensor_into_local_scopes( + feed_tensor_dict) elif isinstance(feed, list) or isinstance(feed, tuple): if len(feed) != len(self._places): raise ValueError( @@ -412,10 +412,10 @@ class Executor(object): tensor = tmp res_dict[feed_name] = tensor res.append(res_dict) - exe.feed_tensors_into_local_scopes(res) + self.executor.feed_tensors_into_local_scopes(res) fetch_var_name = '@FETCHED_VAR_NAME@' - exe.run(fetch_list, fetch_var_name) + self.executor.run(fetch_list, fetch_var_name) arr = scope.find_var(fetch_var_name).get_lod_tensor_array() if return_numpy: @@ -502,12 +502,13 @@ class Executor(object): self.executor = program._executor if program._is_data_parallel: return self._run_parallel( - exe=program._executor, scope=scope, feed=feed, fetch_list=fetch_list, return_numpy=return_numpy) else: + # TODO(panyx0718): Can compile program to optimize executor + # performance. return self._run( program._program, feed=feed, From bbc9336878f73026ece222f2b9d85740408852f1 Mon Sep 17 00:00:00 2001 From: xiaolil1 <39753926+xiaolil1@users.noreply.github.com> Date: Fri, 4 Jan 2019 11:34:57 +0800 Subject: [PATCH 116/197] Enable basic MKL-DNN INT8 Conv OP (#15124) * Enable basic MKL-DNN INT8 Conv OP test=develop * Modify test case test=develop * Clean unittest code test=develop * Fix test test=develop * Modify test test=develop * Modify basic INT8 Conv test=develop --- paddle/fluid/operators/conv_mkldnn_op.cc | 340 +++++++++++++++++- paddle/fluid/operators/conv_op.cc | 33 +- paddle/fluid/operators/conv_op.h | 1 + paddle/fluid/platform/mkldnn_reuse.h | 110 +++++- .../tests/unittests/test_conv2d_fusion_op.py | 5 +- .../unittests/test_conv2d_int8_mkldnn_op.py | 228 ++++++++++++ .../fluid/tests/unittests/test_conv2d_op.py | 7 +- 7 files changed, 696 insertions(+), 28 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc index 8c116c4abf..0f2bb8c65c 100644 --- a/paddle/fluid/operators/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/conv_mkldnn_op.cc @@ -12,6 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ +#include #include "paddle/fluid/framework/data_layout_transform.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/operators/conv_op.h" @@ -68,13 +69,22 @@ inline mkldnn::memory::format GetWeightsFormat(mkldnn::memory::format format, } } -template +template class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace."); + bool is_INT8 = + std::is_same::value || std::is_same::value; + if (!is_INT8) { + ComputeFP32(ctx); + } else { + ComputeINT8(ctx); + } + } + void ComputeFP32(const paddle::framework::ExecutionContext& ctx) const { const bool is_test = ctx.Attr("is_test"); auto& dev_ctx = @@ -274,6 +284,257 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { output->set_layout(DataLayout::kMKLDNN); output->set_format(GetMKLDNNFormat(*dst_memory_p)); } + void ComputeINT8(const paddle::framework::ExecutionContext& ctx) const { + const bool is_test = ctx.Attr("is_test"); + + auto& dev_ctx = + ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); + + auto* input = ctx.Input("Input"); + auto* filter = ctx.Input("Filter"); + auto* bias = ctx.HasInput("Bias") ? ctx.Input("Bias") : nullptr; + auto* output = ctx.Output("Output"); + + PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN && + input->format() != memory::format::format_undef, + "Wrong layout/format set for Input tensor"); + PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN && + filter->format() != memory::format::format_undef, + "Wrong layout/format set for Filter tensor"); + PADDLE_ENFORCE(input->dims().size() == 4 || input->dims().size() == 5, + "Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW"); + PADDLE_ENFORCE(filter->dims().size() == 4 || filter->dims().size() == 5, + "Filter must be with 4 or 5 dimensions, i.e. OIHW or OIDHW"); + if (bias) { + PADDLE_ENFORCE(bias->layout() == DataLayout::kMKLDNN && + bias->format() != memory::format::format_undef, + "Wrong layout/format set for Bias tensor"); + PADDLE_ENFORCE(bias->dims().size() == 1, + "Bias must only have 1 dimension, i.e. X"); + } + + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + std::vector dilations = ctx.Attr>("dilations"); + int groups = ctx.Attr("groups"); + + bool force_fp32_output = ctx.Attr("force_fp32_output"); + + bool is_conv3d = strides.size() == 3U; + // TODO(tpatejko): add support for dilation + PADDLE_ENFORCE( + is_conv3d + ? dilations.size() == 3 && dilations[0] == 1 && dilations[1] == 1 && + dilations[2] == 1 + : dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1, + "dilation in convolution is not implemented yet"); + PADDLE_ENFORCE(is_conv3d != true, "int8 does not support conv3d currently"); + + const T* input_data = input->data(); + + std::vector src_tz = paddle::framework::vectorize2int(input->dims()); + std::vector weights_tz = + paddle::framework::vectorize2int(filter->dims()); + int g = std::max(groups, 1); + GetWeightsTz(weights_tz, g, is_conv3d); + std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); + + // Get unique name for storing MKLDNN primitives + std::string key; + key.reserve(MaxKeyLength); + mkldnn::memory::data_type src_dt = + paddle::framework::ToMKLDNNDataType(input->type()); + platform::ConvMKLDNNHandler::AppendKey( + &key, src_tz, weights_tz, strides, paddings, dilations, groups, src_dt, + input->format(), ctx.op().Output("Output")); + + const std::string key_conv_pd = key + "@conv_pd"; + + std::shared_ptr conv_p = nullptr; + std::shared_ptr src_memory_p = nullptr; + std::shared_ptr user_src_memory_p = nullptr; + std::shared_ptr dst_memory_p = nullptr; + std::vector pipeline; + std::shared_ptr conv_pd = + nullptr; + std::shared_ptr handler = nullptr; + + auto prim_key = key + "@conv_p"; + auto dst_key = key + "@dst_mem_p"; + auto src_key = key + "@src_mem_p"; + auto user_src_key = key + "@user_src_mem_p"; + auto src_reorder_key = key + "@src_mem_preorder_p"; + conv_p = std::static_pointer_cast( + dev_ctx.GetBlob(prim_key)); + if (conv_p == nullptr || !is_test) { + const K* filter_data = filter->data(); + auto scale_in_data = ctx.Attr("Scale_in"); + auto scale_weights_data = ctx.Attr>("Scale_weights"); + auto scale_out_data = + force_fp32_output ? 1.0f : ctx.Attr("Scale_out"); + + bool is_multi_channel = scale_weights_data.size() > 1; + + int count = is_multi_channel ? (g > 1 ? (weights_tz)[1] * (weights_tz)[0] + : (weights_tz)[0]) + : 1; + std::vector output_shift_scale(count); +#pragma omp parallel for if (count > 1) + for (int i = 0; i < count; i++) { + if (scale_weights_data[i] == 0.0) + output_shift_scale[i] = + scale_out_data; // weights data will contain 0 + // in some models, then weights + // scale couldn't be calculated + else + output_shift_scale[i] = + scale_out_data / (scale_in_data * scale_weights_data[i]); + } + + auto user_src_md = + platform::MKLDNNMemDesc({src_tz}, src_dt, input->format()); + auto user_weights_md = platform::MKLDNNMemDesc( + {weights_tz}, platform::MKLDNNGetDataType(), + ((g) == 1) ? mkldnn::memory::format::oihw + : mkldnn::memory::format::goihw); + + /* create memory descriptor for convolution without specified format + * ('any') which lets a primitive (convolution in this case) choose + * the memory format preferred for best performance + */ + std::string data_format = ctx.Attr("data_format"); + auto chosen_memory_format = + platform::data_format_to_memory_format(data_format); + + std::vector bias_tz; + + auto src_md = + platform::MKLDNNMemDesc(src_tz, src_dt, chosen_memory_format); + auto weights_md = platform::MKLDNNMemDesc( + weights_tz, memory::data_type::s8, chosen_memory_format); + + auto dst_dt = force_fp32_output + ? paddle::framework::ToMKLDNNDataType( + framework::DataTypeTrait::DataType) + : paddle::framework::ToMKLDNNDataType( + framework::DataTypeTrait::DataType); + + auto dst_md = + platform::MKLDNNMemDesc(dst_tz, dst_dt, chosen_memory_format); + // create a conv primitive descriptor and save it for usage in backward + if (bias) { + bias_tz = paddle::framework::vectorize2int(bias->dims()); + auto bias_md = platform::MKLDNNMemDesc(bias_tz, memory::data_type::s32, + memory::format::x); + conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, + strides, paddings, mkldnn_engine, + output_shift_scale, is_test); + } else { + conv_pd = + ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings, + mkldnn_engine, output_shift_scale, is_test); + } + // Save conv_pd/src_memory/weights_memory for backward pass + dev_ctx.SetBlob(key_conv_pd, conv_pd); + + handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx, + mkldnn_engine, key)); + + // create mkldnn memory from input tensors (data/weights) + user_src_memory_p = + handler->AcquireSrcMemory(user_src_md, to_void_cast(input_data)); + auto user_weights_memory_p = handler->AcquireWeightsMemory( + user_weights_md, to_void_cast(filter_data)); + + // create reorder primitive if the input format is not the preferred one + src_memory_p = + handler->AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline); + + std::shared_ptr weights_memory_p; + int mask_reorder = + is_multi_channel ? ((g != 1) ? (1 << 1) + (1 << 0) : 1 << 0) : 0; + weights_memory_p = handler->AcquireWeightsMemoryFromPrimitive( + user_weights_memory_p, pipeline, is_test, true, scale_weights_data, + mask_reorder); + + if (!force_fp32_output) { + dst_memory_p = platform::SetDstMemory(ctx, output, handler); + } else { + dst_memory_p = platform::SetDstMemory(ctx, output, handler); + } + + // create convolution op primitive + auto scale_bias_key = key + "@scale_bias"; + if (bias) { + const float* bias_data = bias->data(); + auto user_bias_md = platform::MKLDNNMemDesc( + {bias_tz}, platform::MKLDNNGetDataType(), memory::format::x); + auto user_bias_memory_p = handler->AcquireBiasMemory( + user_bias_md, to_void_cast(bias_data)); + std::shared_ptr bias_memory_p; + int mask_reorder = is_multi_channel ? 1 << 0 : 1; + int count = + is_multi_channel + ? (g > 1 ? (weights_tz)[1] * (weights_tz)[0] : (weights_tz)[0]) + : 1; + std::vector scale_bias_data(count); +#pragma omp parallel for if (count > 1) + for (int i = 0; i < count; i++) { + scale_bias_data[i] = scale_in_data * scale_weights_data[i]; + } + bias_memory_p = handler->AcquireBiasMemoryFromPrimitive( + user_bias_memory_p, pipeline, is_test, true, scale_bias_data, + mask_reorder); + conv_p = handler->AcquireConvolution(src_memory_p, weights_memory_p, + bias_memory_p, dst_memory_p); + } else { + conv_p = handler->AcquireConvolution(src_memory_p, weights_memory_p, + dst_memory_p); + } + + // push primitive to stream and wait until it's executed + pipeline.push_back(*conv_p); + } else { + auto src_memory_reorder_p = std::static_pointer_cast( + dev_ctx.GetBlob(src_reorder_key)); + src_memory_p = + std::static_pointer_cast(dev_ctx.GetBlob(src_key)); + if (src_memory_reorder_p) { + user_src_memory_p = std::static_pointer_cast( + dev_ctx.GetBlob(user_src_key)); + user_src_memory_p->set_data_handle(to_void_cast(input_data)); + } else if (src_memory_p) { + src_memory_p->set_data_handle(to_void_cast(input_data)); + } + + dst_memory_p = + std::static_pointer_cast(dev_ctx.GetBlob(dst_key)); + conv_pd = + std::static_pointer_cast( + dev_ctx.GetBlob(key_conv_pd)); + if (conv_pd) { + handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx, + mkldnn_engine, key)); + } + if (!force_fp32_output) { + dst_memory_p = + platform::SetDstMemoryHandler(ctx, output, handler); + } else { + dst_memory_p = + platform::SetDstMemoryHandler(ctx, output, handler); + } + if (src_memory_reorder_p) { + pipeline.push_back(*src_memory_reorder_p); + } + pipeline.push_back(*conv_p); + } + // push primitive to stream and wait until it's executed + stream(stream::kind::eager).submit(pipeline).wait(); + + output->set_layout(DataLayout::kMKLDNN); + output->set_format(GetMKLDNNFormat(*dst_memory_p)); + } private: mkldnn::primitive_attr CreatePostOps(bool fuse_relu, @@ -301,6 +562,16 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { return conv_attr; } + mkldnn::primitive_attr CreatePostOps( + const std::vector output_shift_scale) const { + mkldnn::primitive_attr conv_attr; + mkldnn::post_ops post_operations; + int mask = output_shift_scale.size() > 1 ? 1 << 1 : 0; + conv_attr.set_output_scales(mask, output_shift_scale); + conv_attr.set_post_ops(post_operations); + return conv_attr; + } + std::unique_ptr ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights, const memory::desc& dst, const std::vector& strides, @@ -325,6 +596,32 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { p_conv_pd); } + std::unique_ptr + ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights, + const memory::desc& dst, const std::vector& strides, + const std::vector& paddings, + const mkldnn::engine& engine, + const std::vector output_shift_scale, + bool is_test) const { + memory::dims stride_dims = {strides[0], strides[1]}; + memory::dims padding_dims = {paddings[0], paddings[1]}; + + auto propagation = is_test ? mkldnn::prop_kind::forward_scoring + : mkldnn::prop_kind::forward_training; + + auto conv_desc = mkldnn::convolution_forward::desc( + propagation, mkldnn::convolution_direct, src, weights, dst, stride_dims, + padding_dims, padding_dims, mkldnn::padding_kind::zero); + + mkldnn::primitive_attr conv_attr = CreatePostOps(output_shift_scale); + + auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc( + conv_desc, conv_attr, engine); + + return std::unique_ptr( + p_conv_pd); + } + std::unique_ptr ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights, const memory::desc& bias, const memory::desc& dst, @@ -349,6 +646,33 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { return std::unique_ptr( p_conv_pd); } + + std::unique_ptr + ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights, + const memory::desc& bias, const memory::desc& dst, + const std::vector& strides, + const std::vector& paddings, + const mkldnn::engine& engine, + const std::vector output_shift_scale, + bool is_test) const { + memory::dims stride_dims = {strides[0], strides[1]}; + memory::dims padding_dims = {paddings[0], paddings[1]}; + + auto propagation = is_test ? mkldnn::prop_kind::forward_scoring + : mkldnn::prop_kind::forward_training; + + auto conv_desc = mkldnn::convolution_forward::desc( + propagation, mkldnn::convolution_direct, src, weights, bias, dst, + stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero); + + mkldnn::primitive_attr conv_attr = CreatePostOps(output_shift_scale); + + auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc( + conv_desc, conv_attr, engine); + + return std::unique_ptr( + p_conv_pd); + } }; template @@ -555,7 +879,17 @@ namespace ops = paddle::operators; REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, ::paddle::platform::CPUPlace, FP32, ops::kConvMKLDNNFP32, - ops::ConvMKLDNNOpKernel); + ops::ConvMKLDNNOpKernel); + +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, + ::paddle::platform::CPUPlace, U8, + ops::kConvMKLDNNFP32, + ops::ConvMKLDNNOpKernel); + +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, + ::paddle::platform::CPUPlace, S8, + ops::kConvMKLDNNFP32, + ops::ConvMKLDNNOpKernel); REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d_grad, MKLDNN, ::paddle::platform::CPUPlace, FP32, @@ -565,7 +899,7 @@ REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d_grad, MKLDNN, REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d, MKLDNN, ::paddle::platform::CPUPlace, FP32, ops::kConvMKLDNNFP32, - ops::ConvMKLDNNOpKernel); + ops::ConvMKLDNNOpKernel); REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d_grad, MKLDNN, ::paddle::platform::CPUPlace, FP32, diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 8e0d282495..c8b33b8932 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -98,10 +98,12 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( #endif auto input_data_type = ctx.Input("Input")->type(); - auto filter_data_type = ctx.Input("Filter")->type(); - PADDLE_ENFORCE_EQ(input_data_type, filter_data_type, - "input and filter data type should be consistent"); - + if (input_data_type != framework::proto::VarType::INT8 && + input_data_type != framework::proto::VarType::UINT8) { + auto filter_data_type = ctx.Input("Filter")->type(); + PADDLE_ENFORCE_EQ(input_data_type, filter_data_type, + "input and filter data type should be consistent"); + } if (input_data_type == framework::proto::VarType::FP16) { PADDLE_ENFORCE_EQ(library, framework::LibraryType::kCUDNN, "float16 can only be used when CUDNN is used"); @@ -179,6 +181,26 @@ void Conv2DOpMaker::Make() { "whenever convolution output is as an input to residual " "connection.") .SetDefault(false); + AddAttr("Scale_in", + "Scale_in to be used for int8 input data." + "Only used with MKL-DNN INT8.") + .SetDefault(1.0f); + AddAttr("Scale_out", + "Scale_out to be used for int8 output data." + "Only used with MKL-DNN INT8.") + .SetDefault(1.0f); + AddAttr("Scale_in_eltwise", + "Scale_in_eltwise to be used for int8 eltwise input data." + "Only used with MKL-DNN INT8.") + .SetDefault(1.0f); + AddAttr>("Scale_weights", + "Scale_weights to be used for int8 weights data." + "Only used with MKL-DNN INT8.") + .SetDefault({1.0f}); + AddAttr("force_fp32_output", + "(bool, default false) Force INT8 kernel output FP32, only " + "used in MKL-DNN INT8") + .SetDefault(false); AddAttr( "data_format", "(string, default NCHW) Only used in " @@ -303,6 +325,9 @@ void Conv3DOpMaker::Make() { "Defaults to \"NHWC\". Specify the data format of the output data, " "the input will be transformed automatically. ") .SetDefault("AnyLayout"); + AddAttr("force_fp32_output", + "(bool, default false) Only used in mkldnn INT8 kernel") + .SetDefault(false); // TODO(dzhwinter): need to registered layout transform function AddAttr("workspace_size_MB", "Only used in cudnn kernel. workspace size for cudnn, in MB, " diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h index 24b8e23879..eaa288edc5 100644 --- a/paddle/fluid/operators/conv_op.h +++ b/paddle/fluid/operators/conv_op.h @@ -29,6 +29,7 @@ namespace operators { using Tensor = framework::Tensor; constexpr int kConvMKLDNNFP32 = 1; constexpr int kConvMKLDNNINT8 = 2; +constexpr int MaxKeyLength = 256; // Base convolution operator definations for other conv // like operators to reuse the implementation. diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 584df85e80..98d1242a16 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -145,7 +145,8 @@ class MKLDNNHandler { const std::shared_ptr user_memory_p, const std::string& suffix, std::vector& pipeline, // NOLINT - bool is_persistent = false) { + bool is_persistent = false, bool is_INT8 = false, + std::vector scale_data = {1.0f}, int mask = 0) { // create reorder primitive if the input format is not the preferred one auto local_key = key_ + suffix; auto key_reorder_p = key_ + suffix + "reorder_p"; @@ -159,8 +160,20 @@ class MKLDNNHandler { std::shared_ptr reorder_p; if (mpd != user_mpd) { target_memory_p = std::make_shared(mpd); - auto reorder_p = - std::make_shared(*user_memory_p, *target_memory_p); + std::shared_ptr reorder_p; + if (is_INT8) { + mkldnn::primitive_attr + attri; // attribute for int8 weights and bias data reorder. + attri.set_output_scales(mask, scale_data); + + auto reorder_pd = std::shared_ptr( + new mkldnn::reorder::primitive_desc(user_mpd, mpd, attri)); + reorder_p = std::shared_ptr(new mkldnn::reorder( + *reorder_pd, *user_memory_p, *target_memory_p)); + } else { + reorder_p = std::make_shared(*user_memory_p, + *target_memory_p); + } dev_ctx_.SetBlob(key_reorder_p, reorder_p); pipeline.push_back(*reorder_p); } @@ -182,22 +195,56 @@ class MKLDNNHandler { return dims2str(operand_dims) + suffix; } - template + template static void SetDstMemory( const framework::ExecutionContext& ctx, framework::Tensor* output, std::vector dst_tz, const mkldnn::engine& engine, std::shared_ptr& dst_pd, // NOLINT std::shared_ptr& dst_memory) { // NOLINT - M* output_data = output->mutable_data(ctx.GetPlace()); + T* output_data = output->mutable_data(ctx.GetPlace()); auto dst_md = platform::MKLDNNMemDesc( {dst_tz}, paddle::framework::ToMKLDNNDataType( - framework::DataTypeTrait::DataType), + framework::DataTypeTrait::DataType), mkldnn::memory::format::nhwc); dst_pd.reset(new mkldnn::memory::primitive_desc(dst_md, engine)); - dst_memory.reset(new mkldnn::memory(*dst_pd, to_void_cast(output_data))); + dst_memory.reset(new mkldnn::memory(*dst_pd, to_void_cast(output_data))); + } + + static void AppendKey( + std::string* key, const mkldnn::memory::dims& input_dims, + const mkldnn::memory::dims& weights_dims, const std::vector& strides, + const std::vector& paddings, const std::vector& dilations, + const int& groups, const mkldnn::memory::data_type& type, + const mkldnn::memory::format& format, const std::string& suffix) { + AppendKeyDims(key, input_dims); + AppendKeyDims(key, weights_dims); + AppendKeyVec(key, strides); + AppendKeyVec(key, paddings); + AppendKeyVec(key, dilations); + AppendKey(key, std::to_string(groups)); + AppendKey(key, std::to_string(type)); + AppendKey(key, std::to_string(format)); + AppendKey(key, suffix); } protected: + static void AppendKeyDims(std::string* key, + const mkldnn::memory::dims& dims) { + for (unsigned int i = 0; i < dims.size(); i++) { + AppendKey(key, std::to_string(dims[i])); + } + } + + static void AppendKeyVec(std::string* key, const std::vector& dims) { + for (unsigned int i = 0; i < dims.size(); i++) { + AppendKey(key, std::to_string(dims[i])); + } + } + + static void AppendKey(std::string* key, const std::string& s) { + key->append(s); + } + static std::string dims2str(const mkldnn::memory::dims& operand_dims) { std::string dstr = ""; for (size_t i = 0; i < operand_dims.size(); ++i) { @@ -215,7 +262,8 @@ class MKLDNNHandler { class TransposeMKLDNNHandler : public MKLDNNHandler { public: - TransposeMKLDNNHandler(std::vector& dims, std::vector& axis, + TransposeMKLDNNHandler(std::vector& dims, // NOLINT + std::vector& axis, // NOLINT const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, const std::string& base_key) : platform::MKLDNNHandler(dev_ctx, engine, base_key), @@ -303,8 +351,9 @@ class TransposeMKLDNNHandler : public MKLDNNHandler { } protected: - mkldnn_memory_desc_t Axis2MemoryDesc(std::vector& nchw_tz, - std::vector& axis) { + mkldnn_memory_desc_t Axis2MemoryDesc(std::vector& nchw_tz, // NOLINT + std::vector& axis // NOLINT + ) { mkldnn_memory_desc_t mem_fmt; mem_fmt.primitive_kind = mkldnn_memory; @@ -462,21 +511,26 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler { std::shared_ptr AcquireWeightsMemoryFromPrimitive( const std::shared_ptr user_weights_memory_p, std::vector& pipeline, // NOLINT - bool is_persistent = false) { + bool is_persistent = false, bool is_INT8 = false, + std::vector scale_data = {1.0f}, int mask = 0) { auto user_weights_pd = user_weights_memory_p->get_primitive_desc(); auto weights_pd = conv_pd_->weights_primitive_desc(); - return this->AcquireMemory(weights_pd, user_weights_pd, - user_weights_memory_p, "@weights_mem_p", - pipeline, is_persistent); + return this->AcquireMemory( + weights_pd, user_weights_pd, user_weights_memory_p, "@weights_mem_p", + pipeline, is_persistent, is_INT8, scale_data, mask); } std::shared_ptr AcquireBiasMemoryFromPrimitive( const std::shared_ptr user_bias_memory_p, - std::vector& pipeline) { // NOLINT + std::vector& pipeline, // NOLINT + bool is_persistent = false, bool is_INT8 = false, + std::vector scale_data = {1.0f}, + int mask = 0) { // NOLINT auto user_bias_pd = user_bias_memory_p->get_primitive_desc(); auto bias_pd = conv_pd_->bias_primitive_desc(); return this->AcquireMemory(bias_pd, user_bias_pd, user_bias_memory_p, - "@bias_mem_p", pipeline); + "@bias_mem_p", pipeline, is_persistent, is_INT8, + scale_data, mask); } std::shared_ptr AcquireConvolution( @@ -594,5 +648,29 @@ using ConvTransposeMKLDNNHandler = ConvMKLDNNTemplateHandler; + +template +static std::shared_ptr SetDstMemory( + const framework::ExecutionContext& ctx, framework::Tensor* output, + const std::shared_ptr& handler) { + T* output_data = output->mutable_data( + ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, + handler->GetDstMemorySize()); + std::shared_ptr dst_memory_p = + handler->AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); + return dst_memory_p; +} + +template +static std::shared_ptr SetDstMemoryHandler( + const framework::ExecutionContext& ctx, framework::Tensor* output, + const std::shared_ptr& handler) { + T* output_data = output->mutable_data( + ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, + handler->GetDstMemorySize()); + std::shared_ptr dst_memory_p; + dst_memory_p->set_data_handle(to_void_cast(output_data)); + return dst_memory_p; +} } // namespace platform } // namespace paddle diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py index a27212f38f..ab34a51dd9 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py @@ -51,8 +51,9 @@ class TestConv2dFusionOp(OpTest): input = np.random.random(self.input_size).astype(self.dtype) filter = np.random.random(self.filter_size).astype(self.dtype) - self.output = conv2d_forward_naive(input, filter, self.groups, - conv2d_param).astype(self.dtype) + self.output, _, _, _, _ = conv2d_forward_naive( + input, filter, self.groups, conv2d_param) + self.output = self.output.astype(self.dtype) self.inputs = { 'Input': OpTest.np_dtype_to_fluid_dtype(input), diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py new file mode 100644 index 0000000000..ca35adc1a3 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py @@ -0,0 +1,228 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np + +import paddle.fluid.core as core +from op_test import OpTest +from test_conv2d_op import conv2d_forward_naive, TestConv2dOp + + +def conv2d_forward_refer(input, filter, group, conv_param): + out, in_n, out_h, out_w, out_c = conv2d_forward_naive(input, filter, group, + conv_param) + out_tmp = np.zeros((in_n, out_h, out_w, out_c)) + for n in range(in_n): + for i in range(out_h): + for j in range(out_w): + for m in range(out_c): + out_tmp[n, i, j, m] = out[n, m, i, j] + return out_tmp.reshape(in_n, out_c, out_h, out_w) + + +class TestConv2dInt8Op(TestConv2dOp): + def setUp(self): + self.op_type = "conv2d" + self.use_cudnn = False + self.exhaustive_search = False + self.use_cuda = False + self.use_mkldnn = False + self.data_format = "AnyLayout" + self.weighttype = np.float32 + self.use_mkldnn = True + self.init_group() + self.init_dilation() + self.init_test_case() + self.init_dtype() + + conv2d_param = { + 'stride': self.stride, + 'pad': self.pad, + 'dilation': self.dilations + } + + filter = np.random.random(self.filter_size).astype(self.weighttype) + if self.srctype == np.uint8: + input = np.random.randint(0, 10, + self.input_size).astype(self.srctype) + else: + input = np.random.randint(-5, 5, + self.input_size).astype(self.srctype) + input_shift = (np.ones(self.input_size) * 128).astype(np.uint8) + + if self.srctype == np.int8: + filter_int = np.round(filter * self.scale_weights[0] * + 0.5).astype(np.int32) + scale_output_shift = self.scale_out / (self.scale_in * + self.scale_weights[0] * 0.5) + output1 = conv2d_forward_refer( + np.round((input.astype(np.int32) + input_shift) * + self.scale_in).astype(np.int32), filter_int, + self.groups, + conv2d_param).astype(np.float32) * scale_output_shift + output2 = conv2d_forward_refer( + np.round((input_shift) * self.scale_in).astype(np.int32), + filter_int, self.groups, + conv2d_param).astype(np.float32) * scale_output_shift + output = np.round(output1 - output2).astype(self.dsttype) + else: + filter_int = np.round(filter * + self.scale_weights[0]).astype(np.int32) + scale_output_shift = self.scale_out / (self.scale_in * + self.scale_weights[0]) + output1 = conv2d_forward_refer( + input.astype(np.int32), filter_int, self.groups, + conv2d_param).astype(np.float32) + output = np.round(output1 * scale_output_shift).astype(self.dsttype) + + self.inputs = { + 'Input': + OpTest.np_dtype_to_fluid_dtype(input.astype(self.srctype)), + 'Filter': OpTest.np_dtype_to_fluid_dtype(filter) + } + self.attrs = { + 'strides': self.stride, + 'paddings': self.pad, + 'groups': self.groups, + 'dilations': self.dilations, + 'use_cudnn': self.use_cudnn, + 'use_mkldnn': self.use_mkldnn, + 'data_format': self.data_format, + 'exhaustive_search': self.exhaustive_search, + 'Scale_in': self.scale_in, + 'Scale_out': self.scale_out, + 'Scale_weights': self.scale_weights, + } + self.outputs = {'Output': output} + + def test_check_output(self): + self.check_output_with_place(core.CPUPlace(), atol=0) + + def test_check_grad(self): + pass + + def test_check_grad_no_filter(self): + pass + + def test_check_grad_no_input(self): + pass + + def init_test_case(self): + TestConv2dOp.init_test_case(self) + f_c = self.input_size[1] // self.groups + self.filter_size = [1, f_c, 3, 3] + self.scale_in = 1.0 + self.scale_out = 0.5 + self.scale_weights = [10.0] + + def init_dtype(self): + self.srctype = np.uint8 + self.dsttype = np.int8 + + +#--------------------test conv2d u8 in and s8 out-------------------- + + +class TestConv2d(TestConv2dInt8Op): + def init_test_case(self): + self.pad = [0, 0] + self.stride = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 3, 3] + self.scale_in = 1.0 + self.scale_out = 0.5 + self.scale_weights = [10.0] + + +class TestWithPad(TestConv2d): + def init_test_case(self): + TestConv2d.init_test_case(self) + self.pad = [1, 1] + + +class TestWithGroup(TestConv2d): + def init_group(self): + self.groups = 3 + + +class TestWithStride(TestConv2dInt8Op): + def init_test_case(self): + self.pad = [1, 1] + self.stride = [2, 2] + self.input_size = [2, 3, 6, 6] + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 3, 3] + self.scale_in = 1.0 + self.scale_out = 0.8 + self.scale_weights = [10.0] + + +class TestWith1x1(TestConv2dInt8Op): + def init_test_case(self): + self.pad = [0, 0] + self.stride = [1, 1] + self.input_size = [1, 3, 5, 5] + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 1, 1] + self.scale_in = 1.0 + self.scale_out = 0.5 + self.scale_weights = [12.0] + + +class TestWithInput1x1Filter1x1(TestConv2dInt8Op): + def init_test_case(self): + self.pad = [0, 0] + self.stride = [1, 1] + self.input_size = [2, 3, 1, 1] + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 1, 1] + self.scale_in = 1.0 + self.scale_out = 0.5 + self.scale_weights = [10.0] + + def init_group(self): + self.groups = 3 + + +#--------------------test conv2d s8 in and s8 out-------------------- + + +def create_test_int8_class(parent): + class TestInt8Case(parent): + def init_dtype(self): + self.srctype = np.int8 + self.dsttype = np.int8 + + cls_name = "{0}_{1}".format(parent.__name__, "s8s8") + TestInt8Case.__name__ = cls_name + globals()[cls_name] = TestInt8Case + + +create_test_int8_class(TestConv2dInt8Op) +create_test_int8_class(TestWithPad) +create_test_int8_class(TestWithStride) +create_test_int8_class(TestWithGroup) +create_test_int8_class(TestWith1x1) +create_test_int8_class(TestWithInput1x1Filter1x1) + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py index bcb79f232b..25a9e8d46e 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py @@ -60,7 +60,7 @@ def conv2d_forward_naive(input, filter, group, conv_param): np.sum(input_pad_masked * f_sub[k, :, :, :], axis=(1, 2, 3)) - return out + return out, in_n, out_h, out_w, out_c class TestConv2dOp(OpTest): @@ -85,8 +85,9 @@ class TestConv2dOp(OpTest): input = np.random.random(self.input_size).astype(self.dtype) filter = np.random.random(self.filter_size).astype(self.dtype) - output = conv2d_forward_naive(input, filter, self.groups, - conv2d_param).astype(self.dtype) + output, _, _, _, _ = conv2d_forward_naive(input, filter, self.groups, + conv2d_param) + output = output.astype(self.dtype) self.inputs = { 'Input': OpTest.np_dtype_to_fluid_dtype(input), From cb1891f97bb005651f36284ad3050c12c8753d9f Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Fri, 4 Jan 2019 12:19:32 +0800 Subject: [PATCH 117/197] polish test=develop --- python/paddle/fluid/compiler.py | 18 ++++++++++++++++++ python/paddle/fluid/parallel_executor.py | 3 --- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py index e5b1ab351e..a4b2ea837f 100644 --- a/python/paddle/fluid/compiler.py +++ b/python/paddle/fluid/compiler.py @@ -31,6 +31,24 @@ def _place_obj(place): class CompiledProgram(object): + """ + Compiles a Program for execution. + + The CompiledProgram is used to transform a program for various + optimizations, for example. + * Pre-compute some logic once so that each run is faster. + * Transform the program so that it can run in multiple devices. + * TODO: transform the program for optimized inference or distributed + training. + + Example: + + + Args: + program: Program instance that contains the model logic. + + """ + def __init__(self, program): self._program = program self._scope = None diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 917db02bb8..a0b6392ebc 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -291,6 +291,3 @@ class ParallelExecutor(object): @property def device_count(self): return len(self._places) - - def close(self): - pass From 7526ac14e37f6b22ec36fd9f4a3d3558dcc582d9 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Fri, 4 Jan 2019 12:39:19 +0800 Subject: [PATCH 118/197] add comments test=develop --- python/paddle/fluid/compiler.py | 57 ++++++++++++++++--- .../unittests/parallel_executor_test_base.py | 2 +- .../fluid/tests/unittests/test_dist_base.py | 2 +- ...test_parallel_executor_test_while_train.py | 11 ++-- 4 files changed, 57 insertions(+), 15 deletions(-) diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py index a4b2ea837f..1e6714479d 100644 --- a/python/paddle/fluid/compiler.py +++ b/python/paddle/fluid/compiler.py @@ -34,6 +34,10 @@ class CompiledProgram(object): """ Compiles a Program for execution. + 1. Users first create the program with layers. + 2. Optionally, users use CompiledProgram to optimize the program before run. + 3. The original program or CompiledProgram is run by executor. + The CompiledProgram is used to transform a program for various optimizations, for example. * Pre-compute some logic once so that each run is faster. @@ -42,11 +46,19 @@ class CompiledProgram(object): training. Example: - + .. code-block:: python + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(startup) + compiled_prog = compiler.CompiledProgram(main).with_data_parallel( + loss_name=loss.name) + for i in range(5): + test_loss, = exe.run(compiled_prog, + feed=feed_dict, + fetch_list=[loss.name]) Args: program: Program instance that contains the model logic. - """ def __init__(self, program): @@ -57,11 +69,32 @@ class CompiledProgram(object): self._compiled = False self._is_data_parallel = False - def _with_data_parallel(self, - loss_name=None, - build_strategy=None, - exec_strategy=None, - share_vars_from=None): + def with_data_parallel(self, + loss_name=None, + build_strategy=None, + exec_strategy=None, + share_vars_from=None): + """Configs the program to run in data parallel way. + + Args: + loss_name (str): The loss name must set in training. Default None. + build_strategy(BuildStrategy): build_strategy is used to + build the graph so it can run on multiple devices/cores with + optimized topology. + For more information, please refer to fluid.BuildStrategy. + Default None. + exec_strategy(ExecutionStrategy): exec_strategy is used to + to select the a way to execute the graph, for example how many + threads are used, how many iterations to clean up the temp + variables. For more information, please refer + to fluid.ExecutionStrategy. Default None. + share_vars_from(CompiledProgram): If provide, this CompiledProgram + will share variables from `share_vars_from`. `share_vars_from` + must be run by the executor before this CompiledProgram so that + vars are ready. + Returns: + self + """ assert not self._is_data_parallel, "Already compiled with parallel." self._is_data_parallel = True self._build_strategy = build_strategy @@ -145,6 +178,16 @@ class CompiledProgram(object): self._exec_strategy, self._build_strategy) def _compile(self, scope, place): + """Compile the program based on the configs. + + Args: + scope: The variables (resources) that are associated with + this compiled program. + place: The location that the compiled program will be run on. + + Returns: + self + """ if self._compiled: if scope and self._scope != scope: raise ValueError("Cannot compile with different scope") diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index 784fe64c4e..1ba47d5a57 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -81,7 +81,7 @@ class TestParallelExecutorBase(unittest.TestCase): if use_cuda and core.is_compiled_with_cuda(): build_strategy.remove_unnecessary_lock = True if use_parallel_executor: - binary = compiler.CompiledProgram(main)._with_data_parallel( + binary = compiler.CompiledProgram(main).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index aacf52e011..3fcdc57906 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -132,7 +132,7 @@ class TestDistRunnerBase(object): build_stra.num_trainers = 1 build_stra.trainer_id = 0 - binary = compiler.CompiledProgram(trainer_prog)._with_data_parallel( + binary = compiler.CompiledProgram(trainer_prog).with_data_parallel( loss_name=avg_cost.name, build_strategy=build_stra, exec_strategy=strategy) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py index 3cc954a77a..d89fd87a38 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py @@ -62,13 +62,12 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase): exe.run(startup) feed_dict = {'image': image, 'label': label} - train_cp = compiler.CompiledProgram(main)._with_data_parallel( + train_cp = compiler.CompiledProgram(main).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy) - test_cp = compiler.CompiledProgram( - test_program)._with_data_parallel( - loss_name=loss.name, - build_strategy=build_strategy, - share_vars_from=train_cp) + test_cp = compiler.CompiledProgram(test_program).with_data_parallel( + loss_name=loss.name, + build_strategy=build_strategy, + share_vars_from=train_cp) for i in range(5): exe.run(train_cp, feed=feed_dict, fetch_list=[loss.name]) From 3e01a4048f28ad5cf4b33fb808b07965d9e7ff5d Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 28 Dec 2018 16:34:13 +0000 Subject: [PATCH 119/197] add refer seqpool jitkernel --- paddle/fluid/operators/jit/kernel_base.h | 20 +++++++++++++++++++ paddle/fluid/operators/jit/kernel_key.cc | 6 ++++++ .../fluid/operators/jit/refer/CMakeLists.txt | 1 + paddle/fluid/operators/jit/refer/refer.cc | 2 ++ paddle/fluid/operators/jit/refer/refer.h | 16 +++++++++++++++ 5 files changed, 45 insertions(+) diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index b4a2d5d473..8f13fbb16e 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -41,6 +41,7 @@ typedef enum { kCRFDecoding, kLayerNorm, kNCHW16CMulNC, + kSeqPool, } KernelType; template @@ -112,6 +113,25 @@ struct GRUTuples { typedef void (*func_type)(gru_t*, const gru_attr_t*); }; +typedef enum { + non = 0, + sum, + avg, + sqrt, +} SeqPoolType; + +typedef struct { + int h, w; + SeqPoolType type; +} seq_pool_attr_t; + +template +struct SeqPoolTuples { + typedef T data_type; + typedef seq_pool_attr_t attr_type; + typedef void (*func_type)(const T*, T*, const seq_pool_attr_t*); +}; + template struct CRFDecodingTuples { typedef T data_type; diff --git a/paddle/fluid/operators/jit/kernel_key.cc b/paddle/fluid/operators/jit/kernel_key.cc index 4e6a19f04f..6b0025a75a 100644 --- a/paddle/fluid/operators/jit/kernel_key.cc +++ b/paddle/fluid/operators/jit/kernel_key.cc @@ -42,6 +42,12 @@ size_t JitCodeKey(const gru_attr_t& attr) { (static_cast(attr.act_cand) << act_type_shift); } +template <> +size_t JitCodeKey(const seq_pool_attr_t& attr) { + size_t key = static_cast(attr.type); + return key + (attr.w << act_type_shift); +} + } // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jit/refer/CMakeLists.txt b/paddle/fluid/operators/jit/refer/CMakeLists.txt index 07497b7320..0f626bb3bf 100644 --- a/paddle/fluid/operators/jit/refer/CMakeLists.txt +++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt @@ -26,3 +26,4 @@ USE_JITKERNEL_REFER(kGRUHtPart2) USE_JITKERNEL_REFER(kCRFDecoding) USE_JITKERNEL_REFER(kLayerNorm) USE_JITKERNEL_REFER(kNCHW16CMulNC) +USE_JITKERNEL_REFER(kSeqPool) diff --git a/paddle/fluid/operators/jit/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc index d196266326..85381daa47 100644 --- a/paddle/fluid/operators/jit/refer/refer.cc +++ b/paddle/fluid/operators/jit/refer/refer.cc @@ -47,4 +47,6 @@ REGISTER_REFER_KERNEL(kLayerNorm, LayerNorm); REGISTER_REFER_KERNEL(kNCHW16CMulNC, NCHW16CMulNC); +REGISTER_REFER_KERNEL(kSeqPool, SeqPool); + #undef REGISTER_REFER_KERNEL diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index 0fd1b89dfd..52fe2de02a 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -332,6 +332,20 @@ void NCHW16CMulNC(const T* x, const T* y, T* z, int height, int width) { } } +template +void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) { + PADDLE_ENFORCE(attr->type == SeqPoolType::sum, "Only support sum yet"); + for (int w = 0; w < attr->w; ++w) { + const T* src = x + w; + T* dst = y + w; + *dst = static_cast(0); + for (int h = 0; h < attr->h; ++h) { + *dst = *dst + *src; + src += attr->w; + } + } +} + #define DECLARE_REFER_KERNEL(name, tuples) \ template \ class name##Kernel : public ReferKernel> { \ @@ -370,6 +384,8 @@ DECLARE_REFER_KERNEL(LayerNorm, LayerNormTuples); DECLARE_REFER_KERNEL(NCHW16CMulNC, NCHW16CMulNCTuples); +DECLARE_REFER_KERNEL(SeqPool, SeqPoolTuples); + #undef DECLARE_REFER_KERNEL } // namespace refer From e58a569c6cdb8ab66c7dff69395518cee224fe67 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 28 Dec 2018 16:35:00 +0000 Subject: [PATCH 120/197] use seqpool jitkernel --- paddle/fluid/operators/math/CMakeLists.txt | 2 +- .../fluid/operators/math/sequence_pooling.cc | 32 ++++++++++++------- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index ea6aebd291..600ab14d37 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -51,7 +51,7 @@ math_library(pooling) math_library(selected_rows_functor DEPS selected_rows math_function blas) math_library(sequence2batch) math_library(sequence_padding) -math_library(sequence_pooling DEPS math_function) +math_library(sequence_pooling DEPS math_function jit_kernel_helper) math_library(sequence_scale) math_library(softmax DEPS math_function) diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc index 6d491dbf1e..23dc516933 100644 --- a/paddle/fluid/operators/math/sequence_pooling.cc +++ b/paddle/fluid/operators/math/sequence_pooling.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include +#include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/sequence_pooling.h" @@ -239,15 +240,33 @@ class SequencePoolFunctor { last_pool(context, input, output); return; } - if (pooltype == "FIRST") { math::FirstSeqPoolFunctor first_pool; first_pool(context, input, output); return; } + auto lod = input.lod()[0]; + if (pooltype == "SUM") { + auto place = context.GetPlace(); + PADDLE_ENFORCE(platform::is_cpu_place(place)); + const T* src = input.data(); + T* dst = output->mutable_data(place); + jit::seq_pool_attr_t attr; + attr.w = input.numel() / input.dims()[0]; + attr.type = jit::SeqPoolType::sum; + auto seqpool = + jit::Get, platform::CPUPlace>( + attr); + for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { + attr.h = static_cast(lod[i + 1] - lod[i]); + seqpool(src, dst, &attr); + dst += attr.w; + src += attr.h * attr.w; + } + return; + } auto& place = *context.eigen_device(); - auto blas = math::GetBlas(context); for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { Tensor in_t = input.Slice(static_cast(lod[i]), static_cast(lod[i + 1])); @@ -258,15 +277,6 @@ class SequencePoolFunctor { auto out_e = EigenVector::Flatten(out_t); if (pooltype == "AVERAGE") { out_e.device(place) = in_e.mean(Eigen::array({{0}})); - } else if (pooltype == "SUM") { - if (h > 0) { - const T* in_data = in_t.data(); - T* out_data = out_t.mutable_data(context.GetPlace()); - blas.VCOPY(w, in_data, out_data); - for (int64_t r = 1; r != h; ++r) { - blas.AXPY(w, 1., in_data + r * w, out_data); - } - } } else if (pooltype == "SQRT") { out_e.device(place) = in_e.sum(Eigen::array({{0}})) / std::sqrt(static_cast(h)); From 142bb417483f9e0e71a26d24d30eb01c6d2f7754 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Sat, 29 Dec 2018 05:13:08 +0000 Subject: [PATCH 121/197] add seqpool jitkernel test and benchmark --- paddle/fluid/operators/jit/benchmark.cc | 21 ++++++++ paddle/fluid/operators/jit/helper.cc | 15 ++++++ paddle/fluid/operators/jit/helper.h | 6 +++ paddle/fluid/operators/jit/kernel_base.h | 19 ++++---- paddle/fluid/operators/jit/refer/refer.h | 2 +- paddle/fluid/operators/jit/test.cc | 48 +++++++++++++++++++ .../fluid/operators/math/sequence_pooling.cc | 2 +- 7 files changed, 103 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index 437005825d..f64e43389a 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -190,6 +190,24 @@ void BenchGRUKernel() { } } +template +void BenchSeqPoolKernel() { + std::vector pool_types = {jit::SeqPoolType::kSum}; + for (auto type : pool_types) { + for (int h : TestSizes()) { + for (int w : TestSizes()) { + const jit::seq_pool_attr_t attr(h, w, type); + std::vector x(h * w), y(w); + RandomVec(h * w, x.data(), -2.f, 2.f); + const T* x_data = x.data(); + T* y_data = y.data(); + BenchAllImpls, PlaceType>(attr, x_data, + y_data, &attr); + } + } + } +} + // Benchmark all jit kernels including jitcode, mkl and refer. // To use this tool, run command: ./benchmark [options...] // Options: @@ -228,4 +246,7 @@ int main(int argc, char* argv[]) { BenchGRUKernel(); BenchGRUKernel(); BenchGRUKernel(); + + // seq pool function + BenchSeqPoolKernel(); } diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc index d00584baa0..7d02590f2e 100644 --- a/paddle/fluid/operators/jit/helper.cc +++ b/paddle/fluid/operators/jit/helper.cc @@ -26,6 +26,7 @@ namespace jit { const char* to_string(KernelType kt) { switch (kt) { + ONE_CASE(kNone); ONE_CASE(kVMul); ONE_CASE(kVAdd); ONE_CASE(kVAddRelu); @@ -45,12 +46,26 @@ const char* to_string(KernelType kt) { ONE_CASE(kCRFDecoding); ONE_CASE(kLayerNorm); ONE_CASE(kNCHW16CMulNC); + ONE_CASE(kSeqPool); default: PADDLE_THROW("Not support type: %d, or forget to add it.", kt); return "NOT JITKernel"; } return nullptr; } + +const char* to_string(SeqPoolType tp) { + switch (tp) { + ONE_CASE(kNonePoolType); + ONE_CASE(kSum); + ONE_CASE(kAvg); + ONE_CASE(kSqrt); + default: + PADDLE_THROW("Not support type: %d, or forget to add it.", tp); + return "NOT PoolType"; + } + return nullptr; +} #undef ONE_CASE KernelType to_kerneltype(const std::string& act) { diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index 412df86aa1..fbf34fc4b3 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -119,6 +119,7 @@ typename KernelTuples::func_type Get( } const char* to_string(KernelType kt); +const char* to_string(SeqPoolType kt); KernelType to_kerneltype(const std::string& act); @@ -134,6 +135,11 @@ inline std::ostream& operator<<(std::ostream& os, const gru_attr_t& attr) { << "],act_cand[" << to_string(attr.act_cand) << "]"; return os; } +inline std::ostream& operator<<(std::ostream& os, const seq_pool_attr_t& attr) { + os << "height_size[" << attr.h << "],width_size[" << attr.w << "],pool_type[" + << to_string(attr.type) << "]"; + return os; +} } // namespace jit } // namespace operators diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index 8f13fbb16e..2659374650 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -44,6 +44,13 @@ typedef enum { kSeqPool, } KernelType; +typedef enum { + kNonePoolType = 0, + kSum, + kAvg, + kSqrt, +} SeqPoolType; + template struct XYZNTuples { typedef T data_type; @@ -113,16 +120,12 @@ struct GRUTuples { typedef void (*func_type)(gru_t*, const gru_attr_t*); }; -typedef enum { - non = 0, - sum, - avg, - sqrt, -} SeqPoolType; - -typedef struct { +typedef struct seq_pool_attr_s { int h, w; SeqPoolType type; + seq_pool_attr_s() = default; + explicit seq_pool_attr_s(int height, int width, SeqPoolType pool_type) + : h(height), w(width), type(pool_type) {} } seq_pool_attr_t; template diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index 52fe2de02a..c2aa922528 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -334,7 +334,7 @@ void NCHW16CMulNC(const T* x, const T* y, T* z, int height, int width) { template void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) { - PADDLE_ENFORCE(attr->type == SeqPoolType::sum, "Only support sum yet"); + PADDLE_ENFORCE(attr->type == SeqPoolType::kSum, "Only support sum yet"); for (int w = 0; w < attr->w; ++w) { const T* src = x + w; T* dst = y + w; diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index a73e2a60ae..0f1776507a 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -211,6 +211,24 @@ struct TestFuncWithRefer, std::vector, std::vector, } }; +template +struct TestFuncWithRefer, std::vector, + std::vector> { + void operator()(const typename jit::SeqPoolTuples::func_type tgt, + const std::vector& x, const std::vector& yref, + const typename jit::SeqPoolTuples::attr_type& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(x.size() % yref.size(), 0); + int w = yref.size(); + std::vector y(w); + const T* x_data = x.data(); + const T* yref_data = yref.data(); + T* y_data = y.data(); + tgt(x_data, y_data, &attr); + ExpectEQ(y_data, yref_data, w); + } +}; + template void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { @@ -415,6 +433,30 @@ void TestGRUKernel() { } } +template +void TestSeqPoolKernel() { + VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); + // TODO(TJ): support more + std::vector pool_types = {jit::SeqPoolType::kSum}; + for (auto type : pool_types) { + for (int h : TestSizes()) { + for (int w : TestSizes()) { + const jit::seq_pool_attr_t attr(h, w, type); + auto ref = jit::GetRefer>(); + EXPECT_TRUE(ref != nullptr); + std::vector x(h * w), yref(w); + RandomVec(h * w, x.data(), -2.f, 2.f); + const T* x_data = x.data(); + T* yref_data = yref.data(); + ref(x_data, yref_data, &attr); + VLOG(10) << attr; + TestAllImpls, PlaceType, std::vector, + std::vector>(attr, x, yref, attr); + } + } + } +} + template void TestNCHW16CMulNCKernel() { VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); @@ -569,6 +611,12 @@ TEST(JITKernel, kGRUHtPart2) { TestGRUKernel(); } +TEST(JITKernel, kSeqPool) { + namespace jit = paddle::operators::jit; + TestSeqPoolKernel(); + TestSeqPoolKernel(); +} + TEST(JITKernel, kNCHW16CMulNC) { namespace jit = paddle::operators::jit; TestNCHW16CMulNCKernel { T* dst = output->mutable_data(place); jit::seq_pool_attr_t attr; attr.w = input.numel() / input.dims()[0]; - attr.type = jit::SeqPoolType::sum; + attr.type = jit::SeqPoolType::kSum; auto seqpool = jit::Get, platform::CPUPlace>( attr); From c50060bb264a3e70ef55abfdd8ab74416cb14121 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Sat, 29 Dec 2018 06:26:02 +0000 Subject: [PATCH 122/197] add jitcode impl and use it --- paddle/fluid/operators/jit/gen/CMakeLists.txt | 1 + paddle/fluid/operators/jit/gen/seqpool.cc | 132 ++++++++++++++++++ paddle/fluid/operators/jit/gen/seqpool.h | 98 +++++++++++++ paddle/fluid/operators/jit/kernel_key.cc | 7 +- .../fluid/operators/math/sequence_pooling.cc | 6 +- 5 files changed, 239 insertions(+), 5 deletions(-) create mode 100644 paddle/fluid/operators/jit/gen/seqpool.cc create mode 100644 paddle/fluid/operators/jit/gen/seqpool.h diff --git a/paddle/fluid/operators/jit/gen/CMakeLists.txt b/paddle/fluid/operators/jit/gen/CMakeLists.txt index 8a54010830..2b8c758a03 100644 --- a/paddle/fluid/operators/jit/gen/CMakeLists.txt +++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt @@ -26,3 +26,4 @@ USE_JITKERNEL_GEN(kGRUH1) USE_JITKERNEL_GEN(kGRUHtPart1) USE_JITKERNEL_GEN(kGRUHtPart2) USE_JITKERNEL_GEN(kNCHW16CMulNC) +USE_JITKERNEL_GEN(kSeqPool) diff --git a/paddle/fluid/operators/jit/gen/seqpool.cc b/paddle/fluid/operators/jit/gen/seqpool.cc new file mode 100644 index 0000000000..ce6801b030 --- /dev/null +++ b/paddle/fluid/operators/jit/gen/seqpool.cc @@ -0,0 +1,132 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/jit/gen/seqpool.h" +#include "paddle/fluid/operators/jit/registry.h" +#include "paddle/fluid/platform/cpu_info.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +void SeqPoolJitCode::genCode() { + constexpr int block = YMM_FLOAT_BLOCK; + constexpr int max_num_regs = 8; + const int num_block = w_ / block; + const int num_groups = num_block / max_num_regs; + int rest_num_regs = num_block % max_num_regs; + if (type_ == SeqPoolType::kAvg) { + float scalar = 1.f / h_; + mov(reg32_scalar, scalar); + } else if (type_ == SeqPoolType::kSqrt) { + float scalar = 1.f / std::sqrt(static_cast(h_)); + mov(reg32_scalar, scalar); + } + + // TODO(TJ): make height load from params + const int group_len = max_num_regs * block * sizeof(float); + for (int g = 0; g < num_groups; ++g) { + pool_height(g * group_len, block, max_num_regs); + } + if (rest_num_regs > 0) { + pool_height(num_groups * group_len, block, rest_num_regs); + } + + // rest part + const int rest = w_ % block; + const bool has_block4 = rest / 4 > 0; + const bool has_block2 = (rest % 4) / 2 > 0; + const bool has_block1 = (rest % 2) == 1; + const int w_offset = num_block * YMM_FLOAT_BLOCK * sizeof(float); + for (int h = 0; h < h_; ++h) { + int offset = h * w_ * sizeof(float) + w_offset; + const int shift_regs = (h == 0) ? 0 : max_num_regs; + int reg_idx = 0; + if (has_block4) { + vmovups(xmm_t(reg_idx + shift_regs), ptr[param1 + offset]); + offset += sizeof(float) * 4; + reg_idx++; + } + if (has_block2) { + vmovq(xmm_t(reg_idx + shift_regs), ptr[param1 + offset]); + offset += sizeof(float) * 2; + reg_idx++; + } + if (has_block1) { + vmovss(xmm_t(reg_idx + shift_regs), ptr[param1 + offset]); + reg_idx++; + } + rest_num_regs = reg_idx; + if (h > 0) { + for (int i = 0; i < reg_idx; ++i) { + vaddps(xmm_t(i), xmm_t(i), xmm_t(i + max_num_regs)); + } + } + } + // save right now + int offset = w_offset; + if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) { + vbroadcastss(xmm_t(max_num_regs - 1), reg32_scalar); + for (int i = 0; i < rest_num_regs; ++i) { + vmulps(xmm_t(i), xmm_t(i), xmm_t(max_num_regs - 1)); + } + } + int reg_idx = 0; + if (has_block4) { + vmovups(ptr[param2 + offset], xmm_t(reg_idx)); + offset += sizeof(float) * 4; + reg_idx++; + } + if (has_block2) { + vmovq(ptr[param2 + offset], xmm_t(reg_idx)); + offset += sizeof(float) * 2; + reg_idx++; + } + if (has_block1) { + vmovss(ptr[param2 + offset], xmm_t(reg_idx)); + } + ret(); +} + +class SeqPoolCreator : public JitCodeCreator { + public: + bool UseMe(const seq_pool_attr_t& attr) const override { + return platform::MayIUse(platform::avx); + } + size_t CodeSize(const seq_pool_attr_t& attr) const override { + // TODO(TJ): remove attr.h when enabled height + bool yes = + attr.type == SeqPoolType::kAvg || attr.type == SeqPoolType::kSqrt; + return 96 /* basic */ + + ((attr.w / YMM_FLOAT_BLOCK + 4 /* rest */) * 2 /* for sum */ + * (attr.h + (yes ? 3 : 1 /*for avg or sqrt*/))) * + 8; + } + std::unique_ptr CreateJitCode( + const seq_pool_attr_t& attr) const override { + PADDLE_ENFORCE_GT(attr.w, 0); + PADDLE_ENFORCE_GT(attr.h, 0); + return make_unique(attr, CodeSize(attr)); + } +}; + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle + +namespace gen = paddle::operators::jit::gen; + +REGISTER_JITKERNEL_GEN(kSeqPool, gen::SeqPoolCreator); diff --git a/paddle/fluid/operators/jit/gen/seqpool.h b/paddle/fluid/operators/jit/gen/seqpool.h new file mode 100644 index 0000000000..eb2d191382 --- /dev/null +++ b/paddle/fluid/operators/jit/gen/seqpool.h @@ -0,0 +1,98 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include +#include "glog/logging.h" +#include "paddle/fluid/operators/jit/gen/jitcode.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +class SeqPoolJitCode : public JitCode { + public: + explicit SeqPoolJitCode(const seq_pool_attr_t& attr, + size_t code_size = 256 * 1024, + void* code_ptr = nullptr) + : JitCode(code_size, code_ptr), h_(attr.h), w_(attr.w), type_(attr.type) { + if (type_ != SeqPoolType::kSum) { + LOG(FATAL) << "Only support sum pool yet "; + } + this->genCode(); + } + + virtual const char* name() const { + std::string base = "SeqPoolJitCode"; + if (type_ == SeqPoolType::kSum) { + base += "_Sum"; + } else if (type_ == SeqPoolType::kAvg) { + base += "_Avg"; + } else if (type_ == SeqPoolType::kSqrt) { + base += "_Sqrt"; + } + base += ("_W" + std::to_string(w_)); + // TODO(TJ): make h load from params + base += ("_H" + std::to_string(h_)); + return base.c_str(); + } + void genCode() override; + + protected: + template + void pool_height(int w_offset, int block, int max_num_regs) { + for (int h = 0; h < h_; ++h) { + int offset = h * w_ * sizeof(float) + w_offset; + const int shift_regs = (h == 0) ? 0 : max_num_regs; + for (int i = 0; i < max_num_regs; ++i) { + vmovups(JMM(i + shift_regs), ptr[param1 + offset]); + offset += sizeof(float) * block; + } + if (h > 0) { + // sum anyway + for (int i = 0; i < max_num_regs; ++i) { + vaddps(JMM(i), JMM(i), JMM(i + max_num_regs)); + } + } + } + // save right now + if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) { + vbroadcastss(JMM(max_num_regs), reg32_scalar); + } + int offset = w_offset; + for (int i = 0; i < max_num_regs; ++i) { + if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) { + vmulps(JMM(i), JMM(i), JMM(max_num_regs)); + } + vmovups(ptr[param2 + offset], JMM(i)); + offset += sizeof(float) * block; + } + } + + private: + int h_; + int w_; + SeqPoolType type_; + reg64_t param1{abi_param1}; + reg64_t param2{abi_param2}; + reg64_t param3{abi_param3}; + reg32_t reg32_scalar{r8d}; +}; + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jit/kernel_key.cc b/paddle/fluid/operators/jit/kernel_key.cc index 6b0025a75a..db78ed8ad8 100644 --- a/paddle/fluid/operators/jit/kernel_key.cc +++ b/paddle/fluid/operators/jit/kernel_key.cc @@ -44,8 +44,11 @@ size_t JitCodeKey(const gru_attr_t& attr) { template <> size_t JitCodeKey(const seq_pool_attr_t& attr) { - size_t key = static_cast(attr.type); - return key + (attr.w << act_type_shift); + size_t key = attr.w; + // TODO(TJ): support height, then removed it from key + constexpr int w_shift = 30; + return (key << act_type_shift) + static_cast(attr.type) + + (static_cast(attr.h) << (act_type_shift + w_shift)); } } // namespace jit diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc index 98707c936d..283e2e251a 100644 --- a/paddle/fluid/operators/math/sequence_pooling.cc +++ b/paddle/fluid/operators/math/sequence_pooling.cc @@ -255,11 +255,11 @@ class SequencePoolFunctor { jit::seq_pool_attr_t attr; attr.w = input.numel() / input.dims()[0]; attr.type = jit::SeqPoolType::kSum; - auto seqpool = - jit::Get, platform::CPUPlace>( - attr); for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { attr.h = static_cast(lod[i + 1] - lod[i]); + auto seqpool = + jit::Get, platform::CPUPlace>( + attr); seqpool(src, dst, &attr); dst += attr.w; src += attr.h * attr.w; From 92201d3956a4f64615baf5bc9e979bcfc6bd09bd Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 4 Jan 2019 06:41:40 +0000 Subject: [PATCH 123/197] support avg and sqrt pool and add mkl impl test=develop --- .../operators/jit/more/mkl/CMakeLists.txt | 1 + paddle/fluid/operators/jit/more/mkl/mkl.cc | 31 +++++++++++++++++++ paddle/fluid/operators/jit/more/mkl/mkl.h | 26 ++++++++++++++++ paddle/fluid/operators/jit/refer/refer.h | 9 ++++++ 4 files changed, 67 insertions(+) diff --git a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt index 863cc720d6..f5ed2f0572 100644 --- a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt +++ b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt @@ -9,3 +9,4 @@ USE_JITKERNEL_MORE(kVScal, mkl) USE_JITKERNEL_MORE(kVExp, mkl) USE_JITKERNEL_MORE(kVSigmoid, mkl) USE_JITKERNEL_MORE(kVTanh, mkl) +USE_JITKERNEL_MORE(kSeqPool, mkl) diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc index a5b088d481..5a499ac2c0 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.cc +++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc @@ -72,6 +72,26 @@ void VExp(const double* x, double* y, int n) { platform::dynload::vdExp(n, x, y); } +template <> +void VCopy(const float* x, float* y, int n) { + platform::dynload::cblas_scopy(n, x, 1, y, 1); +} + +template <> +void VCopy(const double* x, double* y, int n) { + platform::dynload::cblas_dcopy(n, x, 1, y, 1); +} + +template <> +void VAXPY(float a, const float* x, float* y, int n) { + platform::dynload::cblas_saxpy(n, a, x, 1, y, 1); +} + +template <> +void VAXPY(double a, const double* x, double* y, int n) { + platform::dynload::cblas_daxpy(n, a, x, 1, y, 1); +} + // TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512 template <> bool VMulKernel::UseMe(const int& d) const { @@ -103,6 +123,16 @@ bool VTanhKernel::UseMe(const int& d) const { return d > 7; } +template <> +bool SeqPoolKernel::UseMe(const seq_pool_attr_t& attr) const { + return true; +} + +template <> +bool SeqPoolKernel::UseMe(const seq_pool_attr_t& attr) const { + return true; +} + #define AWALYS_USE_ME_WITH_DOUBLE(func) \ template <> \ bool func##Kernel::UseMe(const int& d) const { \ @@ -135,5 +165,6 @@ REGISTER_MKL_KERNEL(kVScal, VScal); REGISTER_MKL_KERNEL(kVExp, VExp); REGISTER_MKL_KERNEL(kVSigmoid, VSigmoid); REGISTER_MKL_KERNEL(kVTanh, VTanh); +REGISTER_MKL_KERNEL(kSeqPool, SeqPool); #undef REGISTER_MKL_KERNEL diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h index ee1031c028..0a3816db24 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.h +++ b/paddle/fluid/operators/jit/more/mkl/mkl.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include "paddle/fluid/operators/jit/kernel_base.h" @@ -35,6 +36,12 @@ void VScal(const T* a, const T* x, T* y, int n); template void VExp(const T* x, T* y, int n); +template +void VCopy(const T* x, T* y, int n); + +template +void VAXPY(T a, const T* x, T* y, int n); + template void VSigmoid(const T* x, T* y, int n) { const T min = SIGMOID_THRESHOLD_MIN; @@ -60,6 +67,23 @@ void VTanh(const T* x, T* y, int n) { } } +template +void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) { + VCopy(x, y, attr->w); + for (int h = 1; h != attr->h; ++h) { + VAXPY(static_cast(1), x + h * attr->w, y, attr->w); + } + if (attr->type == SeqPoolType::kAvg || attr->type == SeqPoolType::kSqrt) { + T scalar = static_cast(1); + if (attr->type == SeqPoolType::kAvg) { + scalar = scalar / static_cast(attr->h); + } else { + scalar = scalar / std::sqrt(static_cast(attr->h)); + } + VScal(&scalar, y, y, attr->w); + } +} + #define DECLARE_MKL_KERNEL(name, tuples) \ template \ class name##Kernel : public KernelMore> { \ @@ -81,6 +105,8 @@ DECLARE_MKL_KERNEL(VExp, XYNTuples); DECLARE_MKL_KERNEL(VSigmoid, XYNTuples); DECLARE_MKL_KERNEL(VTanh, XYNTuples); +DECLARE_MKL_KERNEL(SeqPool, SeqPoolTuples); + #undef DECLARE_MKL_KERNEL } // namespace mkl diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index c2aa922528..4e19783c86 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -344,6 +344,15 @@ void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) { src += attr->w; } } + if (attr->type == SeqPoolType::kAvg || attr->type == SeqPoolType::kSqrt) { + T scalar = static_cast(1); + if (attr->type == SeqPoolType::kAvg) { + scalar = scalar / static_cast(attr->h); + } else { + scalar = scalar / std::sqrt(static_cast(attr->h)); + } + VScal(&scalar, y, y, attr->w); + } } #define DECLARE_REFER_KERNEL(name, tuples) \ From f0cde74564626f0991f13e1cbff59ec41a6fd0c1 Mon Sep 17 00:00:00 2001 From: baojun-nervana Date: Fri, 4 Jan 2019 11:28:27 -0800 Subject: [PATCH 124/197] Update ngraph with elt-wise relu test=develop --- cmake/external/ngraph.cmake | 2 +- paddle/fluid/framework/ngraph_operator.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/external/ngraph.cmake b/cmake/external/ngraph.cmake index 9da657b7d7..799d9c309f 100644 --- a/cmake/external/ngraph.cmake +++ b/cmake/external/ngraph.cmake @@ -37,7 +37,7 @@ INCLUDE(GNUInstallDirs) INCLUDE(ExternalProject) SET(NGRAPH_PROJECT "extern_ngraph") -SET(NGRAPH_GIT_TAG "v0.10.1") +SET(NGRAPH_GIT_TAG "08851c2c45fcf9fa9c74871dd3dbc3fe38f37cc9") SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph) SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph) SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include) diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc index 57345f12cc..7e174c7def 100644 --- a/paddle/fluid/framework/ngraph_operator.cc +++ b/paddle/fluid/framework/ngraph_operator.cc @@ -539,7 +539,7 @@ void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const { } } - backend_->call(ngraph_function_, t_out, t_in); + backend_->call(backend_->compile(ngraph_function_), t_out, t_in); } // NgraphEngine::RunImpl } // namespace framework } // namespace paddle From 8e2a592be29da1ee045b3c11ba4484a5f71957e0 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Sun, 6 Jan 2019 15:13:12 +0800 Subject: [PATCH 125/197] fix test=develop --- python/paddle/fluid/compiler.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py index 1e6714479d..7e0ef8d150 100644 --- a/python/paddle/fluid/compiler.py +++ b/python/paddle/fluid/compiler.py @@ -101,6 +101,10 @@ class CompiledProgram(object): self._exec_strategy = exec_strategy self._loss_name = loss_name self._share_vars_from = share_vars_from + if self._exec_strategy is None: + self._exec_strategy = ExecutionStrategy() + if self._build_strategy is None: + self._build_strategy = BuildStrategy() return self def _with_distributed(self): @@ -124,12 +128,6 @@ class CompiledProgram(object): else: self._local_scopes = [] - self._places = [] - if self._exec_strategy is None: - self._exec_strategy = ExecutionStrategy() - if self._build_strategy is None: - self._build_strategy = BuildStrategy() - self._exec_strategy.use_cuda = isinstance(self._place, core.CUDAPlace) if self._exec_strategy.use_cuda: gpus_env = os.getenv("FLAGS_selected_gpus") @@ -194,6 +192,7 @@ class CompiledProgram(object): if place and self._place != place: raise ValueError("Cannot compile with different place") return self + self._compiled = True self._scope = scope self._place = place From 5f0a0286e0ba0410361dfd1e3027b923c999a8d2 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Sun, 6 Jan 2019 15:28:26 +0800 Subject: [PATCH 126/197] add doc test=develop --- python/paddle/fluid/executor.py | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 4003e988f2..67e569eac0 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -270,6 +270,29 @@ class Executor(object): But the global scope variables will be persistent through different runs. All of ops in program will be running in sequence. + + Example: + .. code-block:: python + # First create the Executor. + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + + # Run the startup program once and only once. + # Not need to optimize/compile the startup program. + exe.run(fluid.default_startup_program()) + + # Run the main program directly without compile. + loss, = exe.run(fluid.default_main_program(), + feed=feed_dict, + fetch_list=[loss.name]) + # Or, compiled the program and run. See `CompiledProgram` for more detail. + compiled_prog = compiler.CompiledProgram( + fluid.default_main_program()).with_data_parallel( + loss_name=loss.name) + loss, = exe.run(compiled_prog, + feed=feed_dict, + fetch_list=[loss.name]) + Args: place(core.CPUPlace|core.CUDAPlace(n)): indicate the executor run on which device @@ -441,8 +464,9 @@ class Executor(object): operators in the program but not only the operators dependent by the fetch_list Args: - program(Program): the program that need to run, if not provied, then default_main_program will be used. - feed(dict): feed variable map, e.g. {"image": ImageData, "label": LableData} + program(Program|CompiledProgram): the program that need to run, + if not provided, then default_main_program will be used. + feed(dict): feed variable map, e.g. {"image": ImageData, "label": LabelData} fetch_list(list): a list of variable or variable names that user want to get, run will return them according to this list. feed_var_name(str): the name for the input variable of feed Operator. fetch_var_name(str): the name for the output variable of fetch Operator. From be425461a1a80ec8d397c00f186374fcd025aa5c Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Mon, 7 Jan 2019 02:27:50 +0000 Subject: [PATCH 127/197] fix crf grad lod share test=develop --- paddle/fluid/operators/linear_chain_crf_op.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc index 998b7f09c3..1da14631e3 100644 --- a/paddle/fluid/operators/linear_chain_crf_op.cc +++ b/paddle/fluid/operators/linear_chain_crf_op.cc @@ -230,10 +230,12 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel { if (ctx->HasOutput(framework::GradVarName("Emission"))) { ctx->SetOutputDim(framework::GradVarName("Emission"), emission_exps_dims); + ctx->ShareLoD("Emission", framework::GradVarName("Emission")); } if (ctx->HasOutput(framework::GradVarName("Transition"))) { ctx->SetOutputDim(framework::GradVarName("Transition"), transition_exps_dims); + ctx->ShareLoD("Transition", framework::GradVarName("Transition")); } } From dd768714aba5980a48466506a1aa38ccd26d1607 Mon Sep 17 00:00:00 2001 From: mozga-intel Date: Mon, 7 Jan 2019 04:10:29 +0100 Subject: [PATCH 128/197] Enable scale operator for a ngraph test=develop --- paddle/fluid/framework/ngraph_bridge.cc | 1 + paddle/fluid/operators/ngraph/ngraph_ops.h | 1 + .../ngraph/ops/elementwise_scalar_op.h | 61 +++++++++++++++++++ paddle/fluid/operators/ngraph/ops/scale_op.h | 41 +++++++++++++ .../unittests/ngraph/test_scale_ngraph_op.py | 40 ++++++++++++ 5 files changed, 144 insertions(+) create mode 100644 paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h create mode 100644 paddle/fluid/operators/ngraph/ops/scale_op.h create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py diff --git a/paddle/fluid/framework/ngraph_bridge.cc b/paddle/fluid/framework/ngraph_bridge.cc index 42190b5228..af80f66ec7 100644 --- a/paddle/fluid/framework/ngraph_bridge.cc +++ b/paddle/fluid/framework/ngraph_bridge.cc @@ -34,6 +34,7 @@ std::map}, {"tanh", paddle::operators::ngraphs::BuildUnaryNode}, {"top_k", paddle::operators::ngraphs::BuildTopKNode}}; diff --git a/paddle/fluid/operators/ngraph/ngraph_ops.h b/paddle/fluid/operators/ngraph/ngraph_ops.h index 8e7457dd56..be977f3c69 100644 --- a/paddle/fluid/operators/ngraph/ngraph_ops.h +++ b/paddle/fluid/operators/ngraph/ngraph_ops.h @@ -24,4 +24,5 @@ limitations under the License. */ #include "ops/binary_unnary_op.h" #include "ops/fill_constant_op.h" #include "ops/mul_op.h" +#include "ops/scale_op.h" #include "ops/top_k_op.h" diff --git a/paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h b/paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h new file mode 100644 index 0000000000..15fbd58b02 --- /dev/null +++ b/paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h @@ -0,0 +1,61 @@ +/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_NGRAPH +#pragma once + +#include +#include "ngraph/ngraph.hpp" +#include "paddle/fluid/platform/ngraph_helper.h" + +namespace paddle { +namespace operators { +namespace ngraphs { + +template +std::shared_ptr ElementwiseScalar( + float scale, std::shared_ptr node) { + auto node_shape = node->get_shape(); + auto scale_const = ngraph::op::Constant::create(node->get_element_type(), + node_shape, {scale}); + return std::make_shared(scale_const, node); +} + +template +std::shared_ptr ElementwiseScalar( + std::shared_ptr scale_1d, + std::shared_ptr node) { + auto scale_shape = scale_1d->get_shape(); + PADDLE_ENFORCE_EQ(scale_shape.size(), 1, "Supporting 1d scale node"); + PADDLE_ENFORCE_EQ(scale_shape.at(0), 1, "scale 1d in in shape {1}"); + + auto node_shape = node->get_shape(); + ngraph::AxisSet axis_set; + for (size_t i = 0; i < node_shape.size(); ++i) { + axis_set.insert(i); + } + node_shape.push_back(1); + + auto scale_bcast = + std::make_shared(scale_1d, node_shape, axis_set); + + auto scale_reshape = + paddle::platform::NgReshaper(scale_bcast, node->get_shape()); + + return std::make_shared(scale_reshape, node); +} +} // namespace ngraphs +} // namespace operators +} // namespace paddle +#endif diff --git a/paddle/fluid/operators/ngraph/ops/scale_op.h b/paddle/fluid/operators/ngraph/ops/scale_op.h new file mode 100644 index 0000000000..24ab0702aa --- /dev/null +++ b/paddle/fluid/operators/ngraph/ops/scale_op.h @@ -0,0 +1,41 @@ +/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_NGRAPH +#pragma once + +#include +#include "ngraph/ngraph.hpp" +#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h" +#include "paddle/fluid/platform/ngraph_helper.h" + +namespace paddle { +namespace operators { +namespace ngraphs { + +void BuildScaleNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto op_attrs = paddle::framework::AttrReader(op->Attrs()); + float scale = op_attrs.Get("scale"); + auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map); + auto out = ElementwiseScalar(scale, x); + paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map); +} +} // namespace ngraphs +} // namespace operators +} // namespace paddle +#endif diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py new file mode 100644 index 0000000000..b42a1f73fa --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py @@ -0,0 +1,40 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import print_function +import unittest +from paddle.fluid.tests.unittests.test_scale_op import TestScaleOp, TestScaleOpSelectedRows, TestScaleFp16Op, TestScaleFp16OpSelectedRows + + +class TestNGRAPHScaleOp(TestScaleOp): + def init_dtype_type(self): + pass + + +class TestNGRAPHScaleOpSelectedRows(TestScaleOpSelectedRows): + def init_dtype_type(self): + pass + + +class TestNGRAPHScaleFp16Op(TestScaleFp16Op): + def init_dtype_type(self): + pass + + +class TestNGRAPHScaleFp16OpSelectedRows(TestScaleFp16OpSelectedRows): + def init_dtype_type(self): + pass + + +if __name__ == "__main__": + unittest.main() From e77956c92007bd8ec7f9956cc7e27519361a2723 Mon Sep 17 00:00:00 2001 From: mozga-intel Date: Mon, 7 Jan 2019 04:17:13 +0100 Subject: [PATCH 129/197] Enable mean operator for a ngraph test=develop --- paddle/fluid/framework/ngraph_bridge.cc | 2 + paddle/fluid/operators/ngraph/ngraph_ops.h | 1 + .../ngraph/ops/elementwise_scalar_op.h | 61 +++++++++++++++++ paddle/fluid/operators/ngraph/ops/mean_op.h | 68 +++++++++++++++++++ .../unittests/ngraph/test_mean_ngraph_op.py | 31 +++++++++ 5 files changed, 163 insertions(+) create mode 100644 paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h create mode 100644 paddle/fluid/operators/ngraph/ops/mean_op.h create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py diff --git a/paddle/fluid/framework/ngraph_bridge.cc b/paddle/fluid/framework/ngraph_bridge.cc index 42190b5228..9f1eef376c 100644 --- a/paddle/fluid/framework/ngraph_bridge.cc +++ b/paddle/fluid/framework/ngraph_bridge.cc @@ -32,6 +32,8 @@ std::map>>)>> NgraphBridge::NG_NODE_MAP = { {"fill_constant", paddle::operators::ngraphs::BuildFillConstantNode}, + {"mean", paddle::operators::ngraphs::BuildMeanNode}, + {"mean_grad", paddle::operators::ngraphs::BuildMeanGradNode}, {"mul", paddle::operators::ngraphs::BuildMulNode}, {"mul_grad", paddle::operators::ngraphs::BuildMulGradNode}, {"relu", paddle::operators::ngraphs::BuildUnaryNode}, diff --git a/paddle/fluid/operators/ngraph/ngraph_ops.h b/paddle/fluid/operators/ngraph/ngraph_ops.h index 8e7457dd56..eef475b73f 100644 --- a/paddle/fluid/operators/ngraph/ngraph_ops.h +++ b/paddle/fluid/operators/ngraph/ngraph_ops.h @@ -23,5 +23,6 @@ limitations under the License. */ #include "ops/binary_unnary_op.h" #include "ops/fill_constant_op.h" +#include "ops/mean_op.h" #include "ops/mul_op.h" #include "ops/top_k_op.h" diff --git a/paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h b/paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h new file mode 100644 index 0000000000..15fbd58b02 --- /dev/null +++ b/paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h @@ -0,0 +1,61 @@ +/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_NGRAPH +#pragma once + +#include +#include "ngraph/ngraph.hpp" +#include "paddle/fluid/platform/ngraph_helper.h" + +namespace paddle { +namespace operators { +namespace ngraphs { + +template +std::shared_ptr ElementwiseScalar( + float scale, std::shared_ptr node) { + auto node_shape = node->get_shape(); + auto scale_const = ngraph::op::Constant::create(node->get_element_type(), + node_shape, {scale}); + return std::make_shared(scale_const, node); +} + +template +std::shared_ptr ElementwiseScalar( + std::shared_ptr scale_1d, + std::shared_ptr node) { + auto scale_shape = scale_1d->get_shape(); + PADDLE_ENFORCE_EQ(scale_shape.size(), 1, "Supporting 1d scale node"); + PADDLE_ENFORCE_EQ(scale_shape.at(0), 1, "scale 1d in in shape {1}"); + + auto node_shape = node->get_shape(); + ngraph::AxisSet axis_set; + for (size_t i = 0; i < node_shape.size(); ++i) { + axis_set.insert(i); + } + node_shape.push_back(1); + + auto scale_bcast = + std::make_shared(scale_1d, node_shape, axis_set); + + auto scale_reshape = + paddle::platform::NgReshaper(scale_bcast, node->get_shape()); + + return std::make_shared(scale_reshape, node); +} +} // namespace ngraphs +} // namespace operators +} // namespace paddle +#endif diff --git a/paddle/fluid/operators/ngraph/ops/mean_op.h b/paddle/fluid/operators/ngraph/ops/mean_op.h new file mode 100644 index 0000000000..7fcf8f09cd --- /dev/null +++ b/paddle/fluid/operators/ngraph/ops/mean_op.h @@ -0,0 +1,68 @@ +/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_NGRAPH +#pragma once + +#include +#include + +#include "ngraph/ngraph.hpp" +#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h" +#include "paddle/fluid/platform/ngraph_helper.h" + +namespace paddle { +namespace operators { +namespace ngraphs { + +void BuildMeanNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto input = paddle::platform::GetInputNode(op, "X", ngb_node_map); + ngraph::AxisSet axes; + for (size_t i = 0; i < input->get_shape().size(); ++i) { + axes.insert(i); + } + + auto mean = ngraph::builder::mean(input, axes); + auto mean_1d = std::make_shared( + mean, ngraph::AxisVector{}, ngraph::Shape{1}); + paddle::platform::SetOutputNode(op, "Out", mean_1d, ngb_node_map); +} + +void BuildMeanGradNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map); + auto og = paddle::platform::GetInputNode(op, "Out@GRAD", ngb_node_map); + auto x_shape = x->get_shape(); + float x_size = std::accumulate(std::begin(x_shape), std::end(x_shape), 1, + std::multiplies()); + auto node_const = ngraph::op::Constant::create(og->get_element_type(), + ngraph::Shape{1}, {x_size}); + auto node_div = std::make_shared(og, node_const); + + auto result = ElementwiseScalar( + og / node_const, + ngraph::op::Constant::create(og->get_element_type(), x_shape, {0})); + paddle::platform::SetOutputNode(op, "X@GRAD", result, ngb_node_map); +} +} // namespace ngraphs +} // namespace operators +} // namespace paddle +#endif diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py new file mode 100644 index 0000000000..5535427ea8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py @@ -0,0 +1,31 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import print_function + +import unittest +from paddle.fluid.tests.unittests.test_mean_op import TestMeanOp, TestFP16MeanOp + + +class TestNGRAPHMeanOp(TestMeanOp): + def setUp(self): + super(TestNGRAPHMeanOp, self).setUp() + + +class TestNGRAPHFP16MeanOp(TestFP16MeanOp): + def setUp(self): + super(TestNGRAPHFP16MeanOp, self).setUp() + + +if __name__ == "__main__": + unittest.main() From 583f7ce173bb685dc0fc78bb94171b6f2f4b2cd4 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 7 Jan 2019 12:27:44 +0800 Subject: [PATCH 130/197] Add dynamic jemalloc modules test=develop --- CMakeLists.txt | 9 ++++++++- cmake/FindJeMalloc.cmake | 21 +++++++++++++++++++++ cmake/generic.cmake | 6 +++++- 3 files changed, 34 insertions(+), 2 deletions(-) create mode 100644 cmake/FindJeMalloc.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 66dcef0013..d6aa8f1b85 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -55,6 +55,7 @@ option(WITH_DOUBLE "Compile PaddlePaddle with double precision" OFF) option(WITH_RDMA "Compile PaddlePaddle with RDMA support" OFF) option(WITH_TIMER "Compile PaddlePaddle with stats timer" OFF) option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools" OFF) +option(WITH_JEMALLOC "Compile PaddlePaddle with jemalloc" OFF) option(WITH_DOC "Compile PaddlePaddle with documentation" OFF) option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF) option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF) @@ -261,6 +262,12 @@ if (WITH_PROFILER) add_definitions(-DWITH_GPERFTOOLS) endif() +if (WITH_JEMALLOC) + find_package(JeMalloc REQUIRED) + include_directories(${JEMALLOC_INCLUDE_DIR}) + add_definitions(-DWITH_JEMALLOC) +endif() + include(generic) # simplify cmake module include(package) # set paddle packages include(ccache) # set ccache for compilation @@ -290,7 +297,7 @@ if(WITH_PSLIB) list(APPEND EXTERNAL_LIBS pslib_brpc) list(APPEND EXTERNAL_LIBS libmct) endif(WITH_PSLIB) - + if(WITH_AMD_GPU) find_package(HIP) include(hip) diff --git a/cmake/FindJeMalloc.cmake b/cmake/FindJeMalloc.cmake new file mode 100644 index 0000000000..7911f77c4c --- /dev/null +++ b/cmake/FindJeMalloc.cmake @@ -0,0 +1,21 @@ +# - Find JeMalloc library +# Find the native JeMalloc includes and library +# +# JEMALLOC_INCLUDE_DIR - where to find jemalloc.h, etc. +# JEMALLOC_LIBRARIES - List of libraries when using jemalloc. +# JEMALLOC_FOUND - True if jemalloc found. + +find_path(JEMALLOC_INCLUDE_DIR + NAMES jemalloc/jemalloc.h + HINTS ${JEMALLOC_ROOT_DIR}/include) + +find_library(JEMALLOC_LIBRARIES + NAMES jemalloc + HINTS ${JEMALLOC_ROOT_DIR}/lib) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(jemalloc DEFAULT_MSG JEMALLOC_LIBRARIES JEMALLOC_INCLUDE_DIR) + +mark_as_advanced( + JEMALLOC_LIBRARIES + JEMALLOC_INCLUDE_DIR) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index c6fe2e970d..4e31392b98 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -115,6 +115,10 @@ function(common_link TARGET_NAME) if (WITH_PROFILER) target_link_libraries(${TARGET_NAME} gperftools::profiler) endif() + + if (WITH_JEMALLOC) + target_link_libraries(${TARGET_NAME} ${JEMALLOC_LIBRARIES}) + endif() endfunction() @@ -228,7 +232,7 @@ function(merge_static_libs TARGET_NAME) # Get the file names of the libraries to be merged set(libfiles ${libfiles} $) endforeach() - # msvc will put libarary in directory of "/Release/xxxlib" by default + # msvc will put libarary in directory of "/Release/xxxlib" by default # COMMAND cmake -E remove "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/${TARGET_NAME}.lib" add_custom_command(TARGET ${TARGET_NAME} POST_BUILD COMMAND cmake -E make_directory "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}" From b2716909b41109a226d088800b9f0b37f3d42bd8 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 7 Jan 2019 12:30:33 +0800 Subject: [PATCH 131/197] Add changes to paddle_build test=develop --- paddle/scripts/paddle_build.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 57e059bcf9..50b7a63129 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -199,6 +199,7 @@ function cmake_gen() { -DANAKIN_BUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM:ON} -DPY_VERSION=${PY_VERSION:-2.7} -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} + -DWITH_JEMALLOC=${WITH_JEMALLOC:-OFF} ======================================== EOF # Disable UNITTEST_USE_VIRTUALENV in docker because @@ -232,7 +233,8 @@ EOF -DANAKIN_BUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN:OFF}\ -DANAKIN_BUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM:ON}\ -DPY_VERSION=${PY_VERSION:-2.7} \ - -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} + -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} \ + -DWITH_JEMALLOC=${WITH_JEMALLOC:-OFF} } @@ -447,7 +449,7 @@ EOF elif [ "$1" == "cp37-cp37m" ]; then pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl fi - + if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then paddle version fi From 39b98709b11a1031ce2e2c373bad9ce901d4cef0 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 7 Jan 2019 12:48:01 +0800 Subject: [PATCH 132/197] Move fused ops to fused dir test=develop --- paddle/fluid/operators/{ => fused}/fused_embedding_seq_pool_op.cc | 0 paddle/fluid/operators/{ => fused}/fused_embedding_seq_pool_op.h | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename paddle/fluid/operators/{ => fused}/fused_embedding_seq_pool_op.cc (100%) rename paddle/fluid/operators/{ => fused}/fused_embedding_seq_pool_op.h (100%) diff --git a/paddle/fluid/operators/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc similarity index 100% rename from paddle/fluid/operators/fused_embedding_seq_pool_op.cc rename to paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc diff --git a/paddle/fluid/operators/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h similarity index 100% rename from paddle/fluid/operators/fused_embedding_seq_pool_op.h rename to paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h From f4c990e7b8493304b61249417aaaca45d95e5174 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 7 Jan 2019 12:54:37 +0800 Subject: [PATCH 133/197] Add fused embedding ops --- .../fused/fused_embedding_seq_pool_op.cc | 194 ++++++++++++++++++ .../fused/fused_embedding_seq_pool_op.h | 142 +++++++++++++ 2 files changed, 336 insertions(+) create mode 100644 paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc create mode 100644 paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc new file mode 100644 index 0000000000..fe4c73f472 --- /dev/null +++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc @@ -0,0 +1,194 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h" +#include "paddle/fluid/framework/var_type_inference.h" + +namespace paddle { +namespace operators { + +class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("W"), + "Input W of FusedEmbeddingSeqPoolOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Ids"), + "Input Ids of FusedEmbeddingSeqPoolOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output of FusedEmbeddingSeqPoolOp should not be null."); + + auto table_dims = ctx->GetInputDim("W"); + auto ids_dims = ctx->GetInputDim("Ids"); + const std::string& combiner = ctx->Attrs().Get("combiner"); + + PADDLE_ENFORCE_EQ(table_dims.size(), 2); + PADDLE_ENFORCE_GE(ids_dims.size(), 1, + "The dim size of the 'Ids' tensor must greater than 1."); + PADDLE_ENFORCE_EQ(ids_dims[ids_dims.size() - 1], 1, + "The last dimension of the 'Ids' tensor must be 1."); + // we only support sum now + PADDLE_ENFORCE_EQ(combiner, "sum"); + + int64_t last_dim = table_dims[1]; + for (int i = 1; i != ids_dims.size(); ++i) { + last_dim *= ids_dims[i]; + } + + if (ctx->IsRuntime()) { + framework::Variable* ids_var = + boost::get(ctx->GetInputVarPtrs("Ids")[0]); + const auto& ids_lod = ids_var->Get().lod(); + + // in run time, the LoD of ids must be 1 + PADDLE_ENFORCE(ids_lod.size(), 1u, + "The LoD level of Input(Ids) must be 1"); + PADDLE_ENFORCE_GE(ids_lod[0].size(), 1u, "The LoD could NOT be empty"); + + int64_t batch_size = ids_lod[0].size() - 1; + + // in run time, the shape from Ids -> output + // should be [seq_length, 1] -> [batch_size, embedding_size] + ctx->SetOutputDim("Out", framework::make_ddim({batch_size, last_dim})); + } else { + // in compile time, the lod level of ids must be 1 + framework::VarDesc* ids_desc = + boost::get(ctx->GetInputVarPtrs("Ids")[0]); + PADDLE_ENFORCE_EQ(ids_desc->GetLoDLevel(), 1); + + // in compile time, the shape from Ids -> output + // should be [-1, 1] -> [-1, embedding_size] + ctx->SetOutputDim("Out", framework::make_ddim({-1, last_dim})); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W")); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +class FusedEmbeddingSeqPoolOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("W", + "(Tensor) The input represents embedding tensors, " + "which is a learnable parameter."); + AddInput("Ids", + "An input with type int32 or int64 " + "contains the ids to be looked up in W. " + "The last dimension size must be 1."); + AddOutput("Out", "The lookup results, which have the same type as W."); + AddAttr("combiner", + "(string, default sum) " + "A string specifying the reduction op. Currently sum " + "are supported, sum computes the weighted sum of the " + "embedding results for each row.") + .SetDefault("sum"); + // NOTE(minqiyang): grad_inplace is an temporal attribute, + // please do NOT set this attribute in python layer. + AddAttr("grad_inplace", + "(boolean, default false) " + "If the grad op reuse the input's variable.") + .SetDefault(false); + AddAttr("is_sparse", + "(boolean, default false) " + "Sparse update.") + .SetDefault(false); + AddComment(R"DOC( +FusedEmbeddingSeqPool Operator. + +Computes embeddings for the given ids and weights. + +This operator is used to perform lookups on the parameter W, +then computes the weighted sum of the lookups results for each row +and concatenated into a dense tensor. + +The input Ids should carry the LoD (Level of Details) information. +And the output will change the LoD information with input Ids. + +)DOC"); + } +}; + +class FusedEmbeddingSeqPoolOpGradDescMaker + : public framework::DefaultGradOpDescMaker { + using ::paddle::framework::DefaultGradOpDescMaker< + true>::DefaultGradOpDescMaker; + + protected: + virtual std::string GradOpType() const { + return "fused_embedding_seq_pool_grad"; + } +}; + +class FusedEmbeddingSeqPoolOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + auto table_dims = ctx->GetInputDim("W"); + ctx->SetOutputDim(framework::GradVarName("W"), table_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W")); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +class FusedEmbeddingSeqPoolOpGradVarTypeInference + : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { + auto out_var_name = op_desc.Output(framework::GradVarName("W")).front(); + auto attr = op_desc.GetAttr("is_sparse"); + bool is_sparse = boost::get(attr); + if (is_sparse) { + VLOG(3) << "fused_embedding_seq_pool_grad op " + << framework::GradVarName("W") << " is set to SelectedRows"; + block->Var(out_var_name) + ->SetType(framework::proto::VarType::SELECTED_ROWS); + } else { + VLOG(3) << "fused_embedding_seq_pool_grad op " + << framework::GradVarName("W") << " is set to LoDTensor"; + block->Var(out_var_name)->SetType(framework::proto::VarType::LOD_TENSOR); + } + block->Var(out_var_name)->SetDataType(block->Var("W")->GetDataType()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(fused_embedding_seq_pool, ops::FusedEmbeddingSeqPoolOp, + ops::FusedEmbeddingSeqPoolOpGradDescMaker, + ops::FusedEmbeddingSeqPoolOpMaker); +REGISTER_OPERATOR(fused_embedding_seq_pool_grad, + ops::FusedEmbeddingSeqPoolOpGrad, + ops::FusedEmbeddingSeqPoolOpGradVarTypeInference); + +REGISTER_OP_CPU_KERNEL(fused_embedding_seq_pool, + ops::FusedEmbeddingSeqPoolKernel, + ops::FusedEmbeddingSeqPoolKernel); +REGISTER_OP_CPU_KERNEL(fused_embedding_seq_pool_grad, + ops::FusedEmbeddingSeqPoolGradKernel, + ops::FusedEmbeddingSeqPoolGradKernel); diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h new file mode 100644 index 0000000000..38dfae8ad6 --- /dev/null +++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h @@ -0,0 +1,142 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/operators/math/blas.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using SelectedRows = framework::SelectedRows; +using DDim = framework::DDim; + +template +struct EmbeddingVSumFunctor { + void operator()(const framework::ExecutionContext &context, + const LoDTensor *table_t, const LoDTensor *ids_t, + LoDTensor *output_t) { + auto *table = table_t->data(); + int64_t row_number = table_t->dims()[0]; + int64_t row_width = table_t->dims()[1]; + int64_t last_dim = output_t->dims()[1]; + int64_t *ids = const_cast(ids_t->data()); + auto ids_lod = ids_t->lod()[0]; + int64_t ids_count = ids_t->numel() / ids_lod.back(); + + auto *output = output_t->mutable_data(context.GetPlace()); + + auto blas = math::GetBlas(context); + for (int64_t i = 0; i != ids_lod.size() - 1; ++i) { + size_t begin = ids_lod[i] * ids_count; + for (int64_t j = 0; j != ids_count; ++j) { + PADDLE_ENFORCE_LT(ids[begin], row_number); + PADDLE_ENFORCE_GE(ids[begin], 0, "ids %d", i); + blas.VCOPY(row_width, table + ids[begin + j] * row_width, + output + i * last_dim + j * row_width); + } + + for (int64_t r = (ids_lod[i] + 1) * ids_count; + r < ids_lod[i + 1] * ids_count; ++r) { + PADDLE_ENFORCE_LT(ids[r], row_number); + PADDLE_ENFORCE_GE(ids[r], 0, "ids %d", i); + blas.AXPY(row_width, 1., table + ids[r] * row_width, + output + i * last_dim + (r % ids_count) * row_width); + } + } + } +}; + +template +class FusedEmbeddingSeqPoolKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + const LoDTensor *ids_t = context.Input("Ids"); // int tensor + LoDTensor *output_t = context.Output("Out"); // float tensor + const LoDTensor *table_var = context.Input("W"); + const std::string &combiner_type = context.Attr("combiner"); + + if (combiner_type == "sum") { + EmbeddingVSumFunctor functor; + functor(context, table_var, ids_t, output_t); + } + } +}; + +template +class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *table_var = context.InputVar("W"); + DDim table_dim; + if (table_var->IsType()) { + table_dim = context.Input("W")->dims(); + } else if (table_var->IsType()) { + auto *table_t = context.Input("W"); + table_dim = table_t->value().dims(); + } else { + PADDLE_THROW( + "The parameter W of a LookupTable " + "must be either LoDTensor or SelectedRows"); + } + + bool is_sparse = context.Attr("is_sparse"); + // Since paddings are not trainable and fixed in forward, the gradient of + // paddings makes no sense and we don't deal with it in backward. + if (is_sparse) { + auto *ids = context.Input("Ids"); + auto *d_output = context.Input(framework::GradVarName("Out")); + auto *d_table = context.Output(framework::GradVarName("W")); + + auto *ids_data = ids->data(); + int64_t ids_num = ids->numel(); + auto lod = ids->lod()[0]; + int64_t row_width = d_output->dims()[1]; + + framework::Vector *new_rows = d_table->mutable_rows(); + new_rows->resize(ids_num); + std::memcpy(&(*new_rows)[0], ids_data, ids_num * sizeof(int64_t)); + + auto *d_table_value = d_table->mutable_value(); + d_table_value->Resize({ids_num, table_dim[1]}); + T *d_table_data = d_table_value->mutable_data(context.GetPlace()); + const T *d_output_data = d_output->data(); + + auto blas = math::GetBlas(context); + for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { + int64_t h = static_cast(lod[i + 1] - lod[i]); + int64_t in_offset = lod[i] * row_width; + const T *out_pos = d_output_data + i * row_width; + T *in_pos = d_table_data + in_offset; + for (int r = 0; r != h; ++r) { + blas.VCOPY(row_width, out_pos, in_pos + r * row_width); + } + } + } else { + LOG(ERROR) << "Dense is not supported in fused_embedding_seq_pool_op now"; + } + } +}; + +} // namespace operators +} // namespace paddle From dc0ecffd6c4115019cfcbcc13b17a20511888c9b Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 7 Jan 2019 12:55:03 +0800 Subject: [PATCH 134/197] Add ut for fused ops --- .../unittests/test_fused_emb_seq_pool_op.py | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py diff --git a/python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py b/python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py new file mode 100644 index 0000000000..584e309bef --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py @@ -0,0 +1,51 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest +import paddle.fluid.core as core +import paddle.fluid as fluid +from paddle.fluid.op import Operator +import paddle.compat as cpt + + +class TestFusedEmbeddingSeqPoolOp(OpTest): + def setUp(self): + self.op_type = "fused_embedding_seq_pool" + self.emb_size = 2 + table = np.random.random((17, self.emb_size)).astype("float32") + ids = np.array([[[4], [3]], [[4], [3]], [[2], [1]], + [[16], [1]]]).astype("int64") + merged_ids = np.array([4, 2, 16]).astype("int64") + ids_expand = np.expand_dims(ids, axis=1) + self.lod = [[3, 1]] + self.attrs = {'is_sparse': True} + self.inputs = {'W': table, 'Ids': (ids_expand, self.lod)} + self.outputs = { + 'Out': np.reshape( + np.array([ + table[[4, 3]] + table[[4, 3]] + table[[2, 1]], + table[[16, 1]] + ]), [len(self.lod[0]), 2 * self.emb_size]) + } + + def test_check_output(self): + self.check_output() + + +if __name__ == "__main__": + unittest.main() From db8eb9b6888d7d76ec0f5e5bc07c6388dd633840 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 7 Jan 2019 12:55:32 +0800 Subject: [PATCH 135/197] Polish code test=develop --- paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc index 966bdb4df5..fe4c73f472 100644 --- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc +++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/fused_embedding_seq_pool_op.h" +#include "paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h" #include "paddle/fluid/framework/var_type_inference.h" namespace paddle { From e0591deebc02202c4ae8bfc95f31be606b8192b8 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 4 Jan 2019 14:40:43 +0000 Subject: [PATCH 136/197] enhance seqpool jitcode --- paddle/fluid/operators/jit/benchmark.cc | 4 +- paddle/fluid/operators/jit/gen/seqpool.cc | 55 +-------- paddle/fluid/operators/jit/gen/seqpool.h | 134 ++++++++++++++++++++-- 3 files changed, 126 insertions(+), 67 deletions(-) diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index f64e43389a..37a552fb6d 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -194,8 +194,8 @@ template void BenchSeqPoolKernel() { std::vector pool_types = {jit::SeqPoolType::kSum}; for (auto type : pool_types) { - for (int h : TestSizes()) { - for (int w : TestSizes()) { + for (int w : TestSizes()) { + for (int h : TestSizes()) { const jit::seq_pool_attr_t attr(h, w, type); std::vector x(h * w), y(w); RandomVec(h * w, x.data(), -2.f, 2.f); diff --git a/paddle/fluid/operators/jit/gen/seqpool.cc b/paddle/fluid/operators/jit/gen/seqpool.cc index ce6801b030..fd83f83436 100644 --- a/paddle/fluid/operators/jit/gen/seqpool.cc +++ b/paddle/fluid/operators/jit/gen/seqpool.cc @@ -35,7 +35,6 @@ void SeqPoolJitCode::genCode() { mov(reg32_scalar, scalar); } - // TODO(TJ): make height load from params const int group_len = max_num_regs * block * sizeof(float); for (int g = 0; g < num_groups; ++g) { pool_height(g * group_len, block, max_num_regs); @@ -44,59 +43,9 @@ void SeqPoolJitCode::genCode() { pool_height(num_groups * group_len, block, rest_num_regs); } - // rest part + // part of rest_w * height const int rest = w_ % block; - const bool has_block4 = rest / 4 > 0; - const bool has_block2 = (rest % 4) / 2 > 0; - const bool has_block1 = (rest % 2) == 1; - const int w_offset = num_block * YMM_FLOAT_BLOCK * sizeof(float); - for (int h = 0; h < h_; ++h) { - int offset = h * w_ * sizeof(float) + w_offset; - const int shift_regs = (h == 0) ? 0 : max_num_regs; - int reg_idx = 0; - if (has_block4) { - vmovups(xmm_t(reg_idx + shift_regs), ptr[param1 + offset]); - offset += sizeof(float) * 4; - reg_idx++; - } - if (has_block2) { - vmovq(xmm_t(reg_idx + shift_regs), ptr[param1 + offset]); - offset += sizeof(float) * 2; - reg_idx++; - } - if (has_block1) { - vmovss(xmm_t(reg_idx + shift_regs), ptr[param1 + offset]); - reg_idx++; - } - rest_num_regs = reg_idx; - if (h > 0) { - for (int i = 0; i < reg_idx; ++i) { - vaddps(xmm_t(i), xmm_t(i), xmm_t(i + max_num_regs)); - } - } - } - // save right now - int offset = w_offset; - if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) { - vbroadcastss(xmm_t(max_num_regs - 1), reg32_scalar); - for (int i = 0; i < rest_num_regs; ++i) { - vmulps(xmm_t(i), xmm_t(i), xmm_t(max_num_regs - 1)); - } - } - int reg_idx = 0; - if (has_block4) { - vmovups(ptr[param2 + offset], xmm_t(reg_idx)); - offset += sizeof(float) * 4; - reg_idx++; - } - if (has_block2) { - vmovq(ptr[param2 + offset], xmm_t(reg_idx)); - offset += sizeof(float) * 2; - reg_idx++; - } - if (has_block1) { - vmovss(ptr[param2 + offset], xmm_t(reg_idx)); - } + pool_height_of_rest_width(rest, (w_ - rest) * sizeof(float), max_num_regs); ret(); } diff --git a/paddle/fluid/operators/jit/gen/seqpool.h b/paddle/fluid/operators/jit/gen/seqpool.h index eb2d191382..48288d8c2a 100644 --- a/paddle/fluid/operators/jit/gen/seqpool.h +++ b/paddle/fluid/operators/jit/gen/seqpool.h @@ -17,6 +17,7 @@ #include #include "glog/logging.h" #include "paddle/fluid/operators/jit/gen/jitcode.h" +#include "paddle/fluid/platform/enforce.h" namespace paddle { namespace operators { @@ -45,8 +46,6 @@ class SeqPoolJitCode : public JitCode { base += "_Sqrt"; } base += ("_W" + std::to_string(w_)); - // TODO(TJ): make h load from params - base += ("_H" + std::to_string(h_)); return base.c_str(); } void genCode() override; @@ -54,25 +53,36 @@ class SeqPoolJitCode : public JitCode { protected: template void pool_height(int w_offset, int block, int max_num_regs) { - for (int h = 0; h < h_; ++h) { - int offset = h * w_ * sizeof(float) + w_offset; - const int shift_regs = (h == 0) ? 0 : max_num_regs; - for (int i = 0; i < max_num_regs; ++i) { - vmovups(JMM(i + shift_regs), ptr[param1 + offset]); - offset += sizeof(float) * block; - } - if (h > 0) { - // sum anyway + int offset = w_offset; + for (int i = 0; i < max_num_regs; ++i) { + vmovups(JMM(i), ptr[param1 + offset]); + offset += sizeof(float) * block; + } + if (h_ > 1) { + Label l_next_h; + mov(reg_h, 1); + mov(reg_tmp, param1); + add(reg_tmp, w_ * sizeof(float) + w_offset); + L(l_next_h); + { + mov(reg_ptr_src_i, reg_tmp); for (int i = 0; i < max_num_regs; ++i) { + vmovups(JMM(i + max_num_regs), ptr[reg_ptr_src_i]); + // sum anyway vaddps(JMM(i), JMM(i), JMM(i + max_num_regs)); + add(reg_ptr_src_i, sizeof(float) * block); } + inc(reg_h); + add(reg_tmp, w_ * sizeof(float)); + cmp(reg_h, h_); + jl(l_next_h, T_NEAR); } } // save right now if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) { vbroadcastss(JMM(max_num_regs), reg32_scalar); } - int offset = w_offset; + offset = w_offset; for (int i = 0; i < max_num_regs; ++i) { if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) { vmulps(JMM(i), JMM(i), JMM(max_num_regs)); @@ -82,6 +92,102 @@ class SeqPoolJitCode : public JitCode { } } + void pool_height_of_rest_width(int rest, int w_offset, int max_num_regs) { + const int rest_used_num_regs = load_rest(rest, w_offset, 0); + const bool has_block4 = rest / 4 > 0; + const bool has_block2 = (rest % 4) / 2 > 0; + const bool has_block1 = (rest % 2) == 1; + if (h_ > 1) { + Label l_next_h; + mov(reg_h, 1); + mov(reg_tmp, param1); + add(reg_tmp, w_ * sizeof(float) + w_offset); + L(l_next_h); + { + // int used_regs =load_rest(rest, h * w_ * sizeof(float) + w_offset, + // max_num_regs); + int reg_idx = 0; + mov(reg_ptr_src_i, reg_tmp); + if (has_block4) { + vmovups(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]); + add(reg_ptr_src_i, sizeof(float) * 4); + reg_idx++; + } + if (has_block2) { + vmovups(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]); + add(reg_ptr_src_i, sizeof(float) * 2); + reg_idx++; + } + if (has_block1) { + vmovss(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]); + reg_idx++; + } + PADDLE_ENFORCE_EQ(reg_idx, rest_used_num_regs, + "All heights should use same regs"); + for (int i = 0; i < reg_idx; ++i) { + vaddps(xmm_t(i), xmm_t(i), xmm_t(i + max_num_regs)); + } + inc(reg_h); + add(reg_tmp, w_ * sizeof(float)); + cmp(reg_h, h_); + jl(l_next_h, T_NEAR); + } + } + // save right now + if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) { + vbroadcastss(xmm_t(max_num_regs - 1), reg32_scalar); + for (int i = 0; i < rest_used_num_regs; ++i) { + vmulps(xmm_t(i), xmm_t(i), xmm_t(max_num_regs - 1)); + } + } + save_rest(rest, w_offset); + } + + // return the number of used regs, use start from reg 0 + int load_rest(int rest, int w_offset, const int num_shift_regs, + const int reg_start = 0) { + const bool has_block4 = rest / 4 > 0; + const bool has_block2 = (rest % 4) / 2 > 0; + const bool has_block1 = (rest % 2) == 1; + int reg_idx = reg_start; + if (has_block4) { + vmovups(xmm_t(reg_idx + num_shift_regs), ptr[param1 + w_offset]); + w_offset += sizeof(float) * 4; + reg_idx++; + } + if (has_block2) { + vmovq(xmm_t(reg_idx + num_shift_regs), ptr[param1 + w_offset]); + w_offset += sizeof(float) * 2; + reg_idx++; + } + if (has_block1) { + vmovss(xmm_t(reg_idx + num_shift_regs), ptr[param1 + w_offset]); + reg_idx++; + } + return reg_idx; + } + + // use reg start from 0 + void save_rest(int rest, int w_offset, int reg_start = 0) { + const bool has_block4 = rest / 4 > 0; + const bool has_block2 = (rest % 4) / 2 > 0; + const bool has_block1 = (rest % 2) == 1; + int reg_idx = reg_start; + if (has_block4) { + vmovups(ptr[param2 + w_offset], xmm_t(reg_idx)); + w_offset += sizeof(float) * 4; + reg_idx++; + } + if (has_block2) { + vmovq(ptr[param2 + w_offset], xmm_t(reg_idx)); + w_offset += sizeof(float) * 2; + reg_idx++; + } + if (has_block1) { + vmovss(ptr[param2 + w_offset], xmm_t(reg_idx)); + } + } + private: int h_; int w_; @@ -90,6 +196,10 @@ class SeqPoolJitCode : public JitCode { reg64_t param2{abi_param2}; reg64_t param3{abi_param3}; reg32_t reg32_scalar{r8d}; + + reg64_t reg_h{r9}; + reg64_t reg_ptr_src_i{r10}; + reg64_t reg_tmp{r11}; }; } // namespace gen From 0145f40f4576fa035b92e3876ca9c4cfefbc5c52 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Sat, 5 Jan 2019 11:34:15 +0000 Subject: [PATCH 137/197] use height from params of jitcode --- paddle/fluid/operators/jit/benchmark.cc | 3 +- paddle/fluid/operators/jit/gen/seqpool.cc | 17 +- paddle/fluid/operators/jit/gen/seqpool.h | 162 ++++++++++-------- paddle/fluid/operators/jit/kernel_base.h | 6 +- paddle/fluid/operators/jit/kernel_key.cc | 6 +- paddle/fluid/operators/jit/refer/refer.h | 1 - paddle/fluid/operators/jit/test.cc | 7 +- .../fluid/operators/math/sequence_pooling.cc | 12 +- 8 files changed, 117 insertions(+), 97 deletions(-) diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index 37a552fb6d..4cbada4a5b 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -195,8 +195,9 @@ void BenchSeqPoolKernel() { std::vector pool_types = {jit::SeqPoolType::kSum}; for (auto type : pool_types) { for (int w : TestSizes()) { + jit::seq_pool_attr_t attr(w, type); for (int h : TestSizes()) { - const jit::seq_pool_attr_t attr(h, w, type); + attr.h = h; std::vector x(h * w), y(w); RandomVec(h * w, x.data(), -2.f, 2.f); const T* x_data = x.data(); diff --git a/paddle/fluid/operators/jit/gen/seqpool.cc b/paddle/fluid/operators/jit/gen/seqpool.cc index fd83f83436..d651f282bf 100644 --- a/paddle/fluid/operators/jit/gen/seqpool.cc +++ b/paddle/fluid/operators/jit/gen/seqpool.cc @@ -13,6 +13,7 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/gen/seqpool.h" +#include // offsetof #include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/cpu_info.h" @@ -21,20 +22,22 @@ namespace operators { namespace jit { namespace gen { +thread_local float ALIGN32_BEG float_h[1] ALIGN32_END = { + 1.f}; // TODO(TJ): try move to private + void SeqPoolJitCode::genCode() { constexpr int block = YMM_FLOAT_BLOCK; constexpr int max_num_regs = 8; const int num_block = w_ / block; const int num_groups = num_block / max_num_regs; int rest_num_regs = num_block % max_num_regs; - if (type_ == SeqPoolType::kAvg) { - float scalar = 1.f / h_; - mov(reg32_scalar, scalar); - } else if (type_ == SeqPoolType::kSqrt) { - float scalar = 1.f / std::sqrt(static_cast(h_)); - mov(reg32_scalar, scalar); + mov(reg32_int_h, dword[param_attr]); + if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) { + mov(reg_tmp, reinterpret_cast(float_h)); + fild(dword[param_attr]); + fstp(dword[reg_tmp]); + mov(reg32_fp_h, dword[reg_tmp]); } - const int group_len = max_num_regs * block * sizeof(float); for (int g = 0; g < num_groups; ++g) { pool_height(g * group_len, block, max_num_regs); diff --git a/paddle/fluid/operators/jit/gen/seqpool.h b/paddle/fluid/operators/jit/gen/seqpool.h index 48288d8c2a..c61bf27cc1 100644 --- a/paddle/fluid/operators/jit/gen/seqpool.h +++ b/paddle/fluid/operators/jit/gen/seqpool.h @@ -16,6 +16,7 @@ #include #include "glog/logging.h" +#include "paddle/fluid/operators/jit/gen/act.h" // for ones #include "paddle/fluid/operators/jit/gen/jitcode.h" #include "paddle/fluid/platform/enforce.h" @@ -29,7 +30,7 @@ class SeqPoolJitCode : public JitCode { explicit SeqPoolJitCode(const seq_pool_attr_t& attr, size_t code_size = 256 * 1024, void* code_ptr = nullptr) - : JitCode(code_size, code_ptr), h_(attr.h), w_(attr.w), type_(attr.type) { + : JitCode(code_size, code_ptr), w_(attr.w), type_(attr.type) { if (type_ != SeqPoolType::kSum) { LOG(FATAL) << "Only support sum pool yet "; } @@ -55,39 +56,48 @@ class SeqPoolJitCode : public JitCode { void pool_height(int w_offset, int block, int max_num_regs) { int offset = w_offset; for (int i = 0; i < max_num_regs; ++i) { - vmovups(JMM(i), ptr[param1 + offset]); + vmovups(JMM(i), ptr[param_src + offset]); offset += sizeof(float) * block; } - if (h_ > 1) { - Label l_next_h; - mov(reg_h, 1); - mov(reg_tmp, param1); - add(reg_tmp, w_ * sizeof(float) + w_offset); - L(l_next_h); - { - mov(reg_ptr_src_i, reg_tmp); - for (int i = 0; i < max_num_regs; ++i) { - vmovups(JMM(i + max_num_regs), ptr[reg_ptr_src_i]); - // sum anyway - vaddps(JMM(i), JMM(i), JMM(i + max_num_regs)); - add(reg_ptr_src_i, sizeof(float) * block); - } - inc(reg_h); - add(reg_tmp, w_ * sizeof(float)); - cmp(reg_h, h_); - jl(l_next_h, T_NEAR); + cmp(reg32_int_h, 1); + Label l_next_h, l_h_done; + jle(l_h_done, T_NEAR); + mov(reg_h_i, 1); + mov(reg_tmp, param_src); + add(reg_tmp, w_ * sizeof(float) + w_offset); + L(l_next_h); + { + mov(reg_ptr_src_i, reg_tmp); + for (int i = 0; i < max_num_regs; ++i) { + vmovups(JMM(i + max_num_regs), ptr[reg_ptr_src_i]); + // sum anyway + vaddps(JMM(i), JMM(i), JMM(i + max_num_regs)); + add(reg_ptr_src_i, sizeof(float) * block); } + inc(reg_h_i); + add(reg_tmp, w_ * sizeof(float)); + cmp(reg_h_i, reg32_int_h); + jl(l_next_h, T_NEAR); } + L(l_h_done); // save right now if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) { - vbroadcastss(JMM(max_num_regs), reg32_scalar); + mov(reg_tmp, reinterpret_cast(exp_float_consts)); + vmovups(JMM(max_num_regs), ptr[reg_tmp + OFFSET_EXP_ONE]); + movd(JMM(max_num_regs + 1), reg32_fp_h); + if (type_ == SeqPoolType::kSqrt) { + vsqrtps(JMM(max_num_regs + 1), JMM(max_num_regs + 1)); + } + vdivps(JMM(max_num_regs + 2), JMM(max_num_regs), JMM(max_num_regs + 1)); + vbroadcastss(JMM(max_num_regs), + JMM(max_num_regs + 2)); // TODO(TJ): fix me } offset = w_offset; for (int i = 0; i < max_num_regs; ++i) { if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) { vmulps(JMM(i), JMM(i), JMM(max_num_regs)); } - vmovups(ptr[param2 + offset], JMM(i)); + vmovups(ptr[param_dst + offset], JMM(i)); offset += sizeof(float) * block; } } @@ -97,47 +107,54 @@ class SeqPoolJitCode : public JitCode { const bool has_block4 = rest / 4 > 0; const bool has_block2 = (rest % 4) / 2 > 0; const bool has_block1 = (rest % 2) == 1; - if (h_ > 1) { - Label l_next_h; - mov(reg_h, 1); - mov(reg_tmp, param1); - add(reg_tmp, w_ * sizeof(float) + w_offset); - L(l_next_h); - { - // int used_regs =load_rest(rest, h * w_ * sizeof(float) + w_offset, - // max_num_regs); - int reg_idx = 0; - mov(reg_ptr_src_i, reg_tmp); - if (has_block4) { - vmovups(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]); - add(reg_ptr_src_i, sizeof(float) * 4); - reg_idx++; - } - if (has_block2) { - vmovups(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]); - add(reg_ptr_src_i, sizeof(float) * 2); - reg_idx++; - } - if (has_block1) { - vmovss(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]); - reg_idx++; - } - PADDLE_ENFORCE_EQ(reg_idx, rest_used_num_regs, - "All heights should use same regs"); - for (int i = 0; i < reg_idx; ++i) { - vaddps(xmm_t(i), xmm_t(i), xmm_t(i + max_num_regs)); - } - inc(reg_h); - add(reg_tmp, w_ * sizeof(float)); - cmp(reg_h, h_); - jl(l_next_h, T_NEAR); + cmp(reg32_int_h, 1); + Label l_next_h, l_h_done; + jle(l_h_done, T_NEAR); + mov(reg_h_i, 1); + mov(reg_tmp, param_src); + add(reg_tmp, w_ * sizeof(float) + w_offset); + L(l_next_h); + { + int reg_idx = 0; + mov(reg_ptr_src_i, reg_tmp); + if (has_block4) { + vmovups(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]); + add(reg_ptr_src_i, sizeof(float) * 4); + reg_idx++; + } + if (has_block2) { + vmovups(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]); + add(reg_ptr_src_i, sizeof(float) * 2); + reg_idx++; + } + if (has_block1) { + vmovss(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]); + reg_idx++; } + PADDLE_ENFORCE_EQ(reg_idx, rest_used_num_regs, + "All heights should use same regs"); + for (int i = 0; i < reg_idx; ++i) { + vaddps(xmm_t(i), xmm_t(i), xmm_t(i + max_num_regs)); + } + inc(reg_h_i); + add(reg_tmp, w_ * sizeof(float)); + cmp(reg_h_i, reg32_int_h); + jl(l_next_h, T_NEAR); } + L(l_h_done); // save right now if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) { - vbroadcastss(xmm_t(max_num_regs - 1), reg32_scalar); + mov(reg_tmp, reinterpret_cast(exp_float_consts)); + vmovups(xmm_t(max_num_regs), ptr[reg_tmp + OFFSET_EXP_ONE]); + movd(xmm_t(max_num_regs + 1), reg32_fp_h); + if (type_ == SeqPoolType::kSqrt) { + vsqrtps(xmm_t(max_num_regs + 1), xmm_t(max_num_regs + 1)); + } + vdivps(xmm_t(max_num_regs + 2), xmm_t(max_num_regs), + xmm_t(max_num_regs + 1)); + vbroadcastss(xmm_t(max_num_regs), xmm_t(max_num_regs + 2)); for (int i = 0; i < rest_used_num_regs; ++i) { - vmulps(xmm_t(i), xmm_t(i), xmm_t(max_num_regs - 1)); + vmulps(xmm_t(i), xmm_t(i), xmm_t(max_num_regs)); } } save_rest(rest, w_offset); @@ -151,17 +168,17 @@ class SeqPoolJitCode : public JitCode { const bool has_block1 = (rest % 2) == 1; int reg_idx = reg_start; if (has_block4) { - vmovups(xmm_t(reg_idx + num_shift_regs), ptr[param1 + w_offset]); + vmovups(xmm_t(reg_idx + num_shift_regs), ptr[param_src + w_offset]); w_offset += sizeof(float) * 4; reg_idx++; } if (has_block2) { - vmovq(xmm_t(reg_idx + num_shift_regs), ptr[param1 + w_offset]); + vmovq(xmm_t(reg_idx + num_shift_regs), ptr[param_src + w_offset]); w_offset += sizeof(float) * 2; reg_idx++; } if (has_block1) { - vmovss(xmm_t(reg_idx + num_shift_regs), ptr[param1 + w_offset]); + vmovss(xmm_t(reg_idx + num_shift_regs), ptr[param_src + w_offset]); reg_idx++; } return reg_idx; @@ -174,32 +191,33 @@ class SeqPoolJitCode : public JitCode { const bool has_block1 = (rest % 2) == 1; int reg_idx = reg_start; if (has_block4) { - vmovups(ptr[param2 + w_offset], xmm_t(reg_idx)); + vmovups(ptr[param_dst + w_offset], xmm_t(reg_idx)); w_offset += sizeof(float) * 4; reg_idx++; } if (has_block2) { - vmovq(ptr[param2 + w_offset], xmm_t(reg_idx)); + vmovq(ptr[param_dst + w_offset], xmm_t(reg_idx)); w_offset += sizeof(float) * 2; reg_idx++; } if (has_block1) { - vmovss(ptr[param2 + w_offset], xmm_t(reg_idx)); + vmovss(ptr[param_dst + w_offset], xmm_t(reg_idx)); } } private: - int h_; int w_; SeqPoolType type_; - reg64_t param1{abi_param1}; - reg64_t param2{abi_param2}; - reg64_t param3{abi_param3}; - reg32_t reg32_scalar{r8d}; + reg64_t param_src{abi_param1}; + reg64_t param_dst{abi_param2}; + reg64_t param_attr{abi_param3}; + reg64_t reg_tmp{rax}; + + reg32_t reg32_int_h{r8d}; + reg32_t reg32_fp_h{r9d}; - reg64_t reg_h{r9}; - reg64_t reg_ptr_src_i{r10}; - reg64_t reg_tmp{r11}; + reg64_t reg_h_i{r10}; + reg64_t reg_ptr_src_i{r11}; }; } // namespace gen diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index 2659374650..2a7697a6f2 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -46,7 +46,7 @@ typedef enum { typedef enum { kNonePoolType = 0, - kSum, + kSum = 1, kAvg, kSqrt, } SeqPoolType; @@ -121,10 +121,10 @@ struct GRUTuples { }; typedef struct seq_pool_attr_s { - int h, w; + int h, w; // h should always be the first one SeqPoolType type; seq_pool_attr_s() = default; - explicit seq_pool_attr_s(int height, int width, SeqPoolType pool_type) + explicit seq_pool_attr_s(int width, SeqPoolType pool_type, int height = 1) : h(height), w(width), type(pool_type) {} } seq_pool_attr_t; diff --git a/paddle/fluid/operators/jit/kernel_key.cc b/paddle/fluid/operators/jit/kernel_key.cc index db78ed8ad8..61de386886 100644 --- a/paddle/fluid/operators/jit/kernel_key.cc +++ b/paddle/fluid/operators/jit/kernel_key.cc @@ -45,10 +45,8 @@ size_t JitCodeKey(const gru_attr_t& attr) { template <> size_t JitCodeKey(const seq_pool_attr_t& attr) { size_t key = attr.w; - // TODO(TJ): support height, then removed it from key - constexpr int w_shift = 30; - return (key << act_type_shift) + static_cast(attr.type) + - (static_cast(attr.h) << (act_type_shift + w_shift)); + constexpr int pool_type_shift = 3; + return (key << pool_type_shift) + static_cast(attr.type); } } // namespace jit diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index 4e19783c86..b4e9c8dd10 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -334,7 +334,6 @@ void NCHW16CMulNC(const T* x, const T* y, T* z, int height, int width) { template void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) { - PADDLE_ENFORCE(attr->type == SeqPoolType::kSum, "Only support sum yet"); for (int w = 0; w < attr->w; ++w) { const T* src = x + w; T* dst = y + w; diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index 0f1776507a..5e05c71f40 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -439,9 +439,10 @@ void TestSeqPoolKernel() { // TODO(TJ): support more std::vector pool_types = {jit::SeqPoolType::kSum}; for (auto type : pool_types) { - for (int h : TestSizes()) { - for (int w : TestSizes()) { - const jit::seq_pool_attr_t attr(h, w, type); + for (int w : TestSizes()) { + jit::seq_pool_attr_t attr(w, type); + for (int h : TestSizes()) { + attr.h = h; auto ref = jit::GetRefer>(); EXPECT_TRUE(ref != nullptr); std::vector x(h * w), yref(w); diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc index 283e2e251a..2a47502614 100644 --- a/paddle/fluid/operators/math/sequence_pooling.cc +++ b/paddle/fluid/operators/math/sequence_pooling.cc @@ -252,14 +252,14 @@ class SequencePoolFunctor { PADDLE_ENFORCE(platform::is_cpu_place(place)); const T* src = input.data(); T* dst = output->mutable_data(place); - jit::seq_pool_attr_t attr; - attr.w = input.numel() / input.dims()[0]; - attr.type = jit::SeqPoolType::kSum; + jit::seq_pool_attr_t attr( + static_cast(input.numel() / input.dims()[0]), + jit::SeqPoolType::kSum); + auto seqpool = + jit::Get, platform::CPUPlace>( + attr); for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { attr.h = static_cast(lod[i + 1] - lod[i]); - auto seqpool = - jit::Get, platform::CPUPlace>( - attr); seqpool(src, dst, &attr); dst += attr.w; src += attr.h * attr.w; From 123b98f417d064e780412f316f4ca43988f4d0d2 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Mon, 7 Jan 2019 06:07:23 +0000 Subject: [PATCH 138/197] refine heigth and codesize and support all pool test=develop --- paddle/fluid/operators/jit/benchmark.cc | 3 ++- paddle/fluid/operators/jit/gen/seqpool.cc | 27 +++++++++++----------- paddle/fluid/operators/jit/gen/seqpool.h | 28 +++++++---------------- paddle/fluid/operators/jit/test.cc | 4 ++-- 4 files changed, 26 insertions(+), 36 deletions(-) diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index 4cbada4a5b..bde2791add 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -192,7 +192,8 @@ void BenchGRUKernel() { template void BenchSeqPoolKernel() { - std::vector pool_types = {jit::SeqPoolType::kSum}; + std::vector pool_types = { + jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt}; for (auto type : pool_types) { for (int w : TestSizes()) { jit::seq_pool_attr_t attr(w, type); diff --git a/paddle/fluid/operators/jit/gen/seqpool.cc b/paddle/fluid/operators/jit/gen/seqpool.cc index d651f282bf..530d24ee1f 100644 --- a/paddle/fluid/operators/jit/gen/seqpool.cc +++ b/paddle/fluid/operators/jit/gen/seqpool.cc @@ -13,7 +13,7 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/gen/seqpool.h" -#include // offsetof +#include "paddle/fluid/operators/jit/gen/act.h" // for exp_float_consts ones #include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/cpu_info.h" @@ -22,9 +22,6 @@ namespace operators { namespace jit { namespace gen { -thread_local float ALIGN32_BEG float_h[1] ALIGN32_END = { - 1.f}; // TODO(TJ): try move to private - void SeqPoolJitCode::genCode() { constexpr int block = YMM_FLOAT_BLOCK; constexpr int max_num_regs = 8; @@ -33,10 +30,17 @@ void SeqPoolJitCode::genCode() { int rest_num_regs = num_block % max_num_regs; mov(reg32_int_h, dword[param_attr]); if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) { - mov(reg_tmp, reinterpret_cast(float_h)); + mov(reg_tmp, reinterpret_cast(exp_float_consts)); + vmovups(xmm_t(1), ptr[reg_tmp + OFFSET_EXP_ONE]); + mov(reg_tmp, reinterpret_cast(fp_h_)); fild(dword[param_attr]); fstp(dword[reg_tmp]); - mov(reg32_fp_h, dword[reg_tmp]); + vmovss(xmm_t(0), ptr[reg_tmp]); + if (type_ == SeqPoolType::kSqrt) { + vsqrtps(xmm_t(0), xmm_t(0)); + } + vdivps(xmm_t(1), xmm_t(1), xmm_t(0)); + vmovss(ptr[reg_tmp], xmm_t(1)); } const int group_len = max_num_regs * block * sizeof(float); for (int g = 0; g < num_groups; ++g) { @@ -45,7 +49,6 @@ void SeqPoolJitCode::genCode() { if (rest_num_regs > 0) { pool_height(num_groups * group_len, block, rest_num_regs); } - // part of rest_w * height const int rest = w_ % block; pool_height_of_rest_width(rest, (w_ - rest) * sizeof(float), max_num_regs); @@ -58,12 +61,10 @@ class SeqPoolCreator : public JitCodeCreator { return platform::MayIUse(platform::avx); } size_t CodeSize(const seq_pool_attr_t& attr) const override { - // TODO(TJ): remove attr.h when enabled height - bool yes = - attr.type == SeqPoolType::kAvg || attr.type == SeqPoolType::kSqrt; - return 96 /* basic */ + - ((attr.w / YMM_FLOAT_BLOCK + 4 /* rest */) * 2 /* for sum */ - * (attr.h + (yes ? 3 : 1 /*for avg or sqrt*/))) * + return 96 + + ((attr.w / YMM_FLOAT_BLOCK + 4 /* for rest */) * + 4 /* load, mul and save */ + + 256) * 8; } std::unique_ptr CreateJitCode( diff --git a/paddle/fluid/operators/jit/gen/seqpool.h b/paddle/fluid/operators/jit/gen/seqpool.h index c61bf27cc1..fcbbb3c84c 100644 --- a/paddle/fluid/operators/jit/gen/seqpool.h +++ b/paddle/fluid/operators/jit/gen/seqpool.h @@ -16,7 +16,6 @@ #include #include "glog/logging.h" -#include "paddle/fluid/operators/jit/gen/act.h" // for ones #include "paddle/fluid/operators/jit/gen/jitcode.h" #include "paddle/fluid/platform/enforce.h" @@ -31,9 +30,11 @@ class SeqPoolJitCode : public JitCode { size_t code_size = 256 * 1024, void* code_ptr = nullptr) : JitCode(code_size, code_ptr), w_(attr.w), type_(attr.type) { - if (type_ != SeqPoolType::kSum) { + if (!(type_ == SeqPoolType::kSum || type_ == SeqPoolType::kAvg || + type_ == SeqPoolType::kSqrt)) { LOG(FATAL) << "Only support sum pool yet "; } + fp_h_[0] = 1.f; this->genCode(); } @@ -82,15 +83,8 @@ class SeqPoolJitCode : public JitCode { L(l_h_done); // save right now if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) { - mov(reg_tmp, reinterpret_cast(exp_float_consts)); - vmovups(JMM(max_num_regs), ptr[reg_tmp + OFFSET_EXP_ONE]); - movd(JMM(max_num_regs + 1), reg32_fp_h); - if (type_ == SeqPoolType::kSqrt) { - vsqrtps(JMM(max_num_regs + 1), JMM(max_num_regs + 1)); - } - vdivps(JMM(max_num_regs + 2), JMM(max_num_regs), JMM(max_num_regs + 1)); - vbroadcastss(JMM(max_num_regs), - JMM(max_num_regs + 2)); // TODO(TJ): fix me + mov(reg_tmp, reinterpret_cast(fp_h_)); + vbroadcastss(JMM(max_num_regs), ptr[reg_tmp]); } offset = w_offset; for (int i = 0; i < max_num_regs; ++i) { @@ -144,15 +138,8 @@ class SeqPoolJitCode : public JitCode { L(l_h_done); // save right now if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) { - mov(reg_tmp, reinterpret_cast(exp_float_consts)); - vmovups(xmm_t(max_num_regs), ptr[reg_tmp + OFFSET_EXP_ONE]); - movd(xmm_t(max_num_regs + 1), reg32_fp_h); - if (type_ == SeqPoolType::kSqrt) { - vsqrtps(xmm_t(max_num_regs + 1), xmm_t(max_num_regs + 1)); - } - vdivps(xmm_t(max_num_regs + 2), xmm_t(max_num_regs), - xmm_t(max_num_regs + 1)); - vbroadcastss(xmm_t(max_num_regs), xmm_t(max_num_regs + 2)); + mov(reg_tmp, reinterpret_cast(fp_h_)); + vbroadcastss(xmm_t(max_num_regs), ptr[reg_tmp]); for (int i = 0; i < rest_used_num_regs; ++i) { vmulps(xmm_t(i), xmm_t(i), xmm_t(max_num_regs)); } @@ -206,6 +193,7 @@ class SeqPoolJitCode : public JitCode { } private: + float ALIGN32_BEG fp_h_[1] ALIGN32_END; int w_; SeqPoolType type_; reg64_t param_src{abi_param1}; diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index 5e05c71f40..30291bfef3 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -436,8 +436,8 @@ void TestGRUKernel() { template void TestSeqPoolKernel() { VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); - // TODO(TJ): support more - std::vector pool_types = {jit::SeqPoolType::kSum}; + std::vector pool_types = { + jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt}; for (auto type : pool_types) { for (int w : TestSizes()) { jit::seq_pool_attr_t attr(w, type); From c09a3790151e82ac51c419ae41cfd40bd449bafb Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 7 Jan 2019 14:56:46 +0800 Subject: [PATCH 139/197] remove const_cast test=develop --- paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h index 38dfae8ad6..2d60b9e96c 100644 --- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h +++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h @@ -40,7 +40,7 @@ struct EmbeddingVSumFunctor { int64_t row_number = table_t->dims()[0]; int64_t row_width = table_t->dims()[1]; int64_t last_dim = output_t->dims()[1]; - int64_t *ids = const_cast(ids_t->data()); + int64_t *ids = ids_t->mutable_data(platform::CPUPlace()); auto ids_lod = ids_t->lod()[0]; int64_t ids_count = ids_t->numel() / ids_lod.back(); From 875a07c32d3e9034e6472d3eb57d16e4c1a4b15e Mon Sep 17 00:00:00 2001 From: Yan Chunwei Date: Mon, 7 Jan 2019 15:23:44 +0800 Subject: [PATCH 140/197] refactor inference analysis api (#14634) --- cmake/configure.cmake | 1 + paddle/fluid/framework/naive_executor.cc | 16 +- paddle/fluid/inference/api/analysis_config.cc | 220 ++++++++++++------ .../fluid/inference/api/analysis_predictor.cc | 83 ++++--- .../api/analysis_predictor_tester.cc | 30 +-- .../fluid/inference/api/api_anakin_engine.h | 2 - paddle/fluid/inference/api/api_impl.cc | 2 +- paddle/fluid/inference/api/api_impl_tester.cc | 3 +- .../api/demo_ci/trt_mobilenet_demo.cc | 9 +- .../fluid/inference/api/demo_ci/vis_demo.cc | 13 +- .../inference/api/paddle_analysis_config.h | 109 +++++++-- .../inference/api/paddle_inference_api.h | 5 +- .../fluid/inference/api/paddle_pass_builder.h | 12 +- .../tests/api/analyzer_dam_tester.cc | 9 +- .../tests/api/analyzer_lac_tester.cc | 9 +- .../tests/api/analyzer_mm_dnn_tester.cc | 9 +- .../tests/api/analyzer_ner_tester.cc | 11 +- .../tests/api/analyzer_resnet50_tester.cc | 10 +- .../tests/api/analyzer_rnn1_tester.cc | 28 +-- .../tests/api/analyzer_rnn2_tester.cc | 10 +- .../tests/api/analyzer_seq_conv1_tester.cc | 9 +- .../tests/api/analyzer_seq_pool1_tester.cc | 9 +- .../analyzer_text_classification_tester.cc | 9 +- .../tests/api/analyzer_vis_tester.cc | 11 +- .../inference/tests/api/config_printer.h | 16 +- .../fluid/inference/tests/api/tester_helper.h | 5 +- .../inference/tests/api/trt_models_tester.cc | 24 +- 27 files changed, 418 insertions(+), 256 deletions(-) diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 4ee2fdcf2d..e3d856fb30 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -134,6 +134,7 @@ if(WITH_GPU) message(WARNING "Anakin needs CUDNN >= 7.0 to compile. Force WITH_ANAKIN=OFF") set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when CUDNN >= 7.0." FORCE) endif() + add_definitions(-DWITH_ANAKIN) endif() if(WITH_ANAKIN) # NOTICE(minqiyang): the end slash is important because $CUDNN_INCLUDE_DIR diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index f1642bc0d2..86e6b1f7d9 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -40,14 +40,14 @@ void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc, void NaiveExecutor::Run() { #ifndef PADDLE_ON_INFERENCE - LOG_FIRST_N(WARNING, 15) << "The NaiveExecutor can not work properly if the " - "cmake flag ON_INFER is not set."; - LOG_FIRST_N(WARNING, 15) << "Unlike the training phase, all the scopes and " - "variables will be reused to save the allocation " - "overhead."; - LOG_FIRST_N(WARNING, 15) << "Please re-compile the inference library by " - "setting the cmake flag ON_INFER=ON if you are " - "running Paddle Inference"; + LOG_FIRST_N(WARNING, 5) << "The NaiveExecutor can not work properly if the " + "cmake flag ON_INFER is not set."; + LOG_FIRST_N(WARNING, 5) << "Unlike the training phase, all the scopes and " + "variables will be reused to save the allocation " + "overhead."; + LOG_FIRST_N(WARNING, 5) << "Please re-compile the inference library by " + "setting the cmake flag ON_INFER=ON if you are " + "running Paddle Inference"; #endif // PADDLE_ON_INFERENCE for (auto &op : ops_) { VLOG(3) << std::this_thread::get_id() << " run " << op->Type() diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 6d6e799fde..211c691504 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -14,86 +14,101 @@ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/api/paddle_analysis_config.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/api/paddle_pass_builder.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle_pass_builder.h" // NOLINT +#include "paddle/fluid/platform/gpu_info.h" namespace paddle { PassStrategy *contrib::AnalysisConfig::pass_builder() const { - PADDLE_ENFORCE( - pass_builder_.get(), - "Should call constructor first, that will init the pass_builder_."); + if (!pass_builder_.get()) { + if (use_gpu_) { + LOG(INFO) << "Create GPU IR passes"; + pass_builder_.reset(new GpuPassStrategy); + } else { + LOG(INFO) << "Create CPU IR passes"; + pass_builder_.reset(new CpuPassStrategy); + } + } else if (pass_builder_->use_gpu() ^ use_gpu()) { + LOG(WARNING) << "The use_gpu flag is not compatible between Config and " + "PassBuilder, the flags are " + << use_gpu() << " " << pass_builder_->use_gpu(); + LOG(WARNING) << "Please make them compatible, still use the existing " + "PassBuilder."; + } + return pass_builder_.get(); } -contrib::AnalysisConfig::AnalysisConfig(bool use_gpu) { - this->use_gpu = use_gpu; - if (use_gpu) { - pass_builder_.reset(new GpuPassStrategy); - } else { - pass_builder_.reset(new CpuPassStrategy); - } +contrib::AnalysisConfig::AnalysisConfig(const std::string &model_dir) { + model_dir_ = model_dir; +} +contrib::AnalysisConfig::AnalysisConfig(const std::string &prog_file, + const std::string ¶ms_file) { + prog_file_ = prog_file; + params_file_ = params_file; +} +void contrib::AnalysisConfig::SetModel(const std::string &prog_file_path, + const std::string ¶ms_file_path) { + prog_file_ = prog_file_path; + params_file_ = params_file_path; +} +void contrib::AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb, + int device_id) { +#ifdef PADDLE_WITH_CUDA + use_gpu_ = true; + memory_pool_init_size_mb_ = memory_pool_init_size_mb; + device_id_ = device_id; +#else + LOG(ERROR) << "Please compile with gpu to EnableGpu"; + use_gpu_ = false; +#endif } +void contrib::AnalysisConfig::DisableGpu() { use_gpu_ = false; } contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { - // fields from Config - model_dir = other.model_dir; - // fields from NativeConfig - use_gpu = other.use_gpu; - device = other.device; - fraction_of_gpu_memory = other.fraction_of_gpu_memory; - prog_file = other.prog_file; - param_file = other.param_file; - specify_input_name = other.specify_input_name; - cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_; - // fields from this. - enable_ir_optim = other.enable_ir_optim; - // For mkldnn - use_mkldnn_ = other.use_mkldnn_; - mkldnn_enabled_op_types_ = other.mkldnn_enabled_op_types_; - - use_feed_fetch_ops = other.use_feed_fetch_ops; - use_tensorrt_ = other.use_tensorrt_; - tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_; - tensorrt_workspace_size_ = other.tensorrt_workspace_size_; - tensorrt_min_subgraph_size_ = other.tensorrt_min_subgraph_size_; - model_from_memory_ = other.model_from_memory_; - - if (use_gpu) { +#define CP_MEMBER(member__) member__ = other.member__; + + // Model related. + CP_MEMBER(model_dir_); + CP_MEMBER(prog_file_); + CP_MEMBER(params_file_); + CP_MEMBER(model_from_memory_); // the memory model reuses prog_file_ and + // params_file_ fields. + // Gpu releated. + CP_MEMBER(use_gpu_); + CP_MEMBER(device_id_); + CP_MEMBER(memory_pool_init_size_mb_); + // TensorRT releated. + CP_MEMBER(use_tensorrt_); + CP_MEMBER(tensorrt_workspace_size_); + CP_MEMBER(tensorrt_max_batchsize_); + CP_MEMBER(tensorrt_min_subgraph_size_); + // MKLDNN releated. + CP_MEMBER(use_mkldnn_); + CP_MEMBER(mkldnn_enabled_op_types_); + + // Ir related. + CP_MEMBER(enable_ir_optim_); + CP_MEMBER(use_feed_fetch_ops_); + CP_MEMBER(ir_debug_); + CP_MEMBER(specify_input_name_); + + CP_MEMBER(cpu_math_library_num_threads_); + + CP_MEMBER(serialized_info_cache_); + + if (use_gpu_) { pass_builder_.reset(new GpuPassStrategy( *static_cast(other.pass_builder()))); } else { pass_builder_.reset(new CpuPassStrategy( *static_cast(other.pass_builder()))); } -} -contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) { - // fields from Config - model_dir = other.model_dir; - // fields from NativeConfig - use_gpu = other.use_gpu; - device = other.device; - fraction_of_gpu_memory = other.fraction_of_gpu_memory; - prog_file = other.prog_file; - param_file = other.param_file; - specify_input_name = other.specify_input_name; - cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_; - // fields from this. - enable_ir_optim = other.enable_ir_optim; - // For mkldnn - use_mkldnn_ = other.use_mkldnn_; - mkldnn_enabled_op_types_ = other.mkldnn_enabled_op_types_; - - use_feed_fetch_ops = other.use_feed_fetch_ops; - use_tensorrt_ = other.use_tensorrt_; - tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_; - tensorrt_workspace_size_ = other.tensorrt_workspace_size_; - tensorrt_min_subgraph_size_ = other.tensorrt_min_subgraph_size_; - model_from_memory_ = other.model_from_memory_; - - pass_builder_ = std::move(other.pass_builder_); +#undef CP_MEMBER } void contrib::AnalysisConfig::EnableMKLDNN() { @@ -112,17 +127,90 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size, use_tensorrt_ = true; tensorrt_workspace_size_ = workspace_size; tensorrt_max_batchsize_ = max_batch_size; - tensorrt_min_subgraph_size_ = min_subgraph_size; - // Append after the conv+affine_channel fuse pass. - pass_builder()->InsertPass(3, "tensorrt_subgraph_pass"); +} + +void contrib::AnalysisConfig::Update() { + auto info = SerializeInfoCache(); + if (info == serialized_info_cache_) return; + + if (use_gpu_) { + pass_builder_.reset(new GpuPassStrategy); + } else { + pass_builder_.reset(new CpuPassStrategy); + } + + if (use_tensorrt_) { + if (!use_gpu_) { + LOG(ERROR) + << "TensorRT engine is not available when EnableGpu() not actived."; + } else { + // Append after the infer_clean pass. + pass_builder()->InsertPass(1, "tensorrt_subgraph_pass"); + } + } + + if (use_mkldnn_) { + if (!enable_ir_optim_) { + LOG(ERROR) + << "EnableMKLDNN() only works when IR optimization is enabled."; + } +#ifdef PADDLE_WITH_MKLDNN + pass_builder()->EnableMKLDNN(); + use_mkldnn_ = true; +#else + LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN"; + use_mkldnn_ = false; +#endif + } + + if (ir_debug_) { + pass_builder()->TurnOnDebug(); + } +} + +std::string contrib::AnalysisConfig::SerializeInfoCache() { + std::stringstream ss; + ss << use_gpu_; + ss << memory_pool_init_size_mb_; + + ss << use_tensorrt_; + ss << tensorrt_workspace_size_; + ss << tensorrt_max_batchsize_; + + ss << use_mkldnn_; + ss << enable_ir_optim_; + ss << use_feed_fetch_ops_; + ss << ir_debug_; + + return ss.str(); +} + +void contrib::AnalysisConfig::SetCpuMathLibraryNumThreads( + int cpu_math_library_num_threads) { + cpu_math_library_num_threads_ = cpu_math_library_num_threads; +} + +float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const { +#ifdef PADDLE_WITH_CUDA + // Get the GPU memory details and calculate the fraction of memory for the + // GPU memory pool. + size_t gpu_used, gpu_available; + platform::GpuMemoryUsage(&gpu_used, &gpu_available); + double total_gpu_memory = (gpu_used + gpu_available) / 1024. / 1024.; + float fraction_of_gpu_memory = + static_cast(memory_pool_init_size_mb()) / total_gpu_memory; + return fraction_of_gpu_memory; +#else + return 0.; +#endif } void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer, size_t prog_buffer_size, const char *param_buffer, size_t param_buffer_size) { - prog_file = std::string(prog_buffer, prog_buffer + prog_buffer_size); - param_file = std::string(param_buffer, param_buffer + param_buffer_size); + prog_file_ = std::string(prog_buffer, prog_buffer + prog_buffer_size); + params_file_ = std::string(param_buffer, param_buffer + param_buffer_size); model_from_memory_ = true; } diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 3aaec10ee2..585634fae9 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -33,6 +33,7 @@ #include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/cpu_helper.h" +#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/profiler.h" DECLARE_bool(profile); @@ -59,8 +60,8 @@ bool AnalysisPredictor::Init( if (FLAGS_profile) { LOG(WARNING) << "Profiler is actived, might affect the performance"; LOG(INFO) << "You can turn off by set gflags '-profile false'"; - auto tracking_device = config_.use_gpu ? platform::ProfilerState::kAll - : platform::ProfilerState::kCPU; + auto tracking_device = config_.use_gpu() ? platform::ProfilerState::kAll + : platform::ProfilerState::kCPU; platform::EnableProfiler(tracking_device); } @@ -112,7 +113,7 @@ bool AnalysisPredictor::PrepareProgram( // Optimize the program, and load parameters and modify them in the // scope_. // This will change the scope_ address. - if (config_.enable_ir_optim) { + if (config_.ir_optim()) { status_ir_optim_enabled_ = true; OptimizeInferenceProgram(); } else { @@ -140,9 +141,9 @@ bool AnalysisPredictor::PrepareProgram( return true; } bool AnalysisPredictor::CreateExecutor() { - if (config_.use_gpu) { + if (config_.use_gpu_) { status_use_gpu_ = true; - place_ = paddle::platform::CUDAPlace(config_.device); + place_ = paddle::platform::CUDAPlace(config_.device_id_); } else { place_ = paddle::platform::CPUPlace(); } @@ -151,7 +152,7 @@ bool AnalysisPredictor::CreateExecutor() { } bool AnalysisPredictor::PrepareExecutor() { executor_->Prepare(sub_scope_, *inference_program_, 0, - config_.use_feed_fetch_ops); + config_.use_feed_fetch_ops_); PADDLE_ENFORCE_NOT_NULL(sub_scope_); @@ -250,7 +251,7 @@ bool AnalysisPredictor::SetFeed(const std::vector &inputs, } input.set_lod(lod); int idx = -1; - if (config_.specify_input_name) { + if (config_.specify_input_name_) { auto name = inputs[i].name; if (feed_names_.find(name) == feed_names_.end()) { LOG(ERROR) << "feed names from program do not have name: [" << name @@ -314,22 +315,22 @@ bool AnalysisPredictor::GetFetch(std::vector *outputs, void AnalysisPredictor::OptimizeInferenceProgram() { status_program_optimized_ = true; - argument_.SetUseGPU(config_.use_gpu); - argument_.SetGPUDeviceId(config_.device); + argument_.SetUseGPU(config_.use_gpu()); + argument_.SetGPUDeviceId(config_.gpu_device_id()); argument_.SetModelFromMemory(config_.model_from_memory_); // Analyze inference_program - if (!config_.model_dir.empty()) { - argument_.SetModelDir(config_.model_dir); + if (!config_.model_dir().empty()) { + argument_.SetModelDir(config_.model_dir()); } else { PADDLE_ENFORCE( - !config_.param_file.empty(), + !config_.params_file().empty(), "Either model_dir or (param_file, prog_file) should be set."); - PADDLE_ENFORCE(!config_.prog_file.empty()); - argument_.SetModelProgramPath(config_.prog_file); - argument_.SetModelParamsPath(config_.param_file); + PADDLE_ENFORCE(!config_.prog_file().empty()); + argument_.SetModelProgramPath(config_.prog_file()); + argument_.SetModelParamsPath(config_.params_file()); } - if (config_.use_gpu && config_.use_tensorrt_) { + if (config_.use_gpu() && config_.tensorrt_engine_enabled()) { argument_.SetUseTensorRT(true); argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_); argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_); @@ -341,7 +342,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { } auto passes = config_.pass_builder()->AllPasses(); - if (!config_.enable_ir_optim) passes.clear(); + if (!config_.ir_optim()) passes.clear(); argument_.SetIrAnalysisPasses(passes); argument_.SetScopeNotOwned(const_cast(scope_.get())); Analyzer().Run(&argument_); @@ -358,18 +359,26 @@ template <> std::unique_ptr CreatePaddlePredictor< AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) { VLOG(3) << "create AnalysisConfig"; - if (config.use_gpu) { + if (config.use_gpu()) { // 1. GPU memeroy - PADDLE_ENFORCE_GT( - config.fraction_of_gpu_memory, 0.f, - "fraction_of_gpu_memory in the config should be set to range (0., 1.]"); - PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device); + PADDLE_ENFORCE_GT(config.memory_pool_init_size_mb(), 0.f); + PADDLE_ENFORCE_GE(config.gpu_device_id(), 0, "Invalid device id %d", + config.gpu_device_id()); std::vector flags; - if (config.fraction_of_gpu_memory >= 0.0f || - config.fraction_of_gpu_memory <= 0.95f) { + + float fraction_of_gpu_memory = config.fraction_of_gpu_memory_for_pool(); + if (fraction_of_gpu_memory > 0.95f) { + LOG(ERROR) + << "Allocate too much memory for the GPU memory pool, assigned " + << config.memory_pool_init_size_mb() << " MB"; + LOG(ERROR) + << "Try to shink the value by setting AnalysisConfig::EnableGpu(...)"; + } + + if (fraction_of_gpu_memory >= 0.0f || fraction_of_gpu_memory <= 0.95f) { flags.push_back("dummpy"); std::string flag = "--fraction_of_gpu_memory_to_use=" + - std::to_string(config.fraction_of_gpu_memory); + std::to_string(fraction_of_gpu_memory); flags.push_back(flag); VLOG(3) << "set flag: " << flag; framework::InitGflags(flags); @@ -443,22 +452,22 @@ bool AnalysisPredictor::ZeroCopyRun() { bool AnalysisPredictor::LoadProgramDesc() { // Initialize the inference program std::string filename; - if (!config_.model_dir.empty()) { - filename = config_.model_dir + "/__model__"; - } else if (!config_.prog_file.empty() && !config_.param_file.empty()) { + if (!config_.model_dir().empty()) { + filename = config_.model_dir() + "/__model__"; + } else if (!config_.prog_file().empty() && !config_.params_file().empty()) { // All parameters are saved in a single file. // The file names should be consistent with that used // in Python API `fluid.io.save_inference_model`. - filename = config_.prog_file; + filename = config_.prog_file(); } else { - if (config_.model_dir.empty() && config_.prog_file.empty()) { + if (config_.model_dir().empty() && config_.prog_file().empty()) { LOG(ERROR) << "Either model_dir or (prog_file, param_file) should be set."; return false; } LOG(ERROR) << string::Sprintf( - "not valid model path '%s' or program path '%s'.", config_.model_dir, - config_.param_file); + "not valid model path '%s' or program path '%s'.", config_.model_dir(), + config_.params_file()); return false; } @@ -478,7 +487,7 @@ bool AnalysisPredictor::LoadProgramDesc() { proto.ParseFromString(pb_content); } else { - proto.ParseFromString(config_.prog_file); + proto.ParseFromString(config_.prog_file()); } inference_program_.reset(new framework::ProgramDesc(proto)); return true; @@ -508,27 +517,27 @@ bool AnalysisPredictor::LoadParameters() { new_var->SetLoDLevel(var->GetLoDLevel()); new_var->SetPersistable(true); - if (!config_.param_file.empty()) { + if (!config_.params_file().empty()) { params.push_back(new_var->Name()); } else { // append_op framework::OpDesc *op = load_block->AppendOp(); op->SetType("load"); op->SetOutput("Out", {new_var->Name()}); - op->SetAttr("file_path", {config_.model_dir + "/" + new_var->Name()}); + op->SetAttr("file_path", {config_.model_dir() + "/" + new_var->Name()}); op->CheckAttrs(); } } } - if (!config_.param_file.empty()) { + if (!config_.params_file().empty()) { // sort paramlist to have consistent ordering std::sort(params.begin(), params.end()); // append just the load_combine op framework::OpDesc *op = load_block->AppendOp(); op->SetType("load_combine"); op->SetOutput("Out", params); - op->SetAttr("file_path", {config_.param_file}); + op->SetAttr("file_path", {config_.params_file()}); op->CheckAttrs(); } diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index a361b34437..6169e60541 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -25,9 +25,9 @@ namespace paddle { using contrib::AnalysisConfig; TEST(AnalysisPredictor, analysis_off) { - AnalysisConfig config(false); - config.model_dir = FLAGS_dirname; - config.enable_ir_optim = false; + AnalysisConfig config; + config.SetModel(FLAGS_dirname); + config.SwitchIrOptim(false); auto _predictor = CreatePaddlePredictor(config); auto* predictor = static_cast(_predictor.get()); @@ -55,14 +55,14 @@ TEST(AnalysisPredictor, analysis_off) { } TEST(AnalysisPredictor, analysis_on) { + AnalysisConfig config; + config.SetModel(FLAGS_dirname); + config.SwitchIrOptim(true); #ifdef PADDLE_WITH_CUDA - AnalysisConfig config(true); - config.fraction_of_gpu_memory = 0.15; + config.EnableUseGpu(100, 0); #else - AnalysisConfig config; + config.DisableGpu(); #endif - config.model_dir = FLAGS_dirname; - config.enable_ir_optim = true; auto _predictor = CreatePaddlePredictor(config); auto* predictor = static_cast(_predictor.get()); @@ -89,7 +89,8 @@ TEST(AnalysisPredictor, analysis_on) { } // compare with NativePredictor - auto naive_predictor = CreatePaddlePredictor(config); + auto naive_predictor = + CreatePaddlePredictor(config.ToNativeConfig()); std::vector naive_outputs; ASSERT_TRUE(naive_predictor->Run(inputs, &naive_outputs)); ASSERT_EQ(naive_outputs.size(), 1UL); @@ -98,9 +99,8 @@ TEST(AnalysisPredictor, analysis_on) { TEST(AnalysisPredictor, ZeroCopy) { AnalysisConfig config; - config.model_dir = FLAGS_dirname; - config.use_feed_fetch_ops = false; - + config.SetModel(FLAGS_dirname); + config.SwitchUseFeedFetchOps(false); auto predictor = CreatePaddlePredictor(config); auto w0 = predictor->GetInputTensor("firstw"); @@ -137,9 +137,9 @@ TEST(AnalysisPredictor, ZeroCopy) { TEST(AnalysisPredictor, Clone) { AnalysisConfig config; - config.model_dir = FLAGS_dirname; - config.use_feed_fetch_ops = true; - config.enable_ir_optim = true; + config.SetModel(FLAGS_dirname); + config.SwitchUseFeedFetchOps(true); + config.SwitchIrOptim(true); std::vector> predictors; predictors.emplace_back(CreatePaddlePredictor(config)); diff --git a/paddle/fluid/inference/api/api_anakin_engine.h b/paddle/fluid/inference/api/api_anakin_engine.h index 6a8b81cc57..e14d93de2c 100644 --- a/paddle/fluid/inference/api/api_anakin_engine.h +++ b/paddle/fluid/inference/api/api_anakin_engine.h @@ -19,8 +19,6 @@ limitations under the License. */ #pragma once -#define WITH_ANAKIN - #include #include "framework/core/net/net.h" diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 102147a493..85e250aaaf 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -288,7 +288,7 @@ std::unique_ptr CreatePaddlePredictor< VLOG(3) << "create NativePaddlePredictor"; if (config.use_gpu) { // 1. GPU memeroy - PADDLE_ENFORCE_GT( + PADDLE_ENFORCE_GE( config.fraction_of_gpu_memory, 0.f, "fraction_of_gpu_memory in the config should be set to range (0., 1.]"); PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device); diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc index 7839639739..54895679ca 100644 --- a/paddle/fluid/inference/api/api_impl_tester.cc +++ b/paddle/fluid/inference/api/api_impl_tester.cc @@ -295,7 +295,8 @@ TEST(inference_api_native, image_classification_gpu) { #endif TEST(PassBuilder, Delete) { - contrib::AnalysisConfig config(false); + contrib::AnalysisConfig config; + config.DisableGpu(); config.pass_builder()->DeletePass("attention_lstm_fuse_pass"); const auto& passes = config.pass_builder()->AllPasses(); auto it = std::find(passes.begin(), passes.end(), "attention_lstm_fuse_pass"); diff --git a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc index 61ecd7bce6..30215e480f 100644 --- a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc +++ b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc @@ -36,12 +36,11 @@ namespace demo { */ void Main() { std::unique_ptr predictor; - paddle::contrib::AnalysisConfig config(true); - config.param_file = FLAGS_modeldir + "/__params__"; - config.prog_file = FLAGS_modeldir + "/__model__"; - config.device = 0; + paddle::contrib::AnalysisConfig config; + config.EnableUseGpu(100, 0); + config.SetModel(FLAGS_modeldir + "/__params__", + FLAGS_modeldir + "/__model__"); config.EnableTensorRtEngine(); - config.fraction_of_gpu_memory = 0.1; // set by yourself predictor = CreatePaddlePredictor(config); VLOG(3) << "begin to process data"; diff --git a/paddle/fluid/inference/api/demo_ci/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc index bc8891455d..5320992b7e 100644 --- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc +++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc @@ -40,15 +40,14 @@ using contrib::AnalysisConfig; */ void Main(bool use_gpu) { std::unique_ptr predictor, analysis_predictor; - AnalysisConfig config(use_gpu); - config.param_file = FLAGS_modeldir + "/__params__"; - config.prog_file = FLAGS_modeldir + "/__model__"; - config.device = 0; - if (FLAGS_use_gpu) { - config.fraction_of_gpu_memory = 0.1; // set by yourself + AnalysisConfig config; + if (use_gpu) { + config.EnableUseGpu(100, 0); } + config.SetModel(FLAGS_modeldir + "/__model__", + FLAGS_modeldir + "/__params__"); - predictor = CreatePaddlePredictor(config); + predictor = CreatePaddlePredictor(config.ToNativeConfig()); analysis_predictor = CreatePaddlePredictor(config); // Just a single batch of data. diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index e7ccea6587..2d61098f93 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -34,26 +34,67 @@ class AnalysisPredictor; namespace contrib { // NOTE WIP, not stable yet. -struct AnalysisConfig : public NativeConfig { - explicit AnalysisConfig(bool use_gpu = false); +struct AnalysisConfig { + AnalysisConfig() = default; explicit AnalysisConfig(const AnalysisConfig& other); - explicit AnalysisConfig(AnalysisConfig&& other); + explicit AnalysisConfig(const std::string& model_dir); + explicit AnalysisConfig(const std::string& prog_file, + const std::string& params_file); + + // Model path related. + void SetModel(const std::string& model_dir) { model_dir_ = model_dir; } + void SetModel(const std::string& prog_file_path, + const std::string& params_file_path); + void SetProgFile(const std::string& x) { prog_file_ = x; } + void SetParamsFile(const std::string& x) { params_file_ = x; } + const std::string& model_dir() const { return model_dir_; } + const std::string& prog_file() const { return prog_file_; } + const std::string& params_file() const { return params_file_; } + + // GPU related. + void EnableUseGpu(uint64_t memory_pool_init_size_mb, int device_id = 0); + void DisableGpu(); + bool use_gpu() const { return use_gpu_; } + int gpu_device_id() const { return device_id_; } + int memory_pool_init_size_mb() const { return memory_pool_init_size_mb_; } + float fraction_of_gpu_memory_for_pool() const; // Determine whether to perform graph optimization. - bool enable_ir_optim = true; + void SwitchIrOptim(int x = true) { enable_ir_optim_ = x; } + bool ir_optim() const { return enable_ir_optim_; } - // Get a pass builder for customize the passes in IR analysis phase. - PassStrategy* pass_builder() const; + void SwitchUseFeedFetchOps(int x = true) { use_feed_fetch_ops_ = x; } + bool use_feed_fetch_ops_enabled() const { return use_feed_fetch_ops_; } - // NOT stable yet. - bool use_feed_fetch_ops{true}; + void SwitchSpecifyInputNames(bool x = true) { specify_input_name_ = x; } + bool specify_input_name() const { return specify_input_name_; } void EnableTensorRtEngine(int workspace_size = 1 << 20, int max_batch_size = 1, int min_subgraph_size = 3); - bool use_tensorrt() const { return use_tensorrt_; } + bool tensorrt_engine_enabled() const { return use_tensorrt_; } + + void SwitchIrDebug(int x = true) { ir_debug_ = x; } void EnableMKLDNN(); - bool use_mkldnn() const { return use_mkldnn_; } + bool mkldnn_enabled() const { return use_mkldnn_; } + + // Set and get the number of cpu math library threads. + void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads); + int cpu_math_library_num_threads() const { + return cpu_math_library_num_threads_; + } + + NativeConfig ToNativeConfig() const { + NativeConfig config; + config.model_dir = model_dir_; + config.prog_file = prog_file_; + config.param_file = params_file_; + config.use_gpu = use_gpu_; + config.device = device_id_; + config.fraction_of_gpu_memory = fraction_of_gpu_memory_for_pool(); + config.specify_input_name = specify_input_name_; + return config; + } void SetMKLDNNOp(std::unordered_set op_list) { mkldnn_enabled_op_types_ = op_list; } @@ -65,10 +106,29 @@ struct AnalysisConfig : public NativeConfig { friend class ::paddle::AnalysisPredictor; + // NOTE just for developer, not an official API, easily to be broken. + // Get a pass builder for customize the passes in IR analysis phase. + PassStrategy* pass_builder() const; + + protected: + // Update the config. + void Update(); + + std::string SerializeInfoCache(); + protected: + // Model pathes. + std::string model_dir_; + std::string prog_file_; + std::string params_file_; + + // GPU releated. + bool use_gpu_{false}; + int device_id_{0}; + uint64_t memory_pool_init_size_mb_{100}; // initial size is 100MB. + + // TensorRT releated. bool use_tensorrt_{false}; - bool use_mkldnn_{false}; - std::unordered_set mkldnn_enabled_op_types_; // For workspace_size, refer it from here: // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting int tensorrt_workspace_size_; @@ -82,17 +142,24 @@ struct AnalysisConfig : public NativeConfig { // We set this variable to control the minimum number of nodes in the // subgraph, 3 as default value. int tensorrt_min_subgraph_size_{3}; - std::unique_ptr pass_builder_; + + bool use_mkldnn_{false}; + std::unordered_set mkldnn_enabled_op_types_; + bool model_from_memory_{false}; -}; -// Configurations for Anakin engine. -struct AnakinConfig : public PaddlePredictor::Config { - enum TargetType { NVGPU = 0, X86 }; - int device; - std::string model_file; - int max_batch_size{-1}; - TargetType target_type; + bool enable_ir_optim_{true}; + bool use_feed_fetch_ops_{true}; + bool ir_debug_{false}; + + bool specify_input_name_{false}; + + int cpu_math_library_num_threads_{1}; + + // A runtime cache, shouldn't be transferred to others. + std::string serialized_info_cache_; + + mutable std::unique_ptr pass_builder_; }; } // namespace contrib diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index 92fb51d647..1785bd520a 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -26,9 +26,8 @@ limitations under the License. */ #include #include -#include "paddle_api.h" // NOLINT -#ifndef WITH_ANAKIN #include "paddle_analysis_config.h" // NOLINT -#else +#include "paddle_api.h" // NOLINT +#ifdef WITH_ANAKIN #include "paddle_anakin_config.h" // NOLINT #endif diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index 1062ac5f58..b4cbc40e0f 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -62,7 +62,12 @@ class PassStrategy : public PaddlePassBuilder { // still some CPU kernels running in CPU mode. virtual void EnableMKLDNN() = 0; + bool use_gpu() const { return use_gpu_; } + virtual ~PassStrategy() = default; + + protected: + bool use_gpu_{false}; }; /* @@ -88,6 +93,7 @@ class CpuPassStrategy : public PassStrategy { "conv_eltwiseadd_bn_fuse_pass", // "is_test_pass", // }); + use_gpu_ = false; } virtual ~CpuPassStrategy() = default; @@ -126,10 +132,14 @@ class GpuPassStrategy : public PassStrategy { "conv_elementwise_add2_act_fuse_pass", // "conv_elementwise_add_fuse_pass", // }); + + use_gpu_ = true; } GpuPassStrategy(const GpuPassStrategy &other) - : PassStrategy(other.AllPasses()) {} + : PassStrategy(other.AllPasses()) { + use_gpu_ = true; + } void EnableMKLDNN() override; diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc index 12d61d06ce..5ad6e4a857 100644 --- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc @@ -165,12 +165,9 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, } void SetConfig(contrib::AnalysisConfig *cfg) { - cfg->prog_file = FLAGS_infer_model + "/__model__"; - cfg->param_file = FLAGS_infer_model + "/param"; - cfg->use_gpu = false; - cfg->device = 0; - cfg->specify_input_name = true; - cfg->enable_ir_optim = true; + cfg->SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param"); + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(true); } void SetInput(std::vector> *inputs) { diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc index 2213971c17..b9666e01ad 100644 --- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc @@ -105,11 +105,10 @@ void GetOneBatch(std::vector *input_slots, DataRecord *data, } void SetConfig(AnalysisConfig *cfg) { - cfg->model_dir = FLAGS_infer_model; - cfg->use_gpu = false; - cfg->device = 0; - cfg->specify_input_name = true; - cfg->enable_ir_optim = true; + cfg->SetModel(FLAGS_infer_model); + cfg->DisableGpu(); + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(); } void SetInput(std::vector> *inputs) { diff --git a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc index 9d3c751943..1318fbcbc4 100644 --- a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc @@ -76,11 +76,10 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, } void SetConfig(contrib::AnalysisConfig *cfg) { - cfg->model_dir = FLAGS_infer_model; - cfg->use_gpu = false; - cfg->device = 0; - cfg->specify_input_name = true; - cfg->enable_ir_optim = true; + cfg->SetModel(FLAGS_infer_model); + cfg->DisableGpu(); + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(); } void SetInput(std::vector> *inputs) { diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc index 04f8b3ffe8..6fef79dc46 100644 --- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc @@ -84,13 +84,12 @@ void SetConfig(contrib::AnalysisConfig *cfg, bool memory_load = false) { cfg->SetModelBuffer(&buffer_prog[0], buffer_prog.size(), &buffer_param[0], buffer_param.size()); } else { - cfg->prog_file = FLAGS_infer_model + "/__model__"; - cfg->param_file = FLAGS_infer_model + "/param"; + cfg->SetModel(FLAGS_infer_model + "/__model__", + FLAGS_infer_model + "/param"); } - cfg->use_gpu = false; - cfg->device = 0; - cfg->specify_input_name = true; - cfg->enable_ir_optim = true; + cfg->DisableGpu(); + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(); } void SetInput(std::vector> *inputs) { diff --git a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc index 764ae5ed85..629981d565 100644 --- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc @@ -21,12 +21,10 @@ namespace inference { namespace analysis { void SetConfig(AnalysisConfig *cfg) { - cfg->param_file = FLAGS_infer_model + "/params"; - cfg->prog_file = FLAGS_infer_model + "/model"; - cfg->use_gpu = false; - cfg->device = 0; - cfg->enable_ir_optim = true; - cfg->specify_input_name = true; + cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params"); + cfg->DisableGpu(); + cfg->SwitchIrOptim(); + cfg->SwitchSpecifyInputNames(); cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); } diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc index 17f4587a50..3c52afbfb8 100644 --- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc @@ -204,12 +204,10 @@ void PrepareZeroCopyInputs(ZeroCopyTensor *lod_attention_tensor, } void SetConfig(AnalysisConfig *cfg) { - cfg->prog_file = FLAGS_infer_model + "/__model__"; - cfg->param_file = FLAGS_infer_model + "/param"; - cfg->use_gpu = false; - cfg->device = 0; - cfg->specify_input_name = true; - cfg->enable_ir_optim = true; + cfg->SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param"); + cfg->DisableGpu(); + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(); } void SetInput(std::vector> *inputs) { @@ -225,10 +223,10 @@ void SetInput(std::vector> *inputs) { // Easy for profiling independently. TEST(Analyzer_rnn1, profile) { - contrib::AnalysisConfig cfg(false); + contrib::AnalysisConfig cfg; SetConfig(&cfg); - cfg.fraction_of_gpu_memory = 0.1; - cfg.pass_builder()->TurnOnDebug(); + cfg.DisableGpu(); + cfg.SwitchIrDebug(); std::vector outputs; std::vector> input_slots_all; @@ -293,16 +291,18 @@ TEST(Analyzer_rnn1, multi_thread) { TEST(Analyzer_rnn1, ZeroCopy) { AnalysisConfig config; SetConfig(&config); - config.use_feed_fetch_ops = false; + config.SwitchUseFeedFetchOps(false); PaddlePlace place; auto predictor = CreatePaddlePredictor(config); - config.use_feed_fetch_ops = true; - auto native_predictor = CreatePaddlePredictor(config); + config.SwitchUseFeedFetchOps(true); + auto native_predictor = + CreatePaddlePredictor(config.ToNativeConfig()); - config.use_feed_fetch_ops = true; // the analysis predictor needs feed/fetch. + config.SwitchUseFeedFetchOps( + true); // the analysis predictor needs feed/fetch. auto analysis_predictor = CreatePaddlePredictor(config); #define NEW_TENSOR(name__) \ @@ -362,7 +362,7 @@ TEST(Analyzer_rnn1, ZeroCopy) { TEST(Analyzer_rnn1, ZeroCopyMultiThread) { AnalysisConfig config; SetConfig(&config); - config.use_feed_fetch_ops = false; + config.SwitchUseFeedFetchOps(false); #define NEW_TENSOR(name__) \ auto name__##_tensor = predictor->GetInputTensor(#name__); diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc index f8354e7687..007f9f0b66 100644 --- a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc @@ -105,12 +105,10 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, } void SetConfig(AnalysisConfig *cfg) { - cfg->prog_file = FLAGS_infer_model + "/__model__"; - cfg->param_file = FLAGS_infer_model + "/param"; - cfg->use_gpu = false; - cfg->device = 0; - cfg->specify_input_name = true; - cfg->enable_ir_optim = true; + cfg->SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param"); + cfg->DisableGpu(); + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(); } void SetInput(std::vector> *inputs) { diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc index e6d6cd2960..47c1d73758 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc @@ -89,11 +89,10 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, } void SetConfig(AnalysisConfig *cfg) { - cfg->model_dir = FLAGS_infer_model; - cfg->use_gpu = false; - cfg->device = 0; - cfg->specify_input_name = true; - cfg->enable_ir_optim = true; + cfg->SetModel(FLAGS_infer_model); + cfg->DisableGpu(); + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(); } void SetInput(std::vector> *inputs) { diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc index 1c251e0c22..a1742f6068 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc @@ -122,12 +122,9 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data) { } void SetConfig(AnalysisConfig *cfg) { - cfg->param_file = FLAGS_infer_model + "/params"; - cfg->prog_file = FLAGS_infer_model + "/model"; - cfg->use_gpu = false; - cfg->device = 0; - cfg->enable_ir_optim = true; - cfg->specify_input_name = true; + cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params"); + cfg->DisableGpu(); + cfg->SwitchSpecifyInputNames(); cfg->pass_builder()->TurnOnDebug(); cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); } diff --git a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc index 79f3c81ade..7b448a3200 100644 --- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc @@ -47,11 +47,10 @@ struct DataReader { }; void SetConfig(AnalysisConfig *cfg) { - cfg->model_dir = FLAGS_infer_model; - cfg->use_gpu = false; - cfg->device = 0; - cfg->specify_input_name = true; - cfg->enable_ir_optim = true; + cfg->SetModel(FLAGS_infer_model); + cfg->DisableGpu(); + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(); } void SetInput(std::vector> *inputs) { diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc index d73bccefd5..5a77b53a85 100644 --- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc @@ -51,12 +51,11 @@ Record ProcessALine(const std::string &line) { } void SetConfig(AnalysisConfig *cfg) { - cfg->param_file = FLAGS_infer_model + "/__params__"; - cfg->prog_file = FLAGS_infer_model + "/__model__"; - cfg->use_gpu = false; - cfg->device = 0; - cfg->enable_ir_optim = true; - cfg->specify_input_name = true; + cfg->SetModel(FLAGS_infer_model + "/__model__", + FLAGS_infer_model + "/__params__"); + cfg->DisableGpu(); + cfg->SwitchIrDebug(); + cfg->SwitchSpecifyInputNames(); // TODO(TJ): fix fusion gru cfg->pass_builder()->DeletePass("fc_gru_fuse_pass"); } diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h index 7046bce303..cf0f1d5c18 100644 --- a/paddle/fluid/inference/tests/api/config_printer.h +++ b/paddle/fluid/inference/tests/api/config_printer.h @@ -64,19 +64,23 @@ std::ostream &operator<<(std::ostream &os, num_spaces++; os << *reinterpret_cast(&config); if (!config.model_from_memory()) { - os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file << "\n"; - os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n"; + os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file() << "\n"; + os << GenSpaces(num_spaces) << "param_file: " << config.params_file() + << "\n"; } else { os << GenSpaces(num_spaces) << "prog_file and param_file: load from memory \n"; } - os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.enable_ir_optim + os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.ir_optim() << "\n"; + os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.ir_optim() + << "\n"; + os << GenSpaces(num_spaces) + << "use_feed_fetch_ops: " << config.use_feed_fetch_ops_enabled() << "\n"; os << GenSpaces(num_spaces) - << "use_feed_fetch_ops: " << config.use_feed_fetch_ops << "\n"; - os << GenSpaces(num_spaces) << "use_tensorrt: " << config.use_tensorrt() + << "use_tensorrt: " << config.tensorrt_engine_enabled() << "\n"; + os << GenSpaces(num_spaces) << "use_mkldnn: " << config.mkldnn_enabled() << "\n"; - os << GenSpaces(num_spaces) << "use_mkldnn: " << config.use_mkldnn() << "\n"; num_spaces--; os << GenSpaces(num_spaces) << "}\n"; return os; diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 7eb44d9f4e..41d033df85 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -328,7 +328,10 @@ void CompareNativeAndAnalysis( const std::vector> &inputs) { PrintConfig(config, true); std::vector native_outputs, analysis_outputs; - TestOneThreadPrediction(config, inputs, &native_outputs, false); + const auto *analysis_config = + reinterpret_cast(config); + auto native_config = analysis_config->ToNativeConfig(); + TestOneThreadPrediction(&native_config, inputs, &native_outputs, false); TestOneThreadPrediction(config, inputs, &analysis_outputs, true); CompareResult(analysis_outputs, native_outputs); } diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc index d3bd035c1c..21df6eab81 100644 --- a/paddle/fluid/inference/tests/api/trt_models_tester.cc +++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc @@ -46,22 +46,20 @@ void SetConfig(contrib::AnalysisConfig* config, std::string model_dir, bool use_gpu, bool use_tensorrt, int batch_size) { if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) { - config->prog_file = model_dir + "/" + FLAGS_prog_filename; - config->param_file = model_dir + "/" + FLAGS_param_filename; + config->SetModel(model_dir + "/" + FLAGS_prog_filename, + model_dir + "/" + FLAGS_param_filename); } else { - config->model_dir = model_dir; + config->SetModel(model_dir); } if (use_gpu) { - config->use_gpu = true; - config->device = 0; - config->fraction_of_gpu_memory = 0.15; + config->EnableUseGpu(100, 0); if (use_tensorrt) { config->EnableTensorRtEngine(1 << 10, batch_size); config->pass_builder()->DeletePass("conv_bn_fuse_pass"); config->pass_builder()->DeletePass("fc_fuse_pass"); config->pass_builder()->TurnOnDebug(); } else { - config->enable_ir_optim = true; + config->SwitchIrOptim(); } } } @@ -77,7 +75,8 @@ void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) { std::vector outputs; if (use_analysis || use_tensorrt) { - contrib::AnalysisConfig config(true); + contrib::AnalysisConfig config; + config.EnableUseGpu(100, 0); config.pass_builder()->TurnOnDebug(); SetConfig(&config, model_dir, true, use_tensorrt, FLAGS_batch_size); @@ -109,7 +108,8 @@ void compare(std::string model_dir, bool use_tensorrt) { &native_outputs, false); std::vector analysis_outputs; - contrib::AnalysisConfig analysis_config(true); + contrib::AnalysisConfig analysis_config; + analysis_config.EnableUseGpu(50, 0); SetConfig(&analysis_config, model_dir, true, use_tensorrt, FLAGS_batch_size); TestOneThreadPrediction( @@ -154,9 +154,9 @@ TEST(TensorRT_mobilenet, analysis) { TEST(AnalysisPredictor, use_gpu) { std::string model_dir = FLAGS_infer_model + "/" + "mobilenet"; - AnalysisConfig config(true); - config.model_dir = model_dir; - config.fraction_of_gpu_memory = 0.15; + AnalysisConfig config; + config.EnableUseGpu(100, 0); + config.SetModel(model_dir); config.pass_builder()->TurnOnDebug(); std::vector> inputs_all; From eabb2105fae03db056dd85e50bf4e959417f4c63 Mon Sep 17 00:00:00 2001 From: chengduo Date: Mon, 7 Jan 2019 02:11:01 -0600 Subject: [PATCH 141/197] Refactor MultiDevSSAGraphBuilder (#15090) * Refactor ParallelExecutor test=develop * extract Reduce and AllReduce mode from MultiDevSSAGraphBuilder test=develop * Refactor MultiDevSSAGraphBuilder test=developt * Remove enable_data_balance test=develop * code refine test=develop * remove data balance test=develop * refine ScaleLossGradOp test=develop * remove uncessary file test=develop * code refine test=develop * modify function name test=develop * follow comments test=develop * add is_distribution field test=develop * set is_distribution test=develop * fix DistSSAGraphBuilder test=develop --- .../fluid/framework/details/build_strategy.cc | 54 +- .../fluid/framework/details/build_strategy.h | 8 +- .../details/multi_devices_graph_check_pass.cc | 104 ++- .../details/multi_devices_graph_check_pass.h | 38 - .../details/multi_devices_graph_pass.cc | 864 ++++++++++-------- .../details/multi_devices_graph_pass.h | 144 ++- paddle/fluid/pybind/pybind.cc | 11 +- python/paddle/fluid/parallel_executor.py | 14 + .../tests/unittests/test_reader_reset.py | 2 - 9 files changed, 701 insertions(+), 538 deletions(-) delete mode 100644 paddle/fluid/framework/details/multi_devices_graph_check_pass.h diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 43c2eb7178..a68b69e026 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -18,7 +18,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/details/memory_reuse_types.h" -#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h" +#include "paddle/fluid/framework/details/multi_devices_graph_pass.h" #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h" #include "paddle/fluid/framework/details/reduce_op_handle.h" #include "paddle/fluid/framework/details/sequential_execution_pass.h" @@ -86,10 +86,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { if (strategy.memory_optimize_) { auto analysis_var_pass = AppendPass("analysis_var_pass"); } - // Convert graph to run on multi-devices. - auto multi_devices_pass = AppendPass("multi_devices_pass"); - multi_devices_pass->SetNotOwned("strategy", - &strategy_); + + AppendMultiDevPass(strategy); // Add a graph print pass to record a graph with device info. if (!strategy_.debug_graphviz_path_.empty()) { @@ -115,6 +113,25 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { } } + // Convert graph to run on multi-devices. + void AppendMultiDevPass(const BuildStrategy &strategy) { + ir::Pass *multi_devices_pass; + if (strategy_.is_distribution_) { + multi_devices_pass = AppendPass("dist_multi_devices_pass").get(); + } else { + if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { + multi_devices_pass = + AppendPass("allreduce_mode_multi_devices_pass").get(); + } else if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kReduce) { + multi_devices_pass = AppendPass("reduce_mode_multi_devices_pass").get(); + } else { + PADDLE_THROW("Unknown reduce strategy."); + } + } + multi_devices_pass->SetNotOwned("strategy", + &strategy_); + } + private: BuildStrategy strategy_; }; @@ -131,6 +148,10 @@ std::shared_ptr BuildStrategy::CreatePassesFromStrategy( return pass_builder_; } +bool BuildStrategy::IsMultiDevPass(const std::string &pass_name) const { + return framework::details::MultiDevSSAGraphBuilder().count(pass_name) > 0; +} + std::unique_ptr BuildStrategy::Apply( const ProgramDesc &main_program, const std::vector &places, const std::string &loss_var_name, const std::vector &local_scopes, @@ -145,22 +166,23 @@ std::unique_ptr BuildStrategy::Apply( std::unique_ptr graph(new ir::Graph(main_program)); for (std::shared_ptr &pass : pass_builder_->AllPasses()) { - if (pass->Type() == "multi_devices_pass") { - pass->Erase("places"); - pass->SetNotOwned>("places", &places); - pass->Erase("loss_var_name"); - pass->SetNotOwned("loss_var_name", &loss_var_name); - pass->Erase("local_scopes"); - pass->SetNotOwned>("local_scopes", + if (IsMultiDevPass(pass->Type())) { + pass->Erase(kPlaces); + pass->SetNotOwned>(kPlaces, &places); + pass->Erase(kLossVarName); + pass->SetNotOwned(kLossVarName, &loss_var_name); + pass->Erase(kLocalScopes); + pass->SetNotOwned>(kLocalScopes, &local_scopes); - pass->Erase("nranks"); - pass->Set("nranks", new size_t(nranks)); + pass->Erase(kNRanks); + pass->Set(kNRanks, new size_t(nranks)); #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr; pass->Erase("nccl_ctxs"); pass->SetNotOwned("nccl_ctxs", nctx); #endif + } else if (pass->Type() == "analysis_var_pass") { const std::vector *all_op_descs = new std::vector(main_program.Block(0).AllOps()); @@ -201,7 +223,9 @@ std::unique_ptr BuildStrategy::Apply( USE_PASS(fuse_elewise_add_act_pass); USE_PASS(graph_viz_pass); USE_PASS(multi_batch_merge_pass); -USE_PASS(multi_devices_pass); +USE_PASS(reduce_mode_multi_devices_pass); +USE_PASS(allreduce_mode_multi_devices_pass); +USE_PASS(dist_multi_devices_pass); USE_PASS(multi_devices_check_pass); USE_PASS(multi_devices_print_pass); USE_PASS(analysis_var_pass); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index b75c01c485..15c2e01b61 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -74,8 +74,6 @@ struct BuildStrategy { bool fuse_elewise_add_act_ops_{false}; - bool enable_data_balance_{false}; - bool memory_optimize_{false}; bool memory_early_delete_{false}; @@ -84,6 +82,10 @@ struct BuildStrategy { bool fuse_broadcast_op_{false}; + // FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode, + // num_trainers is 1, so the current fields of build_strategy doesn't tell if + // it's distributed model. + bool is_distribution_{false}; int num_trainers_{1}; int trainer_id_{0}; std::vector trainers_endpoints_; @@ -104,6 +106,8 @@ struct BuildStrategy { bool IsFinalized() const { return is_finalized_; } + bool IsMultiDevPass(const std::string &pass_name) const; + // Apply the passes built by the pass_builder_. The passes will be // applied to the Program and output an ir::Graph. std::unique_ptr Apply(const ProgramDesc &main_program, diff --git a/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc index c8ea188046..a4bb1e26d9 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h" #include +#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph_helper.h" @@ -21,68 +21,78 @@ namespace paddle { namespace framework { namespace details { -bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const { - std::unordered_map pending_ops; - std::unordered_set pending_vars; - std::unordered_set ready_vars; - std::unordered_set ready_ops; +class SSAGraghBuilderWithChecker : public ir::Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override { + PADDLE_ENFORCE(IsValidGraph(graph.get())); + return graph; + } - auto insert_pending_var = [&](VarHandleBase *var) { - pending_vars.insert(var); - if (var->GeneratedOp() == nullptr) { - ready_vars.emplace(var); - } - }; + bool IsValidGraph(const ir::Graph *graph) const { + std::unordered_map pending_ops; + std::unordered_set pending_vars; + std::unordered_set ready_vars; + std::unordered_set ready_ops; - for (auto &var_map : graph->Get(kGraphVars)) { - for (auto &name_pair : var_map) { - for (auto &version_pair : name_pair.second) { - insert_pending_var(version_pair); + auto insert_pending_var = [&](VarHandleBase *var) { + pending_vars.insert(var); + if (var->GeneratedOp() == nullptr) { + ready_vars.emplace(var); } - } - } + }; - for (auto &var : graph->Get(kGraphDepVars)) { - insert_pending_var(var); - } + for (auto &var_map : graph->Get(kGraphVars)) { + for (auto &name_pair : var_map) { + for (auto &version_pair : name_pair.second) { + insert_pending_var(version_pair); + } + } + } - for (OpHandleBase *op : ir::FilterByNodeWrapper(*graph)) { - if (op->Inputs().empty()) { - ready_ops.insert(op); - } else { - pending_ops.insert({op, op->NoDupInputSize()}); + for (auto &var : graph->Get(kGraphDepVars)) { + insert_pending_var(var); } - } - auto run_all_ops = [&](std::unordered_set &set) { - for (auto *op : set) { - for (auto out : op->Outputs()) { - ready_vars.emplace(out); + for (OpHandleBase *op : ir::FilterByNodeWrapper(*graph)) { + if (op->Inputs().empty()) { + ready_ops.insert(op); + } else { + pending_ops.insert({op, op->NoDupInputSize()}); } } - set.clear(); - }; - while (!pending_vars.empty()) { - run_all_ops(ready_ops); + auto run_all_ops = [&](std::unordered_set &set) { + for (auto *op : set) { + for (auto out : op->Outputs()) { + ready_vars.emplace(out); + } + } + set.clear(); + }; - if (ready_vars.empty()) { - return false; - } + while (!pending_vars.empty()) { + run_all_ops(ready_ops); - for (auto ready_var : ready_vars) { - pending_vars.erase(ready_var); - for (auto *op : ready_var->PendingOps()) { - auto &deps = --pending_ops[op]; - if (deps == 0) { - ready_ops.insert(op); + if (ready_vars.empty()) { + return false; + } + + for (auto ready_var : ready_vars) { + pending_vars.erase(ready_var); + for (auto *op : ready_var->PendingOps()) { + auto &deps = --pending_ops[op]; + if (deps == 0) { + ready_ops.insert(op); + } } } + ready_vars.clear(); } - ready_vars.clear(); + return true; } - return true; -} +}; + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/multi_devices_graph_check_pass.h b/paddle/fluid/framework/details/multi_devices_graph_check_pass.h deleted file mode 100644 index 1e2b1867c3..0000000000 --- a/paddle/fluid/framework/details/multi_devices_graph_check_pass.h +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/fluid/framework/details/multi_devices_helper.h" - -#include - -namespace paddle { -namespace framework { -namespace details { - -class SSAGraghBuilderWithChecker : public ir::Pass { - protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override { - PADDLE_ENFORCE(IsValidGraph(graph.get())); - return graph; - } - - bool IsValidGraph(const ir::Graph* graph) const; -}; - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 761c9ab904..d91993bd4f 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -134,15 +134,8 @@ void AddOutputToLeafOps(ir::Graph *graph) { } } // namespace -static const char kLossVarName[] = "loss_var_name"; -static const char kPlaces[] = "places"; -static const char kLocalScopes[] = "local_scopes"; -static const char kStrategy[] = "strategy"; -static const char kNRanks[] = "nranks"; - -void MultiDevSSAGraphBuilder::Init() const { +void MultiDevSSAGraphBuilderBase::Init() const { all_vars_.clear(); - balance_vars_.clear(); loss_var_name_ = Get(kLossVarName); places_ = Get>(kPlaces); @@ -151,31 +144,16 @@ void MultiDevSSAGraphBuilder::Init() const { #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) nccl_ctxs_ = &Get("nccl_ctxs"); #endif - - balance_vars_.resize(places_.size(), 0); - - if (strategy_.enable_data_balance_ && places_.size() == 1) { - LOG(WARNING) << "It is no need to enable data balance when there is only " - "one place. enable_data_balance is set to False."; - strategy_.enable_data_balance_ = false; - } } -std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( +std::unique_ptr MultiDevSSAGraphBuilderBase::ApplyImpl( std::unique_ptr graph) const { Init(); - // Give the topology sort order and rebuild the graph structure. - std::vector sorted_ops = ir::TopologySortOperations(*graph); - - if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) { - sorted_ops = SortForReduceMode(sorted_ops); - } + std::vector sorted_ops = SortOperations(*graph); auto nodes = graph->ReleaseNodes(); ir::Graph &result = *graph; - size_t nranks = Get(kNRanks); - for (auto &node : nodes) { if (node->IsVar() && node->Var()) { all_vars_.emplace(node->Name(), node->Var()); @@ -187,146 +165,61 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( result.Set(kGraphDepVars, new GraphDepVars); result.Set(kGraphOps, new GraphOps); - std::vector> bcast_var_name_set; - bcast_var_name_set.resize(places_.size()); - bool is_forwarding = true; - bool is_dist_train = false; - - std::unordered_map sharded_var_device; + bool insert_collection_ops = NeedCollectiveOps(); for (ir::Node *node : sorted_ops) { - if (OpHaveRole(*node, OpRole::kRPC)) { - int op_dev_id = CreateRPCOp(&result, node, &sharded_var_device); - PADDLE_ENFORCE(op_dev_id != -1, - "Can not schedule the RPC operator to the right place."); - if (node->Op()->Type() == "recv") { - auto recv_vars_attr = - boost::get>(node->Op()->GetNullableAttr( - OpProtoAndCheckerMaker::OpRoleVarAttrName())); - PADDLE_ENFORCE(recv_vars_attr.size() == 2UL); // [parameter, gradient] - if (recv_vars_attr[0].find(".block") == std::string::npos) { - bcast_var_name_set[op_dev_id].emplace(recv_vars_attr[0]); - } - } - is_dist_train = true; - } else if (OpHaveRole(*node, OpRole::kDist)) { - int op_dev_id = CreateDistTrainOp(&result, node, &sharded_var_device); - if (node->Op()->Type() == "concat") { - auto origin_param_name = node->Op()->OutputArgumentNames()[0]; - bcast_var_name_set[op_dev_id].emplace(origin_param_name); - } - } else if (IsScaleLossOp(node)) { - // user can customize loss@grad if not use_default_grad_scale_ - if (strategy_.gradient_scale_ != - BuildStrategy::GradientScaleStrategy::kCustomized) { - // TODO(paddle-dev): Why is there no input for this op_handle? - auto loss_grad_name = node->Op()->OutputArgumentNames()[0]; - auto out_dtype = all_vars_.at(loss_grad_name)->GetDataType(); - CreateScaleLossGradOp(&result, loss_grad_name, node->outputs[0], - out_dtype); - } - // This assumes the backward generating code will ensure IsScaleLossOp - // is true only for the op that scale the final scalar loss. - // It also assumes backward op will always follow the forward op in - // the block. - is_forwarding = false; + if (DealWithSpecialOp(&result, node)) { + continue; } else { - int op_dev_id = GetOpDeviceID(node, sharded_var_device); - if (op_dev_id != -1) { // This op only runs on one specific device. - CreateComputationalOp(&result, node, op_dev_id); - for (ir::Node *n : node->outputs) { - sharded_var_device.emplace(n->Name(), op_dev_id); - } + // This op runs on all devices + if (IsScaleLossOp(node)) { + // user can customize loss@grad if not use_default_grad_scale_ + InsertScaleLossGradOp(&result, node); + // This assumes the backward generating code will ensure IsScaleLossOp + // is true only for the op that scale the final scalar loss. + // It also assumes backward op will always follow the forward op in + // the block. + is_forwarding = false; } else { - // This op runs on all devices, and its output may have parameter's - // gradients. - // TODO(paddle-dev): Why is so special about "read" op? - if (node->Op()->Type() == "read" && strategy_.enable_data_balance_) { - node->Op()->SetAttr("throw_eof_exp", false); - CreateComputationalOps(&result, node, places_.size()); - const auto &data_var_names = node->Op()->Output("Out"); - InsertDataBalanceOp(&result, data_var_names); - } else { - CreateComputationalOps(&result, node, places_.size()); - } + CreateComputationalOps(&result, node, places_.size()); + } - if (!is_forwarding && nranks > 1UL) { + // Insert collection ops + if (!is_forwarding && insert_collection_ops) { + try { bool is_bk_op = static_cast(boost::get(node->Op()->GetAttr( OpProtoAndCheckerMaker::OpRoleAttrName())) & static_cast(OpRole::kBackward)); if (!is_bk_op) continue; + // Currently, we assume that once gradient is generated, it can be // broadcast, and each gradient is only broadcast once. - try { - auto backward_vars = boost::get>( - node->Op()->GetNullableAttr( - OpProtoAndCheckerMaker::OpRoleVarAttrName())); - - PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); - - for (size_t i = 0; i < backward_vars.size(); i += 2) { - auto &p_name = backward_vars[i]; - auto &g_name = backward_vars[i + 1]; - VLOG(10) << "Bcast " << g_name << " for parameter " << p_name; - size_t cur_device_id = -1; - switch (strategy_.reduce_) { - case BuildStrategy::ReduceStrategy::kReduce: - cur_device_id = GetAppropriateDeviceID({g_name}); - CreateReduceOp(&result, g_name, cur_device_id); - sharded_var_device.emplace(g_name, cur_device_id); - if (!is_dist_train) { - bcast_var_name_set[cur_device_id].emplace(p_name); - } - break; - case BuildStrategy::ReduceStrategy::kAllReduce: - if (IsSparseGradient(g_name)) { - CreateReduceOp(&result, g_name, 0); - CreateBroadcastOp(&result, g_name, 0); - } else { - InsertAllReduceOp(&result, g_name); - } - break; - default: - LOG(FATAL) << "Unknown reduce strategy "; - break; - } - } - } catch (boost::bad_get e) { + auto backward_vars = + boost::get>(node->Op()->GetNullableAttr( + OpProtoAndCheckerMaker::OpRoleVarAttrName())); + PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); + + for (size_t i = 0; i < backward_vars.size(); i += 2) { + auto &p_name = backward_vars[i]; + auto &g_name = backward_vars[i + 1]; + VLOG(10) << "Bcast " << g_name << " for parameter " << p_name; + + InsertCollectiveOp(&result, p_name, g_name); } + } catch (boost::bad_get e) { } } } } - bool use_gpu = false; -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - use_gpu = nccl_ctxs_ != nullptr; -#endif - // Insert broadcast operators principle: - // 1. Broadcast optimized parameters in Reduce strategy; - // 2. No need broadcast optimized parameters in AllReduce strategy because of - // the optimization sub-graph would be run on every GPU; - // 3. Allways broadcast received parameters in Distribute Training. - if ((use_gpu && - strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) || - is_dist_train) { - if (strategy_.fuse_broadcast_op_) { - CreateFusedBroadcastOp(&result, bcast_var_name_set); - } else { - for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) { - auto &to_bcast_set = bcast_var_name_set[dev_id]; - for (auto &bcast_name : to_bcast_set) { - CreateBroadcastOp(&result, bcast_name, dev_id); - } - } - } - } + InsertPostprocessOps(&result); + /* Dependency graph has been constructed. However, there are still data hazards need to be handled. - */ + */ PolishGraphToSupportDataHazards(&result); /* @@ -337,67 +230,54 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( return graph; } -std::vector MultiDevSSAGraphBuilder::SortForReduceMode( - const std::vector &topo_ops) const { - std::unordered_map sharded_var_device; - std::vector sorted_ops; - std::unordered_map> delayed_op; - sorted_ops.reserve(topo_ops.size()); - - auto insert_delayed_op = [&](const std::string &var_name, int dev_id) { - sharded_var_device.emplace(var_name, dev_id); - if (delayed_op.count(var_name)) { - auto &ops = delayed_op.at(var_name); - sorted_ops.insert(sorted_ops.end(), ops.begin(), ops.end()); - delayed_op.at(var_name).clear(); - } - }; +void MultiDevSSAGraphBuilderBase::InsertScaleLossGradOp( + ir::Graph *result, const ir::Node *node) const { + // user can customize loss@grad if not use_default_grad_scale_ + size_t loss_scale = 0; + switch (this->strategy_.gradient_scale_) { + case BuildStrategy::GradientScaleStrategy::kOne: + loss_scale = 1; + break; + case BuildStrategy::GradientScaleStrategy::kCoeffNumDevice: + loss_scale = Get(kNRanks); + break; + case BuildStrategy::GradientScaleStrategy::kCustomized: + loss_scale = 0; + break; + default: + LOG(FATAL) << "Unknown gradient scale strategy."; + break; + } + + if (loss_scale) { + // TODO(paddle-dev): Why is there no input for this op_handle? + auto loss_grad_name = node->Op()->OutputArgumentNames()[0]; + auto out_dtype = this->all_vars_.at(loss_grad_name)->GetDataType(); + this->CreateScaleLossGradOp(result, loss_grad_name, node->outputs[0], + loss_scale, out_dtype); + } +} - for (ir::Node *node : topo_ops) { - int op_dev_id = GetOpDeviceID(node, sharded_var_device, &delayed_op); - if (op_dev_id > -1) { - // This op only runs on one specific device. - sorted_ops.emplace_back(node); - for (ir::Node *n : node->outputs) { - insert_delayed_op(n->Name(), op_dev_id); - } - } else if (op_dev_id == -1) { - // This op runs on all devices, and its output may have parameter's - // gradients. - sorted_ops.emplace_back(node); - bool is_bk_op = - static_cast(boost::get(node->Op()->GetAttr( - OpProtoAndCheckerMaker::OpRoleAttrName())) & - static_cast(OpRole::kBackward)); - if (!is_bk_op) continue; - // Currently, we assume that once gradient is generated, it can be - // broadcast, and each gradient is only broadcast once. - std::vector backward_vars; - try { - backward_vars = - boost::get>(node->Op()->GetNullableAttr( - OpProtoAndCheckerMaker::OpRoleVarAttrName())); - } catch (boost::bad_get e) { - } - PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); +std::vector MultiDevSSAGraphBuilderBase::SortOperations( + const ir::Graph &graph) const { + return ir::TopologySortOperations(graph); +} - for (size_t i = 0; i < backward_vars.size(); i += 2) { - auto &g_name = backward_vars[i + 1]; - size_t cur_device_id = GetAppropriateDeviceID({g_name}); - insert_delayed_op(g_name, static_cast(cur_device_id)); - } - } else if (op_dev_id == -2) { - // The Op on which the Op depends has not yet been generated. - } - } +bool MultiDevSSAGraphBuilderBase::UseGPU() const { + bool use_gpu = false; +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + use_gpu = nccl_ctxs_ != nullptr; +#endif + return use_gpu; +} - PADDLE_ENFORCE_EQ(sorted_ops.size(), topo_ops.size()); - return sorted_ops; +bool MultiDevSSAGraphBuilderBase::NeedCollectiveOps() const { + return Get(kNRanks) > 1; } -void MultiDevSSAGraphBuilder::CreateOpHandleIOs(ir::Graph *result, - ir::Node *node, - size_t place_id) const { +void MultiDevSSAGraphBuilderBase::CreateOpHandleIOs(ir::Graph *result, + ir::Node *node, + size_t place_id) const { auto p = places_[place_id]; auto *op_handle = result->Get(kGraphOps).back(); op_handle->SetDeviceContext(p, @@ -420,28 +300,7 @@ void MultiDevSSAGraphBuilder::CreateOpHandleIOs(ir::Graph *result, } } -size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID( - const std::vector &var_names) const { - int64_t numel_sum = 0; - for (auto var_name : var_names) { - if (all_vars_.find(var_name) == all_vars_.end()) continue; - auto var_desc = all_vars_.at(var_name); - PADDLE_ENFORCE_NOT_NULL(var_desc); - auto dim = framework::make_ddim(var_desc->GetShape()); - int64_t numel = framework::product(dim); - PADDLE_ENFORCE_GT(numel, 0); - numel_sum += numel; - } - - auto smallest = - std::min_element(std::begin(balance_vars_), std::end(balance_vars_)); - size_t dev_id = - static_cast(std::distance(std::begin(balance_vars_), smallest)); - balance_vars_[dev_id] += numel_sum; - return dev_id; -} - -void MultiDevSSAGraphBuilder::SetCommunicationContext( +void MultiDevSSAGraphBuilderBase::SetCommunicationContext( OpHandleBase *op_handle, const platform::Place &p) const { #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) if (nccl_ctxs_ == nullptr) { @@ -454,9 +313,9 @@ void MultiDevSSAGraphBuilder::SetCommunicationContext( #endif } -void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result, - const std::string &p_name, - size_t src_dev_id) const { +void MultiDevSSAGraphBuilderBase::CreateBroadcastOp(ir::Graph *result, + const std::string &p_name, + size_t src_dev_id) const { #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) auto *op_handle = new BroadcastOpHandle( result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation), @@ -484,7 +343,7 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result, } } -void MultiDevSSAGraphBuilder::CreateFusedBroadcastOp( +void MultiDevSSAGraphBuilderBase::CreateFusedBroadcastOp( ir::Graph *result, const std::vector> &bcast_varnames) const { #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) @@ -522,17 +381,17 @@ void MultiDevSSAGraphBuilder::CreateFusedBroadcastOp( } } -void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result, - ir::Node *node, - int dev_id) const { +void MultiDevSSAGraphBuilderBase::CreateComputationalOp(ir::Graph *result, + ir::Node *node, + int dev_id) const { result->Get(kGraphOps).emplace_back( new ComputationOpHandle(result->CreateOpNode(node->Op()), local_scopes_[dev_id], places_[dev_id], dev_id)); CreateOpHandleIOs(result, node, dev_id); } -void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result, - const std::string &og) const { +void MultiDevSSAGraphBuilderBase::CreateAllReduceOp( + ir::Graph *result, const std::string &og) const { #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) result->Get(kGraphOps).emplace_back(new AllReduceOpHandle( result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), @@ -560,102 +419,15 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result, } } -void MultiDevSSAGraphBuilder::InsertDataBalanceOp( - ir::Graph *result, const std::vector &datas) const { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - result->Get(kGraphOps).emplace_back(new DataBalanceOpHandle( - result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation), - local_scopes_, places_, nccl_ctxs_)); -#else - result->Get(kGraphOps).emplace_back(new DataBalanceOpHandle( - result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation), - local_scopes_, places_)); -#endif - auto *op_handle = result->Get(kGraphOps).back(); - for (size_t i = 0; i < places_.size(); ++i) { - auto &p = places_[i]; - SetCommunicationContext(op_handle, p); - for (const std::string &d_name : datas) { - auto &vars = result->Get(kGraphVars)[i][d_name]; - PADDLE_ENFORCE(!vars.empty()); - op_handle->AddInput(vars.back()); - auto var = new VarHandle( - result->CreateEmptyNode(d_name, ir::Node::Type::kVariable), - vars.size(), i, d_name, p); - vars.emplace_back(var); - op_handle->AddOutput(var); - } - } -} - -int MultiDevSSAGraphBuilder::GetOpDeviceID( - ir::Node *node, - const std::unordered_map &sharded_var_device, - std::unordered_map> *delay_ops) const { - if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) { - return -1; - } - - if (!OpHaveRole(*node, framework::OpRole::kOptimize)) { - return -1; - } - - auto param_grad = boost::get>( - node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); - - PADDLE_ENFORCE_EQ(param_grad.size(), 2U); - int dev_id = GetVarDeviceID(param_grad[1], sharded_var_device); - - if (dev_id == -1) { - (*delay_ops)[param_grad[1]].push_back(node); - return -2; - } - return dev_id; -} - -int MultiDevSSAGraphBuilder::GetOpDeviceID( - ir::Node *node, - const std::unordered_map &sharded_var_device) const { - if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) { - return -1; - } - - if (!OpHaveRole(*node, framework::OpRole::kOptimize)) { - return -1; - } - auto param_grad = boost::get>( - node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); - - PADDLE_ENFORCE_EQ(param_grad.size(), 2U); - int dev_id = GetVarDeviceID(param_grad[1], sharded_var_device); - PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s, %s]", - node->Op()->Type(), param_grad[0], param_grad[1]); - return dev_id; -} - -int MultiDevSSAGraphBuilder::GetVarDeviceID( - const std::string &varname, - const std::unordered_map &sharded_var_device) const { - auto got = sharded_var_device.find(varname); - if (got == sharded_var_device.end()) { - auto pos = varname.find(framework::kNewGradSuffix); - if (pos != std::string::npos) { - got = sharded_var_device.find(varname.substr(0, pos)); - } - } - return got == sharded_var_device.end() ? -1 : got->second; -} - -void MultiDevSSAGraphBuilder::CreateScaleLossGradOp( +void MultiDevSSAGraphBuilderBase::CreateScaleLossGradOp( ir::Graph *result, const std::string &loss_grad_name, - ir::Node *out_var_node, proto::VarType::Type dtype) const { - size_t nranks = Get("nranks"); + ir::Node *out_var_node, size_t loss_scale, + proto::VarType::Type dtype) const { for (size_t i = 0; i < places_.size(); ++i) { - // Insert ScaleCost OpHandle auto *dev_ctx = platform::DeviceContextPool::Instance().Get(places_[i]); auto *op_handle = new ScaleLossGradOpHandle( result->CreateEmptyNode("scale_loss_grad", ir::Node::Type::kOperation), - nranks, local_scopes_[i], places_[i], dev_ctx, dtype); + loss_scale, local_scopes_[i], places_[i], dev_ctx, dtype); result->Get(kGraphOps).emplace_back(op_handle); // FIXME: Currently ScaleLossGradOp only use device_count as scale @@ -669,9 +441,8 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp( } } -void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result, - ir::Node *node, - size_t num_places) const { +void MultiDevSSAGraphBuilderBase::CreateComputationalOps( + ir::Graph *result, ir::Node *node, size_t num_places) const { for (size_t scope_idx = 0; scope_idx < num_places; ++scope_idx) { auto p = places_[scope_idx]; auto s = local_scopes_[scope_idx]; @@ -681,9 +452,9 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result, } } -VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result, - const std::string &og, - int dst_dev_id) const { +VarHandle *MultiDevSSAGraphBuilderBase::CreateReduceOp(ir::Graph *result, + const std::string &og, + int dst_dev_id) const { #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) result->Get(kGraphOps).emplace_back(new ReduceOpHandle( result->CreateEmptyNode("reduce", ir::Node::Type::kOperation), @@ -712,51 +483,273 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result, return var; } -int MultiDevSSAGraphBuilder::CreateDistTrainOp( - ir::Graph *result, ir::Node *node, - std::unordered_map *sharded_var_device) const { - int op_dev_id = -1; - std::vector input_var_names; - std::vector output_var_names; - for (ir::Node *input : node->inputs) { - input_var_names.push_back(input->Name()); +bool MultiDevSSAGraphBuilderBase::IsScaleLossOp(ir::Node *node) const { + return boost::get( + node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == + (static_cast(OpRole::kBackward) | + static_cast(OpRole::kLoss)) && + !loss_var_name_.empty(); // If loss_var is empty. This is test mode +} + +bool MultiDevSSAGraphBuilderBase::IsSparseGradient( + const std::string &og) const { + PADDLE_ENFORCE(all_vars_.count(og) != 0); + if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) { + return true; } - for (ir::Node *output : node->outputs) { - output_var_names.push_back(output->Name()); + return false; +} + +void AllReduceSSAGraphBuilder::InsertCollectiveOp( + ir::Graph *result, const std::string &p_name, + const std::string &g_name) const { + if (IsSparseGradient(g_name)) { + CreateReduceOp(result, g_name, 0); + CreateBroadcastOp(result, g_name, 0); + } else { + CreateAllReduceOp(result, g_name); } +} - if (node->Op()->Type() == "split_byref" || - node->Op()->Type() == "split_selected_rows" || - node->Op()->Type() == "split_ids") { - // TODO(paddle-dev): getting the first var is not safe. - op_dev_id = GetVarDeviceID(input_var_names[0], *sharded_var_device); - if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { - op_dev_id = GetAppropriateDeviceID(input_var_names); - for (auto &varname : input_var_names) { - sharded_var_device->emplace(varname, op_dev_id); +int BalanceVarSSAGraphBuilder::GetVarDeviceID( + const std::string &varname) const { + auto got = sharded_var_device_.find(varname); + if (got == sharded_var_device_.end()) { + auto pos = varname.find(framework::kNewGradSuffix); + if (pos != std::string::npos) { + got = sharded_var_device_.find(varname.substr(0, pos)); + } + } + return got == sharded_var_device_.end() ? -1 : got->second; +} + +int BalanceVarSSAGraphBuilder::GetOpDeviceID(ir::Node *node) const { + if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) { + return -1; + } + if (!OpHaveRole(*node, framework::OpRole::kOptimize)) { + return -1; + } + auto param_grad = boost::get>( + node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); + + PADDLE_ENFORCE_EQ(param_grad.size(), 2U); + int dev_id = GetVarDeviceID(param_grad[1]); + PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s, %s]", + node->Op()->Type(), param_grad[0], param_grad[1]); + return dev_id; +} + +size_t BalanceVarSSAGraphBuilder::GetAppropriateDeviceID( + const std::vector &var_names) const { + int64_t numel_sum = 0; + for (auto var_name : var_names) { + if (all_vars_.find(var_name) == all_vars_.end()) continue; + auto var_desc = all_vars_.at(var_name); + PADDLE_ENFORCE_NOT_NULL(var_desc); + auto dim = framework::make_ddim(var_desc->GetShape()); + int64_t numel = framework::product(dim); + PADDLE_ENFORCE_GT(numel, 0); + numel_sum += numel; + } + + auto smallest = + std::min_element(std::begin(balance_vars_), std::end(balance_vars_)); + size_t dev_id = + static_cast(std::distance(std::begin(balance_vars_), smallest)); + balance_vars_[dev_id] += numel_sum; + return dev_id; +} + +void BalanceVarSSAGraphBuilder::ResetState() const { + balance_vars_.clear(); + sharded_var_device_.clear(); + + balance_vars_.resize(places_.size(), 0); +} + +void ReduceSSAGraphBuilder::Init() const { + MultiDevSSAGraphBuilderBase::Init(); + ResetState(); +} + +void ReduceSSAGraphBuilder::ResetState() const { + BalanceVarSSAGraphBuilder::ResetState(); + bcast_var_name_set_.clear(); + bcast_var_name_set_.resize(places_.size()); +} + +void ReduceSSAGraphBuilder::InsertCollectiveOp( + ir::Graph *result, const std::string &p_name, + const std::string &g_name) const { + size_t cur_device_id = GetAppropriateDeviceID({g_name}); + CreateReduceOp(result, g_name, cur_device_id); + sharded_var_device_.emplace(g_name, cur_device_id); + bcast_var_name_set_[cur_device_id].emplace(p_name); +} + +bool ReduceSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result, + ir::Node *node) const { + int op_dev_id = BalanceVarSSAGraphBuilder::GetOpDeviceID(node); + if (op_dev_id != -1) { + // This op only runs on one specific device. + CreateComputationalOp(result, node, op_dev_id); + for (ir::Node *n : node->outputs) { + sharded_var_device_.emplace(n->Name(), op_dev_id); + } + return true; + } + return false; +} + +void ReduceSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const { + if (UseGPU()) { + if (strategy_.fuse_broadcast_op_) { + CreateFusedBroadcastOp(result, bcast_var_name_set_); + } else { + for (size_t dev_id = 0; dev_id < bcast_var_name_set_.size(); ++dev_id) { + auto &to_bcast_set = bcast_var_name_set_[dev_id]; + for (auto &bcast_name : to_bcast_set) { + CreateBroadcastOp(result, bcast_name, dev_id); + } } } - for (auto &varname : output_var_names) { - sharded_var_device->emplace(varname, op_dev_id); + } +} + +int ReduceSSAGraphBuilder::GetOpDeviceID( + ir::Node *node, + std::unordered_map> *delay_ops) const { + if (!OpHaveRole(*node, framework::OpRole::kOptimize)) { + return -1; + } + + auto param_grad = boost::get>( + node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); + + PADDLE_ENFORCE_EQ(param_grad.size(), 2U); + int dev_id = GetVarDeviceID(param_grad[1]); + + if (dev_id == -1) { + (*delay_ops)[param_grad[1]].push_back(node); + return -2; + } + return dev_id; +} + +std::vector ReduceSSAGraphBuilder::SortOperations( + const ir::Graph &graph) const { + std::vector sorted_ops = ir::TopologySortOperations(graph); + return SortForReduceMode(sorted_ops); +} + +std::vector ReduceSSAGraphBuilder::SortForReduceMode( + const std::vector &topo_ops) const { + std::vector sorted_ops; + std::unordered_map> delayed_op; + sorted_ops.reserve(topo_ops.size()); + ResetState(); + + auto insert_delayed_op = [&](const std::string &var_name, int dev_id) { + sharded_var_device_.emplace(var_name, dev_id); + if (delayed_op.count(var_name)) { + auto &ops = delayed_op.at(var_name); + sorted_ops.insert(sorted_ops.end(), ops.begin(), ops.end()); + delayed_op.at(var_name).clear(); } - } else if (node->Op()->Type() == "concat") { - op_dev_id = GetVarDeviceID(input_var_names[0], *sharded_var_device); - for (auto &varname : output_var_names) { - sharded_var_device->emplace(varname, op_dev_id); + }; + + for (ir::Node *node : topo_ops) { + int op_dev_id = GetOpDeviceID(node, &delayed_op); + if (op_dev_id > -1) { + // This op only runs on one specific device. + sorted_ops.emplace_back(node); + for (ir::Node *n : node->outputs) { + insert_delayed_op(n->Name(), op_dev_id); + } + } else if (op_dev_id == -1) { + // This op runs on all devices, and its output may have parameter's + // gradients. + sorted_ops.emplace_back(node); + bool is_bk_op = + static_cast(boost::get(node->Op()->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName())) & + static_cast(OpRole::kBackward)); + if (!is_bk_op) continue; + // Currently, we assume that once gradient is generated, it can be + // broadcast, and each gradient is only broadcast once. + std::vector backward_vars; + try { + backward_vars = + boost::get>(node->Op()->GetNullableAttr( + OpProtoAndCheckerMaker::OpRoleVarAttrName())); + } catch (boost::bad_get e) { + } + PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); + + for (size_t i = 0; i < backward_vars.size(); i += 2) { + auto &g_name = backward_vars[i + 1]; + size_t cur_device_id = GetAppropriateDeviceID({g_name}); + insert_delayed_op(g_name, static_cast(cur_device_id)); + } + } else if (op_dev_id == -2) { + // The Op on which the Op depends has not yet been generated. } - } else { - LOG(ERROR) << "got unexpected dist op: " << node->Op()->Type(); - PADDLE_THROW( - "the distribute training related op should be in [split_byref, " - "concat]."); } - PADDLE_ENFORCE(op_dev_id != -1, - "can not find right place for distributed op: %s", - node->Op()->Type()); + PADDLE_ENFORCE_EQ(sorted_ops.size(), topo_ops.size()); - CreateComputationalOp(result, node, op_dev_id); - return op_dev_id; + ResetState(); + return sorted_ops; +} + +void DistSSAGraphBuilder::Init() const { + MultiDevSSAGraphBuilderBase::Init(); + ResetState(); +} + +void DistSSAGraphBuilder::ResetState() const { + BalanceVarSSAGraphBuilder::ResetState(); + bcast_var_name_set_.clear(); + bcast_var_name_set_.resize(places_.size()); +} + +bool DistSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result, + ir::Node *node) const { + bool insert_op = false; + if (OpHaveRole(*node, OpRole::kRPC)) { + int op_dev_id = CreateRPCOp(result, node); + PADDLE_ENFORCE(op_dev_id != -1, + "Can not schedule the RPC operator to the right place."); + if (node->Op()->Type() == "recv") { + auto recv_vars_attr = + boost::get>(node->Op()->GetNullableAttr( + OpProtoAndCheckerMaker::OpRoleVarAttrName())); + PADDLE_ENFORCE(recv_vars_attr.size() == 2UL); // [parameter, gradient] + if (recv_vars_attr[0].find(".block") == std::string::npos) { + bcast_var_name_set_[op_dev_id].emplace(recv_vars_attr[0]); + } + } + insert_op = true; + need_broadcast_var_ = true; + } else if (OpHaveRole(*node, OpRole::kDist)) { + int op_dev_id = CreateDistTrainOp(result, node); + if (node->Op()->Type() == "concat") { + auto origin_param_name = node->Op()->OutputArgumentNames()[0]; + bcast_var_name_set_[op_dev_id].emplace(origin_param_name); + } + insert_op = true; + } else { + int op_dev_id = GetOpDeviceID(node); + if (op_dev_id != -1) { // This op only runs on one specific device. + CreateComputationalOp(result, node, op_dev_id); + for (ir::Node *n : node->outputs) { + sharded_var_device_.emplace(n->Name(), op_dev_id); + } + insert_op = true; + } + } + return insert_op; } void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) { @@ -775,13 +768,11 @@ void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) { } // Create RPC related op handles that connects its in ops and out ops. -int MultiDevSSAGraphBuilder::CreateRPCOp( - ir::Graph *result, ir::Node *node, - std::unordered_map *sharded_var_device) const { +int DistSSAGraphBuilder::CreateRPCOp(ir::Graph *result, ir::Node *node) const { int op_dev_id = -1; if (node->Op()->Type() == "send") { // TODO(paddle-dev): getting the first var is not safe. - op_dev_id = GetVarDeviceID(node->inputs[0]->Name(), *sharded_var_device); + op_dev_id = GetVarDeviceID(node->inputs[0]->Name()); PADDLE_ENFORCE(!ir::IsControlDepVar(*node->inputs[0]), "This hack no longer holds, please fix."); // the variable name which contains .block means it was splited by @@ -799,9 +790,9 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( VLOG(10) << "send grad " << input_var_names[0] << " origin " << send_param_grad[1] << " place: " << op_dev_id; for (auto &varname : input_var_names) { - sharded_var_device->emplace(varname, op_dev_id); + sharded_var_device_.emplace(varname, op_dev_id); } - sharded_var_device->emplace(send_param_grad[1], op_dev_id); + sharded_var_device_.emplace(send_param_grad[1], op_dev_id); } } else if (node->Op()->Type() == "recv") { std::vector output_var_names; @@ -811,7 +802,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( auto recv_param_grad = boost::get>( node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); if (recv_param_grad.size() == 2U) { - op_dev_id = GetVarDeviceID(recv_param_grad[1], *sharded_var_device); + op_dev_id = GetVarDeviceID(recv_param_grad[1]); VLOG(10) << "recv param " << recv_param_grad[0] << " get grad place: " << recv_param_grad[1] << " place: " << op_dev_id; @@ -819,7 +810,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( op_dev_id = GetAppropriateDeviceID(output_var_names); } for (auto &varname : output_var_names) { - sharded_var_device->emplace(varname, op_dev_id); + sharded_var_device_.emplace(varname, op_dev_id); } } else { // send_barrier, fetch_barrier will run on place 0; @@ -846,7 +837,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( for (ir::Node *output : node->outputs) { int outvar_dev_id = op_dev_id; if (node->Op()->Type() == "fetch_barrier") { - outvar_dev_id = GetVarDeviceID(output->Name(), *sharded_var_device); + outvar_dev_id = GetVarDeviceID(output->Name()); PADDLE_ENFORCE_NE(outvar_dev_id, -1, "output name %s", output->Name()); } p = places_[outvar_dev_id]; @@ -863,29 +854,124 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( return op_dev_id; } -bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const { - PADDLE_ENFORCE(all_vars_.count(og) != 0); - if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) { - return true; +int DistSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, + ir::Node *node) const { + int op_dev_id = -1; + std::vector input_var_names; + std::vector output_var_names; + for (ir::Node *input : node->inputs) { + input_var_names.push_back(input->Name()); } - return false; + for (ir::Node *output : node->outputs) { + output_var_names.push_back(output->Name()); + } + + if (node->Op()->Type() == "split_byref" || + node->Op()->Type() == "split_selected_rows" || + node->Op()->Type() == "split_ids") { + // TODO(paddle-dev): getting the first var is not safe. + op_dev_id = GetVarDeviceID(input_var_names[0]); + if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { + op_dev_id = GetAppropriateDeviceID(input_var_names); + for (auto &varname : input_var_names) { + sharded_var_device_.emplace(varname, op_dev_id); + } + } + for (auto &varname : output_var_names) { + sharded_var_device_.emplace(varname, op_dev_id); + } + } else if (node->Op()->Type() == "concat") { + op_dev_id = GetVarDeviceID(input_var_names[0]); + for (auto &varname : output_var_names) { + sharded_var_device_.emplace(varname, op_dev_id); + } + } else { + LOG(ERROR) << "got unexpected dist op: " << node->Op()->Type(); + PADDLE_THROW( + "the distribute training related op should be in [split_byref, " + "concat]."); + } + + PADDLE_ENFORCE(op_dev_id != -1, + "can not find right place for distributed op: %s", + node->Op()->Type()); + + CreateComputationalOp(result, node, op_dev_id); + return op_dev_id; } -bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const { - return boost::get( - node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == - (static_cast(OpRole::kBackward) | - static_cast(OpRole::kLoss)) && - !loss_var_name_.empty(); // If loss_var is empty. This is test mode +void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result, + const std::string &p_name, + const std::string &g_name) const { + size_t cur_device_id = 0; + switch (strategy_.reduce_) { + case BuildStrategy::ReduceStrategy::kReduce: + cur_device_id = GetAppropriateDeviceID({g_name}); + CreateReduceOp(result, g_name, cur_device_id); + sharded_var_device_.emplace(g_name, cur_device_id); + break; + case BuildStrategy::ReduceStrategy::kAllReduce: + if (IsSparseGradient(g_name)) { + CreateReduceOp(result, g_name, 0); + CreateBroadcastOp(result, g_name, 0); + } else { + CreateAllReduceOp(result, g_name); + } + break; + default: + LOG(FATAL) << "Unknown reduce strategy."; + break; + } +} + +void DistSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const { + if (need_broadcast_var_ || + (UseGPU() && + strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce)) { + if (strategy_.fuse_broadcast_op_) { + CreateFusedBroadcastOp(result, bcast_var_name_set_); + } else { + for (size_t dev_id = 0; dev_id < bcast_var_name_set_.size(); ++dev_id) { + auto &to_bcast_set = bcast_var_name_set_[dev_id]; + for (auto &bcast_name : to_bcast_set) { + CreateBroadcastOp(result, bcast_name, dev_id); + } + } + } + } +} + +std::unordered_set &MultiDevSSAGraphBuilder() { + static std::unordered_set regs; + return regs; } + +static int MultiDevSSAGraphBuilderRegister(const std::string &builder_mode) { + MultiDevSSAGraphBuilder().insert(builder_mode); + return 0; +} + } // namespace details } // namespace framework } // namespace paddle -REGISTER_PASS(multi_devices_pass, - paddle::framework::details::MultiDevSSAGraphBuilder) - .RequirePassAttr(paddle::framework::details::kLossVarName) - .RequirePassAttr(paddle::framework::details::kPlaces) - .RequirePassAttr(paddle::framework::details::kLocalScopes) - .RequirePassAttr(paddle::framework::details::kStrategy) - .RequirePassAttr(paddle::framework::details::kNRanks); +#define REGISTER_MULTI_DEVICES_PASS(pass_name, pass_class) \ + STATIC_ASSERT_GLOBAL_NAMESPACE( \ + _reg_ssa_graph_builder_##pass_name, \ + "REGISTER_MULTI_DEVICES_PASS must be called in global namespace."); \ + int _reg_ssa_graph_builder_entry_##pass_name = \ + paddle::framework::details::MultiDevSSAGraphBuilderRegister(#pass_name); \ + REGISTER_PASS(pass_name, pass_class) \ + .RequirePassAttr(paddle::framework::details::kLossVarName) \ + .RequirePassAttr(paddle::framework::details::kPlaces) \ + .RequirePassAttr(paddle::framework::details::kLocalScopes) \ + .RequirePassAttr(paddle::framework::details::kStrategy) \ + .RequirePassAttr(paddle::framework::details::kNRanks) + +REGISTER_MULTI_DEVICES_PASS(reduce_mode_multi_devices_pass, + paddle::framework::details::ReduceSSAGraphBuilder); +REGISTER_MULTI_DEVICES_PASS( + allreduce_mode_multi_devices_pass, + paddle::framework::details::AllReduceSSAGraphBuilder); +REGISTER_MULTI_DEVICES_PASS(dist_multi_devices_pass, + paddle::framework::details::DistSSAGraphBuilder); diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h index 7029e9dc18..6d4386538e 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once + #include #include #include @@ -30,78 +31,70 @@ namespace framework { class Scope; namespace details { -class MultiDevSSAGraphBuilder : public ir::Pass { +constexpr char kLossVarName[] = "loss_var_name"; +constexpr char kPlaces[] = "places"; +constexpr char kLocalScopes[] = "local_scopes"; +constexpr char kStrategy[] = "strategy"; +constexpr char kNRanks[] = "nranks"; + +class MultiDevSSAGraphBuilderBase : public ir::Pass { protected: std::unique_ptr ApplyImpl( std::unique_ptr graph) const override; - private: - void CreateOpHandleIOs(ir::Graph *result, ir::Node *node, - size_t device_id) const; - void Init() const; + virtual void Init() const; -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - mutable platform::NCCLContextMap *nccl_ctxs_; -#endif + virtual std::vector SortOperations(const ir::Graph &graph) const; - int GetVarDeviceID( - const std::string &varname, - const std::unordered_map &sharded_var_device) const; + virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name, + const std::string &g_name) const = 0; - bool IsScaleLossOp(ir::Node *node) const; + virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const = 0; + + virtual void InsertPostprocessOps(ir::Graph *result) const = 0; - int CreateRPCOp( - ir::Graph *result, ir::Node *node, - std::unordered_map *sharded_var_device) const; - int CreateDistTrainOp( - ir::Graph *result, ir::Node *node, - std::unordered_map *sharded_var_device) const; + bool UseGPU() const; + + bool NeedCollectiveOps() const; + + bool IsScaleLossOp(ir::Node *node) const; void CreateComputationalOps(ir::Graph *result, ir::Node *node, size_t num_places) const; void CreateScaleLossGradOp(ir::Graph *result, const std::string &loss_grad_name, - ir::Node *out_var_node, + ir::Node *out_var_node, size_t loss_scale, proto::VarType::Type dtype) const; VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og, int dst_dev_id) const; + void CreateComputationalOp(ir::Graph *result, ir::Node *node, int dev_id) const; - int GetOpDeviceID( - ir::Node *node, - const std::unordered_map &sharded_var_device) const; - - void InsertAllReduceOp(ir::Graph *result, const std::string &og) const; + bool IsSparseGradient(const std::string &og) const; - void InsertDataBalanceOp(ir::Graph *result, - const std::vector &datas) const; + void CreateAllReduceOp(ir::Graph *result, const std::string &og) const; void CreateBroadcastOp(ir::Graph *result, const std::string &p_name, size_t src_dev_id) const; + void InsertScaleLossGradOp(ir::Graph *result, const ir::Node *node) const; + void CreateFusedBroadcastOp( ir::Graph *result, const std::vector> &bcast_varnames) const; - bool IsSparseGradient(const std::string &og) const; - - size_t GetAppropriateDeviceID( - const std::vector &var_names) const; - void SetCommunicationContext(OpHandleBase *op_handle, const platform::Place &p) const; - std::vector SortForReduceMode( - const std::vector &) const; + void CreateOpHandleIOs(ir::Graph *result, ir::Node *node, + size_t device_id) const; - int GetOpDeviceID( - ir::Node *node, - const std::unordered_map &shared_var_device, - std::unordered_map> *delay_ops) - const; +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + mutable platform::NCCLContextMap *nccl_ctxs_; +#endif mutable std::string loss_var_name_; mutable std::vector places_; @@ -109,8 +102,83 @@ class MultiDevSSAGraphBuilder : public ir::Pass { mutable BuildStrategy strategy_; mutable std::unordered_map all_vars_; +}; + +class AllReduceSSAGraphBuilder : public MultiDevSSAGraphBuilderBase { + protected: + virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name, + const std::string &g_name) const; + + virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const { + return false; + } + + virtual void InsertPostprocessOps(ir::Graph *result) const {} +}; + +class BalanceVarSSAGraphBuilder : public MultiDevSSAGraphBuilderBase { + protected: + int GetVarDeviceID(const std::string &varname) const; + + int GetOpDeviceID(ir::Node *node) const; + + size_t GetAppropriateDeviceID( + const std::vector &var_names) const; + + virtual void ResetState() const; + + mutable std::unordered_map sharded_var_device_; mutable std::vector balance_vars_; }; + +class ReduceSSAGraphBuilder : public BalanceVarSSAGraphBuilder { + protected: + virtual void Init() const; + + virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name, + const std::string &g_name) const; + + virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const; + + virtual void InsertPostprocessOps(ir::Graph *result) const; + + virtual std::vector SortOperations(const ir::Graph &graph) const; + + virtual void ResetState() const; + + int GetOpDeviceID(ir::Node *node, + std::unordered_map> + *delay_ops) const; + + std::vector SortForReduceMode( + const std::vector &topo_ops) const; + + mutable std::vector> bcast_var_name_set_; +}; + +class DistSSAGraphBuilder : public BalanceVarSSAGraphBuilder { + protected: + virtual void Init() const; + + virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const; + + virtual void InsertPostprocessOps(ir::Graph *result) const; + + virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name, + const std::string &g_name) const; + + virtual void ResetState() const; + + int CreateRPCOp(ir::Graph *result, ir::Node *node) const; + + int CreateDistTrainOp(ir::Graph *result, ir::Node *node) const; + + mutable std::vector> bcast_var_name_set_; + mutable bool need_broadcast_var_{false}; +}; + +std::unordered_set &MultiDevSSAGraphBuilder(); + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 3b81d59ad9..dce755c91a 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -946,13 +946,6 @@ All parameter, weight, gradient are variables in Paddle. R"DOC(The type is STR, debug_graphviz_path indicate the path that writing the SSA Graph to file in the form of graphviz, you. It is useful for debugging. Default "")DOC") - .def_property( - "enable_data_balance", - [](const BuildStrategy &self) { return self.enable_data_balance_; }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); - self.enable_data_balance_ = b; - }) // FIXME(chengudo): enable_data_balance seems not important .def_property( "enable_sequential_execution", [](const BuildStrategy &self) { @@ -1007,6 +1000,10 @@ All parameter, weight, gradient are variables in Paddle. "memory_optimize", [](const BuildStrategy &self) { return self.memory_optimize_; }, [](BuildStrategy &self, bool b) { self.memory_optimize_ = b; }) + .def_property( + "is_distribution", + [](const BuildStrategy &self) { return self.is_distribution_; }, + [](BuildStrategy &self, bool b) { self.is_distribution_ = b; }) .def_property( "memory_early_delete", [](const BuildStrategy &self) { return self.memory_early_delete_; }, diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index c97a93ec36..3b066eda11 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -29,6 +29,15 @@ ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy BuildStrategy = core.ParallelExecutor.BuildStrategy +def _is_pserver_mode(main_program): + main = main_program if main_program \ + else framework.default_main_program() + for op in main.global_block().ops: + if op.type in ["send", "recv"]: + return True + return False + + class ParallelExecutor(object): """ ParallelExecutor is designed for data parallelism, which focuses on distributing @@ -128,6 +137,11 @@ class ParallelExecutor(object): build_strategy = BuildStrategy() build_strategy.num_trainers = num_trainers build_strategy.trainer_id = trainer_id + # FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode, + # num_trainers is 1, so the current fields of build_strategy doesn't tell if + # it's distributed model. + build_strategy.is_distribution = _is_pserver_mode( + main_program) or num_trainers > 1 # step4: get main_program, scope, local_scopes main = main_program if main_program \ diff --git a/python/paddle/fluid/tests/unittests/test_reader_reset.py b/python/paddle/fluid/tests/unittests/test_reader_reset.py index e97a05b6f9..7eeffa1039 100644 --- a/python/paddle/fluid/tests/unittests/test_reader_reset.py +++ b/python/paddle/fluid/tests/unittests/test_reader_reset.py @@ -75,8 +75,6 @@ class TestReaderReset(unittest.TestCase): exe.run(startup_prog) build_strategy = fluid.BuildStrategy() - if with_double_buffer: - build_strategy.enable_data_balance = True exec_strategy = fluid.ExecutionStrategy() parallel_exe = fluid.ParallelExecutor( use_cuda=self.use_cuda, From 4bfa110fd893ee402ba1b052ddce7f26b257b442 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 7 Jan 2019 16:28:44 +0800 Subject: [PATCH 142/197] Add no lock optimize pass test=develop --- CMakeLists.txt | 2 + cmake/FindJeMalloc.cmake | 7 + cmake/generic.cmake | 2 +- paddle/fluid/framework/details/CMakeLists.txt | 2 +- .../fluid/framework/details/build_strategy.cc | 1 + paddle/fluid/framework/ir/CMakeLists.txt | 1 + .../framework/ir/lock_free_optimize_pass.cc | 360 ++++++++++++++++++ .../framework/ir/lock_free_optimize_pass.h | 130 +++++++ 8 files changed, 503 insertions(+), 2 deletions(-) create mode 100644 paddle/fluid/framework/ir/lock_free_optimize_pass.cc create mode 100644 paddle/fluid/framework/ir/lock_free_optimize_pass.h diff --git a/CMakeLists.txt b/CMakeLists.txt index d6aa8f1b85..74d869307d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License +set(CMAKE_VERBOSE_MAKEFILE on) + cmake_minimum_required(VERSION 3.0) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/cmake/FindJeMalloc.cmake b/cmake/FindJeMalloc.cmake index 7911f77c4c..b95287160b 100644 --- a/cmake/FindJeMalloc.cmake +++ b/cmake/FindJeMalloc.cmake @@ -19,3 +19,10 @@ find_package_handle_standard_args(jemalloc DEFAULT_MSG JEMALLOC_LIBRARIES JEMALL mark_as_advanced( JEMALLOC_LIBRARIES JEMALLOC_INCLUDE_DIR) + +if (JEMALLOC_FOUND) + add_library(jemalloc::jemalloc UNKNOWN IMPORTED) + set_target_properties(jemalloc::jemalloc PROPERTIES + IMPORTED_LOCATION ${JEMALLOC_LIBRARIES} + INTERFACE_INCLUDE_DIRECTORIES "${JEMALLOC_INCLUDE_DIR}") +endif() diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 4e31392b98..05293b8b06 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -117,7 +117,7 @@ function(common_link TARGET_NAME) endif() if (WITH_JEMALLOC) - target_link_libraries(${TARGET_NAME} ${JEMALLOC_LIBRARIES}) + target_link_libraries(${TARGET_NAME} jemalloc::jemalloc) endif() endfunction() diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 179aa14528..c1ba6606f1 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -94,4 +94,4 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS graph_viz_pass multi_devices_graph_pass multi_devices_graph_print_pass multi_devices_graph_check_pass fuse_elewise_add_act_pass multi_batch_merge_pass - memory_optimize_pass) + memory_optimize_pass lock_free_optimize_pass) diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 43c2eb7178..f65b3598b0 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -208,3 +208,4 @@ USE_PASS(analysis_var_pass); USE_PASS(sequential_execution_pass); USE_PASS(all_reduce_deps_pass); USE_PASS(modify_op_lock_and_record_event_pass); +USE_PASS(lock_free_optimize_pass); diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 6d795e1e2d..6e6db3d3ef 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -31,6 +31,7 @@ cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass) pass_library(graph_to_program_pass base) pass_library(graph_viz_pass base) +pass_library(lock_free_optimize_pass base) pass_library(fc_fuse_pass inference) pass_library(attention_lstm_fuse_pass inference) pass_library(infer_clean_graph_pass inference) diff --git a/paddle/fluid/framework/ir/lock_free_optimize_pass.cc b/paddle/fluid/framework/ir/lock_free_optimize_pass.cc new file mode 100644 index 0000000000..96e7060aac --- /dev/null +++ b/paddle/fluid/framework/ir/lock_free_optimize_pass.cc @@ -0,0 +1,360 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/lock_free_optimize_pass.h" + +#include +#include +#include + +#include "paddle/fluid/framework/ir/node.h" +#include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +const char kSumGradOpName[] = "sum"; +// TODO(minqiyang): only support sgd at current time, please add +// other optimizers later. +const char kOptimizerType[] = "sgd"; + +std::unique_ptr LockFreeOptimizePass::ApplyImpl( + std::unique_ptr graph) const { + PADDLE_ENFORCE(graph.get()); + + // We could collect all weights' name from SGD, where + // W1 <- SGD(W0, Grad0) + std::unordered_set weight_var_set; + for (auto* node : graph->Nodes()) { + if (IsOpNamed(node, kOptimizerType)) { + auto& param_out_vars = node->Op()->Output("ParamOut"); + PADDLE_ENFORCE(param_out_vars.size() == 1u); + weight_var_set.insert(param_out_vars[0]); + } + } + + // find all grad's merge op via weight name, where + // Grad0 <- SUM(Grad1, Grad2, Grad3 ...) + std::unordered_set grad_sum_op_set; + for (ir::Node* node : graph->Nodes()) { + if (IsOpNamed(node, kSumGradOpName)) { + for (ir::Node* output : node->outputs) { + // strip the last grad suffix @GRAD + std::string var_name = output->Name(); + const std::string suffix(kGradVarSuffix); + if (var_name != suffix && var_name.size() > suffix.size() && + var_name.substr(var_name.size() - suffix.size()) == suffix) { + // if so then strip them off + var_name = var_name.substr(0, var_name.size() - suffix.size()); + if (weight_var_set.find(var_name) != weight_var_set.end()) { + grad_sum_op_set.insert(node); + break; + } + } + } + } + } + + // get the forward op and backward op pairs, where + // out <- forward(X, W) + // Grad1 <- backward(out, X') + // Grad0 <- SUM(Grad1, Grad2, Grad3 ...) + // W0 <- SGD(W1, Grad0) + for (ir::Node* node : grad_sum_op_set) { + for (ir::Node* merged_grad_var : node->outputs) { + // find the optimizers connected with sum op + if (IsVarNameEndsWith(merged_grad_var, kGradVarSuffix) && + merged_grad_var->outputs.size() == 1u) { + ir::Node* opt_node = merged_grad_var->outputs[0]; + LOG(ERROR) << "Found opt node " << opt_node->Name(); + + // find the backward op connected with sum op + for (ir::Node* unmerged_grad_var : node->inputs) { + if (IsVarNameContains(unmerged_grad_var, kGradVarSuffix) && + unmerged_grad_var->inputs.size() == 1u) { + ir::Node* backward_op = unmerged_grad_var->inputs[0]; + + LOG(ERROR) << "Found backward_op " << backward_op->Name(); + + // find the forward op related to the backward op + ir::Node* forward_op = + FindForwardOpViaBackwardOp(graph.get(), backward_op); + + LOG(ERROR) << "Found forward_op " << forward_op->Name(); + + PADDLE_ENFORCE(forward_op); + + Node* new_optimizer_node = CreateNewSGDNode( + graph.get(), forward_op, backward_op, node, opt_node); + + PADDLE_ENFORCE(new_optimizer_node); + } + } + } + } + } + + // Remove the sum_op and its' outputs and connected Optimizers + for (Node* sum_op : grad_sum_op_set) { + for (Node* sum_op_output : sum_op->outputs) { + for (Node* optimize_op : sum_op_output->outputs) { + if (optimize_op->NodeType() == Node::Type::kOperation && + optimize_op->Name() == kOptimizerType) { + LOG(ERROR) << "remove optimize_op: " << optimize_op->Name() << "_" + << optimize_op->id(); + graph->RemoveNode(optimize_op); + } + } + LOG(ERROR) << "remove sum_op_output: " << sum_op_output->Name() << "_" + << sum_op_output->id(); + graph->RemoveNode(sum_op_output); + } + LOG(ERROR) << "remove sum_op: " << sum_op->Name() << "_" << sum_op->id(); + graph->RemoveNode(sum_op); + } + + for (auto* node : graph->Nodes()) { + for (Node* output_node : node->outputs) { + if (output_node->Name() == "sgd") { + LOG(ERROR) << "Node link to SGD: " << node->Name() << "_" << node->id() + << " --> " << output_node->Name() << "_" + << output_node->id(); + for (Node* input_node : node->inputs) { + LOG(ERROR) << "SGD Input link: " << input_node->Name() << "_" + << input_node->id() << " --> " << node->Name() << "_" + << node->id(); + } + } + } + } + + return graph; +} + +ir::Node* LockFreeOptimizePass::CreateNewSGDNode( + ir::Graph* graph, ir::Node* forward_node, ir::Node* backward_node, + ir::Node* grad_sum_node, ir::Node* optimize_node) const { + PADDLE_ENFORCE(graph); + PADDLE_ENFORCE(forward_node); + PADDLE_ENFORCE(backward_node); + PADDLE_ENFORCE(grad_sum_node); + PADDLE_ENFORCE(optimize_node); + + // find the grad var node between the grad sum node and backward_node + std::vector grad_vars = + FindConnectedNode(backward_node, grad_sum_node); + ir::Node* grad_node = nullptr; + for (ir::Node* node : grad_vars) { + if (!ir::IsControlDepVar(*node)) { + grad_node = node; + } + } + PADDLE_ENFORCE(grad_node); + + // create a new SGD node + OpDesc* old_desc = optimize_node->Op(); + // keep with the same block between new optimizer and the old one + OpDesc new_desc(*old_desc, old_desc->Block()); + new_desc.SetInput("Param", old_desc->Input("Param")); + new_desc.SetInput("LearningRate", old_desc->Input("LearningRate")); + new_desc.SetInput("Grad", std::vector({grad_node->Name()})); + new_desc.SetOutput("ParamOut", old_desc->Output("ParamOut")); + + std::vector op_role_vars = boost::get>( + new_desc.GetAttr(framework::OpProtoAndCheckerMaker::OpRoleVarAttrName())); + // replace the second op role var, because the grad name was + // changed in new optimizer + op_role_vars.pop_back(); + op_role_vars.push_back(grad_node->Name()); + new_desc.SetAttr(framework::OpProtoAndCheckerMaker::OpRoleVarAttrName(), + op_role_vars); + new_desc.SetType(kOptimizerType); + + // set backward op's op role var, this will be used to + // set device_id in multi_device_pass + backward_node->Op()->SetAttr( + framework::OpProtoAndCheckerMaker::OpRoleVarAttrName(), op_role_vars); + // backward_node->Op()->SetAttr( + // framework::OpProtoAndCheckerMaker::OpRoleVarAttrName(), {}); + + // keep with the same output nodes between new optimizer and the + // old one + Node* sgd_node = graph->CreateOpNode(&new_desc); + + // change all outputs of the optimize_node to the new one + ReplaceAllDownstreamNode(optimize_node, sgd_node); + + // find connected node between forward node and optimize node + // and replace the optimize node to new sgd node + std::vector forward_opt_connected_nodes = + FindConnectedNode(forward_node, optimize_node); + for (ir::Node* node : forward_opt_connected_nodes) { + ReplaceUpstreamNode(node, optimize_node, sgd_node); + } + + // find connected node between backward node and optimize node + // and replace the optimize node to new sgd node + std::vector backward_opt_connected_nodes = + FindConnectedNode(backward_node, optimize_node); + for (ir::Node* node : backward_opt_connected_nodes) { + ReplaceUpstreamNode(node, optimize_node, sgd_node); + } + + // SGD must have only one param and LR in + PADDLE_ENFORCE(old_desc->Input("LearningRate").size() == 1u); + PADDLE_ENFORCE(old_desc->Input("Param").size() == 1u); + + // LR and weight nodes should be copied + for (Node* upstream_node : optimize_node->inputs) { + if (upstream_node->Name() == old_desc->Input("LearningRate")[0] || + upstream_node->Name() == old_desc->Input("Param")[0]) { + ReplaceUpstreamNode(upstream_node, optimize_node, sgd_node); + } + } + + LOG(ERROR) << "Create new opt node" << sgd_node->Name() << "_" + << sgd_node->id(); + + return sgd_node; +} + +std::vector LockFreeOptimizePass::FindConnectedNode( + ir::Node* upstream_node, ir::Node* downstream_node) const { + std::vector result; + for (ir::Node* out_node : upstream_node->outputs) { + for (ir::Node* in_node : downstream_node->inputs) { + if (in_node == out_node) { + result.push_back(in_node); + } + } + } + + return result; +} + +void LockFreeOptimizePass::ReplaceUpstreamNode( + ir::Node* upstream_node, ir::Node* old_optimizer_node, + ir::Node* new_optimizer_node) const { + PADDLE_ENFORCE(upstream_node); + PADDLE_ENFORCE(old_optimizer_node); + PADDLE_ENFORCE(new_optimizer_node); + + // Remove the old_optimizer_node from upstream_node's outputs vector + auto& output_node_vec = upstream_node->outputs; + for (auto output_node_iter = output_node_vec.begin(); + output_node_iter != output_node_vec.end();) { + if (*output_node_iter == old_optimizer_node) { + output_node_vec.erase(output_node_iter); + break; + } else { + ++output_node_iter; + } + } + + // Add the new_optimizer_node to upstream_node's outputs vector + output_node_vec.emplace_back(new_optimizer_node); + new_optimizer_node->inputs.emplace_back(upstream_node); +} + +void LockFreeOptimizePass::ReplaceAllDownstreamNode( + ir::Node* old_optimizer_node, ir::Node* new_optimizer_node) const { + PADDLE_ENFORCE(old_optimizer_node); + PADDLE_ENFORCE(new_optimizer_node); + + for (ir::Node* downstream_node : old_optimizer_node->outputs) { + // Remove the old_optimizer_node from downstream_node's inputs vector + auto& input_node_vec = downstream_node->inputs; + for (auto input_node_iter = input_node_vec.begin(); + input_node_iter != input_node_vec.end();) { + if (*input_node_iter == old_optimizer_node) { + input_node_vec.erase(input_node_iter); + break; + } else { + ++input_node_iter; + } + } + + // Add the new_optimizer_node to downstream_node's inputs vector + input_node_vec.emplace_back(new_optimizer_node); + new_optimizer_node->outputs.emplace_back(downstream_node); + } +} + +ir::Node* LockFreeOptimizePass::FindForwardOpViaBackwardOp( + ir::Graph* graph, ir::Node* backward_node) const { + PADDLE_ENFORCE(graph); + PADDLE_ENFORCE(backward_node); + + // strip the suffix _grad of backward_node's name + std::string forward_op_name = backward_node->Name(); + const std::string suffix("_grad"); + if (forward_op_name != suffix && forward_op_name.size() > suffix.size() && + forward_op_name.substr(forward_op_name.size() - suffix.size()) == + suffix) { + // if so then strip them off + forward_op_name = + forward_op_name.substr(0, forward_op_name.size() - suffix.size()); + } else { + LOG(WARNING) << "Illegal backward node's name " << backward_node->Name() + << " id " << backward_node->id(); + + return nullptr; + } + + for (ir::Node* node : graph->Nodes()) { + if (node->Name() == forward_op_name) { + if (node->outputs.size() == 0u) { + // if forward_node has no output, then it has NO grad op + continue; + } + + // check whether all inputs of the backward_op that ends_with @GRAD + // comes from the output of forward_op is the input of the backward_op + bool is_related_forward_node = true; + for (ir::Node* backward_input : backward_node->inputs) { + if (IsVarNameEndsWith(backward_input, kGradVarSuffix)) { + bool meets_correct_output = false; + for (ir::Node* forward_output : node->outputs) { + if (forward_output->Name() + kGradVarSuffix == + backward_input->Name()) { + meets_correct_output = true; + break; + } + } + + if (!meets_correct_output) { + is_related_forward_node = false; + break; + } + } + } + + if (is_related_forward_node) { + return node; + } + } + } + + return nullptr; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(lock_free_optimize_pass, + paddle::framework::ir::LockFreeOptimizePass); diff --git a/paddle/fluid/framework/ir/lock_free_optimize_pass.h b/paddle/fluid/framework/ir/lock_free_optimize_pass.h new file mode 100644 index 0000000000..7310f596f8 --- /dev/null +++ b/paddle/fluid/framework/ir/lock_free_optimize_pass.h @@ -0,0 +1,130 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef PADDLE_FLUID_FRAMEWORK_IR_LOCK_FREE_OPTIMIZE_PASS_H_ +#define PADDLE_FLUID_FRAMEWORK_IR_LOCK_FREE_OPTIMIZE_PASS_H_ + +#include +#include + +#include + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +class Node; + +/* +* Remove the sum op of all gradients of the backward op. +* And remove the dependecies of the optimizer related to the +* same backward op. +* +* Before this pass: +* +* forward_op1 forward_op2 +* | | +* grad_op1 grad_op2 +* \ / +* \ / +* sum_op +* | +* sgd_op +* +* After this pass: +* forward_op1 forward_op2 +* | | +* grad_op1 grad_op2 +* | | +* sgd_op1 sgd_op2 +* +* sgd_op1 and sgd_op2 will update the same weight which holds the same +* memory, so we could benefits from the acceleration +*/ +class LockFreeOptimizePass : public Pass { + public: + virtual ~LockFreeOptimizePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + + private: + // Create a new sgd node via current optimizer node + ir::Node* CreateNewSGDNode(ir::Graph* graph, ir::Node* forward_node, + ir::Node* backward_node, ir::Node* grad_sum_node, + ir::Node* optimize_node) const; + + // Replace the input weight's optimizers + void ReplaceUpstreamNode(ir::Node* upstream_node, + ir::Node* old_optimizer_node, + ir::Node* new_optimizer_node) const; + + // Replace the output weight's optimizers + void ReplaceAllDownstreamNode(ir::Node* old_optimizer_node, + ir::Node* new_optimizer_node) const; + + // Find all weight variables in graph + bool FindAllWeightVars(ir::Graph* graph) const; + + // Find the forward_op node via the backward_op node + ir::Node* FindForwardOpViaBackwardOp(ir::Graph* graph, + ir::Node* backward_node) const; + + std::vector FindConnectedNode(ir::Node* upstream_node, + ir::Node* downstream_node) const; + + inline bool IsOpNamed(ir::Node* node, const std::string& name) const { + PADDLE_ENFORCE(node); + + return node->NodeType() == Node::Type::kOperation && node->Name() == name; + } + + inline bool IsVarNamed(ir::Node* node, const std::string& name) const { + PADDLE_ENFORCE(node); + + return node->NodeType() == Node::Type::kVariable && node->Name() == name; + } + + inline bool IsVarNameEndsWith(ir::Node* node, const std::string& name) const { + PADDLE_ENFORCE(node); + + return node->NodeType() == Node::Type::kVariable && + boost::algorithm::ends_with(node->Name(), name); + } + + inline bool IsVarNameContains(ir::Node* node, const std::string& name) const { + PADDLE_ENFORCE(node); + + return node->NodeType() == Node::Type::kVariable && + node->Name().find(name) != std::string::npos; + } + + inline bool IsControlDepFrom(ir::Node* ctrl_dep_node, ir::Node* node) const { + PADDLE_ENFORCE(ctrl_dep_node); + PADDLE_ENFORCE(node); + + return IsControlDepVar(*ctrl_dep_node) && + ctrl_dep_node->inputs.size() >= 1u && + ctrl_dep_node->inputs[0] == node; + } +}; + +} // namespace ir +} // namespace framework +} // namespace paddle + +#endif // PADDLE_FLUID_FRAMEWORK_IR_LOCK_FREE_OPTIMIZE_PASS_H_ From 00e4de04bfa0ab0b90d153694fc7c597378bac16 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 7 Jan 2019 16:44:07 +0800 Subject: [PATCH 143/197] Polish code --- paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h index 38dfae8ad6..758432fd9e 100644 --- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h +++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h @@ -40,7 +40,7 @@ struct EmbeddingVSumFunctor { int64_t row_number = table_t->dims()[0]; int64_t row_width = table_t->dims()[1]; int64_t last_dim = output_t->dims()[1]; - int64_t *ids = const_cast(ids_t->data()); + const int64_t *ids = ids_t->data(); auto ids_lod = ids_t->lod()[0]; int64_t ids_count = ids_t->numel() / ids_lod.back(); From 0f94c1ac14a62372e0e5a35d5d0a393ca92472a5 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 7 Jan 2019 16:45:03 +0800 Subject: [PATCH 144/197] Polish code test=develop --- paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h index 2d60b9e96c..758432fd9e 100644 --- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h +++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h @@ -40,7 +40,7 @@ struct EmbeddingVSumFunctor { int64_t row_number = table_t->dims()[0]; int64_t row_width = table_t->dims()[1]; int64_t last_dim = output_t->dims()[1]; - int64_t *ids = ids_t->mutable_data(platform::CPUPlace()); + const int64_t *ids = ids_t->data(); auto ids_lod = ids_t->lod()[0]; int64_t ids_count = ids_t->numel() / ids_lod.back(); From ee59e60f779749a3d431a54f68a32ebc5624df02 Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Mon, 7 Jan 2019 16:59:48 +0800 Subject: [PATCH 145/197] update mklml version test=develop --- CMakeLists.txt | 5 ----- cmake/external/boost.cmake | 7 ++----- cmake/external/mklml.cmake | 24 +++++++++++------------- 3 files changed, 13 insertions(+), 23 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 66dcef0013..8ba8554456 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -126,11 +126,6 @@ if(ANDROID OR IOS) add_definitions(-DPADDLE_MOBILE_INFERENCE) endif() -if (APPLE) - set(WITH_MKL OFF CACHE STRING - "Disable MKL for building on mac" FORCE) -endif() - if (WIN32) set(WITH_DISTRIBUTE OFF CACHE STRING "Disable DISTRIBUTE when compiling for Windows" FORCE) diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake index 5a78a1d1b7..12412a51a0 100644 --- a/cmake/external/boost.cmake +++ b/cmake/external/boost.cmake @@ -23,11 +23,8 @@ set(BOOST_PROJECT "extern_boost") # checked that the devtools package of CentOS 6 installs boost 1.41.0. # So we use 1.41.0 here. set(BOOST_VER "1.41.0") -if((NOT DEFINED BOOST_TAR) OR (NOT DEFINED BOOST_URL)) - message(STATUS "use pre defined download url") - set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE) - set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE) -endif() +set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE) +set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE) MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}") diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake index 96127e78d6..c94878b6c7 100644 --- a/cmake/external/mklml.cmake +++ b/cmake/external/mklml.cmake @@ -36,19 +36,17 @@ else() endif() SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib") -IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL)) - MESSAGE(STATUS "use pre defined download url") - if(WIN32) - SET(MKLML_VER "mklml_win_2019.0.1.20180928" CACHE STRING "" FORCE) - SET(MKLML_URL "https://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE) - elseif(APPLE) - SET(MKLML_VER "mklml_mac_2019.0.1.20180928" CACHE STRING "" FORCE) - SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE) - else() - SET(MKLML_VER "mklml_lnx_2019.0.1.20180928" CACHE STRING "" FORCE) - SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE) - ENDIF() -endif() +SET(TIME_VERSION "2019.0.1.20181227") +if(WIN32) + SET(MKLML_VER "mklml_win_${TIME_VERSION}" CACHE STRING "" FORCE) + SET(MKLML_URL "https://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE) +elseif(APPLE) + SET(MKLML_VER "mklml_mac_${TIME_VERSION}" CACHE STRING "" FORCE) + SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE) +else() + SET(MKLML_VER "mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE) + SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE) +ENDIF() SET(MKLML_PROJECT "extern_mklml") MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}") From d752177b8f1fafe3588fe7f77a4960813f1bab4f Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Mon, 7 Jan 2019 08:57:21 +0000 Subject: [PATCH 146/197] enforce_dim_check_in_data_feeder test=develop --- python/paddle/fluid/data_feeder.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py index c280ff21ee..1301525914 100644 --- a/python/paddle/fluid/data_feeder.py +++ b/python/paddle/fluid/data_feeder.py @@ -71,10 +71,20 @@ class DataToLoDTensorConverter(object): for each_data in data: self._feed_impl_(each_data, lod[1:], lod_level - 1) + def _check_shape_(self, shape): + for s1, s2 in zip(self.shape, shape): + if s1 != s2 and s1 >= 0 and s2 >= 0: + raise ValueError( + "Shape not match. What is defined in data layer is {}, but receive {}". + format(self.shape, shape)) + def done(self): arr = numpy.array(self.data, dtype=self.dtype) - if self.shape and len(arr.shape) != len(self.shape): - arr = arr.reshape(self.shape) + if self.shape: + if len(arr.shape) != len(self.shape): + arr = arr.reshape(self.shape) + else: + self._check_shape_(arr.shape) t = core.LoDTensor() t.set(arr, self.place) if self.lod_level > 0: @@ -152,17 +162,8 @@ class DataFeeder(object): raise TypeError("Feed list should contain a list of variable") self.feed_dtypes.append(each_var.dtype) self.feed_names.append(each_var.name) - shape = each_var.shape - batch_size_dim = -1 - for i, s in enumerate(shape): - if s < 0: - batch_size_dim = i - break - if batch_size_dim == -1: - raise ValueError("Variable {0} must has a batch size dimension", - each_var.name) self.feed_lod_level.append(each_var.lod_level) - self.feed_shapes.append(shape) + self.feed_shapes.append(each_var.shape) self.place = place From 7923d7271f5f36d0cd13a3270bd5683c26f78724 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Mon, 7 Jan 2019 07:52:50 +0000 Subject: [PATCH 147/197] add fusion seqpool concat op --- .../fused/fusion_seqpool_concat_op.cc | 128 ++++++++++++++++++ .../fused/fusion_seqpool_concat_op.h | 41 ++++++ 2 files changed, 169 insertions(+) create mode 100644 paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc create mode 100644 paddle/fluid/operators/fused/fusion_seqpool_concat_op.h diff --git a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc new file mode 100644 index 0000000000..bf4ae6db13 --- /dev/null +++ b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc @@ -0,0 +1,128 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/fused/fusion_seqpool_concat_op.h" +#include +#include +#include "paddle/fluid/operators/jit/kernels.h" + +namespace paddle { +namespace operators { + +void FusionSeqPoolConcatOp::InferShape( + framework::InferShapeContext* ctx) const { + PADDLE_ENFORCE_GE(ctx->Inputs("X").size(), 1UL, + "Inputs(X) of FusionSeqPoolConcatOp should be empty."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of FusionSeqPoolConcatOp should not be null."); + int axis = ctx->Attrs().Get("axis"); + PADDLE_ENFORCE_EQ(axis, 1, + "FusionSeqPoolConcatOp only supports concat axis=1 yet."); + PADDLE_ENFORCE_EQ(ctx->Attrs().Get("pooltype"), "SUM", + "FusionSeqPoolConcatOp only supports sum pool type yet."); + + auto ins_dims = ctx->GetInputsDim("X"); + const size_t n = ins_dims.size(); + PADDLE_ENFORCE_GT(n, 0UL, "Input tensors count should > 0."); + if (n == 1) { + LOG(WARNING) << "Only have one input, may waste memory"; + } + + // The output height should be confirmed in Compute, + // since input lod is not accessible here. + PADDLE_ENFORCE_EQ(ins_dims[0].size(), 2UL, + "The dims size of first input should be 2."); + ctx->SetOutputDim("Out", {-1, ins_dims[0][axis] * static_cast(n)}); +} + +framework::OpKernelType FusionSeqPoolConcatOp::GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + return framework::OpKernelType( + framework::GetDataTypeOfVar(ctx.MultiInputVar("X")[0]), ctx.GetPlace()); +} + +void FusionSeqPoolConcatOpMaker::Make() { + AddInput("X", "(LoDTensor) Input tensors of this operator.").AsDuplicable(); + AddOutput("Out", "(LoDTensor) Output tensor of concat operator."); + AddAttr("pooltype", + "(string, default 'AVERAGE') some of the pooling " + "pooltype of SequencePoolOp.") + .SetDefault("SUM") + .InEnum({"AVERAGE", "SUM", "SQRT"}); + AddAttr("axis", + "The axis along which the input tensors will be concatenated.") + .SetDefault(1); + AddComment(R"DOC( +Fusion Sequence Pool of pooltype(sum, average and sqrt) and Concat Operator. +)DOC"); +} + +template +class FusionSeqPoolConcatKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto ins = ctx.MultiInput("X"); + auto* out = ctx.Output("Out"); + auto x0_lod = ins[0]->lod(); + auto x0_dims = ins[0]->dims(); + auto y_dims = out->dims(); + size_t bs = x0_lod[0].size() - 1; + out->Resize({static_cast(bs), y_dims[1]}); + framework::LoD y_lod(1); + y_lod[0].resize(bs + 1); + for (size_t i = 0; i <= bs; ++i) { + y_lod[0][i] = i; + } + out->set_lod(y_lod); + auto place = ctx.GetPlace(); + T* y_data = out->mutable_data(place); + + int w = ins[0]->numel() / x0_dims[0]; + PADDLE_ENFORCE_EQ(y_dims[1] % w, 0, + "The output of dims[1] should be dividable of w"); + jit::seq_pool_attr_t attr(w, jit::SeqPoolType::kSum); + auto seqpool = + jit::Get, platform::CPUPlace>( + attr); + size_t n = ins.size(); + for (size_t i = 0; i < n; ++i) { + auto x_dims = ins[i]->dims(); + auto x_lod = ins[i]->lod()[0]; + const T* src = ins[i]->data(); + T* dst = y_data + i * w; + PADDLE_ENFORCE_EQ(static_cast(ins[i]->numel() / x_dims[0]), w, + "Width of all inputs should be equal."); + PADDLE_ENFORCE_EQ(x_lod.size(), bs + 1, + "Batchsize of all inputs should be equal."); + for (size_t j = 0; j < bs; ++j) { + attr.h = static_cast(x_lod[j + 1] - x_lod[j]); + seqpool(src, dst, &attr); + dst += n * w; + src += attr.h * attr.w; + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(fusion_seqpool_concat, ops::FusionSeqPoolConcatOp, + ops::FusionSeqPoolConcatOpMaker, + paddle::framework::DefaultGradOpDescMaker); + +REGISTER_OP_CPU_KERNEL(fusion_seqpool_concat, + ops::FusionSeqPoolConcatKernel, + ops::FusionSeqPoolConcatKernel); diff --git a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.h b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.h new file mode 100644 index 0000000000..9f882a59d3 --- /dev/null +++ b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using Tensor = framework::Tensor; + +class FusionSeqPoolConcatOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; +}; + +class FusionSeqPoolConcatOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override; +}; + +} // namespace operators +} // namespace paddle From 9793a0b6a6e26816a089dfaa65a3cf7a37f1e693 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Mon, 7 Jan 2019 18:52:21 +0800 Subject: [PATCH 148/197] fix_cudnn_compatible_check --- paddle/fluid/platform/device_context.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index be7f4949d6..09f3d3de54 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -292,7 +292,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) if (dynload::HasCUDNN()) { auto local_cudnn_version = cudnn_dso_ver / 100; auto compile_cudnn_version = CUDNN_VERSION / 100; - if (local_cuda_version < compile_cuda_version) { + if (local_cudnn_version < compile_cudnn_version) { LOG_FIRST_N(WARNING, 1) << "WARNING: device: " << place_.device << ". The installed Paddle is compiled with CUDNN " From c8f101e5da3497bfa12688d90d84cad52deee2f0 Mon Sep 17 00:00:00 2001 From: xiaolil1 <39753926+xiaolil1@users.noreply.github.com> Date: Mon, 7 Jan 2019 19:55:08 +0800 Subject: [PATCH 149/197] Conv int8 relu (#15130) * Enable basic MKL-DNN INT8 Conv OP test=develop * Modify test case test=develop * Clean unittest code test=develop * Fix test test=develop * Modify test test=develop * Enable MKL-DNN INT8 Conv with Relu Fusion OP test=develop * Modify basic INT8 Conv test=develop * fix type test=develop * Modify test test=develop --- paddle/fluid/operators/conv_mkldnn_op.cc | 69 ++++++++++++------ paddle/fluid/platform/mkldnn_reuse.h | 8 ++- .../unittests/test_conv2d_int8_mkldnn_op.py | 70 +++++++++++++++---- 3 files changed, 107 insertions(+), 40 deletions(-) diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc index 0f2bb8c65c..03d9d466c3 100644 --- a/paddle/fluid/operators/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/conv_mkldnn_op.cc @@ -319,6 +319,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { std::vector dilations = ctx.Attr>("dilations"); int groups = ctx.Attr("groups"); + bool fuse_relu = ctx.Attr("fuse_relu"); + bool force_fp32_output = ctx.Attr("force_fp32_output"); bool is_conv3d = strides.size() == 3U; @@ -329,6 +331,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { dilations[2] == 1 : dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1, "dilation in convolution is not implemented yet"); + PADDLE_ENFORCE(is_conv3d != true, "int8 does not support conv3d currently"); const T* input_data = input->data(); @@ -340,15 +343,24 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { GetWeightsTz(weights_tz, g, is_conv3d); std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); + mkldnn::memory::data_type src_dt = + paddle::framework::ToMKLDNNDataType(input->type()); + auto dst_dt = fuse_relu ? paddle::framework::ToMKLDNNDataType( + framework::DataTypeTrait::DataType) + : paddle::framework::ToMKLDNNDataType( + framework::DataTypeTrait::DataType); + + if (force_fp32_output) { + dst_dt = paddle::framework::ToMKLDNNDataType( + framework::DataTypeTrait::DataType); + } + // Get unique name for storing MKLDNN primitives std::string key; key.reserve(MaxKeyLength); - mkldnn::memory::data_type src_dt = - paddle::framework::ToMKLDNNDataType(input->type()); platform::ConvMKLDNNHandler::AppendKey( &key, src_tz, weights_tz, strides, paddings, dilations, groups, src_dt, - input->format(), ctx.op().Output("Output")); - + input->format(), dst_dt, ctx.op().Output("Output")); const std::string key_conv_pd = key + "@conv_pd"; std::shared_ptr conv_p = nullptr; @@ -413,13 +425,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { platform::MKLDNNMemDesc(src_tz, src_dt, chosen_memory_format); auto weights_md = platform::MKLDNNMemDesc( weights_tz, memory::data_type::s8, chosen_memory_format); - - auto dst_dt = force_fp32_output - ? paddle::framework::ToMKLDNNDataType( - framework::DataTypeTrait::DataType) - : paddle::framework::ToMKLDNNDataType( - framework::DataTypeTrait::DataType); - auto dst_md = platform::MKLDNNMemDesc(dst_tz, dst_dt, chosen_memory_format); // create a conv primitive descriptor and save it for usage in backward @@ -429,11 +434,11 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { memory::format::x); conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, strides, paddings, mkldnn_engine, - output_shift_scale, is_test); + fuse_relu, output_shift_scale, is_test); } else { - conv_pd = - ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings, - mkldnn_engine, output_shift_scale, is_test); + conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, + paddings, mkldnn_engine, fuse_relu, + output_shift_scale, is_test); } // Save conv_pd/src_memory/weights_memory for backward pass dev_ctx.SetBlob(key_conv_pd, conv_pd); @@ -459,7 +464,11 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { mask_reorder); if (!force_fp32_output) { - dst_memory_p = platform::SetDstMemory(ctx, output, handler); + if (fuse_relu) { + dst_memory_p = platform::SetDstMemory(ctx, output, handler); + } else { + dst_memory_p = platform::SetDstMemory(ctx, output, handler); + } } else { dst_memory_p = platform::SetDstMemory(ctx, output, handler); } @@ -518,8 +527,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { mkldnn_engine, key)); } if (!force_fp32_output) { - dst_memory_p = - platform::SetDstMemoryHandler(ctx, output, handler); + if (fuse_relu) { + dst_memory_p = + platform::SetDstMemoryHandler(ctx, output, handler); + } else { + dst_memory_p = + platform::SetDstMemoryHandler(ctx, output, handler); + } } else { dst_memory_p = platform::SetDstMemoryHandler(ctx, output, handler); @@ -563,11 +577,18 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { } mkldnn::primitive_attr CreatePostOps( - const std::vector output_shift_scale) const { + bool fuse_relu, const std::vector output_shift_scale) const { mkldnn::primitive_attr conv_attr; mkldnn::post_ops post_operations; int mask = output_shift_scale.size() > 1 ? 1 << 1 : 0; conv_attr.set_output_scales(mask, output_shift_scale); + if (fuse_relu) { + constexpr float scale = 1.0f; + constexpr float negative_slope = 0.0f; + constexpr float placeholder = 1.0f; // beta + post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu, + negative_slope, placeholder); + } conv_attr.set_post_ops(post_operations); return conv_attr; } @@ -600,7 +621,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights, const memory::desc& dst, const std::vector& strides, const std::vector& paddings, - const mkldnn::engine& engine, + const mkldnn::engine& engine, const bool fuse_relu, const std::vector output_shift_scale, bool is_test) const { memory::dims stride_dims = {strides[0], strides[1]}; @@ -613,7 +634,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { propagation, mkldnn::convolution_direct, src, weights, dst, stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero); - mkldnn::primitive_attr conv_attr = CreatePostOps(output_shift_scale); + mkldnn::primitive_attr conv_attr = + CreatePostOps(fuse_relu, output_shift_scale); auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc( conv_desc, conv_attr, engine); @@ -652,7 +674,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { const memory::desc& bias, const memory::desc& dst, const std::vector& strides, const std::vector& paddings, - const mkldnn::engine& engine, + const mkldnn::engine& engine, const bool fuse_relu, const std::vector output_shift_scale, bool is_test) const { memory::dims stride_dims = {strides[0], strides[1]}; @@ -665,7 +687,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { propagation, mkldnn::convolution_direct, src, weights, bias, dst, stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero); - mkldnn::primitive_attr conv_attr = CreatePostOps(output_shift_scale); + mkldnn::primitive_attr conv_attr = + CreatePostOps(fuse_relu, output_shift_scale); auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc( conv_desc, conv_attr, engine); diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 98d1242a16..b3d20736a8 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -214,16 +214,18 @@ class MKLDNNHandler { std::string* key, const mkldnn::memory::dims& input_dims, const mkldnn::memory::dims& weights_dims, const std::vector& strides, const std::vector& paddings, const std::vector& dilations, - const int& groups, const mkldnn::memory::data_type& type, - const mkldnn::memory::format& format, const std::string& suffix) { + const int& groups, const mkldnn::memory::data_type& srcdt, + const mkldnn::memory::format& format, + const mkldnn::memory::data_type& dstdt, const std::string& suffix) { AppendKeyDims(key, input_dims); AppendKeyDims(key, weights_dims); AppendKeyVec(key, strides); AppendKeyVec(key, paddings); AppendKeyVec(key, dilations); AppendKey(key, std::to_string(groups)); - AppendKey(key, std::to_string(type)); + AppendKey(key, std::to_string(srcdt)); AppendKey(key, std::to_string(format)); + AppendKey(key, std::to_string(dstdt)); AppendKey(key, suffix); } diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py index ca35adc1a3..def188bfa6 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py @@ -47,7 +47,8 @@ class TestConv2dInt8Op(TestConv2dOp): self.init_group() self.init_dilation() self.init_test_case() - self.init_dtype() + self.init_fuse_relu() + self.init_data_type() conv2d_param = { 'stride': self.stride, @@ -78,7 +79,11 @@ class TestConv2dInt8Op(TestConv2dOp): np.round((input_shift) * self.scale_in).astype(np.int32), filter_int, self.groups, conv2d_param).astype(np.float32) * scale_output_shift - output = np.round(output1 - output2).astype(self.dsttype) + if self.fuse_relu: + output = np.maximum(np.round(output1 - output2), + 0).astype(self.dsttype) + else: + output = np.round(output1 - output2).astype(self.dsttype) else: filter_int = np.round(filter * self.scale_weights[0]).astype(np.int32) @@ -87,7 +92,15 @@ class TestConv2dInt8Op(TestConv2dOp): output1 = conv2d_forward_refer( input.astype(np.int32), filter_int, self.groups, conv2d_param).astype(np.float32) - output = np.round(output1 * scale_output_shift).astype(self.dsttype) + if self.fuse_relu: + output = np.maximum( + np.round(output1 * (self.scale_out / ( + self.scale_in * self.scale_weights[0]))), + 0).astype(self.dsttype) + else: + output = np.round(output1 * (self.scale_out / ( + self.scale_in * + self.scale_weights[0]))).astype(self.dsttype) self.inputs = { 'Input': @@ -106,6 +119,7 @@ class TestConv2dInt8Op(TestConv2dOp): 'Scale_in': self.scale_in, 'Scale_out': self.scale_out, 'Scale_weights': self.scale_weights, + 'fuse_relu': self.fuse_relu } self.outputs = {'Output': output} @@ -129,12 +143,15 @@ class TestConv2dInt8Op(TestConv2dOp): self.scale_out = 0.5 self.scale_weights = [10.0] - def init_dtype(self): + def init_data_type(self): self.srctype = np.uint8 self.dsttype = np.int8 + def init_fuse_relu(self): + self.fuse_relu = True -#--------------------test conv2d u8 in and s8 out-------------------- + +#--------------------test conv2d u8 in and u8 out-------------------- class TestConv2d(TestConv2dInt8Op): @@ -203,18 +220,43 @@ class TestWithInput1x1Filter1x1(TestConv2dInt8Op): self.groups = 3 -#--------------------test conv2d s8 in and s8 out-------------------- +def init_data_type_with_fusion(self, input_dt, fuse_relu): + self.srctype = input_dt + self.dsttype = np.uint8 if fuse_relu else np.int8 + + def init_fuse_relu(self): + self.fuse_relu = fuse_relu def create_test_int8_class(parent): - class TestInt8Case(parent): - def init_dtype(self): - self.srctype = np.int8 - self.dsttype = np.int8 - - cls_name = "{0}_{1}".format(parent.__name__, "s8s8") - TestInt8Case.__name__ = cls_name - globals()[cls_name] = TestInt8Case + + #--------------------test conv2d s8 in and u8 out-------------------- + + class TestS8U8Case(parent): + def init_data_type(self): + init_data_type_with_fusion(self, np.int8, True) + + #--------------------test conv2d s8 in and s8 out-------------------- + + class TestS8S8Case(parent): + def init_data_type(self): + init_data_type_with_fusion(self, np.int8, False) + + #--------------------test conv2d u8 in and s8 out-------------------- + + class TestU8S8Case(parent): + def init_data_type(self): + init_data_type_with_fusion(self, np.uint8, False) + + cls_name_s8u8 = "{0}_relu_{1}".format(parent.__name__, "1") + cls_name_s8s8 = "{0}_relu_{1}".format(parent.__name__, "0") + cls_name_u8s8 = "{0}_relu_{1}".format(parent.__name__, "0") + TestS8U8Case.__name__ = cls_name_s8u8 + TestS8S8Case.__name__ = cls_name_s8s8 + TestU8S8Case.__name__ = cls_name_u8s8 + globals()[cls_name_s8u8] = TestS8U8Case + globals()[cls_name_s8s8] = TestS8S8Case + globals()[cls_name_u8s8] = TestU8S8Case create_test_int8_class(TestConv2dInt8Op) From 7dc0181c46d0833a9b951dda84c886d697accac9 Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Mon, 7 Jan 2019 19:56:32 +0800 Subject: [PATCH 150/197] run analyzer_tester serial in multi-thread test=develop --- paddle/fluid/inference/tests/api/CMakeLists.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index a1a79c6885..131712ca88 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -41,7 +41,7 @@ endfunction() if(NOT APPLE AND WITH_MKLML) set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1") download_model_and_data(${RNN1_INSTALL_DIR} "rnn1%2Fmodel.tar.gz" "rnn1%2Fdata.txt.tar.gz") - inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR} analyzer_rnn1_tester.cc) + inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR} analyzer_rnn1_tester.cc SERIAL) else() # TODO: fix this test on MACOS and OPENBLAS, the reason is that # fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS @@ -56,14 +56,14 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2 # normal DAM set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam") download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz") -inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc) +inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc SERIAL) # small DAM set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam") download_model_and_data(${DAM_SMALL_INSTALL_DIR} "dam_small_model.tar.gz" "dam_small_data.txt.tar.gz") inference_analysis_test(test_analyzer_small_dam SRCS analyzer_dam_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${DAM_SMALL_INSTALL_DIR}/model --infer_data=${DAM_SMALL_INSTALL_DIR}/data.txt --max_turn_num=1) + ARGS --infer_model=${DAM_SMALL_INSTALL_DIR}/model --infer_data=${DAM_SMALL_INSTALL_DIR}/data.txt --max_turn_num=1 SERIAL) # chinese_ner set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner") @@ -111,11 +111,11 @@ inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose # resnet50 inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 - "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz") + "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz" SERIAL) # mobilenet with depthwise_conv op inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv - "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz") + "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz" SERIAL) # anakin if (WITH_ANAKIN AND WITH_MKL) # only needed in CI From 6ccf8685f781153baca5ce14412de4263ab64bef Mon Sep 17 00:00:00 2001 From: Yan Chunwei Date: Mon, 7 Jan 2019 19:59:01 +0800 Subject: [PATCH 151/197] refactor tensorrt node teller (#15181) --- paddle/fluid/inference/analysis/argument.h | 2 - .../inference/analysis/ir_pass_manager.cc | 10 --- .../analysis/ir_passes/CMakeLists.txt | 18 +++-- .../ir_passes/tensorrt_subgraph_pass.cc | 8 ++- .../passes/ir_analysis_compose_pass.cc | 23 ------- .../passes/ir_analysis_compose_pass.h | 2 - .../fluid/inference/tensorrt/CMakeLists.txt | 1 + paddle/fluid/inference/tensorrt/op_teller.cc | 49 +++++++++++++ paddle/fluid/inference/tensorrt/op_teller.h | 68 +++++++++++++++++++ 9 files changed, 134 insertions(+), 47 deletions(-) create mode 100644 paddle/fluid/inference/tensorrt/op_teller.cc create mode 100644 paddle/fluid/inference/tensorrt/op_teller.h diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 2db5705d09..2d8980b1d1 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -123,8 +123,6 @@ struct Argument { DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool); DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int); DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool); - DECL_ARGUMENT_FIELD(tensorrt_node_teller, TensorRtNodeTeller, - std::function); DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int); DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int); DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int); diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index b8c9426ed3..e37fea38bc 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -49,13 +49,6 @@ void IRPassManager::CreatePasses(Argument *argument, for (const std::string &pass_name : passes) { auto pass = framework::ir::PassRegistry::Instance().Get(pass_name); - // Set some pass attributes. - if (pass_name == "ir_analysis_pass") { - pass->Set("tensorrt_node_teller", - new SubgraphDetector::NodeInsideSubgraphTeller( - argument->tensorrt_node_teller())); - } - if (pass_name == "graph_viz_pass") { std::string dot_file_path = std::to_string(pass_num) + "_ir_" + (pre_pass.empty() ? "origin" : pre_pass) + @@ -70,9 +63,6 @@ void IRPassManager::CreatePasses(Argument *argument, } if (pass_name == "tensorrt_subgraph_pass") { - PADDLE_ENFORCE(argument->tensorrt_node_teller_valid()); - pass->SetNotOwned("tensorrt_node_teller", - argument->tensorrt_node_teller_ptr()); pass->Set("workspace_size", new int(argument->tensorrt_workspace_size())); pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size())); pass->Set("min_subgraph_size", diff --git a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt index 822c7799bb..9ae5b8aa17 100644 --- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt @@ -1,9 +1,13 @@ cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc) -cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector) -set(analysis_deps ${analysis_deps} - subgraph_detector tensorrt_subgraph_pass - CACHE INTERNAL "") -set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h) -file(APPEND ${pass_file} "USE_PASS(tensorrt_subgraph_pass);\n") -set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "") +if (TENSORRT_FOUND) + cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller) + + set(analysis_deps ${analysis_deps} + subgraph_detector tensorrt_subgraph_pass + CACHE INTERNAL "") + + set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h) + file(APPEND ${pass_file} "USE_PASS(tensorrt_subgraph_pass);\n") + set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "") +endif() diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index ad10010e42..bc06e78ae6 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -20,6 +20,7 @@ #include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h" #include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h" +#include "paddle/fluid/inference/tensorrt/op_teller.h" namespace paddle { namespace inference { @@ -35,8 +36,10 @@ std::unique_ptr analysis::TensorRtSubgraphPass::ApplyImpl( std::unique_ptr graph) const { framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get()); - auto teller = - Get("tensorrt_node_teller"); + auto teller = [](const framework::ir::Node *node) { + if (!node->IsOp() || !node->Op()) return false; + return tensorrt::OpTeller::Global().Tell(node->Op()->Type(), *node->Op()); + }; SubGraphFuser fuser(graph.get(), teller, Get("min_subgraph_size") /*min subgraph size*/); @@ -232,7 +235,6 @@ std::vector ExtractParameters( REGISTER_PASS(tensorrt_subgraph_pass, paddle::inference::analysis::TensorRtSubgraphPass) - .RequirePassAttr("tensorrt_node_teller") .RequirePassAttr("max_batch_size") .RequirePassAttr("workspace_size") .RequirePassAttr("min_subgraph_size"); diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc index c3a2b3ca1d..490189e550 100644 --- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc @@ -27,9 +27,6 @@ namespace analysis { void IrAnalysisComposePass::RunImpl(Argument *argument) { ARGUMENT_CHECK_FIELD(argument, ir_analysis_passes); - if (argument->use_tensorrt_valid() && argument->use_tensorrt()) { - InitTensorRTAttrs(argument); - } ApplyIrPasses(argument); CollectFusionStatis(argument); } @@ -38,26 +35,6 @@ std::string IrAnalysisComposePass::repr() const { return "ir-analysis-compose-pass"; } -void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) { - if (argument->use_tensorrt_valid() && argument->use_tensorrt()) { - LOG(INFO) << "Initing TensorRT pass"; - argument->SetTensorRtNodeTeller([](const framework::ir::Node *node) { - std::unordered_set teller_set( - {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid", - "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad", - "elementwise_add", "elementwise_mul", "dropout", "split", "prelu", - "conv2d_transpose", "leaky_relu"}); - if (!node->IsOp()) return false; - - if (teller_set.count(node->Op()->Type())) { - return true; - } else { - return false; - } - }); - } -} - void IrAnalysisComposePass::ApplyIrPasses(Argument *argument) { std::vector passes({ "ir_graph_build_pass", "ir_analysis_pass", diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h index 53e2ebb003..16c6b7d84d 100644 --- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h +++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h @@ -33,8 +33,6 @@ class IrAnalysisComposePass : public AnalysisPass { std::string repr() const override; private: - void InitTensorRTAttrs(Argument* argument); - void ApplyIrPasses(Argument* argument); void CollectFusionStatis(Argument* argument); diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt index 17f6c6d9f1..9afeafd176 100644 --- a/paddle/fluid/inference/tensorrt/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt @@ -1,4 +1,5 @@ nv_library(tensorrt_engine SRCS engine.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context) +nv_library(tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto) nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader) nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine) add_subdirectory(plugin) diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc new file mode 100644 index 0000000000..9fecad6eb3 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -0,0 +1,49 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tensorrt/op_teller.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +// Just tell by the op_types. +struct SimpleOpTypeSetTeller : public Teller { + SimpleOpTypeSetTeller() {} + + bool operator()(const std::string& op_type, + const framework::OpDesc& desc) override { + return teller_set.count(op_type); + } + + private: + std::unordered_set teller_set{ + {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid", + "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad", + "elementwise_add", "elementwise_mul", "dropout", "split", "prelu", + "conv2d_transpose", "leaky_relu"}}; +}; + +bool OpTeller::Tell(const std::string& op_type, const framework::OpDesc& desc) { + for (auto& teller : tellers_) { + if ((*teller)(op_type, desc)) return true; + } + return false; +} + +OpTeller::OpTeller() { tellers_.emplace_back(new SimpleOpTypeSetTeller); } + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/op_teller.h b/paddle/fluid/inference/tensorrt/op_teller.h new file mode 100644 index 0000000000..b98f052bf2 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/op_teller.h @@ -0,0 +1,68 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "paddle/fluid/framework/op_desc.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * Single Op teller definition. + * One can override this and define a more complex tell logic, considerring more + * issues such as op_desc. + */ +struct Teller { + virtual bool operator()(const std::string& op_type, + const framework::OpDesc& desc) = 0; + + virtual ~Teller() = default; +}; +/* + * A real example: + * + * struct SomeTeller : public Teller { + * bool operator()(const std::string& op_type, + * const framework::OpDesc& desc) override { + * return op_type == "fc" && desc.Inputs().size() == 2; + * } + *}; + */ + +/* + * class OpTeller helps to tell whether a fluid + * operator can be transformed to a TensorRT layer. + */ +class OpTeller { + public: + static OpTeller& Global() { + static std::unique_ptr x(new OpTeller); + return *x; + } + + bool Tell(const std::string& op_type, const framework::OpDesc& desc); + + private: + OpTeller(); + + private: + std::vector> tellers_; +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle From 316636404ff8294890668ce1ae55f0b0ec4ec621 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Mon, 7 Jan 2019 10:30:47 +0000 Subject: [PATCH 152/197] add seqpool concat unit test --- .../fused/fusion_seqpool_concat_op.cc | 8 +- .../test_fusion_seqpool_concat_op.py | 118 ++++++++++++++++++ .../unittests/test_reorder_lod_tensor.py | 15 +-- .../fluid/tests/unittests/test_seq_pool.py | 49 ++++---- 4 files changed, 159 insertions(+), 31 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_fusion_seqpool_concat_op.py diff --git a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc index bf4ae6db13..578ff6b2d0 100644 --- a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc +++ b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc @@ -29,8 +29,6 @@ void FusionSeqPoolConcatOp::InferShape( int axis = ctx->Attrs().Get("axis"); PADDLE_ENFORCE_EQ(axis, 1, "FusionSeqPoolConcatOp only supports concat axis=1 yet."); - PADDLE_ENFORCE_EQ(ctx->Attrs().Get("pooltype"), "SUM", - "FusionSeqPoolConcatOp only supports sum pool type yet."); auto ins_dims = ctx->GetInputsDim("X"); const size_t n = ins_dims.size(); @@ -74,6 +72,7 @@ class FusionSeqPoolConcatKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto ins = ctx.MultiInput("X"); auto* out = ctx.Output("Out"); + std::string pooltype = ctx.Attr("pooltype"); auto x0_lod = ins[0]->lod(); auto x0_dims = ins[0]->dims(); auto y_dims = out->dims(); @@ -92,6 +91,11 @@ class FusionSeqPoolConcatKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ(y_dims[1] % w, 0, "The output of dims[1] should be dividable of w"); jit::seq_pool_attr_t attr(w, jit::SeqPoolType::kSum); + if (pooltype == "AVERAGE") { + attr.type = jit::SeqPoolType::kAvg; + } else if (pooltype == "SQRT") { + attr.type = jit::SeqPoolType::kSqrt; + } auto seqpool = jit::Get, platform::CPUPlace>( attr); diff --git a/python/paddle/fluid/tests/unittests/test_fusion_seqpool_concat_op.py b/python/paddle/fluid/tests/unittests/test_fusion_seqpool_concat_op.py new file mode 100644 index 0000000000..8a6837dae2 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fusion_seqpool_concat_op.py @@ -0,0 +1,118 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest +from test_reorder_lod_tensor import convert_to_offset +from test_seq_pool import compute_seqpool_sum, compute_seqpool_avg, compute_seqpool_sqrt + + +class TestFusionSeqPoolConcatOp(OpTest): + def setUp(self): + self.w = 11 + self.lods = [[[2, 3, 5]], [[1, 5, 2]]] + self.set_conf() + self.set_pooltype() + self.op_type = 'fusion_seqpool_concat' + self.axis = 1 + bs = len(self.lods[0][0]) + inputs = [] + outs = [] + i = 0 + for lod in self.lods: + assert bs == len(lod[0]), 'All lod size should be equal' + x = np.random.uniform(0.1, 1, + [sum(lod[0]), self.w]).astype('float32') + offset = convert_to_offset(lod) + out = np.zeros((bs, self.w)).astype('float32') + if self.pooltype == "SUM": + compute_seqpool_sum(x, offset, out) + elif self.pooltype == "AVERAGE": + compute_seqpool_avg(x, offset, out) + elif self.pooltype == "SQRT": + compute_seqpool_sqrt(x, offset, out) + else: + raise Exception("Unsupported pool type!") + inputs.append(('x_{0}'.format(i), (x, lod))) + outs.append(out) + i = i + 1 + + self.inputs = {'X': inputs} + self.outputs = {'Out': np.concatenate(outs, axis=self.axis)} + self.attrs = { + 'pooltype': self.pooltype, + 'axis': self.axis, + } + + def set_pooltype(self): + self.pooltype = "SUM" + + def set_conf(self): + pass + + def test_check_output(self): + self.check_output() + + +class TestFusionSeqPoolConcatOpCase1(TestFusionSeqPoolConcatOp): + def set_conf(self): + self.lods = [[[1]]] + + +class TestFusionSeqPoolConcatOpCase2(TestFusionSeqPoolConcatOp): + def set_conf(self): + self.lods = [[[1]], [[1]], [[1]]] + + +class TestFusionSeqPoolConcatOpCase3(TestFusionSeqPoolConcatOp): + def set_conf(self): + self.lods = [[[1, 3, 4, 6]]] + self.w = 10 + + +class TestFusionSeqPoolConcatOpCase4(TestFusionSeqPoolConcatOp): + def set_conf(self): + self.lods = [[[2, 13, 4]], [[1, 1, 1]], [[5, 3, 1]], [[9, 10, 3]]] + self.w = 3 + + +## test avg pool and sqrt +def create_test_avg_sqrt_class(parent): + class TestSeqPoolAvgCase(parent): + def set_pooltype(self): + self.pooltype = "AVERAGE" + + class TestSeqPoolSqrtCase(parent): + def set_pooltype(self): + self.pooltype = "SQRT" + + cls_name_avg = "{0}_{1}".format(parent.__name__, "avg") + cls_name_sqrt = "{0}_{1}".format(parent.__name__, "sqrt") + TestSeqPoolAvgCase.__name__ = cls_name_avg + TestSeqPoolSqrtCase.__name__ = cls_name_sqrt + globals()[cls_name_avg] = TestSeqPoolAvgCase + globals()[cls_name_sqrt] = TestSeqPoolSqrtCase + + +create_test_avg_sqrt_class(TestFusionSeqPoolConcatOp) +create_test_avg_sqrt_class(TestFusionSeqPoolConcatOpCase1) +create_test_avg_sqrt_class(TestFusionSeqPoolConcatOpCase2) +create_test_avg_sqrt_class(TestFusionSeqPoolConcatOpCase3) +create_test_avg_sqrt_class(TestFusionSeqPoolConcatOpCase4) + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py b/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py index 28c8c4699a..a7fd271ae7 100644 --- a/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py +++ b/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py @@ -22,6 +22,14 @@ import numpy import functools +def convert_to_offset(lod): + offset = [[0] for i in lod] + for i, level in enumerate(lod): + for seq_len in level: + offset[i].append(offset[i][-1] + seq_len) + return offset + + class TestReorderLoDTensor(unittest.TestCase): num_seq = 5 # [name, shape, lod_level] pair indicating data info of source and target @@ -91,13 +99,6 @@ class TestReorderLoDTensor(unittest.TestCase): self.inputs[desc[0]] = tensor def reorder(self): - def convert_to_offset(lod): - offset_lod = [[0] for i in lod] - for i, level in enumerate(lod): - for seq_len in level: - offset_lod[i].append(offset_lod[i][-1] + seq_len) - return offset_lod - level = 0 # compute the rank_table according to ref_lod ref_lod = self.data[self.data_desc[1][0]][1][level] diff --git a/python/paddle/fluid/tests/unittests/test_seq_pool.py b/python/paddle/fluid/tests/unittests/test_seq_pool.py index a80ad5b079..176265428c 100644 --- a/python/paddle/fluid/tests/unittests/test_seq_pool.py +++ b/python/paddle/fluid/tests/unittests/test_seq_pool.py @@ -17,33 +17,43 @@ from __future__ import print_function import unittest import numpy as np from op_test import OpTest +from test_reorder_lod_tensor import convert_to_offset -class TestSeqAvgPool(OpTest): - def convert_to_offset(self, lod): - offset = [[0] for i in lod] - for i, level in enumerate(lod): - for seq_len in level: - offset[i].append(offset[i][-1] + seq_len) - return offset +def compute_seqpool_sum(x, offset, out): + for i in range(len(offset[0]) - 1): + sub_x = x[offset[0][i]:offset[0][i + 1], :] + out[i] = sub_x.sum(axis=0) + + +def compute_seqpool_avg(x, offset, out): + for i in range(len(offset[0]) - 1): + sub_x = x[offset[0][i]:offset[0][i + 1], :] + out[i] = sub_x.mean(axis=0) + +def compute_seqpool_sqrt(x, offset, out): + for i in range(len(offset[0]) - 1): + sub_x = x[offset[0][i]:offset[0][i + 1], :] + seq_len = offset[0][i + 1] - offset[0][i] + out[i] = sub_x.sum(axis=0) / np.sqrt(seq_len) + + +class TestSeqAvgPool(OpTest): def set_data(self): self.op_type = 'sequence_pool' # one level, batch size is 4 x = np.random.uniform(0.1, 1, [11, 23]).astype('float32') lod = [[11]] self.inputs = {'X': (x, lod)} - offset = self.convert_to_offset(lod) - + offset = convert_to_offset(lod) out = np.zeros((len(lod[0]), 23)).astype('float32') self.outputs = {'Out': out} return x, offset, out def compute(self, x, offset, out): self.attrs = {'pooltype': "AVERAGE"} - for i in range(len(offset[0]) - 1): - sub_x = x[offset[0][i]:offset[0][i + 1], :] - out[i] = sub_x.mean(axis=0) + compute_seqpool_avg(x, offset, out) def setUp(self): x, offset, out = self.set_data() @@ -62,9 +72,7 @@ class TestSeqAvgPool(OpTest): class TestSeqSumPool(TestSeqAvgPool): def compute(self, x, offset, out): self.attrs = {'pooltype': "SUM"} - for i in range(len(offset[0]) - 1): - sub_x = x[offset[0][i]:offset[0][i + 1], :] - out[i] = sub_x.sum(axis=0) + compute_seqpool_sum(x, offset, out) class TestSeqMaxPool(TestSeqAvgPool): @@ -72,7 +80,7 @@ class TestSeqMaxPool(TestSeqAvgPool): self.op_type = 'sequence_pool' x = np.random.uniform(0.1, 1, [13, 23]).astype('float32') lod = [[13]] - offset = self.convert_to_offset(lod) + offset = convert_to_offset(lod) for i in range(len(offset[0]) - 1): l = offset[0][i + 1] - offset[0][i] x[offset[0][i] + np.random.randint(l), :] += 2.0 @@ -93,10 +101,7 @@ class TestSeqMaxPool(TestSeqAvgPool): class TestSeqSqrtPool(TestSeqAvgPool): def compute(self, x, offset, out): self.attrs = {'pooltype': "SQRT"} - for i in range(len(offset[0]) - 1): - sub_x = x[offset[0][i]:offset[0][i + 1], :] - seq_len = offset[0][i + 1] - offset[0][i] - out[i] = sub_x.sum(axis=0) / np.sqrt(seq_len) + compute_seqpool_sqrt(x, offset, out) class TestSeqLastPool(TestSeqAvgPool): @@ -122,7 +127,7 @@ class TestSeqAvgPool2D(TestSeqAvgPool): x = np.random.uniform(0.1, 1, [13, 3, 17]).astype('float32') lod = [[4, 1, 3, 5]] self.inputs = {'X': (x, lod)} - offset = self.convert_to_offset(lod) + offset = convert_to_offset(lod) out = np.zeros((4, 3, 17)).astype('float32') self.outputs = {'Out': out} @@ -167,7 +172,7 @@ class TestSeqMaxPool2D(TestSeqAvgPool2D): x = np.random.uniform(0.1, 1, [13, 3, 11]).astype('float32') lod = [[4, 1, 3, 5]] self.inputs = {'X': (x, lod)} - offset = self.convert_to_offset(lod) + offset = convert_to_offset(lod) for i in range(len(offset[0]) - 1): l = offset[0][i + 1] - offset[0][i] x[offset[0][i] + np.random.randint(l), :] += 1.0 From 7f45b9511aa1cf18f36709627a01a59bc1d3e661 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 7 Jan 2019 22:54:01 +0800 Subject: [PATCH 153/197] Polish code --- paddle/fluid/framework/operator.cc | 1 + paddle/fluid/operators/hash_op.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index f10da22aec..afece8e3d2 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -29,6 +29,7 @@ DECLARE_bool(benchmark); DEFINE_bool(check_nan_inf, false, "Checking whether operator produce NAN/INF or not. It will be " "extremely slow so please use this flag wisely."); +DEFINE_int32(inner_op_parallelism, 0, "number of threads for inner op"); namespace paddle { namespace framework { diff --git a/paddle/fluid/operators/hash_op.h b/paddle/fluid/operators/hash_op.h index 9781bb0f45..1ed3ffe9aa 100644 --- a/paddle/fluid/operators/hash_op.h +++ b/paddle/fluid/operators/hash_op.h @@ -45,7 +45,7 @@ class HashKerel : public framework::OpKernel { for (int idx = 0; idx < seq_length; ++idx) { for (int ihash = 0; ihash != num_hash; ++ihash) { output[idx * num_hash + ihash] = - XXH64(input, sizeof(int) * last_dim, ihash) % mod_by; + XXH32(input, sizeof(int) * last_dim, ihash) % mod_by; } input += last_dim; } From 1bfbc0d963db26fcf72b9b53d568e0b102d50a5d Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 7 Jan 2019 22:54:47 +0800 Subject: [PATCH 154/197] Polish code test=develop --- paddle/fluid/framework/operator.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index afece8e3d2..f10da22aec 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -29,7 +29,6 @@ DECLARE_bool(benchmark); DEFINE_bool(check_nan_inf, false, "Checking whether operator produce NAN/INF or not. It will be " "extremely slow so please use this flag wisely."); -DEFINE_int32(inner_op_parallelism, 0, "number of threads for inner op"); namespace paddle { namespace framework { From b76695418ad6cfe16f5fe54f9768fdf3b467a241 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 7 Jan 2019 22:55:59 +0800 Subject: [PATCH 155/197] Polish log test=develop --- .../framework/ir/lock_free_optimize_pass.cc | 30 +++++++++---------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/framework/ir/lock_free_optimize_pass.cc b/paddle/fluid/framework/ir/lock_free_optimize_pass.cc index 96e7060aac..92e897ca9c 100644 --- a/paddle/fluid/framework/ir/lock_free_optimize_pass.cc +++ b/paddle/fluid/framework/ir/lock_free_optimize_pass.cc @@ -80,7 +80,7 @@ std::unique_ptr LockFreeOptimizePass::ApplyImpl( if (IsVarNameEndsWith(merged_grad_var, kGradVarSuffix) && merged_grad_var->outputs.size() == 1u) { ir::Node* opt_node = merged_grad_var->outputs[0]; - LOG(ERROR) << "Found opt node " << opt_node->Name(); + VLOG(3) << "Found opt node " << opt_node->Name(); // find the backward op connected with sum op for (ir::Node* unmerged_grad_var : node->inputs) { @@ -88,13 +88,13 @@ std::unique_ptr LockFreeOptimizePass::ApplyImpl( unmerged_grad_var->inputs.size() == 1u) { ir::Node* backward_op = unmerged_grad_var->inputs[0]; - LOG(ERROR) << "Found backward_op " << backward_op->Name(); + VLOG(3) << "Found backward_op " << backward_op->Name(); // find the forward op related to the backward op ir::Node* forward_op = FindForwardOpViaBackwardOp(graph.get(), backward_op); - LOG(ERROR) << "Found forward_op " << forward_op->Name(); + VLOG(3) << "Found forward_op " << forward_op->Name(); PADDLE_ENFORCE(forward_op); @@ -114,29 +114,28 @@ std::unique_ptr LockFreeOptimizePass::ApplyImpl( for (Node* optimize_op : sum_op_output->outputs) { if (optimize_op->NodeType() == Node::Type::kOperation && optimize_op->Name() == kOptimizerType) { - LOG(ERROR) << "remove optimize_op: " << optimize_op->Name() << "_" - << optimize_op->id(); + VLOG(3) << "remove optimize_op: " << optimize_op->Name() << "_" + << optimize_op->id(); graph->RemoveNode(optimize_op); } } - LOG(ERROR) << "remove sum_op_output: " << sum_op_output->Name() << "_" - << sum_op_output->id(); + VLOG(3) << "remove sum_op_output: " << sum_op_output->Name() << "_" + << sum_op_output->id(); graph->RemoveNode(sum_op_output); } - LOG(ERROR) << "remove sum_op: " << sum_op->Name() << "_" << sum_op->id(); + VLOG(3) << "remove sum_op: " << sum_op->Name() << "_" << sum_op->id(); graph->RemoveNode(sum_op); } for (auto* node : graph->Nodes()) { for (Node* output_node : node->outputs) { if (output_node->Name() == "sgd") { - LOG(ERROR) << "Node link to SGD: " << node->Name() << "_" << node->id() - << " --> " << output_node->Name() << "_" - << output_node->id(); + VLOG(3) << "Node link to SGD: " << node->Name() << "_" << node->id() + << " --> " << output_node->Name() << "_" << output_node->id(); for (Node* input_node : node->inputs) { - LOG(ERROR) << "SGD Input link: " << input_node->Name() << "_" - << input_node->id() << " --> " << node->Name() << "_" - << node->id(); + VLOG(3) << "SGD Input link: " << input_node->Name() << "_" + << input_node->id() << " --> " << node->Name() << "_" + << node->id(); } } } @@ -226,8 +225,7 @@ ir::Node* LockFreeOptimizePass::CreateNewSGDNode( } } - LOG(ERROR) << "Create new opt node" << sgd_node->Name() << "_" - << sgd_node->id(); + VLOG(3) << "Create new opt node" << sgd_node->Name() << "_" << sgd_node->id(); return sgd_node; } From 5979953720ce35e5607f227d7b4c2400df0b8a35 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 7 Jan 2019 22:56:41 +0800 Subject: [PATCH 156/197] Remove debug info test=develop --- CMakeLists.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 74d869307d..d6aa8f1b85 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License -set(CMAKE_VERBOSE_MAKEFILE on) - cmake_minimum_required(VERSION 3.0) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) From c4b09a713f2b21e239085d67943e0da0e493d711 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Mon, 7 Jan 2019 14:15:13 +0800 Subject: [PATCH 157/197] polish test=develop --- paddle/fluid/imperative/layer.h | 1 + python/paddle/fluid/executor.py | 40 +++++++++++------------- python/paddle/fluid/parallel_executor.py | 2 +- 3 files changed, 21 insertions(+), 22 deletions(-) diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index 2abda933cf..34cffd1aa3 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -77,6 +77,7 @@ class PreparedOp { framework::OperatorWithKernel::OpKernelFunc func; platform::DeviceContext* dev_ctx; }; + class OpBase; class VarBase { diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 67e569eac0..1a940b30c1 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -208,20 +208,20 @@ def _fetch_var(name, scope=None, return_numpy=True): return tensor -def _get_program_cache_key(feed, fetch_list): - feed_var_names = list(feed.keys()) +def _to_name_str(var): + if isinstance(var, Variable): + return var.desc.name() + elif isinstance(var, str): + return var + elif isinstance(var, six.string_types): + return str(var) + else: + raise TypeError(str(var) + " should be Variable or str") - def to_name_str(var): - if isinstance(var, Variable): - return var.desc.name() - elif isinstance(var, str): - return var - elif isinstance(var, six.string_types): - return str(var) - else: - raise TypeError(str(var) + " should be Variable or str") - fetch_var_names = list(map(to_name_str, fetch_list)) +def _get_program_cache_key(feed, fetch_list): + feed_var_names = list(feed.keys()) + fetch_var_names = list(map(_to_name_str, fetch_list)) return str(feed_var_names + fetch_var_names) @@ -397,11 +397,8 @@ class Executor(object): self.executor.close() self._closed = True - def _run_parallel(self, - scope, - feed=None, - fetch_list=None, - return_numpy=True): + def _run_parallel(self, scope, feed, fetch_list, fetch_var_name, + return_numpy): if isinstance(feed, dict): feed_tensor_dict = dict() for feed_name in feed: @@ -437,8 +434,8 @@ class Executor(object): res.append(res_dict) self.executor.feed_tensors_into_local_scopes(res) - fetch_var_name = '@FETCHED_VAR_NAME@' - self.executor.run(fetch_list, fetch_var_name) + fetch_var_names = list(map(_to_name_str, fetch_list)) + self.executor.run(fetch_var_names, fetch_var_name) arr = scope.find_var(fetch_var_name).get_lod_tensor_array() if return_numpy: @@ -504,6 +501,8 @@ class Executor(object): if scope is None: scope = global_scope() + if fetch_list is None: + fetch_list = [] compiled = isinstance(program, compiler.CompiledProgram) # For backward compatibility, run directly. @@ -529,6 +528,7 @@ class Executor(object): scope=scope, feed=feed, fetch_list=fetch_list, + fetch_var_name=fetch_var_name, return_numpy=return_numpy) else: # TODO(panyx0718): Can compile program to optimize executor @@ -552,8 +552,6 @@ class Executor(object): raise TypeError( "feed requires dict as its Parameter. But you passed in %s" % (type(feed))) - if fetch_list is None: - fetch_list = [] if program is None: program = default_main_program() diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index a0b6392ebc..ef75f4802a 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -279,7 +279,7 @@ class ParallelExecutor(object): res.append(res_dict) self.executor.feed_tensors_into_local_scopes(res) - fetch_var_name = '@FETCHED_VAR_NAME@' + fetch_var_name = 'fetch' self.executor.run(fetch_list, fetch_var_name) arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array() From dacfaaa966b5e8d0b809e1f38600b30d44b1f7f0 Mon Sep 17 00:00:00 2001 From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com> Date: Tue, 8 Jan 2019 10:08:33 +0800 Subject: [PATCH 158/197] Revert "Remove op handle lock" test=develop --- paddle/fluid/operators/math/blas_impl.cu.h | 134 ++++++++++--------- paddle/fluid/platform/cuda_helper.h | 58 -------- paddle/fluid/platform/device_context.cc | 18 +-- paddle/fluid/platform/device_context.h | 76 +++++++---- paddle/fluid/platform/device_context_test.cu | 3 + 5 files changed, 130 insertions(+), 159 deletions(-) delete mode 100644 paddle/fluid/platform/cuda_helper.h diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h index 58f7be12ce..d35073029a 100644 --- a/paddle/fluid/operators/math/blas_impl.cu.h +++ b/paddle/fluid/operators/math/blas_impl.cu.h @@ -62,19 +62,27 @@ struct CUBlas { cudaDataType_t Atype, int lda, const void *B, cudaDataType_t Btype, int ldb, const float *beta, void *C, cudaDataType_t Ctype, int ldc) { -// Because the gcc 4.8 doesn't expand template parameter pack that -// appears in a lambda-expression, I can not use template parameter pack -// here. + // Because the gcc 4.8 doesn't expand template parameter pack that + // appears in a lambda-expression, I can not use template parameter pack + // here. + auto cublas_call = [&]() { #if CUDA_VERSION >= 8000 - VLOG(5) << "use_tensor_op_math: " - << (dev_ctx->tensor_core_available() ? "True" : "False"); - dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + VLOG(5) << "use_tensor_op_math: " + << (platform::TensorCoreAvailable() ? "True" : "False"); PADDLE_ENFORCE(platform::dynload::cublasSgemmEx( - handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, - beta, C, Ctype, ldc)); - }); + dev_ctx->cublas_handle(), transa, transb, m, n, k, alpha, A, Atype, + lda, B, Btype, ldb, beta, C, Ctype, ldc)); #else - PADDLE_THROW("cublasSgemmEx is supported on cuda >= 8.0"); + PADDLE_THROW("cublasSgemmEx is supported on cuda >= 8.0"); +#endif + }; + +#if CUDA_VERSION >= 9000 + // NOTES: To use Tensor Core, we should change the cublas config, + // but the cublas may be hold by multi-thread. + dev_ctx->CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH); +#else + cublas_call(); #endif } }; @@ -162,24 +170,32 @@ struct CUBlas { cudaDataType_t Btype, int ldb, const void *beta, void *C, cudaDataType_t Ctype, int ldc, cudaDataType_t computeType) { + auto cublas_call = [&]() { #if CUDA_VERSION >= 8000 - cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; + cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; #if CUDA_VERSION >= 9000 - bool use_tensor_op_math = dev_ctx->tensor_core_available(); - if (use_tensor_op_math) { - algo = CUBLAS_GEMM_DFALT_TENSOR_OP; - } - VLOG(5) << "use_tensor_op_math: " - << (use_tensor_op_math ? "True" : "False"); + bool use_tensor_op_math = platform::TensorCoreAvailable(); + if (use_tensor_op_math) { + algo = CUBLAS_GEMM_DFALT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " + << (use_tensor_op_math ? "True" : "False"); #endif // CUDA_VERSION >= 9000 - dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { PADDLE_ENFORCE(platform::dynload::cublasGemmEx( - handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, - beta, C, Ctype, ldc, computeType, algo)); - }); + dev_ctx->cublas_handle(), transa, transb, m, n, k, alpha, A, Atype, + lda, B, Btype, ldb, beta, C, Ctype, ldc, computeType, algo)); #else - PADDLE_THROW("cublasGemmEx is supported on cuda >= 8.0"); + PADDLE_THROW("cublasGemmEx is supported on cuda >= 8.0"); +#endif + }; + +#if CUDA_VERSION >= 9000 + // NOTES: To use Tensor Core, we should change the cublas config, + // but the cublas may be hold by multi-thread. + dev_ctx->CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH); +#else + cublas_call(); #endif } }; @@ -207,10 +223,9 @@ void Blas::GEMM(CBLAS_TRANSPOSE transA, CUDA_R_32F, N); } else { #endif // CUDA_VERSION >= 8000 - context_.CublasCall([&](cublasHandle_t handle) { - CUBlas::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, - lda, &beta, C, N); - }); + + CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, + &alpha, B, ldb, A, lda, &beta, C, N); #if CUDA_VERSION >= 8000 } @@ -251,12 +266,9 @@ inline void Blas::GEMM( CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N, CUDA_R_32F); #else // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm - - context_.CublasCall([&](cublasHandle_t handle) { - CUBlas::GEMM(handle, cuTransB, cuTransA, N, M, K, - &h_alpha, h_B, ldb, h_A, lda, &h_beta, h_C, - N); - }); + CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, + N, M, K, &h_alpha, h_B, ldb, h_A, lda, + &h_beta, h_C, N); #endif // CUDA_VERSION >= 8000 } @@ -280,10 +292,8 @@ void Blas::GEMM(bool transA, bool transB, int M, } else { #endif // CUDA_VERSION >= 8000 - context_.CublasCall([&](cublasHandle_t handle) { - CUBlas::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, - lda, &beta, C, ldc); - }); + CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, + &alpha, B, ldb, A, lda, &beta, C, ldc); #if CUDA_VERSION >= 8000 } @@ -301,19 +311,16 @@ inline void Blas::GEMM( cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N; cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N; - context_.CublasCall([&](cublasHandle_t handle) { - CUBlas::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, - B, ldb, A, lda, &beta, C, ldc); - }); + CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, + N, M, K, &alpha, B, ldb, A, lda, &beta, C, + ldc); } template <> template void Blas::AXPY(int n, T alpha, const T *x, T *y) const { - context_.CublasCall([&](cublasHandle_t handle) { - CUBlas::AXPY(handle, n, &alpha, x, 1, y, 1); - }); + CUBlas::AXPY(context_.cublas_handle(), n, &alpha, x, 1, y, 1); } template <> @@ -323,9 +330,8 @@ void Blas::GEMV(bool trans_a, int M, int N, T beta, T *C) const { cublasOperation_t cuTransA = !trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; - context_.CublasCall([&](cublasHandle_t handle) { - CUBlas::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1); - }); + CUBlas::GEMV(context_.cublas_handle(), cuTransA, N, M, &alpha, A, N, B, 1, + &beta, C, 1); } template <> @@ -347,28 +353,28 @@ void Blas::BatchedGEMM( #if CUDA_VERSION >= 9010 if (FLAGS_enable_cublas_tensor_op_math && std::is_same::value) { - cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; - bool use_tensor_op_math = context_.tensor_core_available(); - if (use_tensor_op_math) { - algo = CUBLAS_GEMM_DFALT_TENSOR_OP; - } - VLOG(5) << "use_tensor_op_math: " - << (use_tensor_op_math ? "True" : "False"); - - context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + auto cublas_call = [&]() { + cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; + bool use_tensor_op_math = platform::TensorCoreAvailable(); + if (use_tensor_op_math) { + algo = CUBLAS_GEMM_DFALT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " + << (use_tensor_op_math ? "True" : "False"); + PADDLE_ENFORCE(platform::dynload::cublasGemmStridedBatchedEx( - handle, cuTransB, cuTransA, N, M, K, &alpha, B, CUDA_R_32F, ldb, - strideB, A, CUDA_R_32F, lda, strideA, &beta, C, CUDA_R_32F, ldc, - strideC, batchCount, CUDA_R_32F, algo)); - }); + context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, + CUDA_R_32F, ldb, strideB, A, CUDA_R_32F, lda, strideA, &beta, C, + CUDA_R_32F, ldc, strideC, batchCount, CUDA_R_32F, algo)); + }; + auto &dev_ctx = const_cast(context_); + dev_ctx.CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH); } else { #endif // CUDA_VERSION >= 9010 - context_.CublasCall([&](cublasHandle_t handle) { - CUBlas::GEMM_STRIDED_BATCH(handle, cuTransB, cuTransA, N, M, K, &alpha, - B, ldb, strideB, A, lda, strideA, &beta, C, - ldc, strideC, batchCount); - }); + CUBlas::GEMM_STRIDED_BATCH(context_.cublas_handle(), cuTransB, cuTransA, + N, M, K, &alpha, B, ldb, strideB, A, lda, + strideA, &beta, C, ldc, strideC, batchCount); #if CUDA_VERSION >= 9010 } diff --git a/paddle/fluid/platform/cuda_helper.h b/paddle/fluid/platform/cuda_helper.h deleted file mode 100644 index 122de72e15..0000000000 --- a/paddle/fluid/platform/cuda_helper.h +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include // NOLINT - -#include "paddle/fluid/platform/dynload/cublas.h" -#include "paddle/fluid/platform/macros.h" - -#if CUDA_VERSION < 9000 -enum cublasMath_t { CUBLAS_DEFAULT_MATH = 0 }; -#endif - -namespace paddle { -namespace platform { - -class CublasHandleHolder { - public: - CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) { - PADDLE_ENFORCE(dynload::cublasCreate(&handle_)); - PADDLE_ENFORCE(dynload::cublasSetStream(handle_, stream)); -#if CUDA_VERSION >= 9000 - if (math_type == CUBLAS_TENSOR_OP_MATH) { - PADDLE_ENFORCE( - dynload::cublasSetMathMode(handle_, CUBLAS_TENSOR_OP_MATH)); - } -#endif - } - - ~CublasHandleHolder() { PADDLE_ENFORCE(dynload::cublasDestroy(handle_)); } - - template - inline void Call(Callback &&callback) const { - std::lock_guard guard(mtx_); - callback(handle_); - } - - private: - DISABLE_COPY_AND_ASSIGN(CublasHandleHolder); - - cublasHandle_t handle_; - mutable std::mutex mtx_; -}; - -} // namespace platform -} // namespace paddle diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index be7f4949d6..022afb686b 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -245,15 +245,8 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) eigen_stream_.reset(new EigenCudaStreamDevice()); eigen_stream_->Reinitialize(&stream_, place); eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get())); - cublas_handle_.reset(new CublasHandleHolder(stream_, CUBLAS_DEFAULT_MATH)); - - if (TensorCoreAvailable()) { -#if CUDA_VERSION >= 9000 - cublas_tensor_core_handle_.reset( - new CublasHandleHolder(stream_, CUBLAS_TENSOR_OP_MATH)); -#endif - } - + PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_)); + PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_)); if (dynload::HasCUDNN()) { cudnn_holder_.reset(new CudnnHolder(&stream_, place)); } @@ -313,8 +306,7 @@ CUDADeviceContext::~CUDADeviceContext() { SetDeviceId(place_.device); Wait(); WaitStreamCallback(); - cublas_handle_.reset(); - cublas_tensor_core_handle_.reset(); + PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_)); eigen_stream_.reset(); eigen_device_.reset(); PADDLE_ENFORCE(cudaStreamDestroy(stream_)); @@ -343,8 +335,8 @@ Eigen::GpuDevice* CUDADeviceContext::eigen_device() const { return eigen_device_.get(); } -bool CUDADeviceContext::tensor_core_available() const { - return cublas_tensor_core_handle_ != nullptr; +cublasHandle_t CUDADeviceContext::cublas_handle() const { + return cublas_handle_; } cudnnHandle_t CUDADeviceContext::cudnn_handle() const { diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index c81d17380c..7e87580189 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -20,7 +20,6 @@ limitations under the License. */ #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/platform/temporary_allocator.h" #ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/dynload/cublas.h" #include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/gpu_info.h" @@ -210,6 +209,39 @@ class CudnnWorkspaceHandle { std::unique_ptr> guard_; }; +#if CUDA_VERSION >= 9000 +class ScopedCublasMathMode { + public: + ScopedCublasMathMode(cublasHandle_t handle, cublasMath_t new_math_mode) + : handle_(handle) { + need_reset = false; + PADDLE_ENFORCE( + platform::dynload::cublasGetMathMode(handle_, &old_math_mode_), + "Failed to get old cublas math mode"); + if (old_math_mode_ != new_math_mode) { + PADDLE_ENFORCE( + platform::dynload::cublasSetMathMode(handle_, new_math_mode), + "Failed to set old cublas math mode"); + need_reset = true; + } + } + + ~ScopedCublasMathMode() { + if (need_reset) { + PADDLE_ENFORCE( + platform::dynload::cublasSetMathMode(handle_, old_math_mode_), + "Failed to set old cublas math mode"); + } + } + + private: + cublasHandle_t handle_; + cublasMath_t old_math_mode_; + bool need_reset; +}; + +#endif + class CUDADeviceContext : public DeviceContext { public: explicit CUDADeviceContext(CUDAPlace place); @@ -230,25 +262,8 @@ class CUDADeviceContext : public DeviceContext { /*! \brief Return eigen device in the device context. */ Eigen::GpuDevice* eigen_device() const; - /*! \brief Call cublas function safely. */ - template - inline void CublasCall(Callback&& callback) const { - cublas_handle_->Call(std::forward(callback)); - } - - /*! \brief Check whether tensor core is supported */ - bool tensor_core_available() const; - - /*! \brief Call cublas function with Tensor Core safely. If - Tensor Core is not available, use DEFAULT_MATH instead. */ - template - inline void TensorCoreCublasCallIfAvailable(Callback&& callback) const { - if (cublas_tensor_core_handle_) { - cublas_tensor_core_handle_->Call(std::forward(callback)); - } else { - cublas_handle_->Call(std::forward(callback)); - } - } + /*! \brief Return cublas handle in the device context. */ + cublasHandle_t cublas_handle() const; /*! \brief Return cudnn handle in the device context. */ cudnnHandle_t cudnn_handle() const; @@ -267,6 +282,7 @@ class CUDADeviceContext : public DeviceContext { template void RecordEvent(cudaEvent_t ev, Callback callback) { + std::lock_guard guard(mtx_); callback(); PADDLE_ENFORCE(cudaEventRecord(ev, stream_)); } @@ -278,6 +294,18 @@ class CUDADeviceContext : public DeviceContext { void WaitStreamCallback() const { callback_manager_->Wait(); } +#if CUDA_VERSION >= 9000 + /*! \brief CublasCall may need to change cublas's config, + * but the cublas may be hold by multi-thread, so we should + * add lock here. */ + template + void CublasCall(Callback callback, cublasMath_t new_math) { + std::lock_guard guard(cublas_mtx_); + ScopedCublasMathMode scoped_cublas_math(cublas_handle_, new_math); + callback(); + } +#endif + private: CUDAPlace place_; @@ -285,9 +313,7 @@ class CUDADeviceContext : public DeviceContext { std::unique_ptr eigen_stream_; std::unique_ptr cudnn_holder_; cudaStream_t stream_; - - std::unique_ptr cublas_handle_; - std::unique_ptr cublas_tensor_core_handle_; + cublasHandle_t cublas_handle_; int compute_capability_; int runtime_version_; @@ -295,10 +321,12 @@ class CUDADeviceContext : public DeviceContext { int multi_process_; int max_threads_per_mp_; + mutable std::mutex mtx_; + // StreamCallbackManager is thread-safe std::unique_ptr callback_manager_; - DISABLE_COPY_AND_ASSIGN(CUDADeviceContext); + mutable std::mutex cublas_mtx_; }; template <> diff --git a/paddle/fluid/platform/device_context_test.cu b/paddle/fluid/platform/device_context_test.cu index 5b3aa98efb..171d2979a0 100644 --- a/paddle/fluid/platform/device_context_test.cu +++ b/paddle/fluid/platform/device_context_test.cu @@ -43,6 +43,9 @@ TEST(Device, CUDADeviceContext) { ASSERT_NE(nullptr, gpu_device); cudnnHandle_t cudnn_handle = device_context->cudnn_handle(); ASSERT_NE(nullptr, cudnn_handle); + cublasHandle_t cublas_handle = device_context->cublas_handle(); + ASSERT_NE(nullptr, cublas_handle); + ASSERT_NE(nullptr, device_context->stream()); delete device_context; } } From b1ea335f60c4e7270231c4ff33d3bf334b01ba9d Mon Sep 17 00:00:00 2001 From: chengduo Date: Mon, 7 Jan 2019 21:11:54 -0600 Subject: [PATCH 159/197] add sm_75 support (#15198) test=develop --- cmake/cuda.cmake | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 5be7be6413..10ecdf0ea8 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -2,7 +2,7 @@ if(NOT WITH_GPU) return() endif() -set(paddle_known_gpu_archs "30 35 50 52 60 61 70") +set(paddle_known_gpu_archs "30 35 50 52 60 61 70 75") set(paddle_known_gpu_archs7 "30 35 50 52") set(paddle_known_gpu_archs8 "30 35 50 52 60 61") @@ -59,7 +59,7 @@ endfunction() # select_nvcc_arch_flags(out_variable) function(select_nvcc_arch_flags out_variable) # List of arch names - set(archs_names "Kepler" "Maxwell" "Pascal" "All" "Manual") + set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "All" "Manual") set(archs_name_default "All") if(NOT CMAKE_CROSSCOMPILING) list(APPEND archs_names "Auto") @@ -93,6 +93,8 @@ function(select_nvcc_arch_flags out_variable) set(cuda_arch_bin "60 61") elseif(${CUDA_ARCH_NAME} STREQUAL "Volta") set(cuda_arch_bin "70") + elseif(${CUDA_ARCH_NAME} STREQUAL "Turing") + set(cuda_arch_bin "75") elseif(${CUDA_ARCH_NAME} STREQUAL "All") set(cuda_arch_bin ${paddle_known_gpu_archs}) elseif(${CUDA_ARCH_NAME} STREQUAL "Auto") From 7c7342bf125ef2859d1dd7628ad5a494ffe315b9 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Tue, 8 Jan 2019 03:33:13 +0000 Subject: [PATCH 160/197] fix scope.var() test=develop --- paddle/fluid/framework/scope.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index a5742dbd3d..9536185609 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -87,11 +87,12 @@ Variable* Scope::Var(const std::string& name) { } Variable* Scope::Var(std::string* name) { - auto new_name = string::Sprintf("%p.%d", this, vars_.size()); + SCOPE_VARS_WRITER_LOCK + auto new_name = std::to_string(reinterpret_cast(this)) + "." + + std::to_string(vars_.size()); if (name != nullptr) { *name = new_name; } - SCOPE_VARS_WRITER_LOCK return VarInternal(new_name); } From ed409ac9f4fa57dbf8785f24dde4b55714555fc4 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Tue, 8 Jan 2019 03:37:59 +0000 Subject: [PATCH 161/197] Revert "Revert "Remove op handle lock"" test=develop --- paddle/fluid/operators/math/blas_impl.cu.h | 134 +++++++++---------- paddle/fluid/platform/cuda_helper.h | 58 ++++++++ paddle/fluid/platform/device_context.cc | 18 ++- paddle/fluid/platform/device_context.h | 76 ++++------- paddle/fluid/platform/device_context_test.cu | 3 - 5 files changed, 159 insertions(+), 130 deletions(-) create mode 100644 paddle/fluid/platform/cuda_helper.h diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h index d35073029a..58f7be12ce 100644 --- a/paddle/fluid/operators/math/blas_impl.cu.h +++ b/paddle/fluid/operators/math/blas_impl.cu.h @@ -62,27 +62,19 @@ struct CUBlas { cudaDataType_t Atype, int lda, const void *B, cudaDataType_t Btype, int ldb, const float *beta, void *C, cudaDataType_t Ctype, int ldc) { - // Because the gcc 4.8 doesn't expand template parameter pack that - // appears in a lambda-expression, I can not use template parameter pack - // here. - auto cublas_call = [&]() { +// Because the gcc 4.8 doesn't expand template parameter pack that +// appears in a lambda-expression, I can not use template parameter pack +// here. #if CUDA_VERSION >= 8000 - VLOG(5) << "use_tensor_op_math: " - << (platform::TensorCoreAvailable() ? "True" : "False"); + VLOG(5) << "use_tensor_op_math: " + << (dev_ctx->tensor_core_available() ? "True" : "False"); + dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { PADDLE_ENFORCE(platform::dynload::cublasSgemmEx( - dev_ctx->cublas_handle(), transa, transb, m, n, k, alpha, A, Atype, - lda, B, Btype, ldb, beta, C, Ctype, ldc)); + handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, + beta, C, Ctype, ldc)); + }); #else - PADDLE_THROW("cublasSgemmEx is supported on cuda >= 8.0"); -#endif - }; - -#if CUDA_VERSION >= 9000 - // NOTES: To use Tensor Core, we should change the cublas config, - // but the cublas may be hold by multi-thread. - dev_ctx->CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH); -#else - cublas_call(); + PADDLE_THROW("cublasSgemmEx is supported on cuda >= 8.0"); #endif } }; @@ -170,32 +162,24 @@ struct CUBlas { cudaDataType_t Btype, int ldb, const void *beta, void *C, cudaDataType_t Ctype, int ldc, cudaDataType_t computeType) { - auto cublas_call = [&]() { #if CUDA_VERSION >= 8000 - cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; + cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; #if CUDA_VERSION >= 9000 - bool use_tensor_op_math = platform::TensorCoreAvailable(); - if (use_tensor_op_math) { - algo = CUBLAS_GEMM_DFALT_TENSOR_OP; - } - VLOG(5) << "use_tensor_op_math: " - << (use_tensor_op_math ? "True" : "False"); + bool use_tensor_op_math = dev_ctx->tensor_core_available(); + if (use_tensor_op_math) { + algo = CUBLAS_GEMM_DFALT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " + << (use_tensor_op_math ? "True" : "False"); #endif // CUDA_VERSION >= 9000 + dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { PADDLE_ENFORCE(platform::dynload::cublasGemmEx( - dev_ctx->cublas_handle(), transa, transb, m, n, k, alpha, A, Atype, - lda, B, Btype, ldb, beta, C, Ctype, ldc, computeType, algo)); + handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, + beta, C, Ctype, ldc, computeType, algo)); + }); #else - PADDLE_THROW("cublasGemmEx is supported on cuda >= 8.0"); -#endif - }; - -#if CUDA_VERSION >= 9000 - // NOTES: To use Tensor Core, we should change the cublas config, - // but the cublas may be hold by multi-thread. - dev_ctx->CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH); -#else - cublas_call(); + PADDLE_THROW("cublasGemmEx is supported on cuda >= 8.0"); #endif } }; @@ -223,9 +207,10 @@ void Blas::GEMM(CBLAS_TRANSPOSE transA, CUDA_R_32F, N); } else { #endif // CUDA_VERSION >= 8000 - - CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, - &alpha, B, ldb, A, lda, &beta, C, N); + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, + lda, &beta, C, N); + }); #if CUDA_VERSION >= 8000 } @@ -266,9 +251,12 @@ inline void Blas::GEMM( CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N, CUDA_R_32F); #else // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm - CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, - N, M, K, &h_alpha, h_B, ldb, h_A, lda, - &h_beta, h_C, N); + + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM(handle, cuTransB, cuTransA, N, M, K, + &h_alpha, h_B, ldb, h_A, lda, &h_beta, h_C, + N); + }); #endif // CUDA_VERSION >= 8000 } @@ -292,8 +280,10 @@ void Blas::GEMM(bool transA, bool transB, int M, } else { #endif // CUDA_VERSION >= 8000 - CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, - &alpha, B, ldb, A, lda, &beta, C, ldc); + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, + lda, &beta, C, ldc); + }); #if CUDA_VERSION >= 8000 } @@ -311,16 +301,19 @@ inline void Blas::GEMM( cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N; cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N; - CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, - N, M, K, &alpha, B, ldb, A, lda, &beta, C, - ldc); + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, + B, ldb, A, lda, &beta, C, ldc); + }); } template <> template void Blas::AXPY(int n, T alpha, const T *x, T *y) const { - CUBlas::AXPY(context_.cublas_handle(), n, &alpha, x, 1, y, 1); + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::AXPY(handle, n, &alpha, x, 1, y, 1); + }); } template <> @@ -330,8 +323,9 @@ void Blas::GEMV(bool trans_a, int M, int N, T beta, T *C) const { cublasOperation_t cuTransA = !trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; - CUBlas::GEMV(context_.cublas_handle(), cuTransA, N, M, &alpha, A, N, B, 1, - &beta, C, 1); + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1); + }); } template <> @@ -353,28 +347,28 @@ void Blas::BatchedGEMM( #if CUDA_VERSION >= 9010 if (FLAGS_enable_cublas_tensor_op_math && std::is_same::value) { - auto cublas_call = [&]() { - cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; - bool use_tensor_op_math = platform::TensorCoreAvailable(); - if (use_tensor_op_math) { - algo = CUBLAS_GEMM_DFALT_TENSOR_OP; - } - VLOG(5) << "use_tensor_op_math: " - << (use_tensor_op_math ? "True" : "False"); - + cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; + bool use_tensor_op_math = context_.tensor_core_available(); + if (use_tensor_op_math) { + algo = CUBLAS_GEMM_DFALT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " + << (use_tensor_op_math ? "True" : "False"); + + context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { PADDLE_ENFORCE(platform::dynload::cublasGemmStridedBatchedEx( - context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, - CUDA_R_32F, ldb, strideB, A, CUDA_R_32F, lda, strideA, &beta, C, - CUDA_R_32F, ldc, strideC, batchCount, CUDA_R_32F, algo)); - }; - auto &dev_ctx = const_cast(context_); - dev_ctx.CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH); + handle, cuTransB, cuTransA, N, M, K, &alpha, B, CUDA_R_32F, ldb, + strideB, A, CUDA_R_32F, lda, strideA, &beta, C, CUDA_R_32F, ldc, + strideC, batchCount, CUDA_R_32F, algo)); + }); } else { #endif // CUDA_VERSION >= 9010 - CUBlas::GEMM_STRIDED_BATCH(context_.cublas_handle(), cuTransB, cuTransA, - N, M, K, &alpha, B, ldb, strideB, A, lda, - strideA, &beta, C, ldc, strideC, batchCount); + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM_STRIDED_BATCH(handle, cuTransB, cuTransA, N, M, K, &alpha, + B, ldb, strideB, A, lda, strideA, &beta, C, + ldc, strideC, batchCount); + }); #if CUDA_VERSION >= 9010 } diff --git a/paddle/fluid/platform/cuda_helper.h b/paddle/fluid/platform/cuda_helper.h new file mode 100644 index 0000000000..122de72e15 --- /dev/null +++ b/paddle/fluid/platform/cuda_helper.h @@ -0,0 +1,58 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include // NOLINT + +#include "paddle/fluid/platform/dynload/cublas.h" +#include "paddle/fluid/platform/macros.h" + +#if CUDA_VERSION < 9000 +enum cublasMath_t { CUBLAS_DEFAULT_MATH = 0 }; +#endif + +namespace paddle { +namespace platform { + +class CublasHandleHolder { + public: + CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) { + PADDLE_ENFORCE(dynload::cublasCreate(&handle_)); + PADDLE_ENFORCE(dynload::cublasSetStream(handle_, stream)); +#if CUDA_VERSION >= 9000 + if (math_type == CUBLAS_TENSOR_OP_MATH) { + PADDLE_ENFORCE( + dynload::cublasSetMathMode(handle_, CUBLAS_TENSOR_OP_MATH)); + } +#endif + } + + ~CublasHandleHolder() { PADDLE_ENFORCE(dynload::cublasDestroy(handle_)); } + + template + inline void Call(Callback &&callback) const { + std::lock_guard guard(mtx_); + callback(handle_); + } + + private: + DISABLE_COPY_AND_ASSIGN(CublasHandleHolder); + + cublasHandle_t handle_; + mutable std::mutex mtx_; +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 022afb686b..be7f4949d6 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -245,8 +245,15 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) eigen_stream_.reset(new EigenCudaStreamDevice()); eigen_stream_->Reinitialize(&stream_, place); eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get())); - PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_)); - PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_)); + cublas_handle_.reset(new CublasHandleHolder(stream_, CUBLAS_DEFAULT_MATH)); + + if (TensorCoreAvailable()) { +#if CUDA_VERSION >= 9000 + cublas_tensor_core_handle_.reset( + new CublasHandleHolder(stream_, CUBLAS_TENSOR_OP_MATH)); +#endif + } + if (dynload::HasCUDNN()) { cudnn_holder_.reset(new CudnnHolder(&stream_, place)); } @@ -306,7 +313,8 @@ CUDADeviceContext::~CUDADeviceContext() { SetDeviceId(place_.device); Wait(); WaitStreamCallback(); - PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_)); + cublas_handle_.reset(); + cublas_tensor_core_handle_.reset(); eigen_stream_.reset(); eigen_device_.reset(); PADDLE_ENFORCE(cudaStreamDestroy(stream_)); @@ -335,8 +343,8 @@ Eigen::GpuDevice* CUDADeviceContext::eigen_device() const { return eigen_device_.get(); } -cublasHandle_t CUDADeviceContext::cublas_handle() const { - return cublas_handle_; +bool CUDADeviceContext::tensor_core_available() const { + return cublas_tensor_core_handle_ != nullptr; } cudnnHandle_t CUDADeviceContext::cudnn_handle() const { diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 7e87580189..c81d17380c 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -20,6 +20,7 @@ limitations under the License. */ #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/platform/temporary_allocator.h" #ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/dynload/cublas.h" #include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/gpu_info.h" @@ -209,39 +210,6 @@ class CudnnWorkspaceHandle { std::unique_ptr> guard_; }; -#if CUDA_VERSION >= 9000 -class ScopedCublasMathMode { - public: - ScopedCublasMathMode(cublasHandle_t handle, cublasMath_t new_math_mode) - : handle_(handle) { - need_reset = false; - PADDLE_ENFORCE( - platform::dynload::cublasGetMathMode(handle_, &old_math_mode_), - "Failed to get old cublas math mode"); - if (old_math_mode_ != new_math_mode) { - PADDLE_ENFORCE( - platform::dynload::cublasSetMathMode(handle_, new_math_mode), - "Failed to set old cublas math mode"); - need_reset = true; - } - } - - ~ScopedCublasMathMode() { - if (need_reset) { - PADDLE_ENFORCE( - platform::dynload::cublasSetMathMode(handle_, old_math_mode_), - "Failed to set old cublas math mode"); - } - } - - private: - cublasHandle_t handle_; - cublasMath_t old_math_mode_; - bool need_reset; -}; - -#endif - class CUDADeviceContext : public DeviceContext { public: explicit CUDADeviceContext(CUDAPlace place); @@ -262,8 +230,25 @@ class CUDADeviceContext : public DeviceContext { /*! \brief Return eigen device in the device context. */ Eigen::GpuDevice* eigen_device() const; - /*! \brief Return cublas handle in the device context. */ - cublasHandle_t cublas_handle() const; + /*! \brief Call cublas function safely. */ + template + inline void CublasCall(Callback&& callback) const { + cublas_handle_->Call(std::forward(callback)); + } + + /*! \brief Check whether tensor core is supported */ + bool tensor_core_available() const; + + /*! \brief Call cublas function with Tensor Core safely. If + Tensor Core is not available, use DEFAULT_MATH instead. */ + template + inline void TensorCoreCublasCallIfAvailable(Callback&& callback) const { + if (cublas_tensor_core_handle_) { + cublas_tensor_core_handle_->Call(std::forward(callback)); + } else { + cublas_handle_->Call(std::forward(callback)); + } + } /*! \brief Return cudnn handle in the device context. */ cudnnHandle_t cudnn_handle() const; @@ -282,7 +267,6 @@ class CUDADeviceContext : public DeviceContext { template void RecordEvent(cudaEvent_t ev, Callback callback) { - std::lock_guard guard(mtx_); callback(); PADDLE_ENFORCE(cudaEventRecord(ev, stream_)); } @@ -294,18 +278,6 @@ class CUDADeviceContext : public DeviceContext { void WaitStreamCallback() const { callback_manager_->Wait(); } -#if CUDA_VERSION >= 9000 - /*! \brief CublasCall may need to change cublas's config, - * but the cublas may be hold by multi-thread, so we should - * add lock here. */ - template - void CublasCall(Callback callback, cublasMath_t new_math) { - std::lock_guard guard(cublas_mtx_); - ScopedCublasMathMode scoped_cublas_math(cublas_handle_, new_math); - callback(); - } -#endif - private: CUDAPlace place_; @@ -313,7 +285,9 @@ class CUDADeviceContext : public DeviceContext { std::unique_ptr eigen_stream_; std::unique_ptr cudnn_holder_; cudaStream_t stream_; - cublasHandle_t cublas_handle_; + + std::unique_ptr cublas_handle_; + std::unique_ptr cublas_tensor_core_handle_; int compute_capability_; int runtime_version_; @@ -321,12 +295,10 @@ class CUDADeviceContext : public DeviceContext { int multi_process_; int max_threads_per_mp_; - mutable std::mutex mtx_; - // StreamCallbackManager is thread-safe std::unique_ptr callback_manager_; - mutable std::mutex cublas_mtx_; + DISABLE_COPY_AND_ASSIGN(CUDADeviceContext); }; template <> diff --git a/paddle/fluid/platform/device_context_test.cu b/paddle/fluid/platform/device_context_test.cu index 171d2979a0..5b3aa98efb 100644 --- a/paddle/fluid/platform/device_context_test.cu +++ b/paddle/fluid/platform/device_context_test.cu @@ -43,9 +43,6 @@ TEST(Device, CUDADeviceContext) { ASSERT_NE(nullptr, gpu_device); cudnnHandle_t cudnn_handle = device_context->cudnn_handle(); ASSERT_NE(nullptr, cudnn_handle); - cublasHandle_t cublas_handle = device_context->cublas_handle(); - ASSERT_NE(nullptr, cublas_handle); - ASSERT_NE(nullptr, device_context->stream()); delete device_context; } } From 49c31e5da409f9af01182ea74a91d605e3ca9747 Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Mon, 7 Jan 2019 20:31:20 +0800 Subject: [PATCH 162/197] disable mkl for mac test=develop --- CMakeLists.txt | 5 +++++ cmake/external/mklml.cmake | 30 +++++++++++++++--------------- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8ba8554456..66dcef0013 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -126,6 +126,11 @@ if(ANDROID OR IOS) add_definitions(-DPADDLE_MOBILE_INFERENCE) endif() +if (APPLE) + set(WITH_MKL OFF CACHE STRING + "Disable MKL for building on mac" FORCE) +endif() + if (WIN32) set(WITH_DISTRIBUTE OFF CACHE STRING "Disable DISTRIBUTE when compiling for Windows" FORCE) diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake index c94878b6c7..43322a257a 100644 --- a/cmake/external/mklml.cmake +++ b/cmake/external/mklml.cmake @@ -16,6 +16,12 @@ IF(NOT ${WITH_MKLML}) return() ENDIF(NOT ${WITH_MKLML}) +IF(APPLE) + MESSAGE(WARNING "Mac is not supported with MKLML in Paddle yet. Force WITH_MKLML=OFF.") + SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in MacOS" FORCE) + return() +ENDIF() + INCLUDE(ExternalProject) SET(MKLML_DST_DIR "mklml") SET(MKLML_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") @@ -23,29 +29,23 @@ SET(MKLML_INSTALL_DIR ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR}) SET(MKLML_ROOT ${MKLML_INSTALL_DIR}) SET(MKLML_INC_DIR ${MKLML_ROOT}/include) SET(MKLML_LIB_DIR ${MKLML_ROOT}/lib) -if(WIN32) +SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib") + +SET(TIME_VERSION "2019.0.1.20181227") +IF(WIN32) + SET(MKLML_VER "mklml_win_${TIME_VERSION}" CACHE STRING "" FORCE) + SET(MKLML_URL "https://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE) SET(MKLML_LIB ${MKLML_LIB_DIR}/mklml.lib) SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.lib) SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/mklml.dll) SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.dll) -else() +ELSE() + SET(MKLML_VER "mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE) + SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE) SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so) SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so) -endif() -SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib") - -SET(TIME_VERSION "2019.0.1.20181227") -if(WIN32) - SET(MKLML_VER "mklml_win_${TIME_VERSION}" CACHE STRING "" FORCE) - SET(MKLML_URL "https://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE) -elseif(APPLE) - SET(MKLML_VER "mklml_mac_${TIME_VERSION}" CACHE STRING "" FORCE) - SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE) -else() - SET(MKLML_VER "mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE) - SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE) ENDIF() SET(MKLML_PROJECT "extern_mklml") From 7b7d0d0caf85fc2d104ac285cfa367ff46490fa1 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Tue, 8 Jan 2019 13:30:09 +0800 Subject: [PATCH 163/197] Change hash function back test=develop --- paddle/fluid/operators/hash_op.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/hash_op.h b/paddle/fluid/operators/hash_op.h index 1ed3ffe9aa..9781bb0f45 100644 --- a/paddle/fluid/operators/hash_op.h +++ b/paddle/fluid/operators/hash_op.h @@ -45,7 +45,7 @@ class HashKerel : public framework::OpKernel { for (int idx = 0; idx < seq_length; ++idx) { for (int ihash = 0; ihash != num_hash; ++ihash) { output[idx * num_hash + ihash] = - XXH32(input, sizeof(int) * last_dim, ihash) % mod_by; + XXH64(input, sizeof(int) * last_dim, ihash) % mod_by; } input += last_dim; } From d09d6eadc0c875cd7f703593d37fb46216ca4400 Mon Sep 17 00:00:00 2001 From: Yan Chunwei Date: Tue, 8 Jan 2019 15:05:00 +0800 Subject: [PATCH 164/197] make inference api work with Doxygen (#15195) --- .../fluid/inference/api/analysis_predictor.h | 7 +- paddle/fluid/inference/api/api_impl.h | 1 - .../inference/api/paddle_analysis_config.h | 103 +++++++++- paddle/fluid/inference/api/paddle_api.h | 176 +++++++++++------- .../fluid/inference/api/paddle_pass_builder.h | 37 ++-- 5 files changed, 227 insertions(+), 97 deletions(-) diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 12ecb7c15e..a6e126c5d5 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -35,8 +35,11 @@ using framework::proto::ProgramDesc; using framework::NaiveExecutor; using contrib::AnalysisConfig; -/* This predictor is based on the original native predictor with IR and Analysis - * support. It will optimize IR and Parameters in the runtime. +/** \brief This predictor is based on the original native predictor with IR and + * Analysis support. + * + * It will optimize IR and Parameters in the runtime. + * * TODO(Superjomn) Replace the Navive predictor? */ class AnalysisPredictor : public PaddlePredictor { diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h index c1fcd198cc..d2133bd467 100644 --- a/paddle/fluid/inference/api/api_impl.h +++ b/paddle/fluid/inference/api/api_impl.h @@ -19,7 +19,6 @@ limitations under the License. */ #include #include #include - #include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor_array.h" diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 2d61098f93..ae6ac69854 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -19,6 +19,8 @@ #include #include +/*! \file */ + // Here we include some header files with relative paths, for that in deploy, // the abstract path of this header file will be changed. #include "paddle_api.h" // NOLINT @@ -41,49 +43,125 @@ struct AnalysisConfig { explicit AnalysisConfig(const std::string& prog_file, const std::string& params_file); - // Model path related. + /** Set model with a directory. + */ void SetModel(const std::string& model_dir) { model_dir_ = model_dir; } + /** Set model with two specific pathes for program and parameters. + */ void SetModel(const std::string& prog_file_path, const std::string& params_file_path); + /** Set program file path. + */ void SetProgFile(const std::string& x) { prog_file_ = x; } + /** Set parameter composed file path. + */ void SetParamsFile(const std::string& x) { params_file_ = x; } + /** Get the model directory path. + */ const std::string& model_dir() const { return model_dir_; } + /** Get the program file path. + */ const std::string& prog_file() const { return prog_file_; } + /** Get the composed parameters file. + */ const std::string& params_file() const { return params_file_; } // GPU related. + + /** + * \brief Turn on GPU. + * @param memory_pool_init_size_mb initial size of the GPU memory pool in MB. + * @param device_id the GPU card to use (default is 0). + */ void EnableUseGpu(uint64_t memory_pool_init_size_mb, int device_id = 0); + /** Turn off the GPU. + */ void DisableGpu(); + /** A bool state telling whether the GPU is turned on. + */ bool use_gpu() const { return use_gpu_; } + /** Get the GPU device id. + */ int gpu_device_id() const { return device_id_; } + /** Get the initial size in MB of the GPU memory pool. + */ int memory_pool_init_size_mb() const { return memory_pool_init_size_mb_; } + /** Get the proportion of the initial memory pool size compared to the device. + */ float fraction_of_gpu_memory_for_pool() const; - // Determine whether to perform graph optimization. + /** \brief Control whether to perform IR graph optimization. + * + * If turned off, the AnalysisConfig will act just like a NativeConfig. + */ void SwitchIrOptim(int x = true) { enable_ir_optim_ = x; } + /** A boolean state tell whether the ir graph optimization is actived. + */ bool ir_optim() const { return enable_ir_optim_; } + /** \brief INTERNAL Determine whether to use the feed and fetch operators. + * Just for internal development, not stable yet. + * When ZeroCopyTensor is used, this should turned off. + */ void SwitchUseFeedFetchOps(int x = true) { use_feed_fetch_ops_ = x; } + /** A boolean state telling whether to use the feed and fetch operators. + */ bool use_feed_fetch_ops_enabled() const { return use_feed_fetch_ops_; } + /** \brief Control whether to specify the inputs' names. + * + * The PaddleTensor type has a `name` member, assign it with the corresponding + * variable name. This is used only when the input PaddleTensors passed to the + * `PaddlePredictor.Run(...)` cannot follow the order in the training phase. + */ void SwitchSpecifyInputNames(bool x = true) { specify_input_name_ = x; } + + /** A boolean state tell whether the input PaddleTensor names specified should + * be used to reorder the inputs in `PaddlePredictor.Run(...)`. + */ bool specify_input_name() const { return specify_input_name_; } + /** + * \brief Turn on the TensorRT engine. + * + * The TensorRT engine will accelerate some subgraphes in the original Fluid + * computation graph. In some models such as TensorRT50, GoogleNet and so on, + * it gains significant performance acceleration. + * + * @param workspace_size the memory size(in byte) used for TensorRT workspace. + * @param max_batch_size the maximum batch size of this prediction task, + * better set as small as possible, or performance loss. + * @param min_subgrpah_size the minimum TensorRT subgraph size needed, if a + * subgraph is less than this, it will not transfer to TensorRT engine. + */ void EnableTensorRtEngine(int workspace_size = 1 << 20, int max_batch_size = 1, int min_subgraph_size = 3); + /** A boolean state telling whether the TensorRT engine is used. + */ bool tensorrt_engine_enabled() const { return use_tensorrt_; } + /** Control whther to debug IR graph analysis phase. + */ void SwitchIrDebug(int x = true) { ir_debug_ = x; } + /** Turn on MKLDNN. + */ void EnableMKLDNN(); + /** A boolean state telling whether to use the MKLDNN. + */ bool mkldnn_enabled() const { return use_mkldnn_; } - // Set and get the number of cpu math library threads. + /** Set and get the number of cpu math library threads. + */ void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads); + /** An int state telling how many threads are used in the CPU math library. + */ int cpu_math_library_num_threads() const { return cpu_math_library_num_threads_; } + /** Transform the AnalysisConfig to NativeConfig. + */ NativeConfig ToNativeConfig() const { NativeConfig config; config.model_dir = model_dir_; @@ -95,19 +173,30 @@ struct AnalysisConfig { config.specify_input_name = specify_input_name_; return config; } + /** Specify the operator type list to use MKLDNN acceleration. + * @param op_list the operator type list. + */ void SetMKLDNNOp(std::unordered_set op_list) { mkldnn_enabled_op_types_ = op_list; } - // Specify the memory buffer of program and parameter + /** Specify the memory buffer of program and parameter + * @param prog_buffer the memory buffer of program. + * @param prog_buffer_size the size of the data. + * @param params_buffer the memory buffer of the composed parameters file. + * @param params_buffer_size the size of the commposed parameters data. + */ void SetModelBuffer(const char* prog_buffer, size_t prog_buffer_size, - const char* program_buffer, size_t program_buffer_size); + const char* params_buffer, size_t params_buffer_size); + /** A boolean state telling whether the model is set from the CPU memory. + */ bool model_from_memory() const { return model_from_memory_; } friend class ::paddle::AnalysisPredictor; - // NOTE just for developer, not an official API, easily to be broken. - // Get a pass builder for customize the passes in IR analysis phase. + /** NOTE just for developer, not an official API, easily to be broken. + * Get a pass builder for customize the passes in IR analysis phase. + */ PassStrategy* pass_builder() const; protected: diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index 1513a4b3b4..3642f36127 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -13,61 +13,76 @@ // limitations under the License. #pragma once +/*! \file paddle_api.h + */ + #include #include #include #include +/*! \namespace paddle + */ namespace paddle { -// Data type. +/** paddle data type. + */ enum PaddleDType { FLOAT32, INT64, // TODO(Superjomn) support more data types if needed. }; -/* - * Memory menage for PaddleTensor. - * The PaddleBuf holds a buffer for data input or output. The memory can be - * allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf - * should be reused for better performance. +/** + *\brief Memory menager for PaddleTensor. * - * For user allocated memory, the following API can be used: - * - PaddleBuf(void* data, size_t length) to set an external memory by - * specifying - * the memory address and length. - * - Reset(void* data, size_t length) to reset the PaddleBuf with an external - * memory. - * ATTENTION, for user allocated memory, deallocation should be done by users - * externally after the program finished. The PaddleBuf won't do any allocation - * or deallocation. + *The PaddleBuf holds a buffer for data input or output. The memory can be + *allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf + *should be reused for better performance. * - * To have the PaddleBuf allocate and manage the memory: - * - PaddleBuf(size_t length) will allocate a memory of size `length`. - * - Resize(size_t length) resize the memory to no less than `length`, ATTENTION - * if the allocated memory is larger than `length`, nothing will done. + *For user allocated memory, the following API can be used: + *- PaddleBuf(void* data, size_t length) to set an external memory by + *specifying + * the memory address and length. + *- Reset(void* data, size_t length) to reset the PaddleBuf with an external + *memory. + *ATTENTION, for user allocated memory, deallocation should be done by users + *externally after the program finished. The PaddleBuf won't do any allocation + *or deallocation. + * + *To have the PaddleBuf allocate and manage the memory: + *- PaddleBuf(size_t length) will allocate a memory of size `length`. + *- Resize(size_t length) resize the memory to no less than `length`, ATTENTION + * if the allocated memory is larger than `length`, nothing will done. */ class PaddleBuf { public: - // PaddleBuf allocate memory internally, and manage it. + /** PaddleBuf allocate memory internally, and manage it. + */ explicit PaddleBuf(size_t length) : data_(new char[length]), length_(length), memory_owned_(true) {} - // Set external memory, the PaddleBuf won't manage it. + /** Set external memory, the PaddleBuf won't manage it. + */ PaddleBuf(void* data, size_t length) : data_(data), length_(length), memory_owned_{false} {} - // Copy only available when memory is managed externally. + /** Copy only available when memory is managed externally. + */ explicit PaddleBuf(const PaddleBuf&); - // Resize the memory. + /** Resize the memory. + */ void Resize(size_t length); - // Reset to external memory, with address and length set. + /** Reset to external memory, with address and length set. + */ void Reset(void* data, size_t length); - // Tell whether the buffer is empty. + /** Tell whether the buffer is empty. + */ bool empty() const { return length_ == 0; } - // Get the memory address. + /** Get the memory address. + */ void* data() const { return data_; } - // Get the memory length. + /** Get the memory length. + */ size_t length() const { return length_; } ~PaddleBuf() { Free(); } @@ -83,7 +98,8 @@ class PaddleBuf { bool memory_owned_{true}; }; -// Basic input and output data structure for PaddlePredictor. +/** Basic input and output data structure for PaddlePredictor. + */ struct PaddleTensor { PaddleTensor() = default; std::string name; // variable name. @@ -94,19 +110,22 @@ struct PaddleTensor { }; enum class PaddlePlace { kUNK = -1, kCPU, kGPU }; -// Tensor without copy, currently only supports AnalysisPredictor. +/** Tensor without copy, currently only supports AnalysisPredictor. + */ class ZeroCopyTensor { public: void Reshape(const std::vector& shape); - // Get the memory in CPU or GPU with specific data type, should Reshape first - // to tell the data size. - // Once can directly call this data to feed the data. - // This is for write the input tensor. + /** Get the memory in CPU or GPU with specific data type, should Reshape first + * to tell the data size. + * Once can directly call this data to feed the data. + * This is for write the input tensor. + */ template T* mutable_data(PaddlePlace place); - // Get the memory directly, will return the place and memory size by pointer. - // This is for reading the output tensor. + /** Get the memory directly, will return the place and memory size by pointer. + * This is for reading the output tensor. + */ template T* data(PaddlePlace* place, int* size) const; @@ -128,8 +147,7 @@ class ZeroCopyTensor { void* scope_{nullptr}; }; -/* - * A simple Inference API for Paddle. +/** A simple Inference API for Paddle. */ class PaddlePredictor { public: @@ -138,18 +156,20 @@ class PaddlePredictor { PaddlePredictor(const PaddlePredictor&) = delete; PaddlePredictor& operator=(const PaddlePredictor&) = delete; - // Predict an record. - // The caller should be responsible for allocating and releasing the memory of - // `inputs`. `inputs` should be available until Run returns. Caller should be - // responsible for the output tensor's buffer, either allocated or passed from - // outside. + /** Predict an record. + * The caller should be responsible for allocating and releasing the memory of + * `inputs`. `inputs` should be available until Run returns. Caller should be + * responsible for the output tensor's buffer, either allocated or passed from + * outside. + */ virtual bool Run(const std::vector& inputs, std::vector* output_data, int batch_size = -1) = 0; - // Zero copy input and output optimization. - // Get the input or output tensors, and operate on their memory directly, - // without copy. + /** Zero copy input and output optimization. + * Get the input or output tensors, and operate on their memory directly, + * without copy. + */ virtual std::unique_ptr GetInputTensor( const std::string& name) { return nullptr; @@ -160,16 +180,19 @@ class PaddlePredictor { } virtual bool ZeroCopyRun() { return false; } - // Clone a predictor that share the model weights, the Cloned predictor should - // be thread-safe. + /** Clone a predictor that share the model weights, the Cloned predictor + * should be thread-safe. + */ virtual std::unique_ptr Clone() = 0; - // Destroy the Predictor. + /** Destroy the Predictor. + */ virtual ~PaddlePredictor() = default; - // The common configs for all the predictors. + /** The common configs for all the predictors. + */ struct Config { - std::string model_dir; // path to the model directory. + std::string model_dir; /*!< path to the model directory. */ }; }; @@ -177,17 +200,21 @@ struct NativeConfig : public PaddlePredictor::Config { // GPU related fields. bool use_gpu{false}; int device{0}; - float fraction_of_gpu_memory{-1.f}; // Change to a float in (0,1] if needed. + float fraction_of_gpu_memory{ + -1.f}; /*!< Change to a float in (0,1] if needed. */ // Specify the exact path of program and parameter files. std::string prog_file; std::string param_file; - // Specify the variable's name of each input if input tensors don't follow the - // `feeds` and `fetches` of the phase `save_inference_model`. + /** Specify the variable's name of each input if input tensors don't follow + * the + * `feeds` and `fetches` of the phase `save_inference_model`. + */ bool specify_input_name{false}; - // Set and get the number of cpu math library threads. + /** Set and get the number of cpu math library threads. + */ void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads) { cpu_math_library_num_threads_ = cpu_math_library_num_threads; } @@ -201,28 +228,33 @@ struct NativeConfig : public PaddlePredictor::Config { int cpu_math_library_num_threads_{1}; }; -// A factory to help create different predictors. -// -// Usage: -// -// NativeConfig config; -// ... // change the configs. -// auto native_predictor = CreatePaddlePredictor(config); -// -// FOR EXTENSION DEVELOPER: -// Different predictors are designated by config type. Similar configs can be -// merged, but there shouldn't be a huge config containing different fields for -// more than one kind of predictors. +/*! \fn std::unique_ptr CreatePaddlePredictor(const ConfigT& + * config); + * + * \brief A factory to help create different predictors. + * + * Usage: + * + * NativeConfig config; + * ... // change the configs. + * auto native_predictor = CreatePaddlePredictor(config); + * + * FOR EXTENSION DEVELOPER: + * Different predictors are designated by config type. Similar configs can be + * merged, but there shouldn't be a huge config containing different fields for + * more than one kind of predictors. + */ template std::unique_ptr CreatePaddlePredictor(const ConfigT& config); -// NOTE The following APIs are too trivial, we will discard it in the following -// versions. +/** NOTE The following APIs are too trivial, we will discard it in the following + * versions. + */ enum class PaddleEngineKind { - kNative = 0, // Use the native Fluid facility. - kAutoMixedTensorRT, // Automatically mix Fluid with TensorRT. - kAnalysis, // More optimization. - kAnakin // Use Anakin for inference, not mature yet. + kNative = 0, /*!< Use the native Fluid facility. */ + kAutoMixedTensorRT, /*!< Automatically mix Fluid with TensorRT. */ + kAnalysis, /*!< More optimization. */ + kAnakin /*!< Use Anakin for inference, not mature yet. */ }; template diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index b4cbc40e0f..9337ae55b7 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -18,30 +18,39 @@ #include #include +/*! \file */ + +/*! \namespace paddle */ namespace paddle { -/* - * This is a pass builder based on string. It is part of inference API. + +/** This is a pass builder based on string. It is part of inference API. */ class PaddlePassBuilder { public: explicit PaddlePassBuilder(const std::vector &passes) : passes_(passes) {} + /** Append a pass to the end of the passes. */ void AppendPass(const std::string &pass_type); + /** Insert a pass to a specific position. + * @param idx the position to insert. + * @param pass_type the pass key. + */ void InsertPass(size_t idx, const std::string &pass_type); - // Delete the `idx`-th pass. + /** Delete the `idx`-th pass. */ void DeletePass(size_t idx); - // Delete all the passes that has type `pass_type`. + /** Delete all the passes that has type `pass_type`. */ void DeletePass(const std::string &pass_type); - // Visualize the computation graph after each pass by generating a DOT - // language file, one can draw them with the Graphviz toolkit. + /** Visualize the computation graph after each pass by generating a DOT + * language file, one can draw them with the Graphviz toolkit. + */ void TurnOnDebug(); - // Human-readible information. + /** Human-readible information. */ std::string DebugString(); const std::vector &AllPasses() const { return passes_; } @@ -50,16 +59,16 @@ class PaddlePassBuilder { std::vector passes_; }; -/* - * Pass strategy to help control the IR passes. +/**Pass strategy to help control the IR passes. */ class PassStrategy : public PaddlePassBuilder { public: explicit PassStrategy(const std::vector &passes) : PaddlePassBuilder(passes) {} - // The MKLDNN control exists in both CPU and GPU mode, because there can be - // still some CPU kernels running in CPU mode. + /** The MKLDNN control exists in both CPU and GPU mode, because there can be + * still some CPU kernels running in CPU mode. + */ virtual void EnableMKLDNN() = 0; bool use_gpu() const { return use_gpu_; } @@ -70,8 +79,7 @@ class PassStrategy : public PaddlePassBuilder { bool use_gpu_{false}; }; -/* - * The CPU passes controller, it is used in AnalysisPredictor with CPU mode. +/** The CPU passes controller, it is used in AnalysisPredictor with CPU mode. */ class CpuPassStrategy : public PassStrategy { public: @@ -117,8 +125,7 @@ class CpuPassStrategy : public PassStrategy { CpuPassStrategy(const CpuPassStrategy &other) : PassStrategy(other.passes_) {} }; -/* - * The GPU passes strategy, it is used in +/** The GPU passes strategy, it is used in AnalysisPredictor with GPU mode. */ class GpuPassStrategy : public PassStrategy { public: From 23bdd0a223cc3e88c62fb8f48155c83455c9fede Mon Sep 17 00:00:00 2001 From: superjomn Date: Tue, 8 Jan 2019 15:11:48 +0800 Subject: [PATCH 165/197] fix analysis_tester bug test=develop --- paddle/fluid/inference/analysis/analyzer_tester.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index f84e1ab6b8..4c84d02d86 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -80,8 +80,8 @@ void TestWord2vecPrediction(const std::string& model_path) { i++) { LOG(INFO) << "data: " << static_cast(outputs.front().data.data())[i] << " result: " << result[i]; - PADDLE_ENFORCE(static_cast(outputs.front().data.data())[i], - result[i]); + EXPECT_NEAR(static_cast(outputs.front().data.data())[i], result[i], + 1e-3); } } From 69fd3fdb5206045cfcee90d98b52cf070f1dcae1 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 8 Jan 2019 09:11:39 +0000 Subject: [PATCH 166/197] fix debug build error test=develop --- paddle/fluid/inference/analysis/passes/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/inference/analysis/passes/CMakeLists.txt b/paddle/fluid/inference/analysis/passes/CMakeLists.txt index d3ea511d8f..add9b70f2c 100644 --- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt @@ -7,4 +7,5 @@ set(analysis_deps ${analysis_deps} ir_graph_build_pass ir_analysis_pass analysis_passes + subgraph_detector CACHE INTERNAL "") From bc205ef37453e0f7ab1f74abb123c3367ceee3c7 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Tue, 8 Jan 2019 10:28:01 +0000 Subject: [PATCH 167/197] fix same name func test=develop --- paddle/fluid/framework/var_type_traits.cc | 8 +++++--- paddle/fluid/framework/var_type_traits.h | 4 ++-- paddle/fluid/framework/var_type_traits_test.cc | 9 +++++---- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc index c3c5bab23b..a37b1fbab8 100644 --- a/paddle/fluid/framework/var_type_traits.cc +++ b/paddle/fluid/framework/var_type_traits.cc @@ -105,13 +105,15 @@ struct VarIdToTypeIndexMapHolder { } // namespace detail -const std::type_index &ToTypeIndex(int var_id) { +const std::type_index &VarTraitIdToTypeIndex(int var_id) { return detail::VarIdToTypeIndexMapHolder::ToTypeIndex(var_id); } -const char *ToTypeName(int var_id) { return ToTypeIndex(var_id).name(); } +const char *ToTypeName(int var_id) { + return VarTraitIdToTypeIndex(var_id).name(); +} -int ToTypeId(const std::type_index &type) { +int TypeIndexToVarTraitId(const std::type_index &type) { return detail::VarIdToTypeIndexMapHolder::ToTypeId(type); } diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h index cc68cf2ab8..733542e497 100644 --- a/paddle/fluid/framework/var_type_traits.h +++ b/paddle/fluid/framework/var_type_traits.h @@ -66,8 +66,8 @@ namespace paddle { namespace framework { const char *ToTypeName(int var_id); -const std::type_index &ToTypeIndex(int var_id); -int ToTypeId(const std::type_index &type); +const std::type_index &VarTraitIdToTypeIndex(int var_id); +int TypeIndexToVarTraitId(const std::type_index &type); namespace detail { diff --git a/paddle/fluid/framework/var_type_traits_test.cc b/paddle/fluid/framework/var_type_traits_test.cc index 00840d634d..a47275e1ca 100644 --- a/paddle/fluid/framework/var_type_traits_test.cc +++ b/paddle/fluid/framework/var_type_traits_test.cc @@ -45,10 +45,11 @@ struct TypeIndexChecker { constexpr auto kId = VarTypeTrait::kId; std::type_index actual_type(typeid(Type)); EXPECT_EQ(std::string(ToTypeName(kId)), std::string(actual_type.name())); - EXPECT_EQ(ToTypeIndex(kId), actual_type); - EXPECT_EQ(ToTypeId(actual_type), kId); - EXPECT_EQ(ToTypeIndex(ToTypeId(actual_type)), actual_type); - EXPECT_EQ(ToTypeId(ToTypeIndex(kId)), kId); + EXPECT_EQ(VarTraitIdToTypeIndex(kId), actual_type); + EXPECT_EQ(TypeIndexToVarTraitId(actual_type), kId); + EXPECT_EQ(VarTraitIdToTypeIndex(TypeIndexToVarTraitId(actual_type)), + actual_type); + EXPECT_EQ(TypeIndexToVarTraitId(VarTraitIdToTypeIndex(kId)), kId); EXPECT_TRUE(var_id_set->count(kId) == 0); // NOLINT EXPECT_TRUE(type_index_set->count(actual_type) == 0); // NOLINT From 55a0672378329764a1b1429d9cfc8def91317e63 Mon Sep 17 00:00:00 2001 From: chengduo Date: Tue, 8 Jan 2019 05:20:48 -0600 Subject: [PATCH 168/197] fix compute_75 of cuda_cmake (#15209) test=develop --- cmake/cuda.cmake | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 10ecdf0ea8..16432ce2b8 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -2,9 +2,11 @@ if(NOT WITH_GPU) return() endif() -set(paddle_known_gpu_archs "30 35 50 52 60 61 70 75") +set(paddle_known_gpu_archs "30 35 50 52 60 61 70") set(paddle_known_gpu_archs7 "30 35 50 52") set(paddle_known_gpu_archs8 "30 35 50 52 60 61") +set(paddle_known_gpu_archs9 "30 35 50 52 60 61 70") +set(paddle_known_gpu_archs10 "30 35 50 52 60 61 70 75") ###################################################################################### # A function for automatic detection of GPUs installed (if autodetection is enabled) @@ -155,6 +157,16 @@ elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x # warning for now. list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets") add_definitions("-DPADDLE_CUDA_BINVER=\"80\"") +elseif (${CUDA_VERSION} LESS 10.0) # CUDA 9.x + set(paddle_known_gpu_archs ${paddle_known_gpu_archs9}) + list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") + list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") + add_definitions("-DPADDLE_CUDA_BINVER=\"90\"") +elseif (${CUDA_VERSION} LESS 11.0) # CUDA 10.x + set(paddle_known_gpu_archs ${paddle_known_gpu_archs10}) + list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") + list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") + add_definitions("-DPADDLE_CUDA_BINVER=\"100\"") endif() include_directories(${CUDA_INCLUDE_DIRS}) From 72d2a1801e92cf441752a9701114c9584ccfcb10 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Mon, 7 Jan 2019 07:36:48 +0000 Subject: [PATCH 169/197] add seqpool concat fuse pass test=develop --- paddle/fluid/framework/ir/CMakeLists.txt | 1 + .../framework/ir/seqpool_concat_fuse_pass.cc | 194 ++++++++++++++++++ .../framework/ir/seqpool_concat_fuse_pass.h | 38 ++++ .../fluid/inference/api/paddle_pass_builder.h | 1 + .../tests/api/analyzer_seq_pool1_tester.cc | 6 +- 5 files changed, 239 insertions(+), 1 deletion(-) create mode 100644 paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 6e6db3d3ef..f71a3d0f2e 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -42,6 +42,7 @@ pass_library(seq_concat_fc_fuse_pass inference) pass_library(multi_batch_merge_pass base) pass_library(conv_bn_fuse_pass inference) pass_library(seqconv_eltadd_relu_fuse_pass inference) +pass_library(seqpool_concat_fuse_pass inference) pass_library(is_test_pass base) pass_library(conv_elementwise_add_act_fuse_pass inference) pass_library(conv_elementwise_add2_act_fuse_pass inference) diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc new file mode 100644 index 0000000000..20b8220033 --- /dev/null +++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc @@ -0,0 +1,194 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h" +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" + +#define MAX_CONCAT_INPUTS 200 + +namespace paddle { +namespace framework { +namespace ir { + +PDNode* BuildSeqPoolConcatPattern(PDPattern* pattern, + const std::string& name_scope, + int num_inputs) { + auto is_concat_op_with_inputs = [](Node* x, int num) -> bool { + return x && x->IsOp() && x->Op()->Type() == "concat" && + x->Op()->Input("X").size() == static_cast(num); + }; + + auto is_nth_input_var_of_concat = [=](Node* x, int idx) -> bool { + return x && x->IsVar() && VarLinksToOp(x, "concat") && + x->outputs.size() == 1 && IsNthInput(x, x->outputs[0], "X", idx) && + is_concat_op_with_inputs(x->outputs[0], num_inputs); + }; + + auto is_seqpool_op_with_pootype_of_nth_input_of_concat = [=]( + Node* x, const std::string& type, int idx) -> bool { + bool ok = x && x->IsOp() && x->Op()->Type() == "sequence_pool" && + x->Op()->HasAttr("pooltype") && + boost::get(x->Op()->GetAttr("pooltype")) == type && + x->outputs.size() == 2; // seqpool should only have 2 outputs + if (ok) { + // only one output of seqpool_op is nth_input_var of concat + // the other one should be unused empty var + if (is_nth_input_var_of_concat(x->outputs[0], idx)) { + ok = ok && x->outputs[1]->IsVar() && x->outputs[1]->outputs.size() == 0; + } else { + ok = ok && is_nth_input_var_of_concat(x->outputs[1], idx) && + x->outputs[0]->IsVar() && x->outputs[0]->outputs.size() == 0; + } + } + return ok; + }; + + auto* concat_op = pattern->NewNode( + [=](Node* x) { return is_concat_op_with_inputs(x, num_inputs); }, + name_scope + "/concat_op"); + concat_op->assert_op_attr("axis", 1); + + auto* concat_out_var = pattern->NewNode( + [=](Node* x) { + return x && x->IsVar() && VarLinksFromOp(x, "concat") && + x->inputs.size() == 1 && + is_concat_op_with_inputs(x->inputs[0], num_inputs); + }, + name_scope + "/concat_out_var"); + concat_out_var->assert_is_only_output_of_op("concat"); + + std::vector seqpool_ops_input_var(num_inputs); + std::vector seqpool_ops_output_var(num_inputs); + std::vector seqpool_ops(num_inputs); + + for (int i = 0; i < num_inputs; ++i) { + seqpool_ops_output_var[i] = pattern->NewNode( + [=](Node* x) { + return x && x->IsVar() && is_nth_input_var_of_concat(x, i) && + x->inputs.size() == 1 && + is_seqpool_op_with_pootype_of_nth_input_of_concat(x->inputs[0], + "SUM", i); + }, + name_scope + "/sequence_pool_out_" + std::to_string(i)); + + seqpool_ops[i] = pattern->NewNode( + [=](Node* x) { + return x && x->IsOp() && + is_seqpool_op_with_pootype_of_nth_input_of_concat(x, "SUM", i); + }, + name_scope + "/sequence_pool_op_" + std::to_string(i)); + + seqpool_ops_input_var[i] = pattern->NewNode( + [=](Node* x) { + return x && x->IsVar() && x->outputs.size() >= 1 && + is_seqpool_op_with_pootype_of_nth_input_of_concat( + x->outputs[0], "SUM", i); + }, + name_scope + "/sequence_pool_in_" + std::to_string(i)); + + // Links + seqpool_ops[i] + ->LinksFrom({seqpool_ops_input_var[i]}) + .LinksTo({seqpool_ops_output_var[i]}); + } + concat_op->LinksFrom(seqpool_ops_output_var).LinksTo({concat_out_var}); + return concat_out_var; +} + +int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, + int num_inputs) { + GraphPatternDetector gpd; + auto* pattern = gpd.mutable_pattern(); + BuildSeqPoolConcatPattern(pattern, name_scope, num_inputs); + + auto retrieve_node = [](const std::string& name, + const GraphPatternDetector::subgraph_t& subgraph, + const PDPattern& pat) -> Node* { + PADDLE_ENFORCE(subgraph.count(pat.RetrieveNode(name)), + "pattern has no Node called %s", name.c_str()); + Node* p = subgraph.at(pat.RetrieveNode(name)); + PADDLE_ENFORCE_NOT_NULL(p, "subgraph has no node %s", name.c_str()); + return p; + }; + + int fusion_count{0}; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "handle SeqPool Concat fuse"; + std::vector input_names(num_inputs); + std::vector input_vars(num_inputs); + auto& fused_pattern = gpd.pattern(); + for (int i = 0; i < num_inputs; ++i) { + input_vars[i] = + retrieve_node(name_scope + "/sequence_pool_in_" + std::to_string(i), + subgraph, fused_pattern); + input_names[i] = input_vars[i]->Name(); + } + auto* concat_op = + retrieve_node(name_scope + "/concat_op", subgraph, fused_pattern); + auto* concat_out_var = + retrieve_node(name_scope + "/concat_out_var", subgraph, fused_pattern); + auto* seqpool_op0 = retrieve_node(name_scope + "/sequence_pool_op_0", + subgraph, fused_pattern); + + // Create New OpDesc + OpDesc op_desc; + op_desc.SetType("fusion_seqpool_concat"); + op_desc.SetInput("X", input_names); + op_desc.SetAttr("pooltype", seqpool_op0->Op()->GetAttr("pooltype")); + op_desc.SetAttr("axis", concat_op->Op()->GetAttr("axis")); + op_desc.SetOutput("Out", {concat_out_var->Name()}); + auto* op = graph->CreateOpNode(&op_desc); + for (size_t i = 0; i < input_vars.size(); ++i) { + IR_NODE_LINK_TO(input_vars[i], op); + } + IR_NODE_LINK_TO(op, concat_out_var); + + std::unordered_set marked_nodes; + for (auto& item : subgraph) { + marked_nodes.insert(item.second); + } + for (size_t i = 0; i < input_vars.size(); ++i) { + marked_nodes.erase(input_vars[i]); + } + marked_nodes.erase(concat_out_var); + GraphSafeRemoveNodes(graph, marked_nodes); + ++fusion_count; + }; + + gpd(graph, handler); + return fusion_count; +} + +std::unique_ptr SeqPoolConcatFusePass::ApplyImpl( + std::unique_ptr graph) const { + FusePassBase::Init(name_scope_, graph.get()); + int fusion_count = 0; + for (int i = MAX_CONCAT_INPUTS; i > 0; --i) { + fusion_count += BuildFusion( + graph.get(), name_scope_ + "/" + std::to_string(i), param_scope(), i); + } + AddStatis(fusion_count); + + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(seqpool_concat_fuse_pass, + paddle::framework::ir::SeqPoolConcatFusePass); diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h new file mode 100644 index 0000000000..59730fde55 --- /dev/null +++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h @@ -0,0 +1,38 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +class SeqPoolConcatFusePass : public FusePassBase { + public: + virtual ~SeqPoolConcatFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + + const std::string name_scope_{"seqpool_concat_fuse"}; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index 9337ae55b7..1e5712e163 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -89,6 +89,7 @@ class CpuPassStrategy : public PassStrategy { passes_.assign({ "infer_clean_graph_pass", // "attention_lstm_fuse_pass", // + "seqpool_concat_fuse_pass", // "seqconv_eltadd_relu_fuse_pass", // // "embedding_fc_lstm_fuse_pass", // "fc_lstm_fuse_pass", // diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc index a1742f6068..083bdf15e9 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc @@ -177,8 +177,12 @@ TEST(Analyzer_seq_pool1, fuse_statis) { auto predictor = CreatePaddlePredictor(cfg); auto fuse_statis = GetFuseStatis( static_cast(predictor.get()), &num_ops); + + ASSERT_TRUE(fuse_statis.count("seqpool_concat_fuse")); + EXPECT_EQ(fuse_statis.at("seqpool_concat_fuse"), 2); + LOG(INFO) << "num_ops: " << num_ops; - EXPECT_EQ(num_ops, 349); + EXPECT_EQ(num_ops, 195); } } // namespace analysis From 71d9097a89ac42e7943f77f4371c633b7df7c3fa Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Tue, 8 Jan 2019 19:15:53 +0800 Subject: [PATCH 170/197] fix analyzer_test runs error in native_config test=develop --- .../inference/tests/api/config_printer.h | 2 +- .../fluid/inference/tests/api/tester_helper.h | 20 +++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h index cf0f1d5c18..ecc10bafd6 100644 --- a/paddle/fluid/inference/tests/api/config_printer.h +++ b/paddle/fluid/inference/tests/api/config_printer.h @@ -62,7 +62,7 @@ std::ostream &operator<<(std::ostream &os, const contrib::AnalysisConfig &config) { os << GenSpaces(num_spaces) << "contrib::AnalysisConfig {\n"; num_spaces++; - os << *reinterpret_cast(&config); + os << config.ToNativeConfig(); if (!config.model_from_memory()) { os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file() << "\n"; os << GenSpaces(num_spaces) << "param_file: " << config.params_file() diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 41d033df85..524b5fa0ee 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -54,11 +54,13 @@ namespace paddle { namespace inference { void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) { + const auto *analysis_config = + reinterpret_cast(config); if (use_analysis) { - LOG(INFO) << *reinterpret_cast(config); + LOG(INFO) << *analysis_config; return; } - LOG(INFO) << *reinterpret_cast(config); + LOG(INFO) << analysis_config->ToNativeConfig(); } void CompareResult(const std::vector &outputs, @@ -96,12 +98,13 @@ void CompareResult(const std::vector &outputs, std::unique_ptr CreateTestPredictor( const PaddlePredictor::Config *config, bool use_analysis = true) { + const auto *analysis_config = + reinterpret_cast(config); if (use_analysis) { - return CreatePaddlePredictor( - *(reinterpret_cast(config))); + return CreatePaddlePredictor(*analysis_config); } - return CreatePaddlePredictor( - *(reinterpret_cast(config))); + auto native_config = analysis_config->ToNativeConfig(); + return CreatePaddlePredictor(native_config); } size_t GetSize(const PaddleTensor &out) { return VecReduceToInt(out.shape); } @@ -328,10 +331,7 @@ void CompareNativeAndAnalysis( const std::vector> &inputs) { PrintConfig(config, true); std::vector native_outputs, analysis_outputs; - const auto *analysis_config = - reinterpret_cast(config); - auto native_config = analysis_config->ToNativeConfig(); - TestOneThreadPrediction(&native_config, inputs, &native_outputs, false); + TestOneThreadPrediction(config, inputs, &native_outputs, false); TestOneThreadPrediction(config, inputs, &analysis_outputs, true); CompareResult(analysis_outputs, native_outputs); } From 3ace486ebd78fd3aeeb4670dab7c1a5d0205c073 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 8 Jan 2019 22:51:03 +0800 Subject: [PATCH 171/197] fix sum_op selected rows test=develop --- paddle/fluid/operators/sum_op.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc index 71fcaafe6b..7abfbbd3cb 100644 --- a/paddle/fluid/operators/sum_op.cc +++ b/paddle/fluid/operators/sum_op.cc @@ -52,10 +52,12 @@ class SumOp : public framework::OperatorWithKernel { framework::DDim in_dim({0}); for (size_t i = 0; i < x_dims.size(); ++i) { - if (x_var_types[i] == framework::proto::VarType::SELECTED_ROWS) { + auto& x_dim = x_dims[i]; + // x_dim.size() == 1 means the real dim of selected rows is [0] + if (x_var_types[i] == framework::proto::VarType::SELECTED_ROWS && + x_dim.size() == 1) { continue; } - auto& x_dim = x_dims[i]; if (framework::product(x_dim) == 0) { continue; } From e4184008a4e4aa60fbd21d43209256ec1114186f Mon Sep 17 00:00:00 2001 From: mozga-intel Date: Tue, 8 Jan 2019 16:37:03 +0100 Subject: [PATCH 172/197] PADDLE_WITH_NGRAPH was removed from the code test=develop --- paddle/fluid/operators/ngraph/ops/binary_unnary_op.h | 2 -- paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h | 2 -- paddle/fluid/operators/ngraph/ops/fill_constant_op.h | 2 -- paddle/fluid/operators/ngraph/ops/mean_op.h | 2 -- paddle/fluid/operators/ngraph/ops/mul_op.h | 2 -- paddle/fluid/operators/ngraph/ops/scale_op.h | 2 -- paddle/fluid/operators/ngraph/ops/top_k_op.h | 2 -- 7 files changed, 14 deletions(-) diff --git a/paddle/fluid/operators/ngraph/ops/binary_unnary_op.h b/paddle/fluid/operators/ngraph/ops/binary_unnary_op.h index 6610380fcf..0c0d25d0cd 100644 --- a/paddle/fluid/operators/ngraph/ops/binary_unnary_op.h +++ b/paddle/fluid/operators/ngraph/ops/binary_unnary_op.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_NGRAPH #pragma once #include @@ -48,4 +47,3 @@ static void BuildUnaryNode( } // namespace ngraphs } // namespace operators } // namespace paddle -#endif diff --git a/paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h b/paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h index 15fbd58b02..8f5092963c 100644 --- a/paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h +++ b/paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_NGRAPH #pragma once #include @@ -58,4 +57,3 @@ std::shared_ptr ElementwiseScalar( } // namespace ngraphs } // namespace operators } // namespace paddle -#endif diff --git a/paddle/fluid/operators/ngraph/ops/fill_constant_op.h b/paddle/fluid/operators/ngraph/ops/fill_constant_op.h index 5eff69e7b1..406a4314f8 100644 --- a/paddle/fluid/operators/ngraph/ops/fill_constant_op.h +++ b/paddle/fluid/operators/ngraph/ops/fill_constant_op.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_NGRAPH #pragma once #include @@ -58,4 +57,3 @@ void BuildFillConstantNode( } // namespace ngraphs } // namespace operators } // namespace paddle -#endif diff --git a/paddle/fluid/operators/ngraph/ops/mean_op.h b/paddle/fluid/operators/ngraph/ops/mean_op.h index 7fcf8f09cd..4c44bc4c11 100644 --- a/paddle/fluid/operators/ngraph/ops/mean_op.h +++ b/paddle/fluid/operators/ngraph/ops/mean_op.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_NGRAPH #pragma once #include @@ -65,4 +64,3 @@ void BuildMeanGradNode( } // namespace ngraphs } // namespace operators } // namespace paddle -#endif diff --git a/paddle/fluid/operators/ngraph/ops/mul_op.h b/paddle/fluid/operators/ngraph/ops/mul_op.h index 9e12e5d7c3..4a6cbebe24 100644 --- a/paddle/fluid/operators/ngraph/ops/mul_op.h +++ b/paddle/fluid/operators/ngraph/ops/mul_op.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_NGRAPH #pragma once #include @@ -131,4 +130,3 @@ static void BuildMulGradNode( } // namespace ngraphs } // namespace operators } // namespace paddle -#endif diff --git a/paddle/fluid/operators/ngraph/ops/scale_op.h b/paddle/fluid/operators/ngraph/ops/scale_op.h index 24ab0702aa..91a57d0be6 100644 --- a/paddle/fluid/operators/ngraph/ops/scale_op.h +++ b/paddle/fluid/operators/ngraph/ops/scale_op.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_NGRAPH #pragma once #include @@ -38,4 +37,3 @@ void BuildScaleNode( } // namespace ngraphs } // namespace operators } // namespace paddle -#endif diff --git a/paddle/fluid/operators/ngraph/ops/top_k_op.h b/paddle/fluid/operators/ngraph/ops/top_k_op.h index 2b7254497c..ea66953a12 100644 --- a/paddle/fluid/operators/ngraph/ops/top_k_op.h +++ b/paddle/fluid/operators/ngraph/ops/top_k_op.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_NGRAPH #pragma once #include @@ -48,4 +47,3 @@ void BuildTopKNode( } // namespace ngraphs } // namespace operators } // namespace paddle -#endif From 810439a993b20c649fa19a30d95369b25395f016 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 8 Jan 2019 23:42:13 +0800 Subject: [PATCH 173/197] fix style test=develop --- python/paddle/fluid/transpiler/distribute_transpiler.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 8d11db376d..ea5a4cf7cd 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -1280,8 +1280,9 @@ class DistributeTranspiler(object): # create table param and grad var in pserver program # create table optimize block in pserver program table_opt_op = [ - op for op in self.optimize_ops if 'Param' in op.input_names and - op.input("Param")[0] == self.table_name + op for op in self.optimize_ops + if 'Param' in op.input_names and op.input("Param")[0] == + self.table_name ][0] origin_param_var = self.origin_program.global_block().vars[ From a037378fdb96773f44e0c12c14d2119b7e76996a Mon Sep 17 00:00:00 2001 From: qingqing01 Date: Wed, 9 Jan 2019 10:16:40 +0800 Subject: [PATCH 174/197] Fix error with cuDNN version less than 7.1. (#15219) Since conv_fusion_op is not exposed into Python, remote the env flag in __init__.py test=develop --- python/paddle/fluid/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index f9f3807b15..2c17716500 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -155,7 +155,7 @@ def __bootstrap__(): 'fraction_of_gpu_memory_to_use', 'cudnn_deterministic', 'enable_cublas_tensor_op_math', 'conv_workspace_size_limit', 'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus', - 'cudnn_exhaustive_search_times', 'sync_nccl_allreduce' + 'sync_nccl_allreduce' ] core.init_gflags([sys.argv[0]] + From f23a257e905e61f513c2a68cdfd9fb39d8ff16db Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Wed, 9 Jan 2019 11:26:14 +0800 Subject: [PATCH 175/197] use the new MKLDNN repo url test=develop --- cmake/external/mkldnn.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index a9b99e9ab8..03f0dee859 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -55,7 +55,7 @@ ExternalProject_Add( ${MKLDNN_PROJECT} ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS ${MKLDNN_DEPENDS} - GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git" + GIT_REPOSITORY "https://github.com/intel/mkl-dnn.git" GIT_TAG "830a10059a018cd2634d94195140cf2d8790a75a" PREFIX ${MKLDNN_SOURCES_DIR} UPDATE_COMMAND "" From c3b9edf95881b1409534fec691197ba110388015 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Wed, 9 Jan 2019 12:39:32 +0800 Subject: [PATCH 176/197] follow comment test=develop --- paddle/fluid/operators/math/selected_rows_functor.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc index 5f169dda22..b99115e44b 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cc +++ b/paddle/fluid/operators/math/selected_rows_functor.cc @@ -195,7 +195,7 @@ struct SelectedRowsAddToTensor { void operator()(const platform::CPUDeviceContext& context, const framework::SelectedRows& input1, framework::Tensor* input2) { - if (input1.rows().size() == 0) { + if (UNLIKELY(input1.rows().size() == 0)) { LOG(WARNING) << "input selected rows is empty!"; return; } From 197d0f2431cb0628b58adf38017b2ddec6b10619 Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Wed, 9 Jan 2019 13:06:33 +0800 Subject: [PATCH 177/197] fix trt_model_tester to pass the ci test=develop --- .../inference/tests/api/trt_models_tester.cc | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc index 21df6eab81..9725c19032 100644 --- a/paddle/fluid/inference/tests/api/trt_models_tester.cc +++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc @@ -99,24 +99,12 @@ void compare(std::string model_dir, bool use_tensorrt) { SetFakeImageInput(&inputs_all, model_dir, false, "__model__", ""); } - std::vector native_outputs; - NativeConfig native_config; - SetConfig(&native_config, model_dir, true, false, - FLAGS_batch_size); - TestOneThreadPrediction( - reinterpret_cast(&native_config), inputs_all, - &native_outputs, false); - - std::vector analysis_outputs; contrib::AnalysisConfig analysis_config; - analysis_config.EnableUseGpu(50, 0); SetConfig(&analysis_config, model_dir, true, use_tensorrt, FLAGS_batch_size); - TestOneThreadPrediction( - reinterpret_cast(&analysis_config), inputs_all, - &analysis_outputs, true); - - CompareResult(native_outputs, analysis_outputs); + CompareNativeAndAnalysis( + reinterpret_cast(&analysis_config), + inputs_all); } TEST(TensorRT_mobilenet, compare) { From 999a05b04bdb6eb62f8de8fe106e2df10388157c Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Wed, 9 Jan 2019 04:40:31 +0000 Subject: [PATCH 178/197] polish code test=develop --- python/paddle/fluid/data_feeder.py | 11 ++++++++--- python/paddle/fluid/tests/test_data_feeder.py | 6 ++++++ 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py index 1301525914..7b70d19de5 100644 --- a/python/paddle/fluid/data_feeder.py +++ b/python/paddle/fluid/data_feeder.py @@ -71,7 +71,7 @@ class DataToLoDTensorConverter(object): for each_data in data: self._feed_impl_(each_data, lod[1:], lod_level - 1) - def _check_shape_(self, shape): + def _check_shape(self, shape): for s1, s2 in zip(self.shape, shape): if s1 != s2 and s1 >= 0 and s2 >= 0: raise ValueError( @@ -82,9 +82,14 @@ class DataToLoDTensorConverter(object): arr = numpy.array(self.data, dtype=self.dtype) if self.shape: if len(arr.shape) != len(self.shape): - arr = arr.reshape(self.shape) + try: + arr = arr.reshape(self.shape) + except ValueError: + raise ValueError( + "Reshape error. What is defined in data layer is {}, but receive {}" + .format(self.shape, arr.shape)) else: - self._check_shape_(arr.shape) + self._check_shape(arr.shape) t = core.LoDTensor() t.set(arr, self.place) if self.lod_level > 0: diff --git a/python/paddle/fluid/tests/test_data_feeder.py b/python/paddle/fluid/tests/test_data_feeder.py index 01de564aa4..16a33fd3ab 100644 --- a/python/paddle/fluid/tests/test_data_feeder.py +++ b/python/paddle/fluid/tests/test_data_feeder.py @@ -30,6 +30,12 @@ class TestDataFeeder(unittest.TestCase): self.assertEqual(result['image'].recursive_sequence_lengths(), []) self.assertEqual(result['label'].recursive_sequence_lengths(), []) + try: + result = feeder.feed([([0] * 783, [9]), ([1] * 783, [1])]) + self.assertTrue(False) + except ValueError: + self.assertTrue(True) + def test_lod_level_1_converter(self): # lod_level = 1 # each sentence has a different number of words From bb9f7a14a0bd11c8dfe046c5ca16af6be14cfd0a Mon Sep 17 00:00:00 2001 From: baojun-nervana Date: Tue, 8 Jan 2019 23:35:52 -0800 Subject: [PATCH 179/197] Fix cmake warning test=develop --- cmake/external/ngraph.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/external/ngraph.cmake b/cmake/external/ngraph.cmake index 799d9c309f..508f3e5257 100644 --- a/cmake/external/ngraph.cmake +++ b/cmake/external/ngraph.cmake @@ -37,7 +37,7 @@ INCLUDE(GNUInstallDirs) INCLUDE(ExternalProject) SET(NGRAPH_PROJECT "extern_ngraph") -SET(NGRAPH_GIT_TAG "08851c2c45fcf9fa9c74871dd3dbc3fe38f37cc9") +SET(NGRAPH_GIT_TAG "20bd8bbc79ae3a81c57313846a2be7313e5d1dab") SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph) SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph) SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include) From 5907d837c8f9dfc0511953451e98889edd3cc78a Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Wed, 9 Jan 2019 16:17:14 +0800 Subject: [PATCH 180/197] merge test_dist_ctr_with_l2_decay.py into test_dist_ctr.py test=develop --- .../fluid/tests/unittests/test_dist_ctr.py | 14 ++++++++ .../unittests/test_dist_ctr_with_l2_decay.py | 36 ------------------- 2 files changed, 14 insertions(+), 36 deletions(-) delete mode 100644 python/paddle/fluid/tests/unittests/test_dist_ctr_with_l2_decay.py diff --git a/python/paddle/fluid/tests/unittests/test_dist_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_ctr.py index 390393e04f..cc11764d55 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_ctr.py +++ b/python/paddle/fluid/tests/unittests/test_dist_ctr.py @@ -27,5 +27,19 @@ class TestDistCTR2x2(TestDistBase): self.check_with_place("dist_ctr.py", delta=1e-7, check_error_log=False) +class TestDistCTRWithL2Decay2x2(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._enforce_place = "CPU" + + def test_dist_ctr(self): + need_envs = {"USE_L2_DECAY": "1"} + self.check_with_place( + "dist_ctr.py", + delta=1e-7, + check_error_log=False, + need_envs=need_envs) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_ctr_with_l2_decay.py b/python/paddle/fluid/tests/unittests/test_dist_ctr_with_l2_decay.py deleted file mode 100644 index 558aee3653..0000000000 --- a/python/paddle/fluid/tests/unittests/test_dist_ctr_with_l2_decay.py +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from __future__ import print_function - -import os -import unittest -from test_dist_base import TestDistBase - - -class TestDistCTR2x2(TestDistBase): - def _setup_config(self): - self._sync_mode = True - self._enforce_place = "CPU" - - def test_dist_ctr(self): - need_envs = {"USE_L2_DECAY": "1"} - self.check_with_place( - "dist_ctr.py", - delta=1e-7, - check_error_log=False, - need_envs=need_envs) - - -if __name__ == "__main__": - unittest.main() From d43983b61de9bb57335c297c7e7fe074a8c48f6c Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Wed, 9 Jan 2019 19:36:34 +0800 Subject: [PATCH 181/197] reduce threads number to avoid hang in CI test=develop --- paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc index 3c52afbfb8..7e7c386f97 100644 --- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc @@ -283,7 +283,7 @@ TEST(Analyzer_rnn1, multi_thread) { std::vector> input_slots_all; SetInput(&input_slots_all); TestPrediction(reinterpret_cast(&cfg), - input_slots_all, &outputs, 4 /* multi_thread */); + input_slots_all, &outputs, 2 /* multi_thread */); } // Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing From e7d83389e61fdbfbf5f16db3fc7dd972b7589bd5 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Wed, 9 Jan 2019 12:53:59 +0000 Subject: [PATCH 182/197] fix demo ci bug 1. trt_demo bug 2. trigger exit when exists a bug test=develop --- paddle/fluid/inference/api/demo_ci/run.sh | 4 ++++ paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh index a94ccfa924..9811fe2cd0 100755 --- a/paddle/fluid/inference/api/demo_ci/run.sh +++ b/paddle/fluid/inference/api/demo_ci/run.sh @@ -116,6 +116,10 @@ D --modeldir=$DATA_DIR/mobilenet/model \ --data=$DATA_DIR/mobilenet/data.txt \ --refer=$DATA_DIR/mobilenet/result.txt + if [ $? -ne 0 ]; then + echo "trt demo trt_mobilenet_demo runs fail." + exit 1 + fi fi done set +x diff --git a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc index 30215e480f..338a0cec16 100644 --- a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc +++ b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc @@ -38,8 +38,8 @@ void Main() { std::unique_ptr predictor; paddle::contrib::AnalysisConfig config; config.EnableUseGpu(100, 0); - config.SetModel(FLAGS_modeldir + "/__params__", - FLAGS_modeldir + "/__model__"); + config.SetModel(FLAGS_modeldir + "/__model__", + FLAGS_modeldir + "/__params__"); config.EnableTensorRtEngine(); predictor = CreatePaddlePredictor(config); From 7461356723bbafc2670b9095c720076f47b1d26e Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 9 Jan 2019 11:10:28 +0000 Subject: [PATCH 183/197] add zerocopy for seqpool test --- .../tests/api/analyzer_seq_pool1_tester.cc | 87 ++++++++++++++++--- 1 file changed, 73 insertions(+), 14 deletions(-) diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc index 083bdf15e9..cd0fcedb9a 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc @@ -121,14 +121,6 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data) { } } -void SetConfig(AnalysisConfig *cfg) { - cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params"); - cfg->DisableGpu(); - cfg->SwitchSpecifyInputNames(); - cfg->pass_builder()->TurnOnDebug(); - cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); -} - void SetInput(std::vector> *inputs) { DataRecord data(FLAGS_infer_data, FLAGS_batch_size); std::vector input_slots; @@ -141,15 +133,22 @@ void SetInput(std::vector> *inputs) { } } +void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false) { + cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params"); + cfg->DisableGpu(); + cfg->SwitchSpecifyInputNames(); + cfg->pass_builder()->TurnOnDebug(); + cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); + if (use_mkldnn) { + cfg->EnableMKLDNN(); + } +} + void profile(bool use_mkldnn = false) { AnalysisConfig cfg; - SetConfig(&cfg); + SetConfig(&cfg, use_mkldnn); - if (use_mkldnn) { - cfg.EnableMKLDNN(); - } std::vector outputs; - std::vector> input_slots_all; SetInput(&input_slots_all); TestPrediction(reinterpret_cast(&cfg), @@ -178,13 +177,73 @@ TEST(Analyzer_seq_pool1, fuse_statis) { auto fuse_statis = GetFuseStatis( static_cast(predictor.get()), &num_ops); + ASSERT_TRUE(fuse_statis.count("fc_fuse")); + ASSERT_EQ(fuse_statis.at("fc_fuse"), 10); ASSERT_TRUE(fuse_statis.count("seqpool_concat_fuse")); EXPECT_EQ(fuse_statis.at("seqpool_concat_fuse"), 2); - LOG(INFO) << "num_ops: " << num_ops; EXPECT_EQ(num_ops, 195); } +void PrepareZeroCopyInputs( + const std::unique_ptr &predictor, + std::vector> *inputs) { + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + // only feed one batch + const auto &one_batch = data.NextBatch(); + inputs->clear(); + for (size_t i = 0; i < one_batch.size(); ++i) { + auto &slot = one_batch[i]; + auto tensor = predictor->GetInputTensor(slot.name + "_embed"); + tensor->Reshape(slot.shape); + tensor->SetLoD({slot.lod}); + ZeroCopyTensorAssignData(tensor.get(), slot.data); + inputs->emplace_back(std::move(tensor)); + } +} + +std::unique_ptr zerocopy_profile(int repeat_times) { + AnalysisConfig config; + SetConfig(&config); + config.SwitchUseFeedFetchOps(false); + auto predictor = CreatePaddlePredictor(config); + std::vector> inputs; + PrepareZeroCopyInputs(predictor, &inputs); + auto output_tensor = predictor->GetOutputTensor("reduce_sum_0.tmp_0"); + Timer timer; + LOG(INFO) << "Warm up run..."; + timer.tic(); + predictor->ZeroCopyRun(); + PrintTime(FLAGS_batch_size, 1, 1, 0, timer.toc(), 1); + if (FLAGS_profile) { + paddle::platform::ResetProfiler(); + } + LOG(INFO) << "Run " << repeat_times << " times..."; + timer.tic(); + for (int i = 0; i < repeat_times; i++) { + predictor->ZeroCopyRun(); + } + PrintTime(FLAGS_batch_size, repeat_times, 1, 0, timer.toc() / repeat_times, + 1); + return output_tensor; +} + +TEST(Analyzer_seq_pool1, zerocopy_profile) { zerocopy_profile(FLAGS_repeat); } + +TEST(Analyzer_seq_pool1, zerocopy_fuse_statis) { + AnalysisConfig config; + SetConfig(&config); + config.SwitchUseFeedFetchOps(false); + auto predictor = CreatePaddlePredictor(config); + int num_ops; + auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops); + ASSERT_TRUE(fuse_statis.count("fc_fuse")); + ASSERT_EQ(fuse_statis.at("fc_fuse"), 10); + ASSERT_TRUE(fuse_statis.count("seqpool_concat_fuse")); + EXPECT_EQ(fuse_statis.at("seqpool_concat_fuse"), 2); + ASSERT_EQ(num_ops, 195); +} + } // namespace analysis } // namespace inference } // namespace paddle From 137060135e602500e9f94549caf6282da9ebd7c8 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 9 Jan 2019 13:00:49 +0000 Subject: [PATCH 184/197] fix zerocopy size --- paddle/fluid/inference/api/helper.h | 11 ++++++++--- paddle/fluid/inference/api/paddle_api.h | 3 ++- .../fluid/inference/tests/api/analyzer_rnn1_tester.cc | 4 ++-- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index 7830e85956..cdd01cb9f0 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -204,11 +204,14 @@ static std::string DescribeTensor(const PaddleTensor &tensor) { os << to_string(l) << "; "; } os << "\n"; - os << " - data: "; + os << " - memory length: " << tensor.data.length(); + os << "\n"; + os << " - data: "; int dim = VecReduceToInt(tensor.shape); + float *pdata = static_cast(tensor.data.data()); for (int i = 0; i < dim; i++) { - os << static_cast(tensor.data.data())[i] << " "; + os << pdata[i] << " "; } os << '\n'; return os.str(); @@ -224,10 +227,12 @@ static std::string DescribeZeroCopyTensor(const ZeroCopyTensor &tensor) { os << to_string(l) << "; "; } os << "\n"; - os << " - data: "; PaddlePlace place; int size; const auto *data = tensor.data(&place, &size); + os << " - numel: " << size; + os << "\n"; + os << " - data: "; for (int i = 0; i < size; i++) { os << data[i] << " "; } diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index 3642f36127..832c8cdf28 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -123,7 +123,8 @@ class ZeroCopyTensor { */ template T* mutable_data(PaddlePlace place); - /** Get the memory directly, will return the place and memory size by pointer. + /** Get the memory directly, will return the place and element size by + * pointer. * This is for reading the output tensor. */ template diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc index 3c52afbfb8..8e37df3cde 100644 --- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc @@ -351,10 +351,10 @@ TEST(Analyzer_rnn1, ZeroCopy) { ASSERT_TRUE(native_predictor->Run(native_inputs.front(), &native_outputs)); LOG(INFO) << "native output " << DescribeTensor(native_outputs.front()); - int output_size{0}; + int output_size{0}; // this is the number of elements not memory size auto *zero_copy_data = output_tensor->data(&place, &output_size); auto *native_data = static_cast(native_outputs.front().data.data()); - for (size_t i = 0; i < output_size / sizeof(float); i++) { + for (int i = 0; i < output_size; i++) { EXPECT_NEAR(zero_copy_data[i], native_data[i], 1e-3); } } From 54afcb7ec663b4f87ee806072eb52d0693c93755 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 9 Jan 2019 13:01:34 +0000 Subject: [PATCH 185/197] add compare zerocopy test with native result test=develop --- .../tests/api/analyzer_seq_pool1_tester.cc | 53 +++++++++++++------ 1 file changed, 36 insertions(+), 17 deletions(-) diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc index cd0fcedb9a..1cf326fc89 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc @@ -168,15 +168,13 @@ TEST(Analyzer_seq_pool1, compare) { reinterpret_cast(&cfg), input_slots_all); } -// Check the fuse status -TEST(Analyzer_seq_pool1, fuse_statis) { +void analysis_fuse_statis(bool use_zerocopy) { AnalysisConfig cfg; SetConfig(&cfg); + cfg.SwitchUseFeedFetchOps(!use_zerocopy); int num_ops; auto predictor = CreatePaddlePredictor(cfg); - auto fuse_statis = GetFuseStatis( - static_cast(predictor.get()), &num_ops); - + auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops); ASSERT_TRUE(fuse_statis.count("fc_fuse")); ASSERT_EQ(fuse_statis.at("fc_fuse"), 10); ASSERT_TRUE(fuse_statis.count("seqpool_concat_fuse")); @@ -185,6 +183,9 @@ TEST(Analyzer_seq_pool1, fuse_statis) { EXPECT_EQ(num_ops, 195); } +// Check the fuse status +TEST(Analyzer_seq_pool1, fuse_statis) { analysis_fuse_statis(false); } + void PrepareZeroCopyInputs( const std::unique_ptr &predictor, std::vector> *inputs) { @@ -202,7 +203,8 @@ void PrepareZeroCopyInputs( } } -std::unique_ptr zerocopy_profile(int repeat_times) { +// return the output values +std::vector zerocopy_profile(int repeat_times) { AnalysisConfig config; SetConfig(&config); config.SwitchUseFeedFetchOps(false); @@ -225,23 +227,40 @@ std::unique_ptr zerocopy_profile(int repeat_times) { } PrintTime(FLAGS_batch_size, repeat_times, 1, 0, timer.toc() / repeat_times, 1); - return output_tensor; + + VLOG(3) << "ZeroCopy output: " << DescribeZeroCopyTensor(*output_tensor); + PaddlePlace place; + int output_size{0}; + auto *pdata = output_tensor->data(&place, &output_size); + std::vector res(output_size); + for (int i = 0; i < output_size; ++i) { + res[i] = pdata[i]; + } + return res; } TEST(Analyzer_seq_pool1, zerocopy_profile) { zerocopy_profile(FLAGS_repeat); } -TEST(Analyzer_seq_pool1, zerocopy_fuse_statis) { +TEST(Analyzer_seq_pool1, zerocopy_fuse_statis) { analysis_fuse_statis(true); } + +TEST(Analyzer_seq_pool1, zerocopy_compare_native) { AnalysisConfig config; SetConfig(&config); - config.SwitchUseFeedFetchOps(false); - auto predictor = CreatePaddlePredictor(config); - int num_ops; - auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops); - ASSERT_TRUE(fuse_statis.count("fc_fuse")); - ASSERT_EQ(fuse_statis.at("fc_fuse"), 10); - ASSERT_TRUE(fuse_statis.count("seqpool_concat_fuse")); - EXPECT_EQ(fuse_statis.at("seqpool_concat_fuse"), 2); - ASSERT_EQ(num_ops, 195); + config.SwitchUseFeedFetchOps(true); + auto predictor = CreatePaddlePredictor(config.ToNativeConfig()); + std::vector native_outputs; + std::vector> input_slots_all; + SetInput(&input_slots_all); + ASSERT_TRUE(predictor->Run(input_slots_all[0], &native_outputs)); + EXPECT_EQ(native_outputs.size(), 1UL); + + auto zerocopy_output = zerocopy_profile(1); + EXPECT_EQ(zerocopy_output.size() * sizeof(float), + native_outputs.front().data.length()); + auto *native_data = static_cast(native_outputs.front().data.data()); + for (size_t i = 0; i < zerocopy_output.size(); ++i) { + EXPECT_NEAR(zerocopy_output[i], native_data[i], 1e-3); + } } } // namespace analysis From 40330c2c23268ab4d602400170088f0ee49a8d48 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Wed, 9 Jan 2019 21:34:30 +0800 Subject: [PATCH 186/197] clean test_dist_ctr_with_l2_decay test=develop --- python/paddle/fluid/tests/unittests/CMakeLists.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index e81632116c..ec8b19c7ba 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -18,7 +18,6 @@ if(NOT WITH_DISTRIBUTE) LIST(REMOVE_ITEM TEST_OPS test_dist_mnist) LIST(REMOVE_ITEM TEST_OPS test_dist_word2vec) LIST(REMOVE_ITEM TEST_OPS test_dist_ctr) - LIST(REMOVE_ITEM TEST_OPS test_dist_ctr_with_l2_decay) LIST(REMOVE_ITEM TEST_OPS test_dist_simnet_bow) LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_batch_merge) LIST(REMOVE_ITEM TEST_OPS test_dist_text_classification) @@ -102,7 +101,7 @@ if(WITH_DISTRIBUTE) # FIXME(typhoonzero): add these tests back # py_test_modules(test_dist_transformer MODULES test_dist_transformer) # set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000) - set_tests_properties(test_dist_ctr test_dist_ctr_with_l2_decay test_dist_mnist test_dist_mnist_batch_merge test_dist_save_load test_dist_se_resnext test_dist_simnet_bow test_dist_text_classification test_dist_train test_dist_word2vec PROPERTIES RUN_SERIAL TRUE) + set_tests_properties(test_dist_ctr test_dist_mnist test_dist_mnist_batch_merge test_dist_save_load test_dist_se_resnext test_dist_simnet_bow test_dist_text_classification test_dist_train test_dist_word2vec PROPERTIES RUN_SERIAL TRUE) endif(NOT APPLE) py_test_modules(test_dist_transpiler MODULES test_dist_transpiler) endif() From 8e086a8521ab1aa8d7b2632b71b9193df630d0a4 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 9 Jan 2019 14:51:51 +0000 Subject: [PATCH 187/197] follow comment and fix typo test=develop --- .../framework/ir/seqpool_concat_fuse_pass.cc | 26 +++++++++++-------- .../framework/ir/seqpool_concat_fuse_pass.h | 14 ++++++++++ .../fused/fusion_seqpool_concat_op.cc | 10 ++++--- 3 files changed, 35 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc index 20b8220033..7dd6f4880a 100644 --- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc +++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc @@ -39,21 +39,25 @@ PDNode* BuildSeqPoolConcatPattern(PDPattern* pattern, auto is_seqpool_op_with_pootype_of_nth_input_of_concat = [=]( Node* x, const std::string& type, int idx) -> bool { - bool ok = x && x->IsOp() && x->Op()->Type() == "sequence_pool" && - x->Op()->HasAttr("pooltype") && - boost::get(x->Op()->GetAttr("pooltype")) == type && - x->outputs.size() == 2; // seqpool should only have 2 outputs - if (ok) { - // only one output of seqpool_op is nth_input_var of concat - // the other one should be unused empty var + bool this_is_seqpool_op = + x && x->IsOp() && x->Op()->Type() == "sequence_pool" && + x->Op()->HasAttr("pooltype") && + boost::get(x->Op()->GetAttr("pooltype")) == type && + x->outputs.size() == 2; // seqpool should only have 2 outputs + bool satisfied_all = this_is_seqpool_op; + if (this_is_seqpool_op) { + // Only one output of seqpool_op is nth_input_var of concat, + // the other one should be unused empty var. if (is_nth_input_var_of_concat(x->outputs[0], idx)) { - ok = ok && x->outputs[1]->IsVar() && x->outputs[1]->outputs.size() == 0; + satisfied_all = satisfied_all && x->outputs[1]->IsVar() && + x->outputs[1]->outputs.size() == 0; } else { - ok = ok && is_nth_input_var_of_concat(x->outputs[1], idx) && - x->outputs[0]->IsVar() && x->outputs[0]->outputs.size() == 0; + satisfied_all = + satisfied_all && is_nth_input_var_of_concat(x->outputs[1], idx) && + x->outputs[0]->IsVar() && x->outputs[0]->outputs.size() == 0; } } - return ok; + return satisfied_all; }; auto* concat_op = pattern->NewNode( diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h index 59730fde55..ba2154045e 100644 --- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h +++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h @@ -23,6 +23,20 @@ namespace paddle { namespace framework { namespace ir { +/** + * Fuse SequencePool(with sum pooltype yet) and Concat; + * + * Before fuse: + * | | | + * seq_pool, seq_pool, ... seq_pool + * \ | ... / + * concat + * | + * After fuse: + * \ | / + * FusionSeqPoolConcat + * | + */ class SeqPoolConcatFusePass : public FusePassBase { public: virtual ~SeqPoolConcatFusePass() {} diff --git a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc index 578ff6b2d0..b181140db7 100644 --- a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc +++ b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc @@ -23,7 +23,7 @@ namespace operators { void FusionSeqPoolConcatOp::InferShape( framework::InferShapeContext* ctx) const { PADDLE_ENFORCE_GE(ctx->Inputs("X").size(), 1UL, - "Inputs(X) of FusionSeqPoolConcatOp should be empty."); + "Inputs(X) of FusionSeqPoolConcatOp should not be empty."); PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of FusionSeqPoolConcatOp should not be null."); int axis = ctx->Attrs().Get("axis"); @@ -54,12 +54,13 @@ void FusionSeqPoolConcatOpMaker::Make() { AddInput("X", "(LoDTensor) Input tensors of this operator.").AsDuplicable(); AddOutput("Out", "(LoDTensor) Output tensor of concat operator."); AddAttr("pooltype", - "(string, default 'AVERAGE') some of the pooling " + "(string, default 'SUM') some of the pooling " "pooltype of SequencePoolOp.") .SetDefault("SUM") .InEnum({"AVERAGE", "SUM", "SQRT"}); AddAttr("axis", - "The axis along which the input tensors will be concatenated.") + "The axis along which the input tensors will be concatenated. " + "Only supports concat axis=1 yet.") .SetDefault(1); AddComment(R"DOC( Fusion Sequence Pool of pooltype(sum, average and sqrt) and Concat Operator. @@ -100,6 +101,7 @@ class FusionSeqPoolConcatKernel : public framework::OpKernel { jit::Get, platform::CPUPlace>( attr); size_t n = ins.size(); + size_t dst_step_size = n * w; for (size_t i = 0; i < n; ++i) { auto x_dims = ins[i]->dims(); auto x_lod = ins[i]->lod()[0]; @@ -112,7 +114,7 @@ class FusionSeqPoolConcatKernel : public framework::OpKernel { for (size_t j = 0; j < bs; ++j) { attr.h = static_cast(x_lod[j + 1] - x_lod[j]); seqpool(src, dst, &attr); - dst += n * w; + dst += dst_step_size; src += attr.h * attr.w; } } From 9181dea9f353dc1df4e1b787ab366422711272a6 Mon Sep 17 00:00:00 2001 From: Sang Ik Lee Date: Wed, 9 Jan 2019 09:34:06 -0800 Subject: [PATCH 188/197] Set correct TBB library name in debug build and remove warning related to rpath dependency from symlink. test=develop --- cmake/external/ngraph.cmake | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/cmake/external/ngraph.cmake b/cmake/external/ngraph.cmake index 508f3e5257..14af98b2d7 100644 --- a/cmake/external/ngraph.cmake +++ b/cmake/external/ngraph.cmake @@ -44,7 +44,11 @@ SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include) SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}) SET(NGRAPH_SHARED_LIB_NAME libngraph.so) SET(NGRAPH_CPU_LIB_NAME libcpu_backend.so) -SET(NGRAPH_TBB_LIB_NAME libtbb.so.2) +if(CMAKE_BUILD_TYPE STREQUAL "Debug") + SET(NGRAPH_TBB_LIB_NAME libtbb_debug.so.2) +else() + SET(NGRAPH_TBB_LIB_NAME libtbb.so.2) +endif() SET(NGRAPH_GIT_REPO "https://github.com/NervanaSystems/ngraph.git") SET(NGRAPH_SHARED_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME}) SET(NGRAPH_CPU_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME}) @@ -66,16 +70,7 @@ ExternalProject_Add( CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} CMAKE_ARGS -DMKLDNN_INCLUDE_DIR=${MKLDNN_INC_DIR} CMAKE_ARGS -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/lib -) - -# Workaround for nGraph expecting mklml to be in mkldnn install directory. -ExternalProject_Add_Step( - ${NGRAPH_PROJECT} - PrepareMKL - COMMAND ${CMAKE_COMMAND} -E create_symlink ${MKLML_LIB} ${MKLDNN_INSTALL_DIR}/lib/libmklml_intel.so - COMMAND ${CMAKE_COMMAND} -E create_symlink ${MKLML_IOMP_LIB} ${MKLDNN_INSTALL_DIR}/lib/libiomp5.so - DEPENDEES download - DEPENDERS configure + CMAKE_ARGS -DMKLML_LIB_DIR=${MKLML_INSTALL_DIR}/lib ) add_dependencies(ngraph ${NGRAPH_PROJECT}) From a0a27bd240e45f617dc9538e34c17c27f202b1e7 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 9 Jan 2019 15:21:15 +0000 Subject: [PATCH 189/197] add seqpool concat fuse pass tester test=develop --- paddle/fluid/framework/ir/CMakeLists.txt | 1 + .../framework/ir/seqpool_concat_fuse_pass.cc | 7 +- .../ir/seqpool_concat_fuse_pass_tester.cc | 114 ++++++++++++++++++ 3 files changed, 118 insertions(+), 4 deletions(-) create mode 100644 paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index f71a3d0f2e..a595a8ab42 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -69,6 +69,7 @@ cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_r cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass) cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector) cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto) +cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto) cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass) if (WITH_MKLDNN) cc_test(test_depthwise_conv_mkldnn_pass SRCS depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass) diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc index 7dd6f4880a..96a60da518 100644 --- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc +++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc @@ -112,8 +112,7 @@ PDNode* BuildSeqPoolConcatPattern(PDPattern* pattern, return concat_out_var; } -int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, - int num_inputs) { +int BuildFusion(Graph* graph, const std::string& name_scope, int num_inputs) { GraphPatternDetector gpd; auto* pattern = gpd.mutable_pattern(); BuildSeqPoolConcatPattern(pattern, name_scope, num_inputs); @@ -182,8 +181,8 @@ std::unique_ptr SeqPoolConcatFusePass::ApplyImpl( FusePassBase::Init(name_scope_, graph.get()); int fusion_count = 0; for (int i = MAX_CONCAT_INPUTS; i > 0; --i) { - fusion_count += BuildFusion( - graph.get(), name_scope_ + "/" + std::to_string(i), param_scope(), i); + fusion_count += + BuildFusion(graph.get(), name_scope_ + "/" + std::to_string(i), i); } AddStatis(fusion_count); diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc new file mode 100644 index 0000000000..7d2739d84d --- /dev/null +++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc @@ -0,0 +1,114 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h" +#include +#include "paddle/fluid/framework/op_proto_maker.h" + +namespace paddle { +namespace framework { +namespace ir { + +void SetOp(ProgramDesc* prog, const std::string& type, + const std::vector& inputs, + const std::vector& outputs) { + auto* op = prog->MutableBlock(0)->AppendOp(); + op->SetType(type); + if (type == "sequence_pool") { + op->SetInput("X", {inputs[0]}); + std::string pooltype = "SUM"; + op->SetAttr("pooltype", pooltype); + op->SetOutput("MaxIndex", {outputs[0]}); + op->SetOutput("Out", {outputs[1]}); + } else if (type == "concat") { + op->SetInput("X", inputs); + op->SetAttr("axis", 1); + op->SetOutput("Out", {outputs[0]}); + } + op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), + static_cast(OpRole::kForward)); +} + +/* + * Before fuse: + * a b c + * | | | + * op1 op2 op3 + * / \ / \ / \ + * d e f g h i + * \ | / + * concat + * | + * j + * After fuse: + * a b c + * \ | / + * fusion_seqpool_concat + * | + * j + * unused nodes: d, f, h + */ +ProgramDesc BuildProgramDesc() { + ProgramDesc prog; + for (auto& v : std::vector( + {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j"})) { + auto* var = prog.MutableBlock(0)->Var(v); + var->SetType(proto::VarType::LOD_TENSOR); + } + + SetOp(&prog, "sequence_pool", std::vector({"a"}), + std::vector({"d", "e"})); + SetOp(&prog, "sequence_pool", std::vector({"b"}), + std::vector({"f", "g"})); + SetOp(&prog, "sequence_pool", std::vector({"c"}), + std::vector({"h", "i"})); + SetOp(&prog, "concat", std::vector({"e", "g", "i"}), + std::vector({"j"})); + + return prog; +} + +TEST(SeqPoolConcatFusePass, basic) { + auto prog = BuildProgramDesc(); + + std::unique_ptr graph(new ir::Graph(prog)); + + auto pass = PassRegistry::Instance().Get("seqpool_concat_fuse_pass"); + + int pre_nodes = graph->Nodes().size(); + + graph = pass->Apply(std::move(graph)); + + int after_nodes = graph->Nodes().size(); + + // Remove 7 Nodes: op1, op2, op3, e, g, i, concat_op + // Add 1 Node: fusion_seqpool_concat + EXPECT_EQ(pre_nodes - 6, after_nodes); + + // Assert new op in newly generated graph + int count = 0; + + for (auto* node : graph->Nodes()) { + if (node->IsOp() && node->Op()->Type() == "fusion_seqpool_concat") { + ++count; + } + } + EXPECT_EQ(count, 1); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(seqpool_concat_fuse_pass); From fb63cd89d4343270ad96598aac177a7ad8d36c21 Mon Sep 17 00:00:00 2001 From: flame Date: Thu, 10 Jan 2019 12:24:51 +0800 Subject: [PATCH 190/197] Add python ir graph API (#14917) --- .../details/multi_devices_graph_pass.cc | 2 +- paddle/fluid/framework/ir/graph.h | 1 - paddle/fluid/pybind/CMakeLists.txt | 2 +- paddle/fluid/pybind/ir.cc | 103 ++++++++++++ paddle/fluid/pybind/ir.h | 25 +++ paddle/fluid/pybind/pybind.cc | 11 +- .../fluid/tests/unittests/test_ir_graph.py | 146 ++++++++++++++++++ 7 files changed, 286 insertions(+), 4 deletions(-) create mode 100644 paddle/fluid/pybind/ir.cc create mode 100644 paddle/fluid/pybind/ir.h create mode 100644 python/paddle/fluid/tests/unittests/test_ir_graph.py diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index d91993bd4f..75f922d2cc 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -226,7 +226,7 @@ std::unique_ptr MultiDevSSAGraphBuilderBase::ApplyImpl( * Only variables should be the leaves of graph. */ AddOutputToLeafOps(&result); - result.Erase(kGraphOps); + result.Erase(kGraphOps); return graph; } diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index 47fcf96a3f..8bb3c27bdd 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -109,7 +109,6 @@ class Graph { attr_dels_[attr_name] = []() {}; } - template void Erase(const std::string &attr_name) { PADDLE_ENFORCE(attrs_.count(attr_name) != 0, "%s not set in the graph", attr_name); diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 72b0f216d3..2545f5312f 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -3,7 +3,7 @@ set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune fe if(WITH_PYTHON) list(APPEND PYBIND_DEPS py_func_op) endif() -set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc) +set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc ir.cc) if(WITH_PYTHON) if(WITH_AMD_GPU) diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc new file mode 100644 index 0000000000..d32fe58f86 --- /dev/null +++ b/paddle/fluid/pybind/ir.cc @@ -0,0 +1,103 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/pybind/ir.h" +#include +#include +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/node.h" +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/var_desc.h" +#include "pybind11/stl.h" + +namespace py = pybind11; +using paddle::framework::ir::Graph; +using paddle::framework::ir::Node; +using paddle::framework::OpDesc; +using paddle::framework::ProgramDesc; +using paddle::framework::VarDesc; +using pybind11::return_value_policy; + +namespace paddle { +namespace pybind { +void BindGraph(py::module *m) { + py::class_>( + *m, "Graph", + "The graph is a Directed Acyclic Single Static Assignment Graph, see " + "`paddle::ir::Graph` for details.") + .def(py::init()) + .def("has", &Graph::Has) + .def("get_int", &Graph::Get) + .def("get_float", &Graph::Get) + .def("get_double", &Graph::Get) + .def("get_string", &Graph::Get) + .def("set", [](Graph &self, const std::string &attr_name, + int attr) { return self.Set(attr_name, new int(attr)); }) + .def("set", + [](Graph &self, const std::string &attr_name, + const std::string &attr) { + return self.Set(attr_name, new std::string(attr)); + }) + .def("set", + [](Graph &self, const std::string &attr_name, float attr) { + return self.Set(attr_name, new float(attr)); + }) + .def("set", + [](Graph &self, const std::string &attr_name, double attr) { + return self.Set(attr_name, new double(attr)); + }) + .def("erase", &Graph::Erase) + .def("nodes", &Graph::Nodes, return_value_policy::reference) + .def("create_var_node", + [](Graph &self, VarDesc &var_desc) { + return self.CreateVarNode(&var_desc); + }, + return_value_policy::reference) + .def("create_op_node", + [](Graph &self, OpDesc &op_desc) { + return self.CreateOpNode(&op_desc); + }, + return_value_policy::reference) + .def("create_control_dep_var", &Graph::CreateControlDepVar, + return_value_policy::reference) + .def("create_empty_node", &Graph::CreateEmptyNode, + return_value_policy::reference) + .def("release_nodes", &Graph::ReleaseNodes) + .def("remove_node", + [](Graph &self, Node &node) { return self.RemoveNode(&node); }) + .def("retrieve_node", &Graph::RetrieveNode, + return_value_policy::reference) + .def("resolve_hazard", &Graph::ResolveHazard); +} + +void BindNode(py::module *m) { + py::class_ node(*m, "Node"); + node.def("name", &Node::Name) + .def("node_type", &Node::NodeType) + .def("var", &Node::Var) + .def("op", &Node::Op) + .def("id", &Node::id) + .def("is_op", &Node::IsOp) + .def("is_var", &Node::IsVar) + .def("is_ctrl_var", &Node::IsCtrlVar) + .def_readwrite("inputs", &Node::inputs) + .def_readwrite("outputs", &Node::outputs); + + py::enum_(node, "Type") + .value("Operation", Node::Type::kOperation) + .value("Variable", Node::Type::kVariable) + .export_values(); +} +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/ir.h b/paddle/fluid/pybind/ir.h new file mode 100644 index 0000000000..5bee70eba6 --- /dev/null +++ b/paddle/fluid/pybind/ir.h @@ -0,0 +1,25 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/framework/ir/graph.h" + +namespace paddle { +namespace pybind { +void BindGraph(pybind11::module *m); +void BindNode(pybind11::module *m); +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index a540c6fca1..1edff3a1f5 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -49,6 +49,7 @@ limitations under the License. */ #include "paddle/fluid/pybind/const_value.h" #include "paddle/fluid/pybind/exception.h" #include "paddle/fluid/pybind/imperative.h" +#include "paddle/fluid/pybind/ir.h" #include "paddle/fluid/pybind/protobuf.h" #include "paddle/fluid/pybind/pybind.h" // NOLINT #include "paddle/fluid/pybind/recordio.h" @@ -775,7 +776,12 @@ All parameter, weight, gradient are variables in Paddle. }) .def("set_int", [](ir::Pass &self, const std::string &name, int val) { self.Set(name, new int(val)); }) - .def("type", &ir::Pass::Type); + .def("type", &ir::Pass::Type) + .def("apply", [](ir::Pass &self, std::shared_ptr graph) { + std::unique_ptr origin_graph(graph.get()); + auto optim_graph = self.Apply(std::move(origin_graph)); + graph.reset(optim_graph.release()); + }); py::class_> pb( m, "PassBuilder"); @@ -1042,6 +1048,9 @@ All parameter, weight, gradient are variables in Paddle. BindRecordIOWriter(&m); BindAsyncExecutor(&m); + + BindGraph(&m); + BindNode(&m); } } // namespace pybind } // namespace paddle diff --git a/python/paddle/fluid/tests/unittests/test_ir_graph.py b/python/paddle/fluid/tests/unittests/test_ir_graph.py new file mode 100644 index 0000000000..ba6e4a8b2e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_ir_graph.py @@ -0,0 +1,146 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +import six +from paddle import fluid + + +class TestIRGraph(unittest.TestCase): + """ + TODO(fc500110): `resolve_hazard` api will be tested when it can be used. + """ + + def test_nodes(self): + graph = build_graph() + self.assertTrue( + {node.name() + for node in graph.nodes()} == {"x1", "x2", "out", "sum"}) + + def test_has_set_get(self): + graph = build_graph() + for attr_name in ["int", "float", "string"]: + self.assertFalse(graph.has(attr_name)) + graph.set("int", 1) + graph.set("float", 0.5) + graph.set("string", "string") + for attr_name in ["int", "float", "string"]: + self.assertTrue(graph.has(attr_name)) + + self.assertTrue(graph.get_int("int") == 1) + self.assertTrue(graph.get_float("float") == 0.5) + self.assertTrue(graph.get_string("string") == "string") + + def test_erase(self): + graph = build_graph() + graph.set("test", 0) + self.assertTrue(graph.has("test")) + graph.erase("test") + self.assertFalse(graph.has("test")) + + def test_create_var_node(self): + prog = fluid.core.ProgramDesc() + block = prog.block(0) + shape = [10, 20] + x1 = block.var(six.b("x1")) + x1.set_type(fluid.core.VarDesc.VarType.LOD_TENSOR) + x1.set_shape(shape) + graph = fluid.core.Graph(prog) + node = graph.create_var_node(x1) + self.assertTrue(node.node_type() == fluid.core.Node.Type.Variable) + + def test_create_op_node(self): + prog = fluid.core.ProgramDesc() + block = prog.block(0) + sum_op_desc = block.append_op() + graph = fluid.core.Graph(prog) + node = graph.create_op_node(sum_op_desc) + self.assertTrue(node.node_type() == fluid.core.Node.Type.Operation) + + def test_create_control_dep_var(self): + graph = build_graph() + name = "__control_var@{}".format(len(graph.nodes())) + node = graph.create_control_dep_var() + self.assertTrue(node.name() == name) + + def test_create_empty_node(self): + prog = fluid.core.ProgramDesc() + graph = fluid.core.Graph(prog) + n1 = graph.create_empty_node('x', fluid.core.Node.Type.Operation) + self.assertTrue(n1.name() == 'x') + n2 = graph.create_empty_node('y', fluid.core.Node.Type.Variable) + self.assertTrue(n2.name() == 'y') + + def test_release_nodes(self): + graph = build_graph() + nodes = graph.release_nodes() + self.assertTrue(len(graph.nodes()) == 0) + self.assertTrue({node.name() + for node in nodes} == {"x1", "x2", "out", "sum"}) + + def test_remove_node(self): + graph = build_graph() + nodes = graph.nodes() + for node in nodes: + if node.name() == "sum": + break + self.assertTrue({node.name() + for node in nodes} == {"x1", "x2", "out", "sum"}) + nodes.remove(node) + self.assertTrue({node.name() for node in nodes} == {"x1", "x2", "out"}) + + def test_retrieve_node(self): + graph = build_graph() + nodes = [] + for i in range(len(graph.nodes())): + nodes.append(graph.retrieve_node(i)) + + for node in nodes: + self.assertTrue(node in graph.nodes()) + + def resolve_hazard(self): + pass + + +def build_graph(): + prog = fluid.core.ProgramDesc() + block = prog.block(0) + + shape = [10, 20] + + # prepare input/output + x1 = block.var(six.b("x1")) + x1.set_type(fluid.core.VarDesc.VarType.LOD_TENSOR) + x1.set_shape(shape) + x2 = block.var(six.b("x2")) + x2.set_type(fluid.core.VarDesc.VarType.LOD_TENSOR) + x2.set_shape(shape) + + out = block.var(six.b("out")) + out.set_type(fluid.core.VarDesc.VarType.LOD_TENSOR) + + sum_op_desc = block.append_op() + sum_op_desc.set_type("sum") + sum_op_desc.set_input("X", ["x1", "x2"]) + sum_op_desc.set_output("Out", ["out"]) + + sum_op_desc.check_attrs() + sum_op_desc.infer_shape(block) + graph = fluid.core.Graph(prog) + return graph + + +if __name__ == "__main__": + unittest.main() From e239558e562f37f9a5ae956793cb04beb6b51e8e Mon Sep 17 00:00:00 2001 From: peizhilin Date: Thu, 10 Jan 2019 13:13:05 +0800 Subject: [PATCH 191/197] remove the dismatch enclosure to avoid warning message test=develop --- paddle/fluid/inference/api/demo_ci/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt index f42ee9a697..fa2752e915 100644 --- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt +++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt @@ -92,10 +92,10 @@ if(WITH_MKL) if(NOT WIN32) set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX}) - else(WIN32) + else() set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml${CMAKE_SHARED_LIBRARY_SUFFIX} ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5md${CMAKE_SHARED_LIBRARY_SUFFIX}) - endif(WIN32) + endif() set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn") if(EXISTS ${MKLDNN_PATH}) include_directories("${MKLDNN_PATH}/include") From ab9c4b2a9f8b1e88d6308369fa3f9f9f6c5914b1 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 10 Jan 2019 03:44:57 +0000 Subject: [PATCH 192/197] refine seqpool concat pass and remove unused nodes test=develop --- .../framework/ir/seqpool_concat_fuse_pass.cc | 25 +++- .../ir/seqpool_concat_fuse_pass_tester.cc | 130 ++++++++++++++---- 2 files changed, 128 insertions(+), 27 deletions(-) diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc index 96a60da518..96a3b7ee05 100644 --- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc +++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc @@ -76,6 +76,7 @@ PDNode* BuildSeqPoolConcatPattern(PDPattern* pattern, std::vector seqpool_ops_input_var(num_inputs); std::vector seqpool_ops_output_var(num_inputs); + std::vector seqpool_ops_output_unused_var(num_inputs); std::vector seqpool_ops(num_inputs); for (int i = 0; i < num_inputs; ++i) { @@ -88,6 +89,15 @@ PDNode* BuildSeqPoolConcatPattern(PDPattern* pattern, }, name_scope + "/sequence_pool_out_" + std::to_string(i)); + seqpool_ops_output_unused_var[i] = pattern->NewNode( + [=](Node* x) { + return x && x->IsVar() && x->inputs.size() == 1 && + x->outputs.size() == 0 && + is_seqpool_op_with_pootype_of_nth_input_of_concat(x->inputs[0], + "SUM", i); + }, + name_scope + "/sequence_pool_unused_out_" + std::to_string(i)); + seqpool_ops[i] = pattern->NewNode( [=](Node* x) { return x && x->IsOp() && @@ -97,16 +107,23 @@ PDNode* BuildSeqPoolConcatPattern(PDPattern* pattern, seqpool_ops_input_var[i] = pattern->NewNode( [=](Node* x) { - return x && x->IsVar() && x->outputs.size() >= 1 && - is_seqpool_op_with_pootype_of_nth_input_of_concat( - x->outputs[0], "SUM", i); + bool basic = x && x->IsVar() && x->outputs.size() >= 1; + bool next_is_fine = false; + for (auto* o : x->outputs) { + if (is_seqpool_op_with_pootype_of_nth_input_of_concat(o, "SUM", + i)) { + next_is_fine = true; + break; + } + } + return basic && next_is_fine; }, name_scope + "/sequence_pool_in_" + std::to_string(i)); // Links seqpool_ops[i] ->LinksFrom({seqpool_ops_input_var[i]}) - .LinksTo({seqpool_ops_output_var[i]}); + .LinksTo({seqpool_ops_output_var[i], seqpool_ops_output_unused_var[i]}); } concat_op->LinksFrom(seqpool_ops_output_var).LinksTo({concat_out_var}); return concat_out_var; diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc index 7d2739d84d..456a03192c 100644 --- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc @@ -35,11 +35,35 @@ void SetOp(ProgramDesc* prog, const std::string& type, op->SetInput("X", inputs); op->SetAttr("axis", 1); op->SetOutput("Out", {outputs[0]}); + } else { + op->SetInput("X", inputs); + op->SetOutput("Out", outputs); } op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), static_cast(OpRole::kForward)); } +int CountOpType(const ir::Graph* graph, + const std::string& op_type = "fusion_seqpool_concat") { + int count = 0; + for (auto* node : graph->Nodes()) { + if (node->IsOp() && node->Op()->Type() == op_type) { + ++count; + } + } + return count; +} + +std::unique_ptr GetNumNodesOfBeforeAfter( + std::unique_ptr graph, int* before, int* after, + const std::string& pass_type = "seqpool_concat_fuse_pass") { + auto pass = PassRegistry::Instance().Get(pass_type); + *before = graph->Nodes().size(); + graph = pass->Apply(std::move(graph)); + *after = graph->Nodes().size(); + return graph; +} + /* * Before fuse: * a b c @@ -51,15 +75,16 @@ void SetOp(ProgramDesc* prog, const std::string& type, * concat * | * j + * Type of op1, op2 and op3 are sequence_pool, with "SUM" pooltype attr + * * After fuse: * a b c * \ | / * fusion_seqpool_concat * | * j - * unused nodes: d, f, h */ -ProgramDesc BuildProgramDesc() { +TEST(SeqPoolConcatFusePass, basic) { ProgramDesc prog; for (auto& v : std::vector( {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j"})) { @@ -76,35 +101,94 @@ ProgramDesc BuildProgramDesc() { SetOp(&prog, "concat", std::vector({"e", "g", "i"}), std::vector({"j"})); - return prog; -} - -TEST(SeqPoolConcatFusePass, basic) { - auto prog = BuildProgramDesc(); - std::unique_ptr graph(new ir::Graph(prog)); + int before, after; + graph = GetNumNodesOfBeforeAfter(std::move(graph), &before, &after); + // Remove 10 Nodes: op1, op2, op3, d, e, f, g, h, i, concat_op + // Add 1 Node: fusion_seqpool_concat + EXPECT_EQ(after, before - 9); + EXPECT_EQ(CountOpType(graph.get()), 1); +} - auto pass = PassRegistry::Instance().Get("seqpool_concat_fuse_pass"); - - int pre_nodes = graph->Nodes().size(); - - graph = pass->Apply(std::move(graph)); +/* + * Before fuse: + * a b + * | / \ + * op1 op2 op3 + * / \ / \ \ + * c d e f g + * \ / + * concat + * | + * h + * Type of op1 and op2 are sequence_pool, with "SUM" pooltype attr + * + * After fuse: + * a b + * \ / \ + * fusion_seqpool_concat op3 + * | | + * h g + */ +TEST(SeqPoolConcatFusePass, advanced) { + ProgramDesc prog; + for (auto& v : + std::vector({"a", "b", "c", "d", "e", "f", "g", "h"})) { + auto* var = prog.MutableBlock(0)->Var(v); + var->SetType(proto::VarType::LOD_TENSOR); + } - int after_nodes = graph->Nodes().size(); + SetOp(&prog, "sequence_pool", std::vector({"a"}), + std::vector({"c", "d"})); + SetOp(&prog, "sequence_pool", std::vector({"b"}), + std::vector({"e", "f"})); + SetOp(&prog, "op3", std::vector({"b"}), + std::vector({"g"})); + SetOp(&prog, "concat", std::vector({"d", "f"}), + std::vector({"h"})); - // Remove 7 Nodes: op1, op2, op3, e, g, i, concat_op + std::unique_ptr graph(new ir::Graph(prog)); + int before, after; + graph = GetNumNodesOfBeforeAfter(std::move(graph), &before, &after); + // Remove 7 Nodes: op1, op2, c, d, e, f concat_op // Add 1 Node: fusion_seqpool_concat - EXPECT_EQ(pre_nodes - 6, after_nodes); + EXPECT_EQ(after, before - 6); + EXPECT_EQ(CountOpType(graph.get()), 1); +} - // Assert new op in newly generated graph - int count = 0; +ProgramDesc BuildProgramDesc(int num_inputs_of_concat) { + ProgramDesc prog; + auto new_var = [&](const std::string& name) { + auto* var = prog.MutableBlock(0)->Var(name); + var->SetType(proto::VarType::LOD_TENSOR); + }; + std::vector concat_inputs; + for (int i = 0; i < num_inputs_of_concat; ++i) { + std::string prefix = "seqpool_op_" + i; + new_var(prefix + "in"); + new_var(prefix + "out"); + new_var(prefix + "out_unused"); + SetOp(&prog, "sequence_pool", std::vector({prefix + "in"}), + std::vector({prefix + "out", prefix + "out_unused"})); + concat_inputs.push_back(prefix + "out"); + } + SetOp(&prog, "concat", concat_inputs, + std::vector({"concat_out"})); + return prog; +} - for (auto* node : graph->Nodes()) { - if (node->IsOp() && node->Op()->Type() == "fusion_seqpool_concat") { - ++count; - } +// test more inputs of concat +TEST(SeqPoolConcatFusePass, more_inputs) { + for (int num : {1, 2, 10}) { + ProgramDesc prog = BuildProgramDesc(num); + std::unique_ptr graph(new ir::Graph(prog)); + int before, after; + graph = GetNumNodesOfBeforeAfter(std::move(graph), &before, &after); + // Remove Nodes: n * (seqpool_op, out, out_unused), and concat_op + // Add Node: fusion_seqpool_concat op + EXPECT_EQ(after, before - num * 3); + EXPECT_EQ(CountOpType(graph.get()), 1); } - EXPECT_EQ(count, 1); } } // namespace ir From 96786d37167f0e252dfea01474a5d71a92968cff Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 10 Jan 2019 07:44:02 +0000 Subject: [PATCH 193/197] add compare_determine of seqpool1 test test=develop --- .../inference/tests/api/analyzer_seq_pool1_tester.cc | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc index 1cf326fc89..d9de55ab76 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc @@ -168,6 +168,17 @@ TEST(Analyzer_seq_pool1, compare) { reinterpret_cast(&cfg), input_slots_all); } +// Compare Deterministic result +TEST(Analyzer_seq_pool1, compare_determine) { + AnalysisConfig cfg; + SetConfig(&cfg); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareDeterministic(reinterpret_cast(&cfg), + input_slots_all); +} + void analysis_fuse_statis(bool use_zerocopy) { AnalysisConfig cfg; SetConfig(&cfg); From fd854183295c0a8d6dc0682f135d7dcc13faa575 Mon Sep 17 00:00:00 2001 From: Wu Yi Date: Thu, 10 Jan 2019 16:27:52 +0800 Subject: [PATCH 194/197] [Feature] support mix precision training for resnet (#14899) * clip softmax for fp16 * updates * fuse xent support fp16 test=develop * wip * wip * add simple row reduce * wip fp16 accurate softmax * add accurate softmax kernel for fp16 test=develop * update test=develop * fix cpu build test=develop * update api.spec test=develop * follow comments test=develop * fix build test=develop * fix trt build test=develop * fix inference build test=develop * fix merge test=develop * update test=develop * try fix build test=develop * fix build test=develop * rename real_exp test=develop * fortest * remove hacky kernels test=develop * clean up test=develop --- paddle/fluid/API.spec | 22 ++ paddle/fluid/operators/conv_cudnn_op.cu.cc | 15 ++ .../elementwise/elementwise_sub_op.cu | 5 + paddle/fluid/operators/math/softmax.h | 1 + .../softmax_with_cross_entropy_op.cu | 64 ++--- python/paddle/fluid/optimizer.py | 226 +++++++++++------- .../fluid/tests/unittests/test_optimizer.py | 70 ++++-- .../test_softmax_with_cross_entropy_op.py | 60 ++++- 8 files changed, 333 insertions(+), 130 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 9872631553..16d43f82d6 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -405,28 +405,50 @@ paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0)) paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True)) paddle.fluid.optimizer.SGDOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None)) +paddle.fluid.optimizer.SGDOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) +paddle.fluid.optimizer.SGDOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.optimizer.SGDOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None)) +paddle.fluid.optimizer.MomentumOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) +paddle.fluid.optimizer.MomentumOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None)) +paddle.fluid.optimizer.AdagradOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) +paddle.fluid.optimizer.AdagradOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False)) +paddle.fluid.optimizer.AdamOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) +paddle.fluid.optimizer.AdamOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)) +paddle.fluid.optimizer.AdamaxOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) +paddle.fluid.optimizer.AdamaxOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None)) +paddle.fluid.optimizer.DecayedAdagradOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) +paddle.fluid.optimizer.DecayedAdagradOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.0, 0.0, -0.5, None, None)) +paddle.fluid.optimizer.FtrlOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) +paddle.fluid.optimizer.FtrlOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.optimizer.FtrlOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, 0.0, False, None, None)) +paddle.fluid.optimizer.RMSPropOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) +paddle.fluid.optimizer.RMSPropOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.optimizer.RMSPropOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, 0.95, None, None)) +paddle.fluid.optimizer.AdadeltaOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) +paddle.fluid.optimizer.AdadeltaOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None)) paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) +paddle.fluid.optimizer.ModelAverage.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) +paddle.fluid.optimizer.ModelAverage.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.optimizer.ModelAverage.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.ModelAverage.restore ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None) paddle.fluid.optimizer.LarsMomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'lars_coeff', 'lars_weight_decay', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.0005, None, None)) +paddle.fluid.optimizer.LarsMomentumOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) +paddle.fluid.optimizer.LarsMomentumOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.optimizer.LarsMomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.backward.append_backward ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.regularizer.L1DecayRegularizer.__init__ ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)) diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc index dbb6ffd5e2..25a723fc07 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc @@ -297,6 +297,21 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor( layout, framework::vectorize2int(filter->dims()), groups); +#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) + // Enable Tensor Core for cudnn backward + if (dev_ctx.GetComputeCapability() >= 70 && + std::type_index(typeid(T)) == + std::type_index(typeid(platform::float16))) { + CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( + cudnn_conv_desc, CUDNN_TENSOR_OP_MATH)); + VLOG(5) << "use cudnn_tensor_op_math for backward"; + } else { + CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( + cudnn_conv_desc, CUDNN_DEFAULT_MATH)); + VLOG(5) << "NOT use cudnn_tensor_op_math for backward"; + } +#endif + int input_channels = input->dims()[1]; int input_height, input_width, input_depth; if (input->dims().size() == 5) { diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu index 6f17d3292f..f2adf1c837 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu @@ -12,18 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_sub_op.h" +#include "paddle/fluid/platform/float16.h" namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( elementwise_sub, ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel, ops::ElementwiseSubKernel, ops::ElementwiseSubKernel, ops::ElementwiseSubKernel); REGISTER_OP_CUDA_KERNEL( elementwise_sub_grad, ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel, ops::ElementwiseSubGradKernel, ops::ElementwiseSubGradKernel, ops::ElementwiseSubGradKernel()(::Eigen::numext::log(x)); +} +static __device__ __forceinline__ float log_on_device(float x) { return math::TolerableValue()(logf(x)); } -static __device__ __forceinline__ double real_log(double x) { +static __device__ __forceinline__ double log_on_device(double x) { return math::TolerableValue()(log(x)); } @@ -72,25 +81,20 @@ static __device__ __forceinline__ double real_log(double x) { /* Supposing the x is `logits` and y is `labels`, the equations are as followings: - cross\_entropy_i = \sum_{j}[- y_i_j * log({e^{x_i_j}/\sum_{j}e^{x_i_j}})] = \sum_{j}[- y_i_j * log({e^{x_i_j - max_i}/\sum_{j}e^{x_i_j-max_i}})] = \sum_{j}[-y_i_j * (x_i_j - max_i - log\sum_{j}e^{x_i_j - max_i})] = \sum_{j}[-y_i_j * (x_i_j - max_i - logDiffMaxSum_i)] = \sum_{j}(-y_i_j * tmp_i_j) - softmax_i_j = e^{tmp_i_j} - where: max_i = \max_{j}{x_i_j} logDiffMaxSum_i = log\sum_{j}e^{x_i_j - max_i} tmp_i_j = x_i_j - max_i - logDiffMaxSum_i - Therefore, the calculation can be separated into 3 steps: Step 1: row-wise operation to calculate max_i Step 2: row-wise operation to calculate logDiffMaxSum_i Step 3: caculate tmp_i_j, and finally get softmax_i_j and cross\_entropy_i - To save memory, we can share memory among max_i, logDiffMaxSum_i and cross\_entropy_i. In this way, the 3 steps should be changed to: @@ -134,7 +138,8 @@ static __global__ void RowReductionForMax(const T* logits_data, T* max_data, cur_max = BlockReduce(temp_storage).Reduce(cur_max, cub::Max()); if (threadIdx.x == 0) { - max_data[blockIdx.x] = cur_max < -64 ? -64 : cur_max; + max_data[blockIdx.x] = + cur_max < static_cast(-64) ? static_cast(-64) : cur_max; } } @@ -151,17 +156,17 @@ static __global__ void RowReductionForDiffMaxSum(const T* logits_data, auto block_max = max_data[blockIdx.x]; softmax[beg_idx] = logits_data[beg_idx] - block_max; - T diff_max_sum = real_exp(softmax[beg_idx]); + T diff_max_sum = exp_on_device(softmax[beg_idx]); auto idx = beg_idx + BlockDim; while (idx < end_idx) { softmax[idx] = logits_data[idx] - block_max; - diff_max_sum += real_exp(softmax[idx]); + diff_max_sum += exp_on_device(softmax[idx]); idx += BlockDim; } diff_max_sum = BlockReduce(temp_storage).Reduce(diff_max_sum, cub::Sum()); - if (threadIdx.x == 0) max_data[blockIdx.x] = real_log(diff_max_sum); + if (threadIdx.x == 0) max_data[blockIdx.x] = log_on_device(diff_max_sum); if (!CalculateLogSoftmax) return; __syncthreads(); @@ -188,12 +193,12 @@ static __global__ void RowReductionForSoftmaxAndCrossEntropy( // log_diff_max_sum shares memory with loss auto block_log_diff_max_sum = loss_data[blockIdx.x]; auto tmp = softmax[beg_idx] - block_log_diff_max_sum; - softmax[beg_idx] = real_exp(tmp); + softmax[beg_idx] = exp_on_device(tmp); auto loss = -labels_data[beg_idx] * tmp; beg_idx += BlockDim; while (beg_idx < end_idx) { tmp = softmax[beg_idx] - block_log_diff_max_sum; - softmax[beg_idx] = real_exp(tmp); + softmax[beg_idx] = exp_on_device(tmp); loss -= (labels_data[beg_idx] * tmp); beg_idx += BlockDim; } @@ -218,10 +223,10 @@ struct HardLabelSoftmaxWithCrossEntropyFunctor { auto row_idx = idx / feature_size_; auto col_idx = idx % feature_size_; if (col_idx != labels_[row_idx]) { - log_softmax_[idx] = real_exp(log_softmax_[idx]); + log_softmax_[idx] = exp_on_device(log_softmax_[idx]); } else { auto softmax = log_softmax_[idx]; - log_softmax_[idx] = real_exp(softmax); + log_softmax_[idx] = exp_on_device(softmax); loss_[row_idx] = -softmax; } } @@ -253,10 +258,10 @@ struct HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx { auto row_idx = idx / feature_size_; auto col_idx = idx % feature_size_; if (col_idx != labels_[row_idx] || col_idx == ignore_idx_) { - log_softmax_[idx] = real_exp(log_softmax_[idx]); + log_softmax_[idx] = exp_on_device(log_softmax_[idx]); } else { auto softmax = log_softmax_[idx]; - log_softmax_[idx] = real_exp(softmax); + log_softmax_[idx] = exp_on_device(softmax); loss_[row_idx] = -softmax; } } @@ -464,9 +469,12 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(softmax_with_cross_entropy, - ops::SoftmaxWithCrossEntropyCUDAKernel, - ops::SoftmaxWithCrossEntropyCUDAKernel); -REGISTER_OP_CUDA_KERNEL(softmax_with_cross_entropy_grad, - ops::SoftmaxWithCrossEntropyGradCUDAKernel, - ops::SoftmaxWithCrossEntropyGradCUDAKernel); +REGISTER_OP_CUDA_KERNEL( + softmax_with_cross_entropy, ops::SoftmaxWithCrossEntropyCUDAKernel, + ops::SoftmaxWithCrossEntropyCUDAKernel, + ops::SoftmaxWithCrossEntropyCUDAKernel); +REGISTER_OP_CUDA_KERNEL( + softmax_with_cross_entropy_grad, + ops::SoftmaxWithCrossEntropyGradCUDAKernel, + ops::SoftmaxWithCrossEntropyGradCUDAKernel, + ops::SoftmaxWithCrossEntropyGradCUDAKernel); diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 779cb5f961..bf3730ce51 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -195,22 +195,18 @@ class Optimizer(object): format(name, param.name)) return self._accumulators[name][param.name] - def _create_optimization_pass(self, - parameters_and_grads, - loss, - startup_program=None): + def _create_optimization_pass(self, parameters_and_grads): """Add optimization operators to update gradients to variables. Args: - loss(Variable): the target that this optimization is for. parameters_and_grads(list(tuple(Variable, Variable))): - a list of (variable, gradient) pair to update. + a list of (variable, gradient) pair to update. Returns: return_op_list: a list of operators that will complete one step of - optimization. This will include parameter update ops, global step - update ops and any other custom ops required by subclasses to manage - their internal state. + optimization. This will include parameter update ops, global step + update ops and any other custom ops required by subclasses to manage + their internal state. """ # This is a default implementation of create_optimization_pass that # can be shared by most optimizers. This implementation assumes that @@ -219,37 +215,33 @@ class Optimizer(object): # _create_accumulators method if it needs to create accumulators # for parameters and extend _finish_update method to add custom ops. - # Create any accumulators - program = loss.block.program - self._dtype = loss.dtype - with program_guard(program, startup_program): - global_block = framework.default_main_program().global_block() - start = len(global_block.ops) - self.helper = LayerHelper(self.__class__.__name__) - self._create_accumulators(loss.block, - [p[0] for p in parameters_and_grads]) - self._create_global_learning_rate() - - optimize_ops = [] - for param_and_grad in parameters_and_grads: - if param_and_grad[1] is None: - continue - with param_and_grad[0].block.program._optimized_guard( - param_and_grad), name_scope("optimizer"): - if param_and_grad[0].trainable is True: - optimize_op = self._append_optimize_op(loss.block, - param_and_grad) - optimize_ops.append(optimize_op) - - # Get custom finish ops for subclasses - # FIXME: Need to fix this once we figure out how to handle dependencies - self._finish_update(loss.block, parameters_and_grads) - - end = len(global_block.ops) - return global_block._slice_ops(start, end) - - def _process_distribute_lookuptable(self, param_grads, loss, - startup_program): + # Allways called under program_guard use global block as loss block + global_block = framework.default_main_program().global_block() + start = len(global_block.ops) + self.helper = LayerHelper(self.__class__.__name__) + self._create_accumulators(global_block, + [p[0] for p in parameters_and_grads]) + self._create_global_learning_rate() + + optimize_ops = [] + for param_and_grad in parameters_and_grads: + if param_and_grad[1] is None: + continue + with param_and_grad[0].block.program._optimized_guard( + param_and_grad), name_scope("optimizer"): + if param_and_grad[0].trainable is True: + optimize_op = self._append_optimize_op(global_block, + param_and_grad) + optimize_ops.append(optimize_op) + + # Get custom finish ops for subclasses + # FIXME: Need to fix this once we figure out how to handle dependencies + self._finish_update(global_block, parameters_and_grads) + + end = len(global_block.ops) + return global_block._slice_ops(start, end) + + def _process_distribute_lookuptable(self, param_grads): """ Because distribute lookup table only support SGD optimizer for now, not support other optimizer and regularization, so we should find the table parameter out, @@ -259,7 +251,8 @@ class Optimizer(object): :param loss: the loss variable. :param startup_program: the startup program """ - program = loss.block.program + program = framework.default_main_program() + global_block = framework.default_main_program().global_block() table_name = find_distributed_lookup_table(program) table_param = None table_grad = None @@ -275,38 +268,121 @@ class Optimizer(object): new_param_grads.append((p, g)) sgd_op = None if table_param is not None: - with program_guard(program, startup_program): - param_and_grad = [table_param, table_grad] - with table_param.block.program._optimized_guard(param_and_grad), \ - framework.name_scope("optimizer"): - self._create_global_learning_rate() - # create the optimize op - sgd_op = loss.block.append_op( - type='sgd', - inputs={ - "Param": table_param, - "Grad": table_grad, - "LearningRate": - self._create_param_lr(param_and_grad) - }, - outputs={"ParamOut": param_and_grad[0]}) + param_and_grad = [table_param, table_grad] + with table_param.block.program._optimized_guard(param_and_grad), \ + framework.name_scope("optimizer"): + self._create_global_learning_rate() + # create the optimize op + sgd_op = global_block.append_op( + type='sgd', + inputs={ + "Param": table_param, + "Grad": table_grad, + "LearningRate": self._create_param_lr(param_and_grad) + }, + outputs={"ParamOut": param_and_grad[0]}) return new_param_grads, (table_param, table_grad), sgd_op + def backward(self, + loss, + startup_program=None, + parameter_list=None, + no_grad_set=None, + callbacks=None): + """ + First part of `minimize`, do auto-diff to append backward ops for + the current program. + + Args: + loss (Variable): loss variable to run optimizations. + startup_program (Program): startup_program for initializing parameters + in `parameter_list`. + parameter_list (list): list of Variables to update. + no_grad_set (set|None): set of Variables should be ignored. + callbacks (list|None): list of callables to run when appending backward + operator for one parameter. + + Return: + list: list of (param, grad) pair, grad is the output of backward. + + Examples: + See examples in `apply_gradients`. + """ + if callbacks is None: + callbacks = [error_clip_callback] + else: + assert (isinstance(callbacks, list)) + callbacks.append(error_clip_callback) + return append_backward(loss, parameter_list, no_grad_set, callbacks) + + def apply_gradients(self, params_grads): + """ + Second part of `minimize`, appending optimization operators for + given `params_grads` pairs. + + Args: + params_grads (list): list of (param, grad) pair to do optimization. + + Returns: + list: A list of operators appended to the current program. + + Examples: + .. code-block:: python + + loss = network() + optimizer = fluid.optimizer.SGD(learning_rate=0.1) + params_grads = optimizer.backward(loss) + # you may append operations for params_grads here + # ... + optimizer.apply_gradients(params_grads) + """ + params_grads = sorted(params_grads, key=lambda x: x[0].name) + + params_grads, table_param_and_grad, table_optimize_op = \ + self._process_distribute_lookuptable(params_grads) + + params_grads = append_gradient_clip_ops(params_grads) + + # Add regularization if any + params_grads = append_regularization_ops(params_grads, + self.regularization) + + optimize_ops = self._create_optimization_pass(params_grads) + if table_optimize_op is not None: + optimize_ops.append(table_optimize_op) + params_grads.append(table_param_and_grad) + + return optimize_ops + def minimize(self, loss, startup_program=None, parameter_list=None, no_grad_set=None): - """Add operations to minimize `loss` by updating `parameter_list`. + """ + Add operations to minimize `loss` by updating `parameter_list`. - This method combines interface `append_backward()` and - `create_optimization_pass()` into one. + This method combines interface `backward()` and + `apply_gradients()` into one. + + Args: + loss (Variable): loss variable to run optimizations. + startup_program (Program): startup_program for initializing parameters + in `parameter_list`. + parameter_list (list): list of Variables to update. + no_grad_set (set|None): set of Variables should be ignored. + + Returns: + tuple: (optimize_ops, params_grads) which are, list of operators appended; + and list of (param, grad) Variables pair for optimization. """ + self._dtype = loss.dtype + program = loss.block.program + optimize_ops = [] if imperative_base.enabled(): if parameter_list is not None: params_grads = parameter_list else: - program = loss.block.program parameters = program.global_block().all_parameters() params_grads = [] for param in parameters: @@ -317,29 +393,13 @@ class Optimizer(object): stop_gradient=True) grad_var._value = param._ivar.grad_value params_grads.append((param, grad_var)) - - optimize_ops = self._create_optimization_pass(params_grads, loss, - startup_program) + with program_guard(program, startup_program): + optimize_ops = self._create_optimization_pass(params_grads) else: - params_grads = append_backward(loss, parameter_list, no_grad_set, - [error_clip_callback]) - - params_grads = sorted(params_grads, key=lambda x: x[0].name) - - params_grads, table_param_and_grad, table_optimize_op = \ - self._process_distribute_lookuptable(params_grads, loss, startup_program) - - params_grads = append_gradient_clip_ops(params_grads) - - # Add regularization if any - params_grads = append_regularization_ops(params_grads, - self.regularization) - - optimize_ops = self._create_optimization_pass(params_grads, loss, - startup_program) - if table_optimize_op is not None: - optimize_ops.append(table_optimize_op) - params_grads.append(table_param_and_grad) + with program_guard(program, startup_program): + params_grads = self.backward(loss, startup_program, + parameter_list, no_grad_set) + optimize_ops = self.apply_gradients(params_grads) return optimize_ops, params_grads diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py index 4374d198f2..34c9b7e006 100644 --- a/python/paddle/fluid/tests/unittests/test_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_optimizer.py @@ -61,6 +61,48 @@ class TestOptimizer(unittest.TestCase): self.assertEqual([op.type for op in opts], ["sgd"]) +class TestOptimizerBackwardApplygrad(unittest.TestCase): + def test_sgd_optimizer(self): + def check_sgd_optimizer(optimizer_attr): + init_program = framework.Program() + program = framework.Program() + block = program.global_block() + mul_x = block.create_parameter( + dtype="float32", + shape=[5, 10], + lod_level=0, + name="mul.x", + optimize_attr=optimizer_attr) + mul_y = block.create_var( + dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") + mul_out = block.create_var( + dtype="float32", shape=[5, 8], lod_level=0, name="mul.out") + mean_out = block.create_var( + dtype="float32", shape=[1], lod_level=0, name="mean.out") + block.append_op( + type="mul", + inputs={"X": mul_x, + "Y": mul_y}, + outputs={"Out": mul_out}, + attrs={"x_num_col_dims": 1}) + block.append_op( + type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out}) + sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01) + with framework.program_guard(program, init_program): + p_g = sgd_optimizer.backward(mean_out) + opts = sgd_optimizer.apply_gradients(p_g) + return opts + + opts = check_sgd_optimizer({'learning_rate': 1.1}) + self.assertEqual(len(opts), 3) + self.assertEqual([op.type for op in opts], + ["fill_constant", "elementwise_mul", "sgd"]) + + opts = check_sgd_optimizer({'learning_rate': 1.0}) + self.assertEqual(len(opts), 1) + self.assertEqual([op.type for op in opts], ["sgd"]) + + class TestMomentumOptimizer(unittest.TestCase): class MockMomentum(optimizer.MomentumOptimizer): def get_accumulators(self): @@ -99,8 +141,8 @@ class TestMomentumOptimizer(unittest.TestCase): params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(momentum_optimizer.get_accumulators()), 0) - opts = momentum_optimizer._create_optimization_pass( - params_grads, mul_out, init_program) + with framework.program_guard(program, init_program): + opts = momentum_optimizer.apply_gradients(params_grads) self.assertEqual(len(opts), 3) sgd_op = opts[-1] self.assertEqual([op.type for op in opts], @@ -153,8 +195,8 @@ class TestMomentumOptimizer(unittest.TestCase): params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(momentum_optimizer.get_accumulators()), 0) - opts = momentum_optimizer._create_optimization_pass( - params_grads, mul_out, init_program) + with framework.program_guard(program, init_program): + opts = momentum_optimizer.apply_gradients(params_grads) self.assertEqual(len(opts), 3) sgd_op = opts[-1] self.assertEqual([op.type for op in opts], @@ -216,8 +258,8 @@ class TestAdagradOptimizer(unittest.TestCase): params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0) - opts = adagrad_optimizer._create_optimization_pass( - params_grads, mul_out, init_program) + with framework.program_guard(program, init_program): + opts = adagrad_optimizer.apply_gradients(params_grads) self.assertEqual(len(opts), 3) self.assertEqual([op.type for op in opts], ["fill_constant", "elementwise_mul", "adagrad"]) @@ -280,8 +322,8 @@ class TestAdamOptimizer(unittest.TestCase): params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(adam_optimizer.get_accumulators()), 0) - opts = adam_optimizer._create_optimization_pass(params_grads, mul_out, - init_program) + with framework.program_guard(program, init_program): + opts = adam_optimizer.apply_gradients(params_grads) self.assertEqual(len(opts), 5) self.assertEqual( [op.type for op in opts], @@ -347,8 +389,8 @@ class TestAdamaxOptimizer(unittest.TestCase): params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(adamax_optimizer.get_accumulators()), 0) - opts = adamax_optimizer._create_optimization_pass(params_grads, mul_out, - init_program) + with framework.program_guard(program, init_program): + opts = adamax_optimizer.apply_gradients(params_grads) self.assertEqual(len(opts), 4) self.assertEqual( [op.type for op in opts], @@ -411,8 +453,8 @@ class TestDecayedAdagradOptimizer(unittest.TestCase): params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(decayed_adagrad_optimizer.get_accumulators()), 0) - opts = decayed_adagrad_optimizer._create_optimization_pass( - params_grads, mul_out, init_program) + with framework.program_guard(program, init_program): + opts = decayed_adagrad_optimizer.apply_gradients(params_grads) self.assertEqual(len(opts), 3) self.assertEqual( [op.type for op in opts], @@ -477,8 +519,8 @@ class TestFtrlOptimizer(unittest.TestCase): params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(ftrl_optimizer.get_accumulators()), 0) - opts = ftrl_optimizer._create_optimization_pass(params_grads, mul_out, - init_program) + with framework.program_guard(program, init_program): + opts = ftrl_optimizer.apply_gradients(params_grads) self.assertEqual(len(opts), 3) self.assertEqual([op.type for op in opts], ["fill_constant", "elementwise_mul", "ftrl"]) diff --git a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py index 37ee880970..b0494f114c 100644 --- a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py +++ b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py @@ -28,6 +28,7 @@ class TestSoftmaxWithCrossEntropyOp(OpTest): def initParams(self): self.numeric_stable_mode = False + self.dtype = np.float64 def setUp(self): self.initParams() @@ -36,19 +37,19 @@ class TestSoftmaxWithCrossEntropyOp(OpTest): class_num = 37 logits = np.random.uniform(0.1, 1.0, - [batch_size, class_num]).astype("float64") + [batch_size, class_num]).astype(self.dtype) softmax = np.apply_along_axis(stable_softmax, 1, logits) labels = np.random.randint(0, class_num, [batch_size, 1], dtype="int64") cross_entropy = np.asmatrix( [[-np.log(softmax[i][labels[i][0]])] for i in range(softmax.shape[0])], - dtype="float64") + dtype=self.dtype) self.inputs = {"Logits": logits, "Label": labels} self.outputs = { - "Softmax": softmax.astype("float64"), - "Loss": cross_entropy.astype("float64") + "Softmax": softmax.astype(self.dtype), + "Loss": cross_entropy.astype(self.dtype) } self.attrs = {"numeric_stable_mode": self.numeric_stable_mode} @@ -56,7 +57,7 @@ class TestSoftmaxWithCrossEntropyOp(OpTest): self.check_output() def test_check_grad(self): - self.check_grad(["Logits"], "Loss") + self.check_grad(["Logits"], "Loss", max_relative_error=0.05) class TestSoftmaxWithCrossEntropyOpNoCudnn(TestSoftmaxWithCrossEntropyOp): @@ -64,6 +65,55 @@ class TestSoftmaxWithCrossEntropyOpNoCudnn(TestSoftmaxWithCrossEntropyOp): self.numeric_stable_mode = True +class TestSoftmaxWithCrossEntropyOpFp16(TestSoftmaxWithCrossEntropyOp): + def initParams(self): + self.numeric_stable_mode = False + self.dtype = np.float16 + + def setUp(self): + self.initParams() + self.op_type = "softmax_with_cross_entropy" + batch_size = 41 + class_num = 37 + + # NOTE: numpy float16 have very low accuracy, use float32 for numpy check. + logits = np.random.uniform(0.1, 1.0, + [batch_size, class_num]).astype(np.float32) + softmax = np.apply_along_axis(stable_softmax, 1, logits) + labels = np.random.randint(0, class_num, [batch_size, 1], dtype="int64") + + cross_entropy = np.asmatrix( + [[-np.log(softmax[i][labels[i][0]])] + for i in range(softmax.shape[0])], + dtype=np.float32) + + self.inputs = { + "Logits": logits.astype(self.dtype).view(np.uint16), + "Label": labels + } + self.outputs = { + "Softmax": softmax.astype(self.dtype), + "Loss": cross_entropy.astype(self.dtype) + } + self.attrs = {"numeric_stable_mode": self.numeric_stable_mode} + + def test_check_output(self): + self.check_output(atol=1e-2) + + def test_check_grad(self): + self.check_grad(["Logits"], "Loss", max_relative_error=0.1) + + +class TestSoftmaxWithCrossEntropyOpNoCudnnFp16( + TestSoftmaxWithCrossEntropyOpFp16): + def initParams(self): + self.numeric_stable_mode = True + self.dtype = np.float16 + + def test_check_grad(self): + self.check_grad(["Logits"], "Loss", max_relative_error=0.1) + + class TestSoftmaxWithCrossEntropyOp2(OpTest): """ Test softmax with cross entropy operator with soft labels. From 0e178033d320e31626f645ebf514224e6b3744cf Mon Sep 17 00:00:00 2001 From: chengduo Date: Thu, 10 Jan 2019 03:02:03 -0600 Subject: [PATCH 195/197] open compare_reduce_and_allreduce test (#15258) test=develop --- .../unittests/test_parallel_executor_mnist.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index 9768f7db26..ac69c95853 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -74,7 +74,11 @@ class TestMNIST(TestParallelExecutorBase): label = np.ones(shape=[32, 1], dtype='int64') return img, label - def _compare_reduce_and_allreduce(self, model, use_cuda): + def _compare_reduce_and_allreduce(self, + model, + use_cuda, + delta1=1e-6, + delta2=1e-4): if use_cuda and not core.is_compiled_with_cuda(): return @@ -95,9 +99,9 @@ class TestMNIST(TestParallelExecutorBase): use_reduce=True) for loss in zip(all_reduce_first_loss, reduce_first_loss): - self.assertAlmostEqual(loss[0], loss[1], delta=1e-6) + self.assertAlmostEqual(loss[0], loss[1], delta=delta1) for loss in zip(all_reduce_last_loss, reduce_last_loss): - self.assertAlmostEqual(loss[0], loss[1], delta=1e-4) + self.assertAlmostEqual(loss[0], loss[1], delta=delta2) # simple_fc def check_simple_fc_convergence(self, use_cuda, use_reduce=False): @@ -174,8 +178,9 @@ class TestMNIST(TestParallelExecutorBase): self.check_batchnorm_fc_convergence(use_cuda, use_fast_executor) def test_batchnorm_fc_with_new_strategy(self): - # FIXME(zcd): close this test temporally. - # self._compare_reduce_and_allreduce(fc_with_batchnorm, True) + # NOTE: the computation result of nccl_reduce is non-deterministic, + # related issue: https://github.com/NVIDIA/nccl/issues/157 + self._compare_reduce_and_allreduce(fc_with_batchnorm, True, 1e-5, 1e-3) self._compare_reduce_and_allreduce(fc_with_batchnorm, False) From 8f17c714de6e2a99c6f7ebf1dc895d65047b372a Mon Sep 17 00:00:00 2001 From: xiaolil1 <39753926+xiaolil1@users.noreply.github.com> Date: Thu, 10 Jan 2019 18:14:30 +0800 Subject: [PATCH 196/197] Conv int8 residual (#15145) * Enable basic MKL-DNN INT8 Conv OP test=develop * Modify test case test=develop * Clean unittest code test=develop * Fix test test=develop * Modify test test=develop * Enable MKL-DNN INT8 Conv with Relu Fusion OP test=develop * Enable INT8 Conv with residual fusion OP test=develop * Modify code. test=develop * Modify basic INT8 Conv test=develop * Modify Conv. test=develop * fix style test=develop * Fix style test=develop * Fix test test=develop * Modify code. test=develop * Fix test test=develop --- paddle/fluid/operators/conv_mkldnn_op.cc | 137 +++++++++++++---- paddle/fluid/platform/mkldnn_reuse.h | 49 ++++-- .../unittests/test_conv2d_int8_mkldnn_op.py | 140 +++++++++++++++--- 3 files changed, 266 insertions(+), 60 deletions(-) diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc index 03d9d466c3..16ffc11419 100644 --- a/paddle/fluid/operators/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/conv_mkldnn_op.cc @@ -318,10 +318,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { std::vector paddings = ctx.Attr>("paddings"); std::vector dilations = ctx.Attr>("dilations"); int groups = ctx.Attr("groups"); - bool fuse_relu = ctx.Attr("fuse_relu"); + bool fuse_residual_conn = ctx.Attr("fuse_residual_connection"); bool force_fp32_output = ctx.Attr("force_fp32_output"); + if (fuse_residual_conn) { + PADDLE_ENFORCE(force_fp32_output != true, + "residual fusion does not support force output with fp32"); + } bool is_conv3d = strides.size() == 3U; // TODO(tpatejko): add support for dilation @@ -355,14 +359,23 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { framework::DataTypeTrait::DataType); } + if (fuse_residual_conn) { + auto residual = ctx.Input("ResidualData"); + auto residual_dt = paddle::framework::ToMKLDNNDataType(residual->type()); + if (dst_dt != residual_dt) dst_dt = residual_dt; + } + // Get unique name for storing MKLDNN primitives std::string key; key.reserve(MaxKeyLength); platform::ConvMKLDNNHandler::AppendKey( &key, src_tz, weights_tz, strides, paddings, dilations, groups, src_dt, - input->format(), dst_dt, ctx.op().Output("Output")); + input->format(), fuse_relu, fuse_residual_conn, + ctx.op().Output("Output")); const std::string key_conv_pd = key + "@conv_pd"; + bool need_s8_to_u8 = false; + std::shared_ptr conv_p = nullptr; std::shared_ptr src_memory_p = nullptr; std::shared_ptr user_src_memory_p = nullptr; @@ -377,14 +390,20 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto src_key = key + "@src_mem_p"; auto user_src_key = key + "@user_src_mem_p"; auto src_reorder_key = key + "@src_mem_preorder_p"; + auto residual_reorder_key = key + "@residual_data_mem_preorder_p"; + conv_p = std::static_pointer_cast( dev_ctx.GetBlob(prim_key)); + if (conv_p == nullptr || !is_test) { const K* filter_data = filter->data(); auto scale_in_data = ctx.Attr("Scale_in"); + auto scale_in_eltwise_data = ctx.Attr("Scale_in_eltwise"); auto scale_weights_data = ctx.Attr>("Scale_weights"); auto scale_out_data = force_fp32_output ? 1.0f : ctx.Attr("Scale_out"); + float sum_scale = + fuse_residual_conn ? scale_out_data / scale_in_eltwise_data : 1.0f; bool is_multi_channel = scale_weights_data.size() > 1; @@ -427,6 +446,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { weights_tz, memory::data_type::s8, chosen_memory_format); auto dst_md = platform::MKLDNNMemDesc(dst_tz, dst_dt, chosen_memory_format); + // create a conv primitive descriptor and save it for usage in backward if (bias) { bias_tz = paddle::framework::vectorize2int(bias->dims()); @@ -434,11 +454,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { memory::format::x); conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, strides, paddings, mkldnn_engine, - fuse_relu, output_shift_scale, is_test); + fuse_relu, fuse_residual_conn, + output_shift_scale, sum_scale, is_test); } else { - conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, - paddings, mkldnn_engine, fuse_relu, - output_shift_scale, is_test); + conv_pd = + ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings, + mkldnn_engine, fuse_relu, fuse_residual_conn, + output_shift_scale, sum_scale, is_test); } // Save conv_pd/src_memory/weights_memory for backward pass dev_ctx.SetBlob(key_conv_pd, conv_pd); @@ -463,7 +485,41 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { user_weights_memory_p, pipeline, is_test, true, scale_weights_data, mask_reorder); - if (!force_fp32_output) { + if (fuse_residual_conn) { + auto residual_param = ctx.Input("ResidualData"); + PADDLE_ENFORCE_EQ(output->dims(), residual_param->dims(), + "Output and elementwise parameter need to have the " + "same dimension sizes"); + auto residual_dt = + paddle::framework::ToMKLDNNDataType(residual_param->type()); + if (residual_param->format() != handler->GetDstFormat()) { + auto residual_data_tz = + paddle::framework::vectorize2int(residual_param->dims()); + + auto user_residual_md = platform::MKLDNNMemDesc( + residual_data_tz, residual_dt, residual_param->format()); + + if (residual_dt == mkldnn::memory::data_type::u8) { + dst_memory_p = platform::SetDstMemory( + ctx, output, residual_param, user_residual_md, handler, + &pipeline); + } else { + need_s8_to_u8 = fuse_relu; + dst_memory_p = platform::SetDstMemory( + ctx, output, residual_param, user_residual_md, handler, + &pipeline); + } + } else { + output->ShareDataWith(*residual_param); + if (residual_dt == mkldnn::memory::data_type::u8) { + dst_memory_p = + platform::SetDstMemory(ctx, output, handler); + } else { + need_s8_to_u8 = fuse_relu; + dst_memory_p = platform::SetDstMemory(ctx, output, handler); + } + } + } else if (!force_fp32_output) { if (fuse_relu) { dst_memory_p = platform::SetDstMemory(ctx, output, handler); } else { @@ -476,11 +532,11 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { // create convolution op primitive auto scale_bias_key = key + "@scale_bias"; if (bias) { - const float* bias_data = bias->data(); + const K* bias_data = bias->data(); auto user_bias_md = platform::MKLDNNMemDesc( - {bias_tz}, platform::MKLDNNGetDataType(), memory::format::x); + {bias_tz}, platform::MKLDNNGetDataType(), memory::format::x); auto user_bias_memory_p = handler->AcquireBiasMemory( - user_bias_md, to_void_cast(bias_data)); + user_bias_md, to_void_cast(bias_data)); std::shared_ptr bias_memory_p; int mask_reorder = is_multi_channel ? 1 << 0 : 1; int count = @@ -526,26 +582,51 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx, mkldnn_engine, key)); } - if (!force_fp32_output) { + + if (fuse_residual_conn) { + auto residual_param = ctx.Input("ResidualData"); + auto residual_dt = + paddle::framework::ToMKLDNNDataType(residual_param->type()); + output->ShareDataWith(*residual_param); + if (residual_dt == mkldnn::memory::data_type::u8) { + platform::SetDstMemoryHandler(ctx, output, handler, + &dst_memory_p); + } else { + platform::SetDstMemoryHandler(ctx, output, handler, + &dst_memory_p); + } + } else if (!force_fp32_output) { if (fuse_relu) { - dst_memory_p = - platform::SetDstMemoryHandler(ctx, output, handler); + platform::SetDstMemoryHandler(ctx, output, handler, + &dst_memory_p); } else { - dst_memory_p = - platform::SetDstMemoryHandler(ctx, output, handler); + platform::SetDstMemoryHandler(ctx, output, handler, + &dst_memory_p); } } else { - dst_memory_p = - platform::SetDstMemoryHandler(ctx, output, handler); + platform::SetDstMemoryHandler(ctx, output, handler, + &dst_memory_p); } + if (src_memory_reorder_p) { pipeline.push_back(*src_memory_reorder_p); } + + auto residual_reorder_p = std::static_pointer_cast( + dev_ctx.GetBlob(residual_reorder_key)); + if (residual_reorder_p) { + pipeline.push_back(*residual_reorder_p); + } + pipeline.push_back(*conv_p); } // push primitive to stream and wait until it's executed stream(stream::kind::eager).submit(pipeline).wait(); + if (need_s8_to_u8) { + output->mutable_data(ctx.GetPlace()); + } + output->set_layout(DataLayout::kMKLDNN); output->set_format(GetMKLDNNFormat(*dst_memory_p)); } @@ -577,11 +658,15 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { } mkldnn::primitive_attr CreatePostOps( - bool fuse_relu, const std::vector output_shift_scale) const { + bool fuse_relu, bool fuse_residual_conn, + const std::vector output_shift_scale, float sum_scale) const { mkldnn::primitive_attr conv_attr; mkldnn::post_ops post_operations; int mask = output_shift_scale.size() > 1 ? 1 << 1 : 0; conv_attr.set_output_scales(mask, output_shift_scale); + if (fuse_residual_conn) { + post_operations.append_sum(sum_scale); + } if (fuse_relu) { constexpr float scale = 1.0f; constexpr float negative_slope = 0.0f; @@ -622,8 +707,9 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { const memory::desc& dst, const std::vector& strides, const std::vector& paddings, const mkldnn::engine& engine, const bool fuse_relu, + const bool fuse_residual_conn, const std::vector output_shift_scale, - bool is_test) const { + const float sum_scale, bool is_test) const { memory::dims stride_dims = {strides[0], strides[1]}; memory::dims padding_dims = {paddings[0], paddings[1]}; @@ -634,8 +720,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { propagation, mkldnn::convolution_direct, src, weights, dst, stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero); - mkldnn::primitive_attr conv_attr = - CreatePostOps(fuse_relu, output_shift_scale); + mkldnn::primitive_attr conv_attr = CreatePostOps( + fuse_relu, fuse_residual_conn, output_shift_scale, sum_scale); auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc( conv_desc, conv_attr, engine); @@ -675,8 +761,9 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { const std::vector& strides, const std::vector& paddings, const mkldnn::engine& engine, const bool fuse_relu, + const bool fuse_residual_conn, const std::vector output_shift_scale, - bool is_test) const { + const float sum_scale, bool is_test) const { memory::dims stride_dims = {strides[0], strides[1]}; memory::dims padding_dims = {paddings[0], paddings[1]}; @@ -687,8 +774,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { propagation, mkldnn::convolution_direct, src, weights, bias, dst, stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero); - mkldnn::primitive_attr conv_attr = - CreatePostOps(fuse_relu, output_shift_scale); + mkldnn::primitive_attr conv_attr = CreatePostOps( + fuse_relu, fuse_residual_conn, output_shift_scale, sum_scale); auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc( conv_desc, conv_attr, engine); @@ -891,7 +978,7 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { input_grad->set_format(GetMKLDNNFormat(*diff_src_memory_p)); } stream(stream::kind::eager).submit(pipeline).wait(); - } // Compute() + } }; } // namespace operators diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index b3d20736a8..faac6a12c6 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -210,13 +210,15 @@ class MKLDNNHandler { dst_memory.reset(new mkldnn::memory(*dst_pd, to_void_cast(output_data))); } - static void AppendKey( - std::string* key, const mkldnn::memory::dims& input_dims, - const mkldnn::memory::dims& weights_dims, const std::vector& strides, - const std::vector& paddings, const std::vector& dilations, - const int& groups, const mkldnn::memory::data_type& srcdt, - const mkldnn::memory::format& format, - const mkldnn::memory::data_type& dstdt, const std::string& suffix) { + static void AppendKey(std::string* key, + const mkldnn::memory::dims& input_dims, + const mkldnn::memory::dims& weights_dims, + const std::vector& strides, + const std::vector& paddings, + const std::vector& dilations, const int& groups, + const mkldnn::memory::data_type& srcdt, + const mkldnn::memory::format& format, const bool& relu, + const bool& residual, const std::string& suffix) { AppendKeyDims(key, input_dims); AppendKeyDims(key, weights_dims); AppendKeyVec(key, strides); @@ -225,7 +227,8 @@ class MKLDNNHandler { AppendKey(key, std::to_string(groups)); AppendKey(key, std::to_string(srcdt)); AppendKey(key, std::to_string(format)); - AppendKey(key, std::to_string(dstdt)); + AppendKey(key, std::to_string(relu)); + AppendKey(key, std::to_string(residual)); AppendKey(key, suffix); } @@ -664,15 +667,35 @@ static std::shared_ptr SetDstMemory( } template -static std::shared_ptr SetDstMemoryHandler( +static std::shared_ptr SetDstMemory( const framework::ExecutionContext& ctx, framework::Tensor* output, - const std::shared_ptr& handler) { + const framework::Tensor* residual_param, + const mkldnn::memory::desc& user_residual_md, + const std::shared_ptr& handler, + std::vector* pipeline) { + const T* residual_param_data = residual_param->data(); + PADDLE_ENFORCE(residual_param_data != nullptr, + "Provide data if you want MKLDNN conv+elementwise_add fusion"); + std::shared_ptr user_residual_memory_p = + handler->AcquireResidualDataMemory(user_residual_md, + to_void_cast(residual_param_data)); + T* output_data = output->mutable_data(ctx.GetPlace()); + std::shared_ptr dst_memory_p = + handler->AcquireDstMemoryFromResidualDataMemory( + user_residual_memory_p, to_void_cast(output_data), *pipeline); + return dst_memory_p; +} + +template +static void SetDstMemoryHandler( + const framework::ExecutionContext& ctx, framework::Tensor* output, + const std::shared_ptr& handler, + std::shared_ptr* dst_memory_p) { T* output_data = output->mutable_data( ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize()); - std::shared_ptr dst_memory_p; - dst_memory_p->set_data_handle(to_void_cast(output_data)); - return dst_memory_p; + (*dst_memory_p)->set_data_handle(to_void_cast(output_data)); } + } // namespace platform } // namespace paddle diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py index def188bfa6..5ad376cb08 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py @@ -25,6 +25,15 @@ from test_conv2d_op import conv2d_forward_naive, TestConv2dOp def conv2d_forward_refer(input, filter, group, conv_param): out, in_n, out_h, out_w, out_c = conv2d_forward_naive(input, filter, group, conv_param) + size = [in_n, out_c, out_h, out_w] + return format_reorder(out, size) + + +def format_reorder(out, size): + in_n = size[0] + out_h = size[2] + out_w = size[3] + out_c = size[1] out_tmp = np.zeros((in_n, out_h, out_w, out_c)) for n in range(in_n): for i in range(out_h): @@ -48,6 +57,7 @@ class TestConv2dInt8Op(TestConv2dOp): self.init_dilation() self.init_test_case() self.init_fuse_relu() + self.init_fuse_residual() self.init_data_type() conv2d_param = { @@ -79,11 +89,24 @@ class TestConv2dInt8Op(TestConv2dOp): np.round((input_shift) * self.scale_in).astype(np.int32), filter_int, self.groups, conv2d_param).astype(np.float32) * scale_output_shift - if self.fuse_relu: - output = np.maximum(np.round(output1 - output2), - 0).astype(self.dsttype) + if self.fuse_residual: + input_residual = np.random.randint( + -5, 5, self.input_residual_size).astype(self.srctype) + output_tmp = np.round(output1 - output2 + format_reorder( + input_residual, self.input_residual_size).astype( + self.srctype) * (self.scale_out / self.scale_in_eltwise + )) + if self.fuse_relu: + output = np.maximum(output_tmp, 0).astype(self.dsttype) + else: + output = output_tmp.astype(self.dsttype) else: - output = np.round(output1 - output2).astype(self.dsttype) + if self.fuse_relu: + output = np.maximum(np.round(output1 - output2), + 0).astype(self.dsttype) + else: + output = np.round(output1 - output2).astype(self.dsttype) + else: filter_int = np.round(filter * self.scale_weights[0]).astype(np.int32) @@ -92,21 +115,35 @@ class TestConv2dInt8Op(TestConv2dOp): output1 = conv2d_forward_refer( input.astype(np.int32), filter_int, self.groups, conv2d_param).astype(np.float32) - if self.fuse_relu: - output = np.maximum( - np.round(output1 * (self.scale_out / ( - self.scale_in * self.scale_weights[0]))), - 0).astype(self.dsttype) + if self.fuse_residual: + input_residual = np.random.randint( + 0, 10, self.input_residual_size).astype(self.srctype) + output_tmp = np.round(output1 * (self.scale_out / ( + self.scale_in * self.scale_weights[0])) + format_reorder( + input_residual, self.input_residual_size).astype( + np.int32) * (self.scale_out / self.scale_in_eltwise + )) + output_tmp2 = np.round(output1 * ( + self.scale_out / (self.scale_in * self.scale_weights[0]))) + if self.fuse_relu: + output = np.maximum(output_tmp, 0).astype(self.dsttype) + else: + output = output_tmp.astype(self.dsttype) else: - output = np.round(output1 * (self.scale_out / ( - self.scale_in * - self.scale_weights[0]))).astype(self.dsttype) + if self.fuse_relu: + output = np.maximum(output_tmp2, 0).astype(self.dsttype) + else: + output = output_tmp2.astype(self.dsttype) self.inputs = { 'Input': OpTest.np_dtype_to_fluid_dtype(input.astype(self.srctype)), 'Filter': OpTest.np_dtype_to_fluid_dtype(filter) } + if self.fuse_residual: + self.inputs['ResidualData'] = OpTest.np_dtype_to_fluid_dtype( + input_residual) + self.attrs = { 'strides': self.stride, 'paddings': self.pad, @@ -119,7 +156,9 @@ class TestConv2dInt8Op(TestConv2dOp): 'Scale_in': self.scale_in, 'Scale_out': self.scale_out, 'Scale_weights': self.scale_weights, - 'fuse_relu': self.fuse_relu + 'Scale_in_eltwise': self.scale_in_eltwise, + 'fuse_relu': self.fuse_relu, + 'fuse_residual_connection': self.fuse_residual } self.outputs = {'Output': output} @@ -137,11 +176,14 @@ class TestConv2dInt8Op(TestConv2dOp): def init_test_case(self): TestConv2dOp.init_test_case(self) + self.input_size = [1, 1, 5, 5] # NCHW f_c = self.input_size[1] // self.groups - self.filter_size = [1, f_c, 3, 3] + self.input_residual_size = [1, 2, 3, 3] + self.filter_size = [2, f_c, 3, 3] self.scale_in = 1.0 self.scale_out = 0.5 self.scale_weights = [10.0] + self.scale_in_eltwise = 0.6 def init_data_type(self): self.srctype = np.uint8 @@ -150,8 +192,11 @@ class TestConv2dInt8Op(TestConv2dOp): def init_fuse_relu(self): self.fuse_relu = True + def init_fuse_residual(self): + self.fuse_residual = True + -#--------------------test conv2d u8 in and u8 out-------------------- +#--------------------test conv2d u8 in and u8 out with residual fuse-------------------- class TestConv2d(TestConv2dInt8Op): @@ -159,18 +204,21 @@ class TestConv2d(TestConv2dInt8Op): self.pad = [0, 0] self.stride = [1, 1] self.input_size = [2, 3, 5, 5] # NCHW + self.input_residual_size = [2, 6, 3, 3] assert np.mod(self.input_size[1], self.groups) == 0 f_c = self.input_size[1] // self.groups self.filter_size = [6, f_c, 3, 3] self.scale_in = 1.0 self.scale_out = 0.5 self.scale_weights = [10.0] + self.scale_in_eltwise = 0.6 class TestWithPad(TestConv2d): def init_test_case(self): TestConv2d.init_test_case(self) self.pad = [1, 1] + self.input_residual_size = [2, 6, 5, 5] class TestWithGroup(TestConv2d): @@ -183,12 +231,14 @@ class TestWithStride(TestConv2dInt8Op): self.pad = [1, 1] self.stride = [2, 2] self.input_size = [2, 3, 6, 6] + self.input_residual_size = [2, 6, 3, 3] assert np.mod(self.input_size[1], self.groups) == 0 f_c = self.input_size[1] // self.groups self.filter_size = [6, f_c, 3, 3] self.scale_in = 1.0 self.scale_out = 0.8 self.scale_weights = [10.0] + self.scale_in_eltwise = 0.5 class TestWith1x1(TestConv2dInt8Op): @@ -196,12 +246,14 @@ class TestWith1x1(TestConv2dInt8Op): self.pad = [0, 0] self.stride = [1, 1] self.input_size = [1, 3, 5, 5] + self.input_residual_size = [1, 6, 5, 5] assert np.mod(self.input_size[1], self.groups) == 0 f_c = self.input_size[1] // self.groups self.filter_size = [6, f_c, 1, 1] self.scale_in = 1.0 self.scale_out = 0.5 self.scale_weights = [12.0] + self.scale_in_eltwise = 0.5 class TestWithInput1x1Filter1x1(TestConv2dInt8Op): @@ -209,24 +261,29 @@ class TestWithInput1x1Filter1x1(TestConv2dInt8Op): self.pad = [0, 0] self.stride = [1, 1] self.input_size = [2, 3, 1, 1] + self.input_residual_size = [2, 6, 1, 1] assert np.mod(self.input_size[1], self.groups) == 0 f_c = self.input_size[1] // self.groups self.filter_size = [6, f_c, 1, 1] self.scale_in = 1.0 self.scale_out = 0.5 self.scale_weights = [10.0] + self.scale_in_eltwise = 0.8 def init_group(self): self.groups = 3 -def init_data_type_with_fusion(self, input_dt, fuse_relu): +def init_data_type_with_fusion(self, input_dt, fuse_relu, fuse_residual): self.srctype = input_dt self.dsttype = np.uint8 if fuse_relu else np.int8 def init_fuse_relu(self): self.fuse_relu = fuse_relu + def init_fuse_residual(self): + self.fuse_residual = fuse_residual + def create_test_int8_class(parent): @@ -234,29 +291,68 @@ def create_test_int8_class(parent): class TestS8U8Case(parent): def init_data_type(self): - init_data_type_with_fusion(self, np.int8, True) + init_data_type_with_fusion(self, np.int8, True, False) #--------------------test conv2d s8 in and s8 out-------------------- class TestS8S8Case(parent): def init_data_type(self): - init_data_type_with_fusion(self, np.int8, False) + init_data_type_with_fusion(self, np.int8, False, False) #--------------------test conv2d u8 in and s8 out-------------------- class TestU8S8Case(parent): def init_data_type(self): - init_data_type_with_fusion(self, np.uint8, False) + init_data_type_with_fusion(self, np.uint8, False, False) + + #--------------------test conv2d u8 in and u8 out without residual fuse-------------------- + + class TestU8U8Case(parent): + def init_data_type(self): + init_data_type_with_fusion(self, np.uint8, True, False) - cls_name_s8u8 = "{0}_relu_{1}".format(parent.__name__, "1") - cls_name_s8s8 = "{0}_relu_{1}".format(parent.__name__, "0") - cls_name_u8s8 = "{0}_relu_{1}".format(parent.__name__, "0") + #--------------------test conv2d s8 in and u8 out with residual fuse-------------------- + + class TestS8U8ResCase(parent): + def init_data_type(self): + init_data_type_with_fusion(self, np.int8, True, True) + + #--------------------test conv2d s8 in and s8 out with residual fuse-------------------- + + class TestS8S8ResCase(parent): + def init_data_type(self): + init_data_type_with_fusion(self, np.int8, False, True) + + #--------------------test conv2d u8 in and s8 out with residual fuse-------------------- + + class TestU8S8ResCase(parent): + def init_data_type(self): + init_data_type_with_fusion(self, np.uint8, False, True) + + cls_name_s8u8 = "{0}_relu_{1}_residual_0".format(parent.__name__, "1") + cls_name_s8s8 = "{0}_relu_{1}_residual_0".format(parent.__name__, "0") + cls_name_u8s8 = "{0}_relu_{1}_residual_0".format(parent.__name__, "0") + cls_name_u8u8 = "{0}_relu_{1}_residual_0".format(parent.__name__, "1") + cls_name_s8u8_re_1 = "{0}_relu_{1}_residual_{2}".format(parent.__name__, + "1", "1") + cls_name_s8s8_re_1 = "{0}_relu_{1}_residual_{2}".format(parent.__name__, + "0", "1") + cls_name_u8s8_re_1 = "{0}_relu_{1}_residual_{2}".format(parent.__name__, + "0", "1") TestS8U8Case.__name__ = cls_name_s8u8 TestS8S8Case.__name__ = cls_name_s8s8 TestU8S8Case.__name__ = cls_name_u8s8 + TestU8U8Case.__name__ = cls_name_u8u8 + TestS8U8ResCase.__name__ = cls_name_s8u8_re_1 + TestS8S8ResCase.__name__ = cls_name_s8s8_re_1 + TestU8S8ResCase.__name__ = cls_name_u8s8_re_1 globals()[cls_name_s8u8] = TestS8U8Case globals()[cls_name_s8s8] = TestS8S8Case globals()[cls_name_u8s8] = TestU8S8Case + globals()[cls_name_u8u8] = TestU8U8Case + globals()[cls_name_s8u8_re_1] = TestS8U8ResCase + globals()[cls_name_s8s8_re_1] = TestS8S8ResCase + globals()[cls_name_u8s8_re_1] = TestU8S8ResCase create_test_int8_class(TestConv2dInt8Op) From 064512aa47b9ea35c0b5479b32c1653512c8b7c4 Mon Sep 17 00:00:00 2001 From: chengduo Date: Thu, 10 Jan 2019 18:41:40 -0600 Subject: [PATCH 197/197] Remove workspace_handle in conv_cudnn (#15186) * remove workspace_handle in conv2d_cudnn test=develop * remove workspace_handle test=develop * fix bug test=develop * make test_conv2d_op SERIAL test=develop * save memory in conv_cudnn test=develop * enhance thread safety test=develop * enhance temporary allocator test=develop * Add excess fraction test=develop * follow comments test=develop * fix bug and code refine test=develop * fix memory size check test=develop * rename reuse_tmp_allocation_excess_fraction test=develop --- paddle/fluid/framework/operator.h | 2 +- paddle/fluid/operators/conv_cudnn_op.cu.cc | 149 ++++++++++-------- paddle/fluid/platform/device_context.cc | 28 ++-- paddle/fluid/platform/device_context.h | 2 +- paddle/fluid/platform/temporary_allocator.cc | 63 ++++++-- paddle/fluid/platform/temporary_allocator.h | 10 +- .../platform/temporary_allocator_test.cc | 58 ++++++- python/paddle/fluid/__init__.py | 3 +- 8 files changed, 208 insertions(+), 107 deletions(-) diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 4d29564aee..041187665a 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -391,7 +391,7 @@ class ExecutionContext { PADDLE_ENFORCE( dynamic_cast(allocation_ptr) != nullptr, "The AllocationPtr must be TemporaryAllocation."); - PADDLE_ENFORCE_EQ(allocation_ptr->size(), + PADDLE_ENFORCE_GE(allocation_ptr->size(), framework::product(dim) * sizeof(T)); paddle::framework::Tensor temp_tensor( diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc index 25a723fc07..f5208e7a60 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc @@ -137,7 +137,6 @@ class CUDNNConvOpKernel : public framework::OpKernel { // ------------------- cudnn conv algorithm --------------------- cudnnConvolutionFwdAlgo_t algo; auto handle = dev_ctx.cudnn_handle(); - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); bool half_float = false; #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) @@ -158,6 +157,8 @@ class CUDNNConvOpKernel : public framework::OpKernel { VLOG(5) << "NOT use cudnn_tensor_op_math"; } #endif + Tensor cudnn_workspace; + void* cudnn_workspace_ptr = nullptr; auto x_dims = framework::vectorize(input->dims()); auto f_dims = framework::vectorize(filter->dims()); @@ -180,21 +181,26 @@ class CUDNNConvOpKernel : public framework::OpKernel { .Var(kCUDNNFwdAlgoCache) ->GetMutable>(); } + cudnn_workspace = + ctx.AllocateTmpTensor( + framework::make_ddim( + {static_cast(workspace_size_limit)}), + dev_ctx); + cudnn_workspace_ptr = static_cast(cudnn_workspace.data()); + algo = algo_cache->GetAlgorithm( x_dims, f_dims, strides, paddings, dilations, 0, [&]() { int returned_algo_count; std::array fwd_perf_stat; - auto cudnn_find_func = [&](void* cudnn_workspace) { - CUDNN_ENFORCE( - platform::dynload::cudnnFindConvolutionForwardAlgorithmEx( - handle, cudnn_input_desc, input_data, cudnn_filter_desc, - filter_data, cudnn_conv_desc, cudnn_output_desc, - output_data, kNUM_CUDNN_FWD_ALGS, &returned_algo_count, - fwd_perf_stat.data(), cudnn_workspace, - workspace_size_limit)); - }; - workspace_handle.RunFunc(cudnn_find_func, workspace_size_limit); + + CUDNN_ENFORCE( + platform::dynload::cudnnFindConvolutionForwardAlgorithmEx( + handle, cudnn_input_desc, input_data, cudnn_filter_desc, + filter_data, cudnn_conv_desc, cudnn_output_desc, + output_data, kNUM_CUDNN_FWD_ALGS, &returned_algo_count, + fwd_perf_stat.data(), cudnn_workspace_ptr, + workspace_size_limit)); VLOG(3) << "Perf result: (algo: stat, time, memory)"; for (int i = 0; i < returned_algo_count; ++i) { @@ -219,17 +225,23 @@ class CUDNNConvOpKernel : public framework::OpKernel { PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit, "workspace_size to be allocated exceeds the limit"); + // Allocate on GPU memory + if (!cudnn_workspace_ptr) { + cudnn_workspace = + ctx.AllocateTmpTensor( + framework::make_ddim( + {static_cast(workspace_size_in_bytes)}), + dev_ctx); + cudnn_workspace_ptr = static_cast(cudnn_workspace.data()); + } // ------------------- cudnn conv forward --------------------- ScalingParamType alpha = 1.0f, beta = 0.0f; for (int i = 0; i < groups; i++) { - auto cudnn_func = [&](void* cudnn_workspace) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( - handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, - cudnn_filter_desc, filter_data + i * group_offset_filter, - cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes, - &beta, cudnn_output_desc, output_data + i * group_offset_out)); - }; - workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( + handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, + cudnn_filter_desc, filter_data + i * group_offset_filter, + cudnn_conv_desc, algo, cudnn_workspace_ptr, workspace_size_in_bytes, + &beta, cudnn_output_desc, output_data + i * group_offset_out)); } } }; @@ -353,10 +365,20 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { workspace_size_limit = max_user_size * 1024 * 1024; } + Tensor cudnn_workspace; + void* cudnn_workspace_ptr = nullptr; + if ((input_data || filter_data) && exhaustive_search) { + cudnn_workspace = + ctx.AllocateTmpTensor( + framework::make_ddim( + {static_cast(workspace_size_limit)}), + dev_ctx); + cudnn_workspace_ptr = static_cast(cudnn_workspace.data()); + } + auto x_dims = framework::vectorize(input->dims()); auto f_dims = framework::vectorize(filter->dims()); auto handle = dev_ctx.cudnn_handle(); - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); if (input_grad) { T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); if (exhaustive_search) { @@ -374,25 +396,22 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { ->GetMutable< AlgorithmsCache>(); } + data_algo = data_algo_cache->GetAlgorithm( x_dims, f_dims, strides, paddings, dilations, 0, [&]() { int returned_algo_count; std::array data_perf_stat; - auto cudnn_find_bd_data_func = [&](void* cudnn_workspace) { - CUDNN_ENFORCE( - platform::dynload:: - cudnnFindConvolutionBackwardDataAlgorithmEx( - handle, cudnn_filter_desc, filter_data, - cudnn_output_grad_desc, output_grad_data, - cudnn_conv_desc, cudnn_input_desc, input_grad_data, - kNUM_CUDNN_BWD_DATA_ALGS, &returned_algo_count, - data_perf_stat.data(), cudnn_workspace, - workspace_size_limit)); - }; - workspace_handle.RunFunc(cudnn_find_bd_data_func, - workspace_size_limit); + + CUDNN_ENFORCE(platform::dynload:: + cudnnFindConvolutionBackwardDataAlgorithmEx( + handle, cudnn_filter_desc, filter_data, + cudnn_output_grad_desc, output_grad_data, + cudnn_conv_desc, cudnn_input_desc, + input_grad_data, kNUM_CUDNN_BWD_DATA_ALGS, + &returned_algo_count, data_perf_stat.data(), + cudnn_workspace_ptr, workspace_size_limit)); VLOG(3) << "Perf result: (algo: stat, time, memory)"; for (int i = 0; i < returned_algo_count; ++i) { @@ -443,25 +462,23 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { ->GetMutable< AlgorithmsCache>(); } + filter_algo = f_algo_cache->GetAlgorithm( x_dims, f_dims, strides, paddings, dilations, 0, [&]() { int returned_algo_count; std::array filter_perf_stat; - auto cudnn_find_bd_f_func = [&](void* cudnn_workspace) { - CUDNN_ENFORCE( - platform::dynload:: - cudnnFindConvolutionBackwardFilterAlgorithmEx( - handle, cudnn_input_desc, input_data, - cudnn_output_grad_desc, output_grad_data, - cudnn_conv_desc, cudnn_filter_desc, - filter_grad_data, kNUM_CUDNN_BWD_FILTER_ALGS, - &returned_algo_count, filter_perf_stat.data(), - cudnn_workspace, workspace_size_limit)); - }; - workspace_handle.RunFunc(cudnn_find_bd_f_func, - workspace_size_limit); + + CUDNN_ENFORCE( + platform::dynload:: + cudnnFindConvolutionBackwardFilterAlgorithmEx( + handle, cudnn_input_desc, input_data, + cudnn_output_grad_desc, output_grad_data, + cudnn_conv_desc, cudnn_filter_desc, filter_grad_data, + kNUM_CUDNN_BWD_FILTER_ALGS, &returned_algo_count, + filter_perf_stat.data(), cudnn_workspace_ptr, + workspace_size_limit)); return filter_perf_stat[0].algo; }); VLOG(3) << "cuDNN backward filter algo " << filter_algo; @@ -482,6 +499,16 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size); } + // ------------------- cudnn conv workspace --------------------- + if (!cudnn_workspace_ptr) { + cudnn_workspace = + ctx.AllocateTmpTensor( + framework::make_ddim( + {static_cast(workspace_size_in_bytes)}), + dev_ctx); + cudnn_workspace_ptr = static_cast(cudnn_workspace.data()); + } + // ------------------- cudnn conv backward data --------------------- ScalingParamType alpha = 1.0f, beta = 0.0f; if (input_grad) { @@ -489,15 +516,12 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { // Because beta is zero, it is unnecessary to reset input_grad. for (int i = 0; i < groups; i++) { - auto cudnn_func = [&](void* cudnn_workspace) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( - handle, &alpha, cudnn_filter_desc, - filter_data + i * group_offset_filter, cudnn_output_grad_desc, - output_grad_data + i * group_offset_out, cudnn_conv_desc, - data_algo, cudnn_workspace, workspace_size_in_bytes, &beta, - cudnn_input_desc, input_grad_data + i * group_offset_in)); - }; - workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( + handle, &alpha, cudnn_filter_desc, + filter_data + i * group_offset_filter, cudnn_output_grad_desc, + output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo, + cudnn_workspace_ptr, workspace_size_in_bytes, &beta, + cudnn_input_desc, input_grad_data + i * group_offset_in)); } } // ------------------- cudnn conv backward filter --------------------- @@ -505,15 +529,12 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { T* filter_grad_data = filter_grad->mutable_data(ctx.GetPlace()); // Because beta is zero, it is unnecessary to reset filter_grad. for (int i = 0; i < groups; i++) { - auto cudnn_func = [&](void* cudnn_workspace) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( - handle, &alpha, cudnn_input_desc, - input_data + i * group_offset_in, cudnn_output_grad_desc, - output_grad_data + i * group_offset_out, cudnn_conv_desc, - filter_algo, cudnn_workspace, workspace_size_in_bytes, &beta, - cudnn_filter_desc, filter_grad_data + i * group_offset_filter)); - }; - workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( + handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, + cudnn_output_grad_desc, output_grad_data + i * group_offset_out, + cudnn_conv_desc, filter_algo, cudnn_workspace_ptr, + workspace_size_in_bytes, &beta, cudnn_filter_desc, + filter_grad_data + i * group_offset_filter)); } } } diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 09f3d3de54..8f80a2d782 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -92,26 +92,24 @@ platform::TemporaryAllocator& DeviceTemporaryAllocator::Get( const platform::Place& place, const cudaStream_t& stream) { PADDLE_ENFORCE(platform::is_gpu_place(place)); auto place_stream = std::make_pair(place, stream); - { - std::unique_lock lock(mtx_); - if (!device_allocator_.count(place_stream)) { - device_allocator_[place_stream].reset(new TemporaryAllocator(place)); - device_allocator_[place_stream]->SetCallback([stream]() { - PADDLE_ENFORCE(cudaStreamSynchronize(stream)); - PADDLE_ENFORCE(cudaGetLastError()); - }); - } + std::unique_lock lock(mtx_); + auto it = device_allocator_.find(place_stream); + if (it == device_allocator_.end()) { + auto tmp_allocator = new TemporaryAllocator(place); + tmp_allocator->SetCallback([stream]() { + PADDLE_ENFORCE(cudaStreamSynchronize(stream)); + PADDLE_ENFORCE(cudaGetLastError()); + }); + device_allocator_[place_stream].reset(tmp_allocator); + return *tmp_allocator; + } else { + return *it->second; } - return *device_allocator_.at(place_stream); } template <> platform::TemporaryAllocator& DeviceTemporaryAllocator::Get( const platform::CUDADeviceContext& dev_ctx) { - auto place_stream = std::make_pair(dev_ctx.GetPlace(), dev_ctx.stream()); - if (device_allocator_.count(place_stream)) { - return *device_allocator_.at(place_stream); - } return Get(dev_ctx.GetPlace(), dev_ctx.stream()); } #endif @@ -325,7 +323,7 @@ Place CUDADeviceContext::GetPlace() const { return place_; } void CUDADeviceContext::Wait() const { auto& allocator = DeviceTemporaryAllocator::Instance().Get(*this); - allocator.Release([=]() { + allocator.Release([this]() { PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); PADDLE_ENFORCE(cudaGetLastError()); }); diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index c81d17380c..d376f90ad5 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -61,7 +61,7 @@ namespace platform { * the allocations of temp_allocation_queue: * - when the Stream calls cudaStreamSynchronize; * - when the allocation size of opportunities exceeds a certain threshold - * (defined by FLAGS_limit_of_temporary_allocation). + * (defined by FLAGS_limit_of_tmp_allocation). * * */ class DeviceTemporaryAllocator { diff --git a/paddle/fluid/platform/temporary_allocator.cc b/paddle/fluid/platform/temporary_allocator.cc index 0be017f75b..9cbdfe46e7 100644 --- a/paddle/fluid/platform/temporary_allocator.cc +++ b/paddle/fluid/platform/temporary_allocator.cc @@ -15,8 +15,15 @@ #include "paddle/fluid/platform/temporary_allocator.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" -DEFINE_double(limit_of_temporary_allocation, -1, - "The up limit of temporary_allocation size."); +DEFINE_int64(limit_of_tmp_allocation, -1, + "The up limit of temporary_allocation size."); +DEFINE_double(times_excess_than_required_tmp_allocation, 2, + "times_excess_than_required_tmp_allocation indicates the " + "max size the TemporaryAllocator can return. For example, " + "if the required memory size is N, and " + "times_excess_than_required_tmp_allocation is 2.0, " + "the TemporaryAllocator will return the available allocation " + "that the range of size is N ~ 2*N."); namespace paddle { namespace platform { @@ -29,24 +36,25 @@ TemporaryAllocation::TemporaryAllocation( underlying_allocation_(std::move(underlying_allocation)) {} TemporaryAllocator::TemporaryAllocator(platform::Place place) : place_(place) { - temp_mem_queue_.reset(new std::deque()); + temp_mem_map_.reset(new std::multimap()); } bool TemporaryAllocator::IsAllocThreadSafe() const { return true; } void TemporaryAllocator::Release(const std::function &callback) { - std::shared_ptr> t_allocations; + std::unique_ptr> t_allocations; { std::unique_lock lock(mtx_); callback(); - t_allocations = temp_mem_queue_; - temp_mem_queue_.reset(new std::deque()); + t_allocations.swap(temp_mem_map_); + temp_mem_map_.reset(new std::multimap()); wait_delete_mem_ = 0; } + for (auto tmp : *t_allocations) { - VLOG(10) << "Delete temporary allocation " << tmp->ptr() - << " size: " << tmp->size(); - delete tmp; + VLOG(10) << "Delete temporary allocation " << tmp.second->ptr() + << " size: " << tmp.second->size(); + delete tmp.second; } } @@ -54,28 +62,34 @@ void TemporaryAllocator::Free(alloc::Allocation *allocation) { auto *temp_allocation = dynamic_cast(allocation); PADDLE_ENFORCE_NOT_NULL(temp_allocation); if (platform::is_gpu_place(temp_allocation->place())) { + PADDLE_ENFORCE(platform::is_same_place(temp_allocation->place(), place_), + "The place should be the same."); size_t wait_delete_mem = 0; { std::unique_lock lock(mtx_); - temp_mem_queue_->emplace_back(temp_allocation); + temp_mem_map_->emplace(temp_allocation->size(), temp_allocation); wait_delete_mem_ += temp_allocation->size(); wait_delete_mem = wait_delete_mem_; VLOG(10) << "Move temporary allocation: " << temp_allocation->ptr() << " to delete queue: " << temp_allocation->size() << "; " - << "wait_delete_mem: " << wait_delete_mem_; + << "wait_delete_mem: " << wait_delete_mem; } - if (FLAGS_limit_of_temporary_allocation > 0 && - wait_delete_mem > FLAGS_limit_of_temporary_allocation) { + + if (FLAGS_limit_of_tmp_allocation > 0 && + wait_delete_mem > static_cast(FLAGS_limit_of_tmp_allocation)) { + PADDLE_ENFORCE(callback_ != nullptr, "The callback is non-initialized."); Release(callback_); } return; } + VLOG(10) << "Delete temporary allocation " << temp_allocation->ptr() + << " size: " << temp_allocation->size(); delete temp_allocation; } size_t TemporaryAllocator::TemporaryAllocationQueueSize() { std::unique_lock lock(mtx_); - return temp_mem_queue_ ? temp_mem_queue_->size() : 0; + return temp_mem_map_ ? temp_mem_map_->size() : 0; } void TemporaryAllocator::SetCallback(const std::function &callback) { @@ -84,6 +98,27 @@ void TemporaryAllocator::SetCallback(const std::function &callback) { alloc::Allocation *TemporaryAllocator::AllocateImpl( size_t size, alloc::Allocator::Attr attr) { + { + // Find available allocation in temp_mem_map. + std::unique_lock lock(mtx_); + if (temp_mem_map_->size()) { + auto it = temp_mem_map_->lower_bound(size); + // FIXME(zcd): Not sure the best value of excess fraction. + if (it != temp_mem_map_->end() && + it->first < + static_cast( + size * FLAGS_times_excess_than_required_tmp_allocation)) { + auto tmp_ptr = it->second; + temp_mem_map_->erase(it); + wait_delete_mem_ -= tmp_ptr->size(); + VLOG(10) << "Reuse temporary allocation: " << tmp_ptr->ptr() << ": " + << tmp_ptr->size(); + return tmp_ptr; + } + } + } + // If not find the the available allocation, get allocation from + // AllocatorFacadeInstance. auto raw_allocation = alloc::AllocatorFacade::Instance().Alloc(place_, size, attr); auto temp_mem = new TemporaryAllocation(std::move(raw_allocation)); diff --git a/paddle/fluid/platform/temporary_allocator.h b/paddle/fluid/platform/temporary_allocator.h index 812c4a3331..d657a14223 100644 --- a/paddle/fluid/platform/temporary_allocator.h +++ b/paddle/fluid/platform/temporary_allocator.h @@ -15,6 +15,7 @@ #pragma once #include // NOLINT #include +#include #include // NOLINT #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/platform/lock_guard_ptr.h" @@ -39,7 +40,7 @@ class TemporaryAllocation : public memory::allocation::Allocation { * * There is one opportunity to free the allocations of temp_allocation_queue: * - when the allocation size of opportunities exceeds a certain threshold - * (defined by FLAGS_limit_of_temporary_allocation). + * (defined by FLAGS_limit_of_tmp_allocation). * * */ class TemporaryAllocator : public memory::allocation::Allocator { @@ -62,11 +63,10 @@ class TemporaryAllocator : public memory::allocation::Allocator { private: platform::Place place_; - // When the allocation is not held by any variable, it should be placed - // to temp_mem_queue immediately. - std::shared_ptr> temp_mem_queue_{nullptr}; - + // to temp_mem_map immediately. + std::unique_ptr> temp_mem_map_{ + nullptr}; std::mutex mtx_; size_t wait_delete_mem_{0}; std::function callback_; diff --git a/paddle/fluid/platform/temporary_allocator_test.cc b/paddle/fluid/platform/temporary_allocator_test.cc index 35d1d92981..3879cd5400 100644 --- a/paddle/fluid/platform/temporary_allocator_test.cc +++ b/paddle/fluid/platform/temporary_allocator_test.cc @@ -18,7 +18,8 @@ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/tensor_util.h" -DECLARE_double(limit_of_temporary_allocation); +DECLARE_int64(limit_of_tmp_allocation); +DECLARE_double(times_excess_than_required_tmp_allocation); namespace paddle { namespace platform { @@ -35,7 +36,7 @@ class DummyOp : public framework::OperatorBase { const platform::Place& place) const override {} }; -TEST(temporary_allocator, temporary_allocator) { +TEST(temporary_allocator, test_base_function) { platform::CPUPlace cpu_place; TemporaryAllocator alloc(cpu_place); alloc.Allocate(100); @@ -59,10 +60,10 @@ TEST(temporary_allocator, temporary_allocator) { #endif } -TEST(temporary_allocator, add_callback) { +TEST(temporary_allocator, test_flags_function) { #ifdef PADDLE_WITH_CUDA - const double limit = FLAGS_limit_of_temporary_allocation; - FLAGS_limit_of_temporary_allocation = 10; + const int64_t limit = FLAGS_limit_of_tmp_allocation; + FLAGS_limit_of_tmp_allocation = 10; platform::CUDAPlace gpu_place(0); TemporaryAllocator gpu_alloc(gpu_place); @@ -78,7 +79,52 @@ TEST(temporary_allocator, add_callback) { }); { gpu_alloc.Allocate(100); } PADDLE_ENFORCE(deleted); - FLAGS_limit_of_temporary_allocation = limit; + FLAGS_limit_of_tmp_allocation = limit; +#endif +} + +TEST(temporary_allocator, test_reuse_tmp_allocation) { +#ifdef PADDLE_WITH_CUDA + platform::CUDAPlace gpu_place(0); + TemporaryAllocator gpu_alloc(gpu_place); + gpu_alloc.SetCallback([]() {}); + + void* tmp_allocation_ptr1 = nullptr; + { + PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0); + auto tmp_allocation1 = gpu_alloc.Allocate(100); + tmp_allocation_ptr1 = tmp_allocation1->ptr(); + } + PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 1); + auto tmp_allocation2 = gpu_alloc.Allocate(100); + void* tmp_allocation_ptr2 = tmp_allocation2->ptr(); + PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0); + PADDLE_ENFORCE_EQ(tmp_allocation_ptr1, tmp_allocation_ptr2); + + auto tmp_allocation3 = gpu_alloc.Allocate(100); + void* tmp_allocation_ptr3 = tmp_allocation2->ptr(); + PADDLE_ENFORCE_EQ(tmp_allocation_ptr1, tmp_allocation_ptr3); +#endif +} + +TEST(temporary_allocator, test_times_excess_than_required_tmp_allocation) { +#ifdef PADDLE_WITH_CUDA + platform::CUDAPlace gpu_place(0); + TemporaryAllocator gpu_alloc(gpu_place); + gpu_alloc.SetCallback([]() {}); + double excess_fraction = FLAGS_times_excess_than_required_tmp_allocation; + void* tmp_allocation_ptr1 = nullptr; + { + PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0); + auto tmp_allocation1 = + gpu_alloc.Allocate(static_cast(100 * excess_fraction - 1)); + tmp_allocation_ptr1 = tmp_allocation1->ptr(); + } + PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 1); + auto tmp_allocation2 = gpu_alloc.Allocate(100); + void* tmp_allocation_ptr2 = tmp_allocation2->ptr(); + PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0); + PADDLE_ENFORCE_EQ(tmp_allocation_ptr1, tmp_allocation_ptr2); #endif } diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 2c17716500..686550a3c8 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -155,7 +155,8 @@ def __bootstrap__(): 'fraction_of_gpu_memory_to_use', 'cudnn_deterministic', 'enable_cublas_tensor_op_math', 'conv_workspace_size_limit', 'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus', - 'sync_nccl_allreduce' + 'sync_nccl_allreduce', 'limit_of_tmp_allocation', + 'times_excess_than_required_tmp_allocation' ] core.init_gflags([sys.argv[0]] +